Back to Changelog
EmpleoVino

New Minimalist Text Editor

Implemented Basecamp's Trix WYSIWYG text editor

Trix text editor screenshot

We were using an old version of django-prose-editor, and although it looked good in theory, simple and lightweight, it didn't work well in practice, and was impossible to upgrade to the latest version.

So I ended up switching to a custom implementation of Basecamp's Trix WYSIWYG text editor who is also simple and lightweight but that has been around and well maintained for more than a decade (at least 2015).

Issues that have been fixed

  • Styles didn't match the real end result
  • Keyboard shortcuts we expected to work didn't actually work
    • For ex. pressing tab or maj+tab to control the hierarchy of lists
  • When pasted, styles would break and it would be impossible to recover with the WYSIWYG buttons
  • Sometimes they would break even without pasting, just by clicking the buttons in a particular order

After implementing Trix, I had a bad surprise, the HTML produced looks like this:

<div>
  <!--block-->
  First paragraph text...
  <br><br>
  Second paragraph text...
  <br><br><br><br> <!-- Multiple Enters -->
  Third paragraph text
</div>

Not only does pressing enter not create a new block, the block is not even a <p>, but a <div>.

That might not be a big deal for a comment section or a internal system, but for a job board, accessibility and SEO are paramount.

I seriously considered moving to another editor, but in the end, the workaround was simple enough. I had gemini-2.5-pro write the cleaning method that is called on save() and claude code implement and test it.

    def _clean_trix_html(self, trix_html: str) -> str:
        """
        Parses HTML content from a Trix editor and converts its
        paragraph-like <div> tags into proper <p> tags.

        It handles single <br> tags as line breaks within a paragraph
        and double <br><br> tags as paragraph separators.
        """
        if not trix_html:
            return ""

        soup = BeautifulSoup(trix_html, 'html.parser')

        # Process each top-level div created by Trix
        for div in soup.find_all('div', recursive=False):
            # Heuristic: only process divs that look like paragraphs,
            # i.e., they don't contain other block-level elements.
            if div.find(['div', 'p', 'ul', 'ol', 'li', 'blockquote', 'pre', 'h1', 'h2']):
                continue

            # Check if the div is effectively empty (only whitespace and/or br tags)
            div_text = div.get_text().strip()
            only_br_tags = all(tag.name == 'br' for tag in div.find_all())
            if not div_text and only_br_tags:
                # Remove empty divs entirely
                div.extract()
                continue

            new_paragraphs = []
            current_paragraph_nodes = []
            nodes = list(div.contents)
            i = 0
            while i < len(nodes):
                node = nodes[i]
                next_node = nodes[i + 1] if (i + 1) < len(nodes) else None

                # Check for a paragraph break: two or more consecutive <br> tags
                is_paragraph_break = (node.name == 'br' and next_node and next_node.name == 'br')

                if is_paragraph_break:
                    # Finish the current paragraph if it has content
                    if any(str(n).strip() for n in current_paragraph_nodes):
                        p = soup.new_tag('p')
                        for content_node in current_paragraph_nodes:
                            p.append(content_node.extract())
                        new_paragraphs.append(p)
                    current_paragraph_nodes = []

                    # Skip all consecutive <br> tags that form this break
                    i += 1  # Move to the second <br>
                    while i < len(nodes) and nodes[i].name == 'br':
                        nodes[i].extract()  # Remove the <br> tag
                        i += 1
                else:
                    current_paragraph_nodes.append(node)
                    i += 1

            # Add any remaining content as the last paragraph
            if any(str(n).strip() for n in current_paragraph_nodes):
                p = soup.new_tag('p')
                for content_node in current_paragraph_nodes:
                    p.append(content_node.extract())
                new_paragraphs.append(p)

            # Replace the original div with the new paragraphs
            if new_paragraphs:
                div.replace_with(*new_paragraphs)
            else:
                # If no paragraphs were created, remove the div entirely
                div.extract()

        return str(soup)

And the tests

class JobPostHTMLCleaningTestCase(TestCase):
    """
    Tests the _clean_trix_html method for converting Trix editor's
    div-based output to semantic paragraph tags.
    """

    def setUp(self):
        # We only need a model instance to call the method,
        # it doesn't need to be saved to the database.
        self.job_post = JobPost()

    def test_simple_div_is_converted_to_p(self):
        """Test a single div with no breaks becomes a single p tag."""
        trix_html = "<div>Simple paragraph content.</div>"
        expected_html = "<p>Simple paragraph content.</p>"
        cleaned_html = self.job_post._clean_trix_html(trix_html)
        self.assertEqual(cleaned_html, expected_html)

    def test_single_br_is_preserved_as_line_break(self):
        """Test that a single <br> is kept as a line break inside a <p> tag."""
        trix_html = "<div>First line.<br>Second line.</div>"
        # Note: BeautifulSoup will self-close the <br> tag.
        expected_html = "<p>First line.<br/>Second line.</p>"
        cleaned_html = self.job_post._clean_trix_html(trix_html)
        self.assertEqual(cleaned_html, expected_html)

    def test_double_br_creates_new_paragraph(self):
        """Test that <br><br> correctly splits content into two <p> tags."""
        trix_html = "<div>First paragraph.<br><br>Second paragraph.</div>"
        expected_html = "<p>First paragraph.</p><p>Second paragraph.</p>"
        cleaned_html = self.job_post._clean_trix_html(trix_html)
        self.assertEqual(cleaned_html, expected_html)

    def test_multiple_brs_create_single_paragraph_break(self):
        """Test that three or more <br> tags are treated as one paragraph break."""
        trix_html = "<div>Paragraph one.<br/><br/><br/><br/>Paragraph two.</div>"
        expected_html = "<p>Paragraph one.</p><p>Paragraph two.</p>"
        cleaned_html = self.job_post._clean_trix_html(trix_html)
        self.assertEqual(cleaned_html, expected_html)

    def test_inline_formatting_is_preserved(self):
        """Test that inline tags like <strong> and <a> are preserved correctly."""
        trix_html = '<div><strong>Bold text</strong> and <em>italic</em>.<br><br>Link to <a href="#">somewhere</a>.</div>'
        expected_html = '<p><strong>Bold text</strong> and <em>italic</em>.</p><p>Link to <a href="#">somewhere</a>.</p>'
        cleaned_html = self.job_post._clean_trix_html(trix_html)
        self.assertEqual(cleaned_html, expected_html)

    def test_multiple_trix_divs_are_processed(self):
        """Test that the function handles multiple top-level <div> blocks."""
        trix_html = "<div>First block.</div><div>Second block, line 1.<br><br>Second block, line 2.</div>"
        expected_html = "<p>First block.</p><p>Second block, line 1.</p><p>Second block, line 2.</p>"
        cleaned_html = self.job_post._clean_trix_html(trix_html)
        self.assertEqual(cleaned_html, expected_html)

    def test_non_trix_block_elements_are_ignored(self):
        """Test that divs containing other block elements are not processed."""
        trix_html = "<div><div>Nested block</div></div><blockquote>Quote</blockquote>"
        # The logic should skip the outer div because it contains another block (the nested div).
        # Therefore, the HTML should remain unchanged.
        expected_html = "<div><div>Nested block</div></div><blockquote>Quote</blockquote>"
        cleaned_html = self.job_post._clean_trix_html(trix_html)
        self.assertEqual(cleaned_html, expected_html)

    def test_empty_and_whitespace_divs_produce_no_output(self):
        """Test that empty or whitespace-only divs are effectively removed."""
        trix_html = "<div>First paragraph.</div><div>    </div><div><br><br></div><div>Second paragraph.</div>"
        expected_html = "<p>First paragraph.</p><p>Second paragraph.</p>"
        cleaned_html = self.job_post._clean_trix_html(trix_html)
        self.assertEqual(cleaned_html, expected_html)

    def test_empty_input_returns_empty_string(self):
        """Test that empty or None input returns empty string safely."""
        self.assertEqual(self.job_post._clean_trix_html(""), "")
        self.assertEqual(self.job_post._clean_trix_html(None), "")

    def test_complex_real_world_example(self):
        """Test with the exact example from the user's issue description."""
        trix_html = ('<div>En <strong>Bodegas Pincerna</strong>, una bodega especializada en la '
                    'recuperación de variedades autóctonas de la D.O. León (Prieto Picudo y Albarín Blanco), '
                    'buscamos un/a <strong>Operario/a de Bodega y Campo</strong> para reforzar nuestro equipo.'
                    '<br><br>La persona seleccionada trabajará tanto en el viñedo como en la bodega, '
                    'participando en todas las fases del ciclo del vino y contribuyendo al crecimiento de un '
                    'proyecto con fuerte identidad y valores de calidad.<br><br><br><br><br>Paragraph 3</div>')
        
        cleaned_html = self.job_post._clean_trix_html(trix_html)
        
        # Should create three paragraphs
        self.assertIn('<p>En <strong>Bodegas Pincerna</strong>', cleaned_html)
        self.assertIn('<p>La persona seleccionada', cleaned_html)
        self.assertIn('<p>Paragraph 3</p>', cleaned_html)
        
        # Should not contain any div tags
        self.assertNotIn('<div>', cleaned_html)
        
        # Should not contain double br tags
        self.assertNotIn('<br><br>', self.job_post._clean_trix_html(trix_html).replace('<br/><br/>', '<br><br>'))