Add table 'rowspan' support

Question

Add table 'rowspan' support

Opened this issue 2 years ago · 2 comments

ffolkes1911 commented 2 years ago

Had a quick look at the code and it seems that there's support for 'colspan' attribute, but not 'rowspan'. Any plans to add support?

HTML example

<!DOCTYPE html>
<html>
<head>
<style>
table, th, td {
  border: 1px solid black;
}
</style>
</head>
<body>

<h1>The td rowspan attribute</h1>

<table>
  <tr>
    <th>Month</th>
    <th>Savings</th>
    <th>Savings for holiday!</th>
  </tr>
  <tr>
    <td>January</td>
    <td>$100</td>
    <td rowspan="2">$50</td>
  </tr>
  <tr>
    <td>February</td>
    <td>$80</td>
  </tr>
</table>

</body>
</html>

Parsed MD table

The td rowspan attribute
========================


| Month | Savings | Savings for holiday! |
| --- | --- | --- |
| January | $100 | $50 |
| February | $80 |

Desired MD output

The td rowspan attribute
========================


| Month | Savings | Savings for holiday! |
| --- | --- | --- |
| January | $100 | $50 |
| February | $80 | |

Answer 1 · 2024-08-01T20:08:46.000Z

I had this issue as well, and I was able to get the desired behavior with a customization.

Requires:

pandas
tabulate
html5lib

import pandas as pd

class MyMarkdownConverter(MarkdownConverter):
    """A custom MarkdownConverter.

    This class is a subclass of the MarkdownConverter class from the markdownify library.
    It overrides the convert_table, convert_th, convert_tr, convert_td, convert_thead, and convert_tbody methods
    to provide a No-Op for the <th>, <tr>, <td>, <thead>, and <tbody> tags, respectively.

    For <table> tags, it converts the table to a DataFrame and then converts the DataFrame to Markdown.
    This gives us the desired behavior of handling rowspan, which markdownify does not handle.
    """

    def convert_table(self, el, text, convert_as_inline):
        try:
            df = pd.read_html(StringIO(str(el)))[0]
            # replace nan with empty string
            df = df.fillna("")
        except Exception as e:
            print(f"Error converting table to DataFrame: {str(el)}")
            print(e)

        # Convert DataFrame to Markdown
        return df.to_markdown(index=False)

    def convert_th(self, el: NavigableString, text, convert_as_inline):
        """This method is empty because we want a No-Op for the <th> tag."""
        # return the html as is
        return str(el)

    def convert_tr(self, el: NavigableString, text, convert_as_inline):
        """This method is empty because we want a No-Op for the <tr> tag."""
        return str(el)

    def convert_td(self, el: NavigableString, text, convert_as_inline):
        """This method is empty because we want a No-Op for the <td> tag."""
        return str(el)

    def convert_thead(self, el: NavigableString, text, convert_as_inline):
        """This method is empty because we want a No-Op for the <thead> tag."""
        return str(el)

    def convert_tbody(self, el: NavigableString, text, convert_as_inline):
        """This method is empty because we want a No-Op for the <tbody> tag."""
        return str(el)

Answer 2 · 2025-03-13T14:00:34.000Z

The solution using pandas introduced a lot of issues:

Tables with no header did not work and required a workaround
Tables with an empty cell in the header did not work
Numbers with EU separator got changed
Formatting of text in tables no longer works (no bold, no code, ...)
Other change in content

I made another implementation just using tabulate:

from typing import cast

from bs4 import BeautifulSoup
from bs4 import Tag
from markdownify import MarkdownConverter
from tabulate import tabulate


def pad(rows: list[list[Tag]]) -> list[list[Tag]]:
    padded: list[list[Tag]] = []
    occ: dict[tuple[int, int], Tag] = {}
    for r, row in enumerate(rows):
        cur: list[Tag] = []
        c = 0
        for cell in row:
            while (r, c) in occ:
                cur.append(occ.pop((r, c)))
                c += 1
            rs = int(cell.get("rowspan", 1))  # type: ignore -
            cs = int(cell.get("colspan", 1))  # type: ignore -
            cur.append(cell)
            # Append extra cells for colspan
            for _ in range(1, cs):
                cur.append(make_empty_cell())
            # Mark future cells for rowspan and colspan
            for i in range(rs):
                for j in range(cs):
                    if i or j:
                        occ[(r + i, c + j)] = make_empty_cell()
            c += cs
        while (r, c) in occ:
            cur.append(occ.pop((r, c)))
            c += 1
        padded.append(cur)
    return padded


def make_empty_cell() -> Tag:
    """Return an empty <td> Tag."""
    return Tag(name="td")


class TableConverter(MarkdownConverter):
    def convert_table(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str:
        rows = [
            cast(list[Tag], tr.find_all(["td", "th"])) for tr in cast(list[Tag], el.find_all("tr"))
        ]

        if not rows:
            return ""

        padded_rows = pad(rows)
        converted = [[self.convert(str(cell)) for cell in row] for row in padded_rows]

        has_header = all(cell.name == "th" for cell in rows[0])
        if has_header:
            return tabulate(converted[1:], headers=converted[0], tablefmt="pipe")

        return tabulate(converted, headers=[""] * len(converted[0]), tablefmt="pipe")

    def convert_th(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str:
        """This method is empty because we want a No-Op for the <th> tag."""
        # return the html as is
        return text

    def convert_tr(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str:
        """This method is empty because we want a No-Op for the <tr> tag."""
        return text

    def convert_td(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str:
        """This method is empty because we want a No-Op for the <td> tag."""
        return text

    def convert_thead(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str:
        """This method is empty because we want a No-Op for the <thead> tag."""
        return text

    def convert_tbody(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str:
        """This method is empty because we want a No-Op for the <tbody> tag."""
        return text