Add table 'rowspan' support
Opened this issue · 2 comments
ffolkes1911 commented
Had a quick look at the code and it seems that there's support for 'colspan' attribute, but not 'rowspan'. Any plans to add support?
HTML example
<!DOCTYPE html>
<html>
<head>
<style>
table, th, td {
border: 1px solid black;
}
</style>
</head>
<body>
<h1>The td rowspan attribute</h1>
<table>
<tr>
<th>Month</th>
<th>Savings</th>
<th>Savings for holiday!</th>
</tr>
<tr>
<td>January</td>
<td>$100</td>
<td rowspan="2">$50</td>
</tr>
<tr>
<td>February</td>
<td>$80</td>
</tr>
</table>
</body>
</html>Parsed MD table
The td rowspan attribute
========================
| Month | Savings | Savings for holiday! |
| --- | --- | --- |
| January | $100 | $50 |
| February | $80 |
Desired MD output
The td rowspan attribute
========================
| Month | Savings | Savings for holiday! |
| --- | --- | --- |
| January | $100 | $50 |
| February | $80 | |
andrewDoing commented
I had this issue as well, and I was able to get the desired behavior with a customization.
Requires:
- pandas
- tabulate
- html5lib
import pandas as pd
class MyMarkdownConverter(MarkdownConverter):
"""A custom MarkdownConverter.
This class is a subclass of the MarkdownConverter class from the markdownify library.
It overrides the convert_table, convert_th, convert_tr, convert_td, convert_thead, and convert_tbody methods
to provide a No-Op for the <th>, <tr>, <td>, <thead>, and <tbody> tags, respectively.
For <table> tags, it converts the table to a DataFrame and then converts the DataFrame to Markdown.
This gives us the desired behavior of handling rowspan, which markdownify does not handle.
"""
def convert_table(self, el, text, convert_as_inline):
try:
df = pd.read_html(StringIO(str(el)))[0]
# replace nan with empty string
df = df.fillna("")
except Exception as e:
print(f"Error converting table to DataFrame: {str(el)}")
print(e)
# Convert DataFrame to Markdown
return df.to_markdown(index=False)
def convert_th(self, el: NavigableString, text, convert_as_inline):
"""This method is empty because we want a No-Op for the <th> tag."""
# return the html as is
return str(el)
def convert_tr(self, el: NavigableString, text, convert_as_inline):
"""This method is empty because we want a No-Op for the <tr> tag."""
return str(el)
def convert_td(self, el: NavigableString, text, convert_as_inline):
"""This method is empty because we want a No-Op for the <td> tag."""
return str(el)
def convert_thead(self, el: NavigableString, text, convert_as_inline):
"""This method is empty because we want a No-Op for the <thead> tag."""
return str(el)
def convert_tbody(self, el: NavigableString, text, convert_as_inline):
"""This method is empty because we want a No-Op for the <tbody> tag."""
return str(el)Spenhouet commented
The solution using pandas introduced a lot of issues:
- Tables with no header did not work and required a workaround
- Tables with an empty cell in the header did not work
- Numbers with EU separator got changed
- Formatting of text in tables no longer works (no bold, no code, ...)
- Other change in content
I made another implementation just using tabulate:
from typing import cast
from bs4 import BeautifulSoup
from bs4 import Tag
from markdownify import MarkdownConverter
from tabulate import tabulate
def pad(rows: list[list[Tag]]) -> list[list[Tag]]:
padded: list[list[Tag]] = []
occ: dict[tuple[int, int], Tag] = {}
for r, row in enumerate(rows):
cur: list[Tag] = []
c = 0
for cell in row:
while (r, c) in occ:
cur.append(occ.pop((r, c)))
c += 1
rs = int(cell.get("rowspan", 1)) # type: ignore -
cs = int(cell.get("colspan", 1)) # type: ignore -
cur.append(cell)
# Append extra cells for colspan
for _ in range(1, cs):
cur.append(make_empty_cell())
# Mark future cells for rowspan and colspan
for i in range(rs):
for j in range(cs):
if i or j:
occ[(r + i, c + j)] = make_empty_cell()
c += cs
while (r, c) in occ:
cur.append(occ.pop((r, c)))
c += 1
padded.append(cur)
return padded
def make_empty_cell() -> Tag:
"""Return an empty <td> Tag."""
return Tag(name="td")
class TableConverter(MarkdownConverter):
def convert_table(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str:
rows = [
cast(list[Tag], tr.find_all(["td", "th"])) for tr in cast(list[Tag], el.find_all("tr"))
]
if not rows:
return ""
padded_rows = pad(rows)
converted = [[self.convert(str(cell)) for cell in row] for row in padded_rows]
has_header = all(cell.name == "th" for cell in rows[0])
if has_header:
return tabulate(converted[1:], headers=converted[0], tablefmt="pipe")
return tabulate(converted, headers=[""] * len(converted[0]), tablefmt="pipe")
def convert_th(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str:
"""This method is empty because we want a No-Op for the <th> tag."""
# return the html as is
return text
def convert_tr(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str:
"""This method is empty because we want a No-Op for the <tr> tag."""
return text
def convert_td(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str:
"""This method is empty because we want a No-Op for the <td> tag."""
return text
def convert_thead(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str:
"""This method is empty because we want a No-Op for the <thead> tag."""
return text
def convert_tbody(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str:
"""This method is empty because we want a No-Op for the <tbody> tag."""
return text