Parse HTML to Delta

Question

Parse HTML to Delta

matiszz opened this issue 3 years ago · 4 comments

How can I convert an HTML string to delta?

I'm using Quill together with Django Import-Export. The CSV input file contains an HTML string, that I want to save as my QuillField attribute of a model.

I'm using a custom function to get te Quill object, but I'm missing how to get the delta without having to implement the whole parser by myself.

def get_quill_field(string):
    delta = parse_html_to_delta(string) # HOW TO DO THIS?
    return Quill(
        '{"delta":"' + delta + '","html":"' + string + '"}')

Answer 1 · 2021-09-02T09:25:33.000Z

I have the same problem

Answer 2 · 2021-12-07T08:40:10.000Z

Thank you for using this library :)
Converting HTML to delta is handled by the Quill.js library, not this Python library.
(If Django passes an HTML string to the JS library through a template, the HTML is converted to delta during the initialization process of Quill.js and loaded in the browser's memory.)

It would be great if the behavior of Quill.js could be implemented within Python, but I haven't been able to find such a way right now.

Answer 3 · 2023-03-13T10:55:12.000Z

No solutions for this yet ?

Answer 4 · 2023-11-23T16:52:02.000Z

I have created a sample parser. It works for most of the elements. Feel free to update this gist

from bs4 import BeautifulSoup, NavigableString

def convert_html_to_delta(html_string):
    soup = BeautifulSoup(html_string, "html.parser")
    delta = {"ops": []}
    for element in soup.descendants:

        if isinstance(element, NavigableString):
            if element.string:
                ops = {"insert": element.string, "attributes": get_style_attributes(element)}
                ops.update(get_class_and_id_attributes(element))
                delta["ops"].append(ops)
        
        elif element.name in ("p", "h1", "h2", "h3", "h4", "h5", "h6"):
            convert_paragraph(element, delta["ops"])
        
        elif element.name == "br":
            delta["ops"].append({"insert": "\n"})
        
        elif element.name == "img":
            src = element["src"]
            alt = element.get("alt", "")
            delta["ops"].append({"insert": {"image": src}, "attributes": {"alt": alt}})
        
        elif element.name == "a":
            href = element.get("href", "")
            convert_link(element, delta["ops"], href)
        
        elif element.name == "span":
            convert_span(element, delta["ops"])
        
        elif element.name in ("strong", "b"):
            convert_bold(element, delta["ops"])
        
        elif element.name in ("em", "i"):
            convert_italic(element, delta["ops"])
    return delta

def convert_paragraph(element, ops):
    text = element.text
    for child in element.children:
        if child.name == "b":
            text = text.replace(child.text, "**%s**" % child.text)
        elif child.name == "i":
            text = text.replace(child.text, "*%s*" % child.text)
    ops.append({"insert": text, "attributes": get_style_attributes(element)})
    ops[-1].update(get_class_and_id_attributes(element))

def convert_link(element, ops, href):
    text = element.text
    ops.append({"insert": text, "attributes": {"link": href}})
    ops[-1].update(get_class_and_id_attributes(element))

def convert_span(element, ops):
    text = element.text
    ops.append({"insert": text, "attributes": get_style_attributes(element)})
    ops[-1].update(get_class_and_id_attributes(element))

def convert_bold(element, ops):
    text = element.text
    ops.append({"insert": text, "attributes": {"bold": True}})
    ops[-1].update(get_class_and_id_attributes(element))

def convert_italic(element, ops):
    text = element.text
    ops.append({"insert": text, "attributes": {"italic": True}})
    ops[-1].update(get_class_and_id_attributes(element))

def get_style_attributes(element):
    attributes = {}
    if hasattr(element, "attrs"):
        if "class" in element.attrs:
            attributes["class"] = " ".join(element["class"])
        
        if "id" in element.attrs:
            attributes["id"] = element["id"]

        if "style" in element.attrs:
            styles = [s.strip() for s in element["style"].split(";")]
            style_dict = {s.split(":")[0]: s.split(":")[1] for s in styles if ":" in s}
            attributes.update(style_dict)

    return attributes

def get_class_and_id_attributes(element):
    attributes = {}
    if hasattr(element, "attrs"):
        if "class" in element.attrs:
            attributes["class"] = " ".join(element["class"])
        if "id" in element.attrs:
            attributes["id"] = element["id"]
    return attributes