lexborisov/myhtml

Disable url encodings in myhtml_attribute_add?

searene opened this issue · 3 comments

I want to disable the automatically applied URL encodings in myhtml_attribute_add. To be more specific, please check the following example:

#include <myhtml/api.h>

int main(int argc, const char * argv[])
{
    char html[] = "<img/>";

    // basic init
    myhtml_t* myhtml = myhtml_create();
    myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);

    // init tree
    myhtml_tree_t* tree = myhtml_tree_create();
    myhtml_tree_init(tree, myhtml);

    // parse html
    myhtml_parse_fragment(tree, MyENCODING_UTF_8, html, strlen(html), MyHTML_TAG_DIV, MyHTML_NAMESPACE_HTML);

    // get first img from index
    myhtml_collection_t *div_list = myhtml_get_nodes_by_name(tree, NULL, "img", 3, NULL);
    myhtml_tree_node_t *node = div_list->list[0];

    // add an attr
    const char* srcValue = "custom_protocol://resource?id=1&name=apple";
    myhtml_attribute_add(node, "src", 3, srcValue, strlen(srcValue), MyENCODING_UTF_8);

    mycore_string_raw_t str = {0};
    myhtml_serialization_tree_buffer(myhtml_tree_get_document(tree), &str);
    printf("%s", str.data);

    // release resources
    myhtml_collection_destroy(div_list);
    myhtml_tree_destroy(tree);
    myhtml_destroy(myhtml);

    return 0;
}

The output is as follows:

<html><img src="custom_protocol://resource?id=1&amp;name=apple"></img></html>

As you can see, the & between id=1 and name=apple was automatically URL-encoded, which is not what I want. I want it to be the same as the one passed by me, in other words, I want the following result:

<html><img src="custom_protocol://resource?id=1&name=apple"></img></html>

Is there anyway to do this?

Hi @searene
This is a serialization problem, I try to get value from node:

#include <myhtml/myhtml.h>
#include <myhtml/serialization.h>

int main(int argc, const char * argv[])
{
    char html[] = "<img/>";
    
    // basic init
    myhtml_t* myhtml = myhtml_create();
    myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
    
    // init tree
    myhtml_tree_t* tree = myhtml_tree_create();
    myhtml_tree_init(tree, myhtml);
    
    // parse html
    myhtml_parse_fragment(tree, MyENCODING_UTF_8, html, strlen(html), MyHTML_TAG_DIV, MyHTML_NAMESPACE_HTML);
    
    // get first img from index
    myhtml_collection_t *div_list = myhtml_get_nodes_by_name(tree, NULL, "img", 3, NULL);
    myhtml_tree_node_t *node = div_list->list[0];
    
    // add an attr
    const char* srcValue = "custom_protocol://resource?id=1&name=apple";
    myhtml_attribute_add(node, "src", 3, srcValue, strlen(srcValue), MyENCODING_UTF_8);

    printf("Value is: %s\n", myhtml_node_attribute_first(node)->value.data);

    mycore_string_raw_t str = {0};
    myhtml_serialization_tree_buffer(myhtml_tree_get_document(tree), &str);
    printf("Serialization is: %s\n", str.data);
    
    // release resources
    myhtml_collection_destroy(div_list);
    myhtml_tree_destroy(tree);
    myhtml_destroy(myhtml);
    
    return 0;
}

I fix it tomorrow.

I highly recommend that you use a lexbor HTML parser. This is a modern and very fast parser (like a myhtml).

#include <lexbor/html/parser.h>
#include <lexbor/html/serialize.h>

#define FAILED(...)                                                            \
    do {                                                                       \
        fprintf(stderr, __VA_ARGS__);                                          \
        fprintf(stderr, "\n");                                                 \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
    while (0)


int
main(int argc, const char *argv[])
{
    lxb_status_t status;
    lxb_html_body_element_t *body;
    lxb_html_document_t *document;
    lxb_dom_collection_t *collection;
    lxb_dom_element_t *element;
    lxb_dom_attr_t *attr;
    
    static const lxb_char_t html[] = "<img>";
    size_t html_len = sizeof(html) - 1;

    /* Initialization */
    document = lxb_html_document_create();
    if (document == NULL) {
        FAILED("Failed to create HTML Document");
    }

    /* Parse HTML */
    status = lxb_html_document_parse(document, html, html_len);
    if (status != LXB_STATUS_OK) {
        FAILED("Failed to parse HTML");
    }

    /* Create Collection for elements */
    collection = lxb_dom_collection_make(&document->dom_document, 16);
    if (collection == NULL) {
        FAILED("Failed to create collection");
    }
    
    /* Get BODY elemenet (root for search) */
    body = lxb_html_document_body_element(document);
    element = lxb_dom_interface_element(body);
    
    /* Find DIV eleemnt */
    status = lxb_dom_elements_by_tag_name(element, collection,
                                          (const lxb_char_t *) "img", 3);
    
    if (status != LXB_STATUS_OK || lxb_dom_collection_length(collection) == 0) {
        FAILED("Failed to find IMG element");
    }

    /* Append new attrtitube */
    element = lxb_dom_collection_element(collection, 0);

    const char* srcValue = "custom_protocol://resource?id=1&name=apple";
    attr = lxb_dom_element_set_attribute(element, (const lxb_char_t *) "src", 3,
                                         (const lxb_char_t *) srcValue, strlen(srcValue));
    if (attr == NULL) {
        FAILED("Failed to create and append new attribute");
    }

    /* Print Result */
    lexbor_str_t str = {0};
    status = lxb_html_serialize_pretty_tree_str(lxb_dom_interface_node(document),
                                                LXB_HTML_SERIALIZE_OPT_UNDEF, 0,
                                                &str);
    if (status != LXB_STATUS_OK) {
        FAILED("Failed to serialization HTML tree");
    }

    printf("%s\n", str.data);

    lxb_dom_collection_destroy(collection, true);
    lxb_html_document_destroy(document);
    
    return 0;
}

@lexborisov Thanks for the quick reply! Please keep us updated in this issue when you fix this problem, I appreciate it!

Regarding lexbor, I tried compiling it in a c++ project but encountered some problems, I'll look into the specific reason and see if it's my fault that caused these problems, I'll report it in the lexbor repository if it's not.

@lexborisov does lexbor support css selectors?