
Python script to get specific information from html and store it in xml file

Python lab project which helps to get specific information from html and store it in xml file

##Task 1

  • Based on given URL site address, obtain text and graphic information from HTML tags of this site pages. Use XPATH to perform this task.
def parse_html(url):
    response = urllib2.urlopen(url)
    page = response.read()
    page = clean_html(page)

    tree = etree.HTML(page.decode('utf-8'))

    text = tree.xpath('//text()')
    text = process_text(text)

    images = tree.xpath('//img/@src')
    images = process_images(images)

    urls = tree.xpath('//a/@href')
    urls = process_urls(urls)

    return urls, text, images
  • Generate XML node for given page
def generate_xml_page(page_url, text, images):
    page_elem = etree.Element("page", url=page_url)

    for elem in text:
        fragment = etree.Element("fragment", type="text")
        fragment.text = elem

    for img in images:
        fragment = etree.Element("fragment", type="image")
        fragment.text = img

    return page_elem
  • Store parsed information in XML file
def generate_xml(filename):
    root = etree.Element("data")

    urls, _, _ = parse_html(BASE_URL)
    urls.insert(0, BASE_URL)

    for url in urls:
        _, text, images = parse_html(url)
        page_elem = generate_xml_page(url, text, images)

    et = etree.ElementTree(root)
    et.write(filename, encoding="utf-8", xml_declaration=True, pretty_print=True)

##Task 2 By means of XPath find XML node with maximum number of text fragments

tree = etree.parse("data.xml")
page_cnt = tree.xpath("count(/data/page)")

max_cnt, max_idx = 0, -1

for i in range(int(page_cnt)):
    cnt = tree.xpath("count(//page[{}]/fragment[@type='text'])".format(i + 1))
    if max_cnt < cnt:
        max_cnt = cnt
        max_idx = i + 1

max_cnt = int(max_cnt)
url = tree.xpath("string(/data/page[{}]/@url)".format(max_idx))

print max_idx, max_cnt, url

##Task 3 Analyze a content of Web store and fetch price, name, description and image of 20 products from that site. Use XPath and DOM-parser to find appropriate nodes.

def parse_html(url):
    response = urllib2.urlopen(BASE_URL + url)
    page = response.read()

    tree = etree.HTML(page.decode("cp1251").encode('utf-8'))

    name = tree.xpath("string(//div[@itemprop='name']/h1/text())")
    price = tree.xpath("string(//div[@id='optionPrice']/text())")
    image = tree.xpath("string(//img[@class='thumbnail']/@src)")
    desc = tree.xpath("//div[@style='overflow-x: auto']/span[@itemprop='description']//text()")

    desc = reduce(lambda a, x: a + x, desc)

    return Product(name, price, desc, BASE_URL + image)

##Task 4 By means of XSLT language, convert obtained XML-file to XHTML page


xslt_root = etree.parse("pattern.xsl")
transorm = etree.XSLT(xslt_root)
xml_root = etree.parse("products.xml")
result_tree = transorm(xml_root)
result = str(result_tree)
with open("products.xhtml", "w") as out:


<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">

<xsl:template match="/">
    <table border="1">
      <tr bgcolor="#9acd32">
        <th style="text-align:left">Name</th>
        <th style="text-align:left">Price</th>
        <th style="text-align:left">Description</th>
        <th style="text-align:left">Image url</th>
      <xsl:for-each select="data/product">
        <td><xsl:value-of select="name"/></td>
        <td><xsl:value-of select="price"/></td>
        <td><xsl:value-of select="description"/></td>
        <td><xsl:value-of select="image"/></td>