A small collection of functionalities to crawl the web.
These python libraries are required:
- requests
- unidecode
- lxml
- cssselect
- urllib
... getting all links
from crawllib import *
(html, headers, status_code) = load("http://example.org/")
for a in html.cssselect("a"):
print( a.get("href") )
... downloading images
from crawllib import *
content_type = download(
"https://www.iana.org/_img/2013.1/iana-logo-header.svg",
slugify("i a n a")+".svg"
)
download(
"https://www.iana.org/_img/2013.1/iana-logo-header.svg",
"/_img/2013.1/iana-logo-header.svg",
overwrite=True,
mkdir=True
)
from crawllib import *
p = PageLoader("https://github.com/maxdoom-com/crawllib")
p.make_absolute_links() # will try to make all links absolute
def print_a(a):
print(a.get("href"))
for_all(p.content, "a", print_a) # will call print_a() for every <a> found
I've added to procs to iterate over (cssselect) nodes:
for_all(tree, selector, callback)
will call the callback for all nodes found with tree.cssselect(selector)for_one(tree, selector, callback)
will call the callback for the first node found with tree.cssselect(selector)
from crawllib import *
p = PageLoader("https://github.com/maxdoom-com/crawllib")
p.make_absolute_links()
# Will try to make all links absolute to the given url or
# the <base href="..."> in the html code.
# The base-tag overrides the url when calculating the baseurl.
def first_a(a):
print(a.get("href"))
def each_div(div):
for_one(div, "a", first_a)
for_all(p.content, "div", each_div) # will call each_div() for each <div> found
You may now load a string as html.
from crawllib import *
html = load_text("""<html>...</html>""")
And you may use the PageLoader class as well:
from crawllib import *
p = PageLoader("http://some.fake/do/main", """<html>...</html>""")
p.make_absolute_links()
I've added a simple sqlite3 database class.
from crawllib import Database
db = Database("my.db")
db.executescript("""
CREATE TABLE IF NOT EXISTS pages ( id INTEGER PRIMARY KEY, url NOT NULL, text);
""")
db.execute("INSERT INTO pages ( url, text ) VALUES(:url, :text)",{
'url': "http://some.fake/do/main",
'text': """<html>...</html>"
})
for page in db.select("SELECT * FROM pages"):
print(page)
from crawllib import *
p = PageLoader("http://some.fake/do/main", """<html>...</html>""")
print(p.tostring())
def print_a_text(a):
print(element2text(a))
for_all(p.content, "a", print_a_text)
Create a store:
db.create("Tests", "foo", "bar", "blub")
Drop a store:
db.drop("Tests")
Store key/value pairs:
db.store("Tests", "test-1-2-3", {"foo":1, "bar":2, "blub":3})
db.store("Tests", "test-1-2-3", {"foo":3, "bar":2, "blub":1})
db.store("Tests", "test-1-2-3 A", {"foo":1, "bar":2, "blub":3})
---
Remove an entry:
```py
db.remove("Tests", "test-1-2-3 A")
Print a record:
print(db.get("Tests", "test-1-2-3"))
Get all entries:
for entry in db.all(self, store):
print(entry)
Clear a store:
db.empty("Tests")
pip install git+https://github.com/maxdoom-com/crawllib