Stackoverflow example not working
rish-hyun opened this issue · 2 comments
rish-hyun commented
This is the code
import logging
import requests
from mlscraper import SingleItemPageSample, RuleBasedSingleItemScraper
items = {
"https://stackoverflow.com/questions/11227809/why-is-processing-a-sorted-array-faster-than-processing-an-unsorted-array": {
"title": "Why is processing a sorted array faster than processing an unsorted array?"
},
"https://stackoverflow.com/questions/927358/how-do-i-undo-the-most-recent-local-commits-in-git": {
"title": "How do I undo the most recent local commits in Git?"
},
"https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do": {
"title": "What does the “yield” keyword do?"
},
}
results = {url: requests.get(url) for url in items.keys()}
# train scraper
samples = [
SingleItemPageSample(results[url].content, items[url]) for url in items.keys()
]
scraper = RuleBasedSingleItemScraper.build(samples)
print("Scraping new question")
html = requests.get(
"https://stackoverflow.com/questions/2003505/how-do-i-delete-a-git-branch-locally-and-remotely"
).content
result = scraper.scrape(html)
print("Result: %s" % result)
Output
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-11-9f646dab1fca> in <module>()
24 SingleItemPageSample(results[url].content, items[url]) for url in items.keys()
25 ]
---> 26 scraper = RuleBasedSingleItemScraper.build(samples)
27
28 print("Scraping new question")
4 frames
/usr/local/lib/python3.7/dist-packages/mlscraper/__init__.py in build(samples)
89 matches_per_page_right = [
90 len(m) == 1 and m[0].get_text() == s.item[attr]
---> 91 for m, s in zip(matches_per_page, samples)
92 ]
93 score = sum(matches_per_page_right) / len(samples)
/usr/local/lib/python3.7/dist-packages/mlscraper/__init__.py in <listcomp>(.0)
88 matches_per_page = (s.page.select(selector) for s in samples)
89 matches_per_page_right = [
---> 90 len(m) == 1 and m[0].get_text() == s.item[attr]
91 for m, s in zip(matches_per_page, samples)
92 ]
/usr/local/lib/python3.7/dist-packages/mlscraper/__init__.py in <genexpr>(.0)
86 if selector not in selector_scoring:
87 logging.info("testing %s (%d/%d)", selector, i, len(selectors))
---> 88 matches_per_page = (s.page.select(selector) for s in samples)
89 matches_per_page_right = [
90 len(m) == 1 and m[0].get_text() == s.item[attr]
/usr/local/lib/python3.7/dist-packages/mlscraper/parser.py in select(self, css_selector)
28 def select(self, css_selector):
29 try:
---> 30 return [SoupNode(res) for res in self._soup.select(css_selector)]
31 except NotImplementedError:
32 logging.warning(
/usr/local/lib/python3.7/dist-packages/bs4/element.py in select(self, selector, _candidate_generator, limit)
1495 if tag_name == '':
1496 raise ValueError(
-> 1497 "A pseudo-class must be prefixed with a tag name.")
1498 pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
1499 found = []
ValueError: A pseudo-class must be prefixed with a tag name.
lorey commented
Hi @rish-hyun, thanks for posting the issue. Looks to be in the beautifulsoup library. Could you post the output of pip freeze
and the logs?