https://www.edureka.co/blog/web-scraping-with-python/ https://stackoverflow.com/questions/59130672/how-to-scrape-pdfs-using-python-specific-content-only
https://www.google.com/robots.txt http://www.robotstxt.org/robotstxt.html
https://librarycarpentry.org/lc-webscraping/setup
#task one scrap this one, update changes with ajax if not updated
Set of supported locator strategies.\n ', 'ID': 'id', 'XPATH': 'xpath', 'LINK_TEXT': 'link text', 'PARTIAL_LINK_TEXT': 'partial link text', 'NAME': 'name', 'TAG_NAME': 'tag name', 'CLASS_NAME': 'class name', 'CSS_SELECTOR': 'css selector', '__dict__': <attribute '__dict__' of 'By' objects>, '__weakref__': <attribute '__weakref
const getElementByText = (text, possibleElms=['div', 'span', 'h5', 'h4', 'button', 'p','h6'])=>{
const elementsToSearch = [];
const resultElements = [];
possibleElms.forEach( (elmType)=>{
const SearchElements = document.querySelectorAll(elmType);
if (SearchElements.length){
SearchElements.forEach( (SearchElement)=>{
if (!elementsToSearch.includes(SearchElement)){
elementsToSearch.push(SearchElement);
}
})
}
});
console.log(elementsToSearch);
const searchText = text.toLowerCase().trim();
elementsToSearch.forEach( (elm)=>{
const ElmTxt = elm.innerText.toLowerCase().trim();
if (ElmTxt == searchText){
if (!resultElements.includes(ElmTxt)){
resultElements.push(elm);
}
}
});
return resultElements;
}
undefined
let maybe = getElementByText('Sign in');
https://simplehtmldom.sourceforge.io/
curl -o curl_test.txt https://www.google.com
#first version scrap google image, and title
<?php
require 'simple_html_dom.php';
$html = file_get_html('http://www.google.com/');
$title = $html->find('a', 1); // any tag get it awesome
$image = $html->find('img', 0);
$div = $html->find('div', 0);
echo $title->plaintext."<br>\n";
echo '<img src="https://www.google.com'.$image->src .'">';
echo $div;
?>
https://blog.apify.com/unofficial-google-search-api-from-apify-22a20537a951
from io import StringIO
from email.generator import Generator
fp = StringIO()
g = Generator(fp, mangle_from_=True, maxheaderlen=60)
g.flatten(msg)
text = fp.getvalue()
从Python 3.0开始,StringIO和cStringIO模块已经取消。通过import io模块代替,分别使用io.String或io.BytesIO处理文本和数据。从Python 3邮件流文档能看到相关实现StringIO的代码为:
from io import StringIO
from email.generator import Generator
fp = StringIO()
g = Generator(fp, mangle_from_=True, maxheaderlen=60)
g.flatten(msg)
text = fp.getvalue()
0
I like the Dom Crawler library. Very easy to use, has lots of options like:
$crawler = $crawler
->filter('body > p')
->reduce(function (Crawler $node, $i) {
// filters every other node
return ($i % 2) == 0;
});
https://selenium-python.readthedocs.io/api.html
const x = ()=> {
setTimeout(function(){
document.body.style.height = '2000px';
}, 3000);
}
const n = ()=> {
setTimeout(function(){
console.log(document.body.scrollHeight);
window.scrollTo(0, document.body.scrollHeight);
}, 4000);
}
x();
n();
driver.execute_script
proxy_list = ['1','2','3','4','5','6']
erro_counter = 0
proxy = ''
proxys = []
mbool = True
while mbool:
try:
proxy = proxy_list[erro_counter]
print(proxys[0])
print(proxy)
mbool = False
except IndexError:
erro_counter += 1
proxy = proxy_list[erro_counter]
proxys.append('handling')
print('error')
mbool = True
var element = document.querySelector('.top-nav-item.wt-pb-xs-2.wt-mr-xs-2.wt-display-flex-xs.wt-align-items-center');
element.addEventListener('mouseover', function() {
console.log('Event triggered');
});
var event = new MouseEvent('mouseover', {
'view': window,
'bubbles': true,
'cancelable': true
});
priceString = "Rs249.5"
def advancedSplit(unformatedtext):
custom_numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
priceList = []
str_length = len(unformatedtext)
index = 0
for l in range(len(unformatedtext)):
if unformatedtext[l] in custom_numbers:
price = unformatedtext[slice(l, len(unformatedtext))]
currency = unformatedtext[slice(0,l)]
if currency == "Rs" or currency == "RS":
currency = "INR"
priceList.append(price)
break
elif index == str_length:
priceList.append("")
priceList.append("unformatedtext")
break
else:
continue
index += 1
return priceList
print(advancedSplit(priceString))