爬取Nature官网文章图片,通过大量观看Nature的论文配图学习如何画图和配色。
定义搜索关键词和搜索范围,获取文章链接,并爬取文章链接中的图片
开启并行下载,速度飙升
依赖第三方库: tqdm,requests
python 2.7
定义搜索关键词,搜索页数范围,并行下载数量 关键词要用+分隔,比如 global+drought
'''
configuration
'''
####### keyword #######
self.keyword = 'global'
####### pages #######
self.page_start = 1
self.page_end = 5
####### threads #######
self.threads = 20
搜索页链接示例
url = 'https://www.nature.com/search?q=global&order=relevance&journal=nature%2Cnclimate%2Cncomms%2Csrep&page=1'
通过改变搜索链接的参数获取搜索内容,然后再获取每页的文章链接,并写入本地txt
product_url = 'https://www.nature.com/search?q={}&page={}'.format(self.keyword, page)
fname = hashlib.md5(product_url).hexdigest()
f = url_text_dir + '{}.txt'.format(fname)
if os.path.isfile(f):
return None
request = urllib2.Request(product_url)
response = urllib2.urlopen(request)
body = response.read()
fw = open(f, 'w')
p = re.findall('<a href="/articles/.*? itemprop=', body)
for i in p:
article = i.split('"')[1]
url_i = 'https://www.nature.com{}'.format(article)
fw.write(url_i + '\n')
url = 'https://www.nature.com/articles/s41586-020-2035-0'
html_dir = this_root+'html\\'
Tools().mk_dir(html_dir)
fname = hashlib.md5(url).hexdigest()
if os.path.isfile(html_dir+fname+'.html'):
body = open(html_dir+fname+'.html','r').read()
else:
request = urllib2.Request(url)
response = urllib2.urlopen(request)
body = response.read()
outf = html_dir + fname + '.html'
fw = open(outf, 'w')
fw.write(body)
fw.close()
# print body
# body_f = this_root+'HTML\\new 1.html'
# body = open(body_f,'r').read()
metadata_p = re.findall('<script data-test="dataLayer">\n.*?\n</script>',body)
metadata = metadata_p[0].split('\n')[1]
metadata = metadata.replace('dataLayer = ','')
# metadata = metadata.replace(' ','')
metadata = metadata[:-1]
null = 'null'
false = False
true = True
dataLayer = eval(metadata)[0]
article_title = dataLayer['content']['contentInfo']['title']
png_p = re.findall('media.springernature.com.*?png', body)
jpg_p = re.findall('media.springernature.com.*?jpg', body)
png_p = set(png_p)
jpg_p = set(jpg_p)
figs_url = []
for i in png_p:
fig_i = 'https://{}'.format(i)
figs_url.append(fig_i)
for i in jpg_p:
fig_i = 'https://{}'.format(i)
figs_url.append(fig_i)
下载图片
invalid_char = '/\:*"<>|?'
new_char = ''
for char in article_title:
if char in invalid_char:
new_char += '.'
else:
new_char += char
article_title = new_char
try:
save_path = this_root+'jpg\\{}\\{}\\'.format(self.keyword,article_title)
Tools().mk_dir(save_path,force=True)
except:
article_title_new = hashlib.md5(article_title).hexdigest()
save_path = this_root + 'jpg\\{}\\{}\\'.format(self.keyword, article_title_new)
Tools().mk_dir(save_path, force=True)
f_txt = save_path+'article_title.txt'
fw_txt = open(f_txt,'w')
fw_txt.write(article_title)
fw_txt.close()
for url in figs_url:
# print url
try:
name = url.split('_')[-2]
suffix = url.split('.')[-1]
fname = name+'.'+suffix
if os.path.isfile(save_path + fname):
continue
request = urllib2.Request(url)
response = urllib2.urlopen(request)
body = response.read()
with open(save_path + fname, 'wb') as f:
f.write(body)
pass
except Exception as e:
# print e
pass
生成参数
url_text_dir = this_root + 'urls\\{}\\'.format(self.keyword)
all_url = []
for f in os.listdir(url_text_dir):
fr = open(url_text_dir+f,'r')
lines = fr.readlines()
for line in lines:
line = line.split('\n')[0]
all_url.append(line)
执行并行下载
MULTIPROCESS(self.kernel_download,all_url).run(process_or_thread='t',process=self.threads)