/Nature_Spider

批量爬取nature文章图片

Primary LanguagePython

Nature Picture Spider

爬取Nature官网文章图片,通过大量观看Nature的论文配图学习如何画图和配色。

定义搜索关键词和搜索范围,获取文章链接,并爬取文章链接中的图片

开启并行下载,速度飙升

依赖第三方库: tqdm,requests

python 2.7

1. 获取Nature文章链接

定义搜索关键词,搜索页数范围,并行下载数量 关键词要用+分隔,比如 global+drought

'''
configuration
'''
####### keyword #######
self.keyword = 'global'

####### pages #######
self.page_start = 1
self.page_end = 5

####### threads #######
self.threads = 20

搜索页链接示例

url = 'https://www.nature.com/search?q=global&order=relevance&journal=nature%2Cnclimate%2Cncomms%2Csrep&page=1'

通过改变搜索链接的参数获取搜索内容,然后再获取每页的文章链接,并写入本地txt

product_url = 'https://www.nature.com/search?q={}&page={}'.format(self.keyword, page)
fname = hashlib.md5(product_url).hexdigest()
f = url_text_dir + '{}.txt'.format(fname)
if os.path.isfile(f):
    return None
request = urllib2.Request(product_url)
response = urllib2.urlopen(request)
body = response.read()
fw = open(f, 'w')
p = re.findall('<a href="/articles/.*? itemprop=', body)
for i in p:
    article = i.split('"')[1]
    url_i = 'https://www.nature.com{}'.format(article)
    fw.write(url_i + '\n')

2. 获取Nature文章图片链接

url = 'https://www.nature.com/articles/s41586-020-2035-0'
html_dir = this_root+'html\\'
Tools().mk_dir(html_dir)
fname = hashlib.md5(url).hexdigest()
if os.path.isfile(html_dir+fname+'.html'):
    body = open(html_dir+fname+'.html','r').read()
else:
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    body = response.read()
    outf = html_dir + fname + '.html'
    fw = open(outf, 'w')
    fw.write(body)
    fw.close()
# print body
# body_f = this_root+'HTML\\new 1.html'
# body = open(body_f,'r').read()
metadata_p = re.findall('<script data-test="dataLayer">\n.*?\n</script>',body)
metadata = metadata_p[0].split('\n')[1]
metadata = metadata.replace('dataLayer = ','')
# metadata = metadata.replace(' ','')
metadata = metadata[:-1]
null = 'null'
false = False
true = True
dataLayer = eval(metadata)[0]

article_title = dataLayer['content']['contentInfo']['title']

png_p = re.findall('media.springernature.com.*?png', body)
jpg_p = re.findall('media.springernature.com.*?jpg', body)
png_p = set(png_p)
jpg_p = set(jpg_p)
figs_url = []
for i in png_p:
    fig_i = 'https://{}'.format(i)
    figs_url.append(fig_i)
for i in jpg_p:
    fig_i = 'https://{}'.format(i)
    figs_url.append(fig_i)

3. 下载图片

下载图片

invalid_char = '/\:*"<>|?'
new_char = ''
for char in article_title:
    if char in invalid_char:
        new_char += '.'
    else:
        new_char += char
article_title = new_char
try:
    save_path = this_root+'jpg\\{}\\{}\\'.format(self.keyword,article_title)
    Tools().mk_dir(save_path,force=True)
except:
    article_title_new = hashlib.md5(article_title).hexdigest()
    save_path = this_root + 'jpg\\{}\\{}\\'.format(self.keyword, article_title_new)
    Tools().mk_dir(save_path, force=True)
f_txt = save_path+'article_title.txt'
fw_txt = open(f_txt,'w')
fw_txt.write(article_title)
fw_txt.close()

for url in figs_url:
    # print url
    try:
        name = url.split('_')[-2]
        suffix = url.split('.')[-1]
        fname = name+'.'+suffix
        if os.path.isfile(save_path + fname):
            continue
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        body = response.read()
        with open(save_path + fname, 'wb') as f:
            f.write(body)
        pass
    except Exception as e:
        # print e
        pass

4. 并行下载

生成参数

url_text_dir = this_root + 'urls\\{}\\'.format(self.keyword)
all_url = []
for f in os.listdir(url_text_dir):
    fr = open(url_text_dir+f,'r')
    lines = fr.readlines()
    for line in lines:
        line = line.split('\n')[0]
        all_url.append(line)

执行并行下载

MULTIPROCESS(self.kernel_download,all_url).run(process_or_thread='t',process=self.threads)