How it was created
- Set HTTP_PROXY and HTTPS_PROXY before opening VS Code
- create devcontainer.env file with server username password variables for proxy setting for scrapy-playwright
- Prepared devcontainer.json, Dockerfile, .gitignore
- Create folder downloaded in /workspaces/macro_scrapy/
- run
scrapy startproject macro_scrapy
cd macro_scrapy
- Change in settings.py
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
"macro_scrapy.pipelines.MacroScrapyPipeline": 1,
}
FILES_STORE = r"/workspaces/macro_scrapy/downloaded"
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
PLAYWRIGHT_LAUNCH_OPTIONS = {
"headless": True,
}
PLAYWRIGHT_LAUNCH_OPTIONS = {
"proxy": {
"server": os.environ.get("SERVER"),
"username": os.environ.get("USERNAME"),
"password": os.environ.get("PASSWORD"),
},
}
- Change in items.py
class MacroScrapyItem(scrapy.Item):
# define the fields for your item here like:
file_urls = scrapy.Field()
original_file_name = scrapy.Field()
files = scrapy.Field
class QuoteItem(scrapy.Item):
# define the fields for your item here like:
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
- Change in pipelines.py
class MacroScrapyPipeline(FilesPipeline):
def file_path(self, request, item=None,response=None, info=None):
file_name: str = datetime.today().strftime('%Y%m%d')+"_"+item['original_file_name'][0]+"_"+request.url.split("/")[-1]
return file_name