/scrapy-net

Scrapy the web scraping tool - a naive implementation in C#

Primary LanguageC#

scrapy.net

var rules = new List<ScrapyRule>
{
    new ScrapyRule
    {
        Selector = ".list-item a", // categories
        Type = ScrapyRuleType.Source,
        Source = new ScrapySource(new List<ScrapyRule>
        {
            new ScrapyRule
            {
                Selector = ".list-item.selected a", // category name
                Type = ScrapyRuleType.Text,
                Name = "Category"
            },
            new ScrapyRule
            {
                Selector = ".page-next", // pagination
                Type = ScrapyRuleType.Source,
                Source = new ScrapySource(new List<ScrapyRule>
                {
                    new ScrapyRule
                    {
                        Selector = ".product-name a", // product
                        Type = ScrapyRuleType.Source,
                        Source = new ScrapySource(new List<ScrapyRule>
                        {
                            new ScrapyRule
                            {
                                Name = "MetaKeywords", // meta keywords 
                                Selector = "meta[name=keywords]",
                                Attribute = "content",
                                Type = ScrapyRuleType.Attribute
                            },
                            new ScrapyRule
                            {
                                Name = "Name", // product name
                                Selector = ".product-details h1",
                                Type = ScrapyRuleType.Text
                            },
                            new ScrapyRule
                            {
                                Name = "Price", // product price
                                Selector = ".price",
                                Type = ScrapyRuleType.Text
                            }
                      }
                })
            }
        })
    }
};

var source = new ScrapySource(rules)
{
    Name = "scrapy",
    Url = "https://scrapethissite.com/"
};

var path = $@"D:\Scrapy\{source.Name}";

// init client
var client = new ScrapyClient(new ScrapyOptions
{
    BaseUrl = "https://scrapethissite.com/",
    WaitForSourceTimeout = 500,
    MaxDegreeOfParallelism = 10,
    Path = path
})
.Dump((content) =>
{
    products.Add(content);
})
.Log((message) =>
{
    Console.WriteLine(message);
});

// start scraping
await client.ScrapeAsync(source);