web-Scraping-test

Create a web application which takes a website URL as an input and provides general information about the contents of the page:

  • HTML Version
  • Page Title
  • Headings count by level
  • Amount of internal and external links
  • Amount of inaccessible links
  • If a page contains a login form

To start project

make -B start-app

or

go run cmd/web-Scraping-test/main.go serve

Apis

Visit Url

curl --request POST \
  --url http://localhost:8080/apis/visit_url \
  --header 'content-type: application/json' \
  --data '{
 "max_depth": 2,
 "url": "https://www.google.com/"
}'

Description Input

  • URL (POST) : /apis/visit_url
{
 "max_depth": 1,
 "url": "http://go-colly.org/"
}

// MaxDepth limits the recursion depth of visited URLs.

// Set it to 0 for infinite recursion (default).

//url ==> visited url

Output

  • http status : 200 ok
{
  "data": {
    "html_version": "",
    "page_title": "Scraping Framework for Golang",
    "headings": {
      "h1": 0,
      "h2": 6,
      "h3": 0,
      "h4": 1,
      "h5": 0,
      "h6": 0
    },
    "external_internal_links_amount": 25,
    "inaccessible_links_amount": 0,
    "contains_login_form": false,
    "page_info": {
      "links": {
        "http://go-colly.org/": 3,
        "http://go-colly.org/articles/": 2,
        "http://go-colly.org/contact/": 1,
        "http://go-colly.org/datasets/": 2,
        "http://go-colly.org/docs/": 4,
        "http://go-colly.org/services/": 3,
        "http://go-colly.org/sitemap.xml": 1,
        "https://github.com/gocolly/colly": 5,
        "https://github.com/gocolly/colly/blob/master/LICENSE.txt": 1,
        "https://github.com/gocolly/site/": 1,
        "https://godoc.org/github.com/gocolly/colly": 2
      }
    }
  },
  "success": true
}

Project Structure

.
├── cmd
│   └── web-Scraping-test
│       └── main.go
├── dto
│   └── domain.go
├── _exemple
│   ├── colly_basic_exemple.go
│   └── link_exemple.go
├── go.mod
├── go.sum
├── Makefile
├── pkg
│   └── crawl.go
├── README.md
└── svc
    ├── cmd
    │   └── serve
    │       └── serve.go
    ├── configs
    │   └── configs.go
    └── rest
        ├── handlers.go
        ├── response.go
        └── server.go

10 directories, 14 files
package pkg

import (
 "time"
 "web-Scraping-test/dto"

 "github.com/PuerkitoBio/goquery"
 "github.com/gocolly/colly"
 "github.com/sirupsen/logrus"
)

func Crawl(Log *logrus.Logger, domain dto.Domain) *dto.DomainResponce {

 res := dto.NewDomainResponce()
 c := colly.NewCollector()

 // MaxDepth limits the recursion depth of visited URLs.
 // Set it to 0 for infinite recursion (default).
 c.MaxDepth = domain.MaxDepth
 c.Limit(&colly.LimitRule{
  Delay:       1 * time.Second, // Set a delay between requests to these domains
  RandomDelay: 1 * time.Second, // Add an additional random delay
 })

 c.OnHTML("html", func(e *colly.HTMLElement) {
  xmlnsProperty := e.Attr("xmlns")
  if len(xmlnsProperty) > 0 {
   res.HTMLVersion = "< 5"
  }
  res.HTMLVersion = "5"
 })

 c.OnHTML("title", func(e *colly.HTMLElement) {
  res.PageTitle = e.Text
 })

 //input[type=password]
 c.OnHTML("input[type] ", func(e *colly.HTMLElement) {
  typeProperty := e.Attr("type")
  if typeProperty == "password" {
   res.ContainsLoginForm = true
  }
 })

 c.OnHTML("h1", func(e *colly.HTMLElement) {
  res.Headings["h1"]++
 })

 c.OnHTML("h2", func(e *colly.HTMLElement) {
  res.Headings["h2"]++
 })

 c.OnHTML("h3", func(e *colly.HTMLElement) {
  res.Headings["h3"]++
 })

 c.OnHTML("h4", func(e *colly.HTMLElement) {
  res.Headings["h4"]++
 })

 c.OnHTML("h5", func(e *colly.HTMLElement) {
  res.Headings["h5"]++
 })

 c.OnHTML("h6", func(e *colly.HTMLElement) {
  res.Headings["h6"]++
 })

 // count links
 c.OnHTML("a[href]", func(e *colly.HTMLElement) {
  link := e.Request.AbsoluteURL(e.Attr("href"))
  if link != "" {
   res.PageInfo.Links[link]++
   res.ExternalAndInternalLinksAmount++
  }
 })

 c.OnHTML("div", func(e *colly.HTMLElement) {
  e.DOM.Find("h1").Each(func(i int, s *goquery.Selection) {
   res.Headings["h1"]++
  })
  e.DOM.Find("h2").Each(func(i int, s *goquery.Selection) {
   res.Headings["h2"]++
  })
  e.DOM.Find("h3").Each(func(i int, s *goquery.Selection) {
   res.Headings["h3"]++
  })
  e.DOM.Find("h4").Each(func(i int, s *goquery.Selection) {
   res.Headings["h4"]++
  })
  e.DOM.Find("h5").Each(func(i int, s *goquery.Selection) {
   res.Headings["h5"]++
  })
  e.DOM.Find("h6").Each(func(i int, s *goquery.Selection) {
   res.Headings["h6"]++
  })
 })

 // Find and visit all links
 c.OnHTML("a[href]", func(e *colly.HTMLElement) {
  e.Request.Visit(e.Attr("href"))
 })

 c.OnRequest(func(r *colly.Request) {
  Log.Println("Visiting", r.URL)
 })

 c.OnError(func(_ *colly.Response, err error) {
  Log.Errorln("Something went wrong:", err)
  res.InaccessibleLinksAmount++
 })

 c.Visit(domain.URL)

 return res
}