webscraping-samples

To explore how to do web scraping

Overview

This exercise shows extraction of keywords from a webpage using Javascript.

Steps

This is an example of code, implemented on the chrome DevTools. The page is https://forums.macrumors.com/forums/macrumors-com-news-discussion.4/

To start off, figure out the tag element of the list to be extrated. Here, the list of titles on the page is stored in

document.getElementsByTagName('h3')   // extract the list of the title by TagName
typeof document.getElementsByTagName('h3')   // check the type of the <h3>
allHeadings = document.getElementsByTagName('h3')   // store the objects in the allHeadings variable
allHeadings_list = Array.prototype.slice.call(allHeadings)   // create an array allHeadings_list
titles = allHeadings_list.map(e => e.innerText)   // extract the text from the <h3> element as titles
//Using .map() to create new array of every element in keywords. In the every new array, the elements are split by space '' 
keywords = titles.map(t => t.split(' '))   

//check keywords[0] to see the result 
keywords[0]
words = [].concat.apply([], keywords)  
words.sort()
// concatenate the keywords, store it in words, and sort by alphabet
// define onlyUnique function to extract the unique value from the array
function onlyUnique(value, index, self) { 
    return self.indexOf(value) === index;
}

// extract the unique values from words 
uniqueWords = words.filter(onlyUnique)

Next I want to filter out the stopwords to get the meaningful keywords

// import a list of stopwords as a new variable 
var stopwords = [
  'about', 'after', 'all', 'also', 'am', 'an', 'and', 'another', 'any', 'are', 'as', 'at', 'be',
  'because', 'been', 'before', 'being', 'between', 'both', 'but', 'by', 'came', 'can',
  'come', 'could', 'did', 'do', 'each', 'for', 'from', 'get', 'got', 'has', 'had',
  'he', 'have', 'her', 'here', 'him', 'himself', 'his', 'how', 'if', 'in', 'into',
  'is', 'it', 'like', 'make', 'many', 'me', 'might', 'more', 'most', 'much', 'must',
  'my', 'never', 'now', 'of', 'on', 'only', 'or', 'other', 'our', 'out', 'over',
  'said', 'same', 'see', 'should', 'since', 'some', 'still', 'such', 'take', 'than',
  'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'this', 'those',
  'through', 'to', 'too', 'under', 'up', 'very', 'was', 'way', 'we', 'well', 'were',
  'what', 'where', 'which', 'while', 'who', 'with', 'would', 'you', 'your', 'a', 'i']
keywords_list = uniqueWords.filter(w => stopwords.indexOf(w) < 0)   // Get the final list of keywords without stopwords