Node based web scraping tool to display the top N occuring words in a site. Data Analyst beware, here comes the ultimate word counter
Check out the deployed version: https://ttt-se-intern.herokuapp.com/
Libraries and Plugins used:
- Express Js
- mongoose
- node-fetch
var express = require("express"),
bodyParser = require("body-parser"),
app = express(),
mongoose = require("mongoose"),
fetch = require("node-fetch");
bodyParser and mongoose initial setup
mongoose.Promise = global.Promise;
mongoose.connect("mongodb://localhost:27017/ttt-se");
app.use(bodyParser.urlencoded({ extended: false }));
app.use(bodyParser.json());
app.set("view engine", "ejs");
app.use(express.static(__dirname + "/public"));
app GET for the index page
app.get("/", function(req, res){
res.render("index");
});
Index page aka ( The main screen ).
Submission from the above page leds to a POST request of the /submit page
app.post("/submit", function(req, res){
var N = req.body.hN;
console.log(N);
fetch("http://terriblytinytales.com/test.txt").then(function(res){
return res.text();
}).then(function(body){
var wordCount = {};
var words = body.split(/\s/);
for(var i = 0;i < words.length;i++)
wordCount[words[i].toLowerCase()] = (wordCount[words[i].toLowerCase()] || 0) + 1;
keysSorted = Object.keys(wordCount).sort(function(a, b){return wordCount[b] - wordCount[a]});
res.render("sorted", {keysSorted: keysSorted, N: N});
});
});
The data from the hosted text file is retrived by using the node module node-fetch
fetch("http://terriblytinytales.com/test.txt").then(function(res){
return res.text();
}).then(function(body){}
The hosted text file is retrived and each words are seperated by using the javaScript split method.
var words = body.split(/\s/);
A loop is created to loop until the end of all words and the word count is stored in an Object called "wordCount"
for(var i = 0;i < words.length;i++)
wordCount[words[i].toLowerCase()] = (wordCount[words[i].toLowerCase()] || 0) + 1;
The wordCount object model has the property as word and the value as occurence
{
"word": occurence
}
The wordCount object is then sorted in descending order
keysSorted = Object.keys(wordCount).sort(function(a, b){return wordCount[b] - wordCount[a]});
Finally sorted page is rendered and along with res.render the sorted value as well the user requested N value is also sent
res.render("sorted", {keysSorted: keysSorted, N: N});
The final sorted page is. The maximum occurencers are displayed until the user provided N value.
A check is performed at the begining of the sorted page to see if the user provided value is correct.
A loop is used on the server side to loop till the requested N value in the keySorted object.
<% if(N < keysSorted.length && N > 0){ %>
If the user provided an invalid entry an okayish kind of an error page is rendered