/drawler

A lightweight Java framework for web pages crawling and datas matching.

Primary LanguageJava

drawler

A lightweght Java framework for web pages crawling and datas matching. This is a beta version, so some errors may occur when running.

Example 1: Crawl web pages from Internet

import drawler.crawl.Drawler;

public class HelloCrawler throws Exception {

	public static void main(String args[]) {
		
		Drawler drawler = new Drawler();
		
		// The start pages that want to crawl
		String[] startURLs = new String[] {
			"http://www.dianping.com/member/4200487/reviews?pg=1&reviewCityId=0&reviewShopType=0&c=0&shopTypeIndex=0",
			"http://www.dianping.com/member/2691446/reviews?pg=1&reviewCityId=0&reviewShopType=0&c=0&shopTypeIndex=0",
			"http://www.dianping.com/member/437769/reviews?pg=1&reviewCityId=0&reviewShopType=0&c=0&shopTypeIndex=0",
			"http://www.dianping.com/member/50227/reviews?pg=1&reviewCityId=0&reviewShopType=0&c=0&shopTypeIndex=0",
			"http://www.dianping.com/member/19148264/reviews?pg=1&reviewCityId=0&reviewShopType=0&c=0&shopTypeIndex=0"
		};
		
		// Add the URL regex that want to scan for more links
		drawler.urlScan.add("http://www\\.dianping\\.com/member/[0-9]+/reviews\\?pg=[0-9]+&reviewCityId=0&reviewShopType=0&c=0&shopTypeIndex=0");
		// Add the URL regex that want to download
		drawler.urlDownload.add("http://www\\.dianping\\.com/member/[0-9]+/reviews\\?pg=[0-9]+&reviewCityId=0&reviewShopType=0&c=0&shopTypeIndex=0");
		
		// Set the scan depth of the drawler
		drawler.scanDepth = 100;
		// Set the count of pages to download in scanned page
		drawler.downloadCount = 10;
		
		// Set the number of thread to run the program
		drawler.maxThreads = 10;
		
		// Set the milliseconds pause per N connections
		drawler.toPause = true;
		drawler.pauseperconnections = 200;
		drawler.pausemseconds = 500000;
		
		// The folder to save the downloaded web pages
		drawler.localpath = "cache/www_dianping_com/";
		
		// Start the crawler
		for (String url : startURLs)
			drawler.followLink(url);
	}
	
}

Example 2: Find objective datas from downloaded web pages

import drawler.match.Matcher;
import drawler.match.io.FileInput;
import drawler.match.sql.SQLOutput;
import drawler.match.pattern.Filter;
import drawler.match.pattern.html.HTMLPattern;

public class HelloMatcher {

	public static void main(String[] args) {
		
		// The matcher object to find datas in web pages;
		Matcher matcher = new Matcher();

		// The file system source object. It can be expanded to other sources, like databases or online pages
		FileInput input = new FileInput();
		input.addPath("cache/www_dianping_com/");
		matcher.setInput(input);
		
		// The mysql output obejct. Can be expanded to other outputer, like file system.
		// The arguments are: database URL, user name, password, database name, table name
		SQLOutput output = new SQLOutput("localhost", "root", "root", "www_dianping_com", "peoples");
		matcher.setOutput(output);
		
		matcher.addPattern("shopID", "INT", new HTMLPattern(
			"<a href=\"http://www.dianping.com/shop/",
			"[0-9]+",
			"\" class=\"J_rpttitle\" title=\"\""));
			
		matcher.addPattern("taste", "INT", new HTMLPattern(
			"<span>口味:",
			"[0-9]",
			"<em class"));
			
		matcher.addPattern("environment", "INT", new HTMLPattern(
			"<span>环境:",
			"[0-9]",
			"<em class"));
			
		matcher.addPattern("service", "INT", new HTMLPattern(
			"<span>服务:",
			"[0-9]",
			"<em class"));
			
		matcher.addPattern("comment", "VARCHAR(1000)", new HTMLPattern(
			"<div class=\"mode-tc comm-entry\">",
			".*?",
			"</div>",
			new Filter() {
				public String filter(String content) {
					return content.replace("<br/>","");
				}
			}));
			
		matcher.addPattern("date", "DATE", new HTMLPattern(
			"<span class=\"col-exp\">.*?于",
			".*?",
			"</span>",
			new Filter() {
				public String filter(String content) {
					return "20" + content;
				}
			}));
			
		matcher.match();
	}
	
}