价格爬虫故障
Closed this issue · 1 comments
klren0312 commented
检查原因
klren0312 commented
使用工具
https://github.com/klren0312/ZSpider
验证问题代码
const cheerio = require('cheerio')
const tableParser = require('cheerio-tableparser')
const rp = require('request-promise')
const complete = new Set()
/**
* '2':'圆钢'
* '3':'螺纹(14mm)'
* '4':'螺纹(12mm)'
* '6':'盘螺(10mm)'
* '8':'盘螺(6mm)'
* '9':'盘螺(8mm)'
* '10':'螺纹(16mm)'
* '11':'螺纹(18mm)'
* '12':'螺纹(20mm)'
* '13':'螺纹(22mm)'
* '14':'螺纹(25mm)'
* '15':'螺纹(28mm)'
* '16':'螺纹(32mm)'
*/
// 钢材字典
const ironDict = {
'螺纹钢Φ12-14': [3, 4],
'螺纹钢Φ16-25': [10, 11, 12, 13, 14],
'螺纹钢Ф28-32': [15, 16],
'盘螺Φ8-10': [6, 9],
'盘螺Φ6': [8]
}
// 每天晚上11点半更新价格
const today = `07-03`
console.log(today)
async function startup () {
const link = await checkNew()
console.log(link)
const res = await getPrice(link)
console.log(res)
const filterRes = res.filter(v => v.name.indexOf('高线') === -1)
console.log(filterRes)
filterRes.forEach(v => {
console.log(v.name + v.size)
console.log(ironDict)
const ids = ironDict[v.name + v.size]
console.log(ids)
for (let i = 0, len = ids.length; i < len; i++) {
console.log({
id: ids[i],
price: v.price
})
}
})
}
startup()
/**
* 判断是否有当天数据
*/
async function checkNew () {
const listUrl = 'http://hq.zgw.com/hefei/jiancai.html'
const body = await rp({ uri: listUrl })
const $ = cheerio.load(body)
let list = $('body > div.wrap > div.cslm_tit > div.hq_con > div.fl.lm_left > div.lm_list > ul:nth-child(2)').children('li:first-child').text().trim()
if (list.match(/(\d+-\d+)/)[0] === `07-03`) {
const link = $('body > div.wrap > div.cslm_tit > div.hq_con > div.fl.lm_left > div.lm_list > ul:nth-child(2) li:first-child a').attr('href')
return `http://hq.zgw.com${link}`
} else {
return false
}
}
/**
* 获取价格
* @param {string} url 价格页链接
*/
async function getPrice (url) {
const tableSelector = 'body > div.wrap > div.cslm_tit > div.hq_con > div.fl.lm_left > div > div.lm_m > div.lm_mt > div.article > table'
const body = await rp({ uri: url })
const $ = cheerio.load(body)
const priceCol = spiderTable(tableSelector, $, false)[4]
priceCol.shift()
const pricesFormat = priceCol.map(v => $(v).data().type)
const otherData = spiderTable(tableSelector, $, true)
const nameCol = otherData[0]
const sizeCol = otherData[1]
const modelCol = otherData[2]
const typeCol = otherData[3]
nameCol.shift()
sizeCol.shift()
modelCol.shift()
typeCol.shift()
const ironArr = []
for (let i = 0, len = nameCol.length; i < len; i++) {
ironArr.push({
name: nameCol[i],
size: sizeCol[i],
model: modelCol[i],
type: typeCol[i],
price: pricesFormat[i]
})
}
let afterArr = ironArr.filter(v => v.type.indexOf('马钢') !== -1)
afterArr = afterArr.filter(v => v.model !== 'HRB400E')
afterArr = afterArr.filter(v => v.name.indexOf('高线') === -1)
return afterArr
}
/**
* 数字补零
* @param {number} num 数字
*/
function zeroPadding (num) {
let n = num.toString()
if (n.length === 1) {
return `0${n}`
} else {
return n
}
}
/**
* 解析表格数据
* @param {string} tableSelector 表格选择器
* @param {function} $ cheerio
* @param {boolean} noHtml 是否需要清除html标签
*/
function spiderTable (tableSelector, $, noHtml) {
tableParser($)
return $(tableSelector).parsetable(false, false, noHtml)
}