第一个网页的内容可以这样爬,你自己去学。
我把第一个网页里对应省份的数据爬出来了。
代码如下:
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
}
url = '
http://liuyan.people.com.cn/home?p=0'r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
dom = etree.HTML(r.text)
xpath_link = '/html/body/div[5]/div[4]/ul/li/b/a/@href'
xpath_city = '/html/body/div[5]/div[4]/ul/li/b/a/text()'
xpath_sum = '/html/body/div[5]/div[4]/ul/li/p[1]/i/text()'
xpath_reply_sum = '/html/body/div[5]/div[4]/ul/li/p[2]/i/text()'
links = dom.xpath(xpath_link)
citys = dom.xpath(xpath_city)
sum = dom.xpath(xpath_sum)
reply_sum = dom.xpath(xpath_reply_sum)
data = []
for i in range(len(links)):
t = {}
t['link'] = links[i]
t['city'] = citys[i]
t['sum'] = sum[i]
t['reply_sum'] = reply_sum[i]
data.append(t)
for t in data:
print(t)
--
FROM 111.182.26.*