二、完全代码
import requests,osfrom lxml import etree#头信息headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'}}#向网站发起要求def get_text(url): response = requests.get(url,headers=headers).text return response#剖析网页def nex_page_parse(url): global count response = requests.get(url, headers=headers).text soup = etree.HTML(response) title=soup.xpath('//[@id="content"]/div/div[2]/h1/text()') content=soup.xpath('//[@id="single-next-link"]/div/text()') res=[] res=''.join(content) print("正在抓取第%d条糗事..."%count) print(url) print(title[0].rstrip()) print(res) with open('糗事百科.txt','a',encoding='utf-8') as f: f.write(title[0]+res) print("第%d条糗事抓取完毕!
\n" %count) count+=1def get_urls(html): soup=etree.HTML(html) tags=soup.xpath('//[@id="content"]/div/div[2]') urls=[] for list in tags: urls=list.xpath('./div/a[1]/@href') return urls#主函数if __name__ == '__main__': count = 1 #记录爬取得段子数 urls =[ 'https://www.qiushibaike.com/text/page/{}'.format(i) for i in range(14)] for url in urls: text = get_text(url) for url in get_urls(text): url='https://www.qiushibaike.com'+url nex_page_parse(url)
