1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
| import requests from lxml import etree import json from pprint import pprint
class qiubai: def __init__(self): self.url_temp = "http://www.qiushibaike.cc/?&p={}" self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}
def get_url_list(self): url_list = [self.url_temp.format(i+1) for i in range(2)] return url_list
def parse_url(self,url): response = requests.get(url,headers=self.headers) html_str = response.content.decode() return html_str
def parse_photo_url(self,url): response = requests.get(url,headers=self.headers) photo_byt = response.content return photo_byt
def get_content_list(self,html_str): content_list = [] html_element = etree.HTML(html_str) div_list = html_element.xpath("//div[@class='block untagged']") for div in div_list: item = {} item["title"] = div.xpath(".//a[contains(@id,'title')]/text()")[0] item["user"] = div.xpath(".//img/@alt")[0] item["content"] = div.xpath(".//div[@class='content']/p/text()|.//div[@class='content']/text()") item["time"] = div.xpath(".//div[@class='content']/@title")[0] item["good"] = div.xpath(".//div[@class='up']/a/text()")[0] item["bad"] = div.xpath(".//div[@class='down']/a/text()")[0] item["icon_url"] = div.xpath(".//div[@style='float:right;']/img/@src")[0] item["photo_url"] = div.xpath(".//div[@class='thumb']/img/@src")[0] if len(div.xpath(".//div[@class='thumb']/img/@src"))>0 else None content_list.append(item) return content_list
def save_content(self,content_list,num): file_path = "第{}页.json".format(num) for content in content_list: with open(file_path,"a",encoding="utf8") as f: f.write(json.dumps(content,ensure_ascii=False,indent=2))
def save_icon(self,icon_byt,num): file_path = "icon第{}张.jpg".format(num) with open(file_path,"wb") as f: f.write(icon_byt)
def save_photo(self,photo_byt,num): file_path = "photo第{}张.jpg".format(num) with open(file_path,"wb") as f: f.write(photo_byt)
def run(self): url_list = self.get_url_list() icon_url_list = [] photo_url_list = [] content_num = 0 for url in url_list: html_str = self.parse_url(url) content_list = self.get_content_list(html_str) page_num = url_list.index(url) self.save_content(content_list,page_num+1) for content in content_list: content_num = content_list.index(content) icon_url_list.append(content_list[content_num]["icon_url"]) icon_num = len(icon_url_list) icon_byt = self.parse_photo_url(icon_url_list[icon_num-1]) self.save_icon(icon_byt,icon_num) while content_list[content_num]["photo_url"] is not None: photo_url_list.append(content_list[content_num]["photo_url"]) photo_num = len(photo_url_list) photo_byt = self.parse_photo_url(photo_url_list[photo_num-1]) self.save_photo(photo_byt,photo_num) break
if __name__ == '__main__': my_qiubai = qiubai() my_qiubai.run()
|