def get_page_url(url): try: urlhtml = requests.get(url,params=headers) urlhtml.encoding = "utf-8" htmlcode = urlhtml.content html = etree.HTML(htmlcode) text = html.xpath('//dd/a/@href') except: text = 'error' return text main_url = "https://www.biedoul.com" if __name__ == '__main__': id = 400 while(1): url = get_page_url(main_url+"/wenzi/"+str(id)) for i inurl: html = requests.get(main_url+i,params=headers) # print(main_url+i) html.encoding = 'utf-8' htmlcode = html.content html = etree.HTML(htmlcode) title = html.xpath('//h1/text()') text = html.xpath('//div[@class="cc2"]/p/text()//div[@class="cc2"]/p/font/text()//div[@class="c"]/div[@class="cc2"]/text()') f = open("xiaohua/"+title[0]+".txt",'a+') for t intext: out = "".join(t.split()) f.write(out) f.write('\n') id+=1