1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
| import re import requests from lxml import etree
class Spider(object): def __init__(self): self.base_url = 'https://www.acwing.com/problem/{}/'
self.headers = { 'Cookie': 'csrftoken=sukdhRyZvZ5xIU4si62uFjnPrsTOB4tEHDoAzK9TmUA74TCVsF; sessionid=' 'y5owkpblzawdlw1p3rx91jwm6; file_2510567_readed=""; file_362295_readed=""; question_6210_readed=""; file_3495_readed=""; file_2897078_readed=""; file_2482882_readed=""; file_20826_readed=""; file_2917651_readed=""; file_17080_readed=""; file_03179_readed=""; question_623319_readed=""; question_623315_readed=""; file_2848720_readed=""; file_4818_readed=""; file_28216_readed=""; file_8708readed=""; file_2209490_readed=""; file_1111647_readed=""; file_421808_readed=""; file_1874010_readed=""; file_2312_readed=""; file_143921_readed=""; file_5296_readed=""; file_140683_readed=""; file_383397_readed=""; file_588001_readed=""; file_1120777_readed=""; file_144026_readed=""; file_70686_readed=""; file_1112271_readed=""; file_2331251_readed=""; file_2808_readed=""; file_144598_readed=""; file_70694_readed=""; file_2926390_readed=""; file_2926598_readed=""', 'Referer': 'https://www.acwing.com/about/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' ' (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36' } self.count = 0 def get_html(self, url): html = requests.get( url=url, headers=self.headers ).text
return html
def xpath_func(self, html): name_bds = '//tbody/tr[./td/span[@title="已通过这道题目"]]/td/a/text()' base_obj = etree.HTML(html) name_lists = base_obj.xpath(name_bds) L = [] for i in name_lists: L.append(i.strip()) return L
def re_func(self, html, re_bds): pattern = re.compile(re_bds, re.S) re_list = pattern.findall(html)
return re_list
def parse_html(self, url): html = self.get_html(url) L = self.xpath_func(html) return L
def run(self): warning = input('您马上就要爬取acwing了,看一下你的做题数,您的劳动成果将会在下面展示出来,确定要看吗?(Y/N)') if warning == 'Y': print('爬虫系统已经启动...正在努力抓取,请稍等....') print('+---------------------------------+') print('| name |') print('+---------------------------------+') for i in range(1, 80): url = self.base_url.format(i) L = self.parse_html(url) for _ in L: self.count += 1 print('| ' + _) print('+---------------------------------+') print('经过您的不懈努力,您一共做了' + str(self.count) + '道题,继续努力!!') else: print('已经退出,你这个弱者')
if __name__ == '__main__': print("hello")
|