Python 爬虫-抓取小说《盗墓笔记-怒海潜沙》-白红宇

Python 爬虫-抓取小说《盗墓笔记-怒海潜沙》

阅读量：7088 次

发布时间：2019-06-28

本文共 1411 字，大约阅读时间需要 4 分钟。

最近想看盗墓笔记，看了一下网页代码，竟然不是js防爬虫，那就用简单的代码爬下了一节：

"""爬取盗墓笔记小说-七星鲁王宫"""from urllib.request import urlopenfrom bs4 import BeautifulSoupfrom docx import Documentimport osclass Download():    def __init__(self):        self.baseUrl = 'http://www.daomubiji.com/nu-hai-qian-sha-'        self.basePath = os.path.dirname(__file__)    def makedir(self, name):        path = os.path.join(self.basePath, name)        isExist = os.path.exists(path)        if not isExist:            os.makedirs(path)            print('File has been created.')        else:            print('The file is existed.')        #切换到该目录下        os.chdir(path)    def connect(self, url):        try:            html = urlopen(url)            print(url)            obj = BeautifulSoup(html, 'lxml')        except:            print('This page is not existed.')        return obj    def getContent(self):        doc = Document()        self.makedir('storyFiles')        for page in range(1,47):            if page < 10:                url = self.baseUrl + '0' + str(page) + '.html'            else:                url = self.baseUrl + str(page) + '.html'            obj = self.connect(url)            content = obj.find('article', {
   'class': 'article-content'})            doc.add_paragraph(content.text)        doc.save('盗墓笔记-怒海潜沙.doc')if __name__ == '__main__':    obj = Download()    obj.getContent()

转载于:https://www.cnblogs.com/fredkeke/p/6646781.html

你可能感兴趣的文章

Leetcode_Wildcard Matching