[Python]將網路小說爬下來

個人非常喜歡看網路小說，但因為在網頁上看小說有以下缺點：
　需要網路
　耗流量
　精彩處等章節載入很痛苦
　用電子閱讀器瀏覽網頁很痛苦
綜合上述，發現用程式將網頁上的內容爬下來使用電子閱讀器來看會方便很多，因此決定寫這個程式。其實這個程式已經寫好很久了，正好這學期的python期末報告是自己想一個小專題做出來，於是懶得要命的我本人就直接把這個程式稍微整理做成報告交出去了。以下程式碼以半夏小說為例

（一）引入所需套件
　BeautifulSoup：解析HTML用
　requests：傳送請求用
　time：設置延遲避免網頁封鎖用

from bs4 import BeautifulSoup
import requests
import time

（二）定義header並傳送請求

myHeader={
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
        "Accept-Language": "zh-TW,zh;q=0.9", 
        "Host": "www.banxia.co", 
        "Referer": "www.banxia.co",
        "Sec-Ch-Ua": "\"Chromium\";v=\"106\", \"Microsoft Edge\";v=\"106\", \"Not;A=Brand\";v=\"99\"", 
        "Sec-Ch-Ua-Mobile": "?0", 
        "Sec-Ch-Ua-Platform": "\"Windows\"", 
        "Sec-Fetch-Dest": "document", 
        "Sec-Fetch-Mode": "navigate", 
        "Sec-Fetch-Site": "cross-site", 
        "Sec-Fetch-User": "?1", 
        "Upgrade-Insecure-Requests": "1", 
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47", 
        "X-Amzn-Trace-Id": "Root=1-6358e27a-27a9c71d021410583ec2cd27"
}
home_req=requests.get(url,headers=myHeader)
home_req.encoding=home_req.apparent_encoding

（三）在小說首頁中找出所需標籤
　書名
　作者
　章節位置、數量（數量部分可計算標籤總數得知）

（四）使用BeautifulSoup套件解析並擷取所需內容並創建寫入書本資訊

#找出書本資訊
home_name=BeautifulSoup(home_req.text, 'html.parser').find_all('div',class_='book-describe')[0].select('h1')[0].text
home_auth=BeautifulSoup(home_req.text, 'html.parser').find_all('div',class_='book-describe')[0].select('a')[0].text
home_list=BeautifulSoup(home_req.text, 'html.parser').find_all('div',class_='book-list clearfix')[0].select('a')

#寫入書本資訊
path = '.\\'+home_name+'.txt'
f = open(path, 'a', encoding='UTF-8')
f.write('書本資訊\n\n\n')
f.write('書名：'+home_name+'\n\n')
f.write('作者：'+home_auth+'\n\n\n\n')

（五）在小說章節頁中找出所需標籤（章節文本）

（六）逐一對章節頁面傳送請求並擷取小說內容（要delay不然會被鎖QQ）

for s in home_list :
    time.sleep(2)
    article_url=url+s['href'][s['href'].rfind('/')+1:]
    article_req=requests.get(article_url,headers=myHeader)
    article_req.encoding = 'big5'
    article_conent=BeautifulSoup(article_req.text, 'html.parser').find_all(id='nr1')[0].text
    article_text='\n'+article_conent.replace('“','「').replace('”','」').replace('            ','').replace('    ','　　')
    f.write(s['title'].replace('“','「').replace('”','」')+'\n')
    f.write(article_text+'\n\n\n')
    print("\r", end="")
    print('進度:'+'{:>7}'.format(str(round(pageNum/len(home_list)*100,2)))+'%|'+'∎' * int(pageNum/len(home_list)*100/2.5)+' '*(40-int(pageNum/len(home_list)*100/2.5))+'|', end="")
    pageNum+=1

（七）完整程式碼

　完整程式碼另外包含了進度條並使用try-except來避免錯誤

from bs4 import BeautifulSoup
import requests
import time
print('麻煩使用VScode檢查，感謝。')
def spider(url):
    myHeader={
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
        "Accept-Language": "zh-TW,zh;q=0.9", 
        "Host": "www.banxia.co", 
        "Referer": "www.banxia.co",
        "Sec-Ch-Ua": "\"Chromium\";v=\"106\", \"Microsoft Edge\";v=\"106\", \"Not;A=Brand\";v=\"99\"", 
        "Sec-Ch-Ua-Mobile": "?0", 
        "Sec-Ch-Ua-Platform": "\"Windows\"", 
        "Sec-Fetch-Dest": "document", 
        "Sec-Fetch-Mode": "navigate", 
        "Sec-Fetch-Site": "cross-site", 
        "Sec-Fetch-User": "?1", 
        "Upgrade-Insecure-Requests": "1", 
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47", 
        "X-Amzn-Trace-Id": "Root=1-6358e27a-27a9c71d021410583ec2cd27"
    }
    home_req=requests.get(url,headers=myHeader)
    home_req.encoding=home_req.apparent_encoding
    home_name=BeautifulSoup(home_req.text, 'html.parser').find_all('div',class_='book-describe')[0].select('h1')[0].text
    home_auth=BeautifulSoup(home_req.text, 'html.parser').find_all('div',class_='book-describe')[0].select('a')[0].text
    home_list=BeautifulSoup(home_req.text, 'html.parser').find_all('div',class_='book-list clearfix')[0].select('a')
    path = '.\\'+home_name+'.txt'
    f = open(path, 'a', encoding='UTF-8')
    f.write('書本資訊\n\n\n')
    f.write('書名：'+home_name+'\n\n')
    f.write('作者：'+home_auth+'\n\n\n\n')
    
    pageNum=0
    for s in home_list :
        time.sleep(2)
        article_url=url+s['href'][s['href'].rfind('/')+1:]
        article_req=requests.get(article_url,headers=myHeader)
        article_req.encoding = 'big5'
        article_conent=BeautifulSoup(article_req.text, 'html.parser').find_all(id='nr1')[0].text
        article_text='\n'+article_conent.replace('“','「').replace('”','」').replace('            ','').replace('    ','　　')
        f.write(s['title'].replace('“','「').replace('”','」')+'\n')
        f.write(article_text+'\n\n\n')
        print("\r", end="")
        print('進度:'+'{:>7}'.format(str(round(pageNum/len(home_list)*100,2)))+'%|'+'∎' * int(pageNum/len(home_list)*100/2.5)+' '*(40-int(pageNum/len(home_list)*100/2.5))+'|', end="")
        pageNum+=1
    f.close()
try:
    url=input('輸入半夏小說頁面網址：')
    spider(url)
    print("\r", end="")
    print('進度:'+'{:>7}'.format('100%|')+'∎' * 40+'|', end="")
    print(' Done.')
except:
    print(' Error.')

（八）執行畫面

久違的更新_(:3 」∠ )_

發表留言 取消回覆

發表留言取消回覆