Python爬蟲

簡易Python 爬蟲

使用request提取網頁,再用bs4分析取得到html, 最後用urllib保存文件

requests預設encoding很大機會是iso-8859-1

所以在提取text之前需要修改成 html.encoding = 'UTF-8'

range(1, int(numbers)) 是返會 [1.....numbers-1] 所以要使用 range(1, int(numbers)+1)

Python 沒有 += 的寫法,合併Array只能 array = array + [....]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from bs4 import BeautifulSoup
import requests
import re
import os
import sys
import urllib.request
from tqdm import tqdm


author = ''


def is_epub(url):
return '#epub' in url['href']


def main():
booklist = []
URL = f"https://tw.ixdzs.com/author/{author}"
html = requests.get(URL)
html.encoding = 'UTF-8'
soup = BeautifulSoup(html.text, 'lxml')
number = soup.find_all('a', title="最後一頁")[0]
numbers = re.search(r'page=(\d+)$', number['href']).group(1)

for num in list(range(1, int(numbers)+1)):
URL = f"https://tw.ixdzs.com/author/{author}?page={num}"
html = requests.get(URL)
html.encoding = 'UTF-8'
soup = BeautifulSoup(html.text, 'lxml')
booklist = booklist + soup.find_all('a')

urllist = filter(is_epub, booklist)
if not os.path.exists(author):
os.makedirs(author)

for url in tqdm(list(urllist)):
book_id = re.search(r'/d/\d+/(\d+)/#epub_down', url['href']).group(1)
try:
title = re.search(r'(.*)epub下載', url['title']).group(1)
except Exception:
print(url['title'])
title = url['title']
if os.path.exists(f"{author}/{title}.epub"):
continue
urllib.request.urlretrieve(
f'https://tw.ixdzs.com/down/{book_id}_4', f'{author}/{title}.epub')


if __name__ == "__main__":
print(sys.argv)
if len(sys.argv) < 2:
raise SyntaxError("Insufficient arguments.")
author = sys.argv[1]
main()