1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
|
import os
import re
import urllib.parse
import requests
from lxml import html
user_id = 36 # Nemesis43
user_id = 4445 # Shipjolly
user_id = 20217 # RubrumPopulus
ID_STRING = '''
36
4445
20217
'''
def check_torrent(href):
"""download.php?id=445&name=Back%20to%20the%20Future%20011%20%282016%29%20%28Digital%29%20%28Kileko-Empire%29.cbr.torrent"""
download_pattern = re.compile(r'^download\.php\?id=\d{1,20}&name=(.+)')
status = re.search(download_pattern, href)
return status
def get_filename(href):
"""download.php?id=445&name=Back%20to%20the%20Future%20011%20%282016%29%20%28Digital%29%20%28Kileko-Empire%29.cbr.torrent"""
download_pattern = re.compile(r'^download\.php\?id=\d{1,20}&name=(.+)')
download_match = download_pattern.match(href)
filename = ''
if download_match:
filename = download_match.group(1)
filename = urllib.parse.unquote(filename)
return filename
def check_page(href):
"""torrents-user.php?id=36&page=239"""
page_pattern = re.compile(r'^torrents-user\.php\?id=\d{1,20}&page=\d{1,20}')
# page_pattern = r'^torrents-user'
status = re.search(page_pattern, href)
return status
def get_page_num(href):
"""torrents-user.php?id=36&page=239"""
p_num = re.compile(r'^torrents-user\.php\?id=\d{1,20}&page=(\d{1,20})')
m_num = p_num.match(href)
if m_num and m_num.group(1).isnumeric():
page_num = int(m_num.group(1))
else:
page_num = 0
return page_num
def download_f_url(download_url, file_path):
print(download_url)
with open(file_path, "wb") as file:
response = requests.get(url=download_url, headers=header, cookies=cookie)
file.write(response.content)
def get_html(url):
header['Referer'] = url
page = requests.get(url=url, headers=header, cookies=cookie)
text = page.text
return text
def get_hrefs(user_id, page_num):
url = 'https://worldwidetorrents.me/torrents-user.php'
param = {
'id': user_id,
'page': page_num,
}
# referer:https://worldwidetorrents.me/account-details.php?id=4445
referer = url + '?id=' + str(user_id)
header['Referer'] = referer
page = requests.get(url=url, headers=header, cookies=cookie, params=param)
tree = html.fromstring(page.text)
# ====================获取列表====================
hrefs = tree.xpath('//center/table//a[@href]/@href')
return hrefs
def ReadHtml(user_id, set_page=999):
sub_dir = os.path.join(file_dir, str(user_id))
# ================读取第1页================
"""https://worldwidetorrents.me/torrents-user.php?id=36&page=237"""
page_num = 0
hrefs = get_hrefs(user_id, page_num)
page_hrefs = [href for href in hrefs if check_page(href)]
torrent_hrefs = [href for href in hrefs if check_torrent(href)]
page_nums = [get_page_num(href) for href in page_hrefs]
if page_nums == []:
max_page = 1
else:
max_page = max(page_nums)
if max_page == 0:
max_page = 1
print(max_page)
torrents = []
wwt_prefix = 'https://worldwidetorrents.me/'
end_page = min(max_page, set_page)
print('end_page:', end_page)
# ================读取全部页面================
for page_num in range(0, end_page + 1):
hrefs = get_hrefs(user_id, page_num)
torrent_hrefs = [href for href in hrefs if check_torrent(href)]
print(page_num)
torrents.extend(torrent_hrefs)
for torrent in torrent_hrefs:
download_url = wwt_prefix + torrent
filename = get_filename(torrent)
if not filename.lower().endswith('.torrent'):
filename += '.torrent'
print(filename)
file_path = os.path.join(sub_dir, filename)
if not os.path.exists(file_path): # 判断目标是否存在
try:
download_f_url(download_url, file_path)
except:
print('请注意:', download_url)
else:
# print(filename + '已下载')
pass
if __name__ == '__main__':
cookie = 'your_cookie'
header = {'User-Agent': 'your_header'}
# 打开https://worldwidetorrents.me/,从浏览器控制台复制你的cookie和header填入此处,一定时间后会过期需重填
file_dir = 'your_destination_folder' # 设置你要保存到的目录
id_lines = [line.strip().split('#')[0] for line in ID_STRING.splitlines() if line != '']
search_list = list(set([int(line) for line in id_lines if line.isnumeric()]))
search_list.sort()
for user_id in search_list:
try:
ReadHtml(user_id, set_page=10) # set_page表示你要抓取前多少页
except:
pass
|