|
| 1 | +# 引入模块 |
| 2 | +import requests |
| 3 | +import json |
| 4 | +import re |
| 5 | +import time |
| 6 | +from bs4 import BeautifulSoup |
| 7 | +import os |
| 8 | + |
| 9 | +#保存下载的 html 页面和图片 |
| 10 | +def save(search_response,html_dir,file_name): |
| 11 | + # 保存 html 的位置 |
| 12 | + htmlDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), html_dir) |
| 13 | + # 保存图片的位置 |
| 14 | + targetDir = os.path.join(os.path.dirname(os.path.abspath(__file__)),html_dir + '/images') |
| 15 | + # 不存在创建文件夹 |
| 16 | + if not os.path.isdir(targetDir): |
| 17 | + os.makedirs(targetDir) |
| 18 | + domain = 'https://mp.weixin.qq.com/s' |
| 19 | + # 调用保存 html 方法 |
| 20 | + save_html(search_response, htmlDir, file_name) |
| 21 | + # 调用保存图片方法 |
| 22 | + save_file_to_local(htmlDir, targetDir, search_response, domain, file_name) |
| 23 | + |
| 24 | +# 保存图片到本地 |
| 25 | +def save_file_to_local(htmlDir,targetDir,search_response,domain,file_name): |
| 26 | + # 使用lxml解析请求返回的页面 |
| 27 | + obj = BeautifulSoup(save_html(search_response,htmlDir,file_name).content, 'lxml') |
| 28 | + # 找到有 img 标签的内容 |
| 29 | + imgs = obj.find_all('img') |
| 30 | + # 将页面上图片的链接加入list |
| 31 | + urls = [] |
| 32 | + for img in imgs: |
| 33 | + if 'data-src' in str(img): |
| 34 | + urls.append(img['data-src']) |
| 35 | + elif 'src=""' in str(img): |
| 36 | + pass |
| 37 | + elif "src" not in str(img): |
| 38 | + pass |
| 39 | + else: |
| 40 | + urls.append(img['src']) |
| 41 | + |
| 42 | + # 遍历所有图片链接,将图片保存到本地指定文件夹,图片名字用0,1,2... |
| 43 | + i = 0 |
| 44 | + for each_url in urls: |
| 45 | + # 跟据文章的图片格式进行处理 |
| 46 | + if each_url.startswith('//'): |
| 47 | + new_url = 'https:' + each_url |
| 48 | + r_pic = requests.get(new_url) |
| 49 | + elif each_url.startswith('/') and each_url.endswith('gif'): |
| 50 | + new_url = domain + each_url |
| 51 | + r_pic = requests.get(new_url) |
| 52 | + elif each_url.endswith('png') or each_url.endswith('jpg') or each_url.endswith('gif') or each_url.endswith('jpeg'): |
| 53 | + r_pic = requests.get(each_url) |
| 54 | + # 创建指定目录 |
| 55 | + t = os.path.join(targetDir, str(i) + '.jpeg') |
| 56 | + print('该文章共需处理' + str(len(urls)) + '张图片,正在处理第' + str(i + 1) + '张……') |
| 57 | + # 指定绝对路径 |
| 58 | + fw = open(t, 'wb') |
| 59 | + # 保存图片到本地指定目录 |
| 60 | + fw.write(r_pic.content) |
| 61 | + i += 1 |
| 62 | + # 将旧的链接或相对链接修改为直接访问本地图片 |
| 63 | + update_file(each_url, t, htmlDir, file_name) |
| 64 | + fw.close() |
| 65 | + |
| 66 | +# 保存 HTML 到本地 |
| 67 | +def save_html(url_content,htmlDir,file_name): |
| 68 | + f = open(htmlDir+"/"+file_name+'.html', 'wb') |
| 69 | + # 写入文件 |
| 70 | + f.write(url_content.content) |
| 71 | + f.close() |
| 72 | + return url_content |
| 73 | + |
| 74 | +# 修改 HTML 文件,将图片的路径改为本地的路径 |
| 75 | +def update_file(old, new, htmlDir, file_name): |
| 76 | + # 打开两个文件,原始文件用来读,另一个文件将修改的内容写入 |
| 77 | + with open(htmlDir+"/"+file_name+'.html', encoding='utf-8') as f, open(htmlDir+"/"+file_name+'_bak.html', 'w', encoding='utf-8') as fw: |
| 78 | + # 遍历每行,用replace()方法替换路径 |
| 79 | + for line in f: |
| 80 | + new_line = line.replace(old, new) |
| 81 | + new_line = new_line.replace("data-src", "src") |
| 82 | + # 写入新文件 |
| 83 | + fw.write(new_line) |
| 84 | + # 执行完,删除原始文件 |
| 85 | + os.remove(htmlDir+"/"+file_name+'.html') |
| 86 | + time.sleep(5) |
| 87 | + # 修改新文件名为 html |
| 88 | + os.rename(htmlDir+"/"+file_name+'_bak.html', htmlDir+"/"+file_name+'.html') |
| 89 | + |
| 90 | +# 打开 cookie.txt |
| 91 | +with open("cookie.txt", "r") as file: |
| 92 | + cookie = file.read() |
| 93 | +cookies = json.loads(cookie) |
| 94 | +url = "https://mp.weixin.qq.com" |
| 95 | +#请求公号平台 |
| 96 | +response = requests.get(url, cookies=cookies) |
| 97 | +# 从url中获取token |
| 98 | +token = re.findall(r'token=(\d+)', str(response.url))[0] |
| 99 | +# 设置请求访问头信息 |
| 100 | +headers = { |
| 101 | + "Referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&token=" + token + "&lang=zh_CN", |
| 102 | + "Host": "mp.weixin.qq.com", |
| 103 | + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36", |
| 104 | +} |
| 105 | + |
| 106 | +# 循环遍历前10页的文章 |
| 107 | +for j in range(1, 10, 1): |
| 108 | + begin = (j-1)*5 |
| 109 | + # 请求当前页获取文章列表 |
| 110 | + requestUrl = "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin="+str(begin)+"&count=5&fakeid=MzU1NDk2MzQyNg==&type=9&query=&token=" + token + "&lang=zh_CN&f=json&ajax=1" |
| 111 | + search_response = requests.get(requestUrl, cookies=cookies, headers=headers) |
| 112 | + # 获取到返回列表 Json 信息 |
| 113 | + re_text = search_response.json() |
| 114 | + list = re_text.get("app_msg_list") |
| 115 | + # 遍历当前页的文章列表 |
| 116 | + for i in list: |
| 117 | + # 目录名为标题名,目录下存放 html 和图片 |
| 118 | + dir_name = i["title"].replace(' ','') |
| 119 | + print("正在下载文章:" + dir_name) |
| 120 | + # 请求文章的 url ,获取文章内容 |
| 121 | + response = requests.get(i["link"], cookies=cookies, headers=headers) |
| 122 | + # 保存文章到本地 |
| 123 | + save(response, dir_name, i["aid"]) |
| 124 | + print(dir_name + "下载完成!") |
| 125 | + # 过快请求可能会被微信问候,这里进行10秒等待 |
| 126 | + time.sleep(10) |
| 127 | + |
| 128 | + |
0 commit comments