Skip to content

Commit 7c0391d

Browse files
committed
python example
1 parent 94f4b25 commit 7c0391d

File tree

4 files changed

+185
-0
lines changed

4 files changed

+185
-0
lines changed

jiguang/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Python 代码实例
2+
3+
- [mpToHtml](https://github.com/JustDoPython/python-examples/tree/master/jiguang/mpToHtml) :抓取公号文章保存成 HTML
4+
- [tushare](https://github.com/JustDoPython/python-examples/tree/master/jiguang/tushare) :用 Python 获取股市交易数据
5+
6+
---
7+
8+
从小白到工程师的学习之路
9+
10+
关注公众号:python 技术,回复"python"一起学习交流
11+
12+
![](http://favorites.ren/assets/images/python.jpg)

jiguang/mpToHtml/gen_cookies.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import json
2+
3+
# 从浏览器中复制出来的 Cookie 字符串
4+
cookie_str = "pgv_pvid=9551991123; pac_uid=89sdjfklas; XWINDEXGREY=0; pgv_pvi=89273492834; tvfe_boss_uuid=lkjslkdf090; RK=lksdf900; ptcz=kjalsjdflkjklsjfdkljslkfdjljsdfk; ua_id=ioje9899fsndfklsdf-DKiowiekfjhsd0Dw=; h_uid=lkdlsodifsdf; mm_lang=zh_CN; ts_uid=0938450938405; mobileUV=98394jsdfjsd8sdf; \
5+
……中间部分省略 \
6+
EXIV96Zg=sNOaZlBxE37T1tqbsOL/qzHBtiHUNZSxr6TMqpb8Z9k="
7+
8+
cookie = {}
9+
# 遍历 cookie 信息
10+
for cookies in cookie_str.split("; "):
11+
cookie_item = cookies.split("=")
12+
cookie[cookie_item[0]] = cookie_item[1]
13+
# 将cookies写入到本地文件
14+
with open('cookie.txt', "w") as file:
15+
# 写入文件
16+
file.write(json.dumps(cookie))

jiguang/mpToHtml/gzh_download.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# 引入模块
2+
import requests
3+
import json
4+
import re
5+
import time
6+
from bs4 import BeautifulSoup
7+
import os
8+
9+
#保存下载的 html 页面和图片
10+
def save(search_response,html_dir,file_name):
11+
# 保存 html 的位置
12+
htmlDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), html_dir)
13+
# 保存图片的位置
14+
targetDir = os.path.join(os.path.dirname(os.path.abspath(__file__)),html_dir + '/images')
15+
# 不存在创建文件夹
16+
if not os.path.isdir(targetDir):
17+
os.makedirs(targetDir)
18+
domain = 'https://mp.weixin.qq.com/s'
19+
# 调用保存 html 方法
20+
save_html(search_response, htmlDir, file_name)
21+
# 调用保存图片方法
22+
save_file_to_local(htmlDir, targetDir, search_response, domain, file_name)
23+
24+
# 保存图片到本地
25+
def save_file_to_local(htmlDir,targetDir,search_response,domain,file_name):
26+
# 使用lxml解析请求返回的页面
27+
obj = BeautifulSoup(save_html(search_response,htmlDir,file_name).content, 'lxml')
28+
# 找到有 img 标签的内容
29+
imgs = obj.find_all('img')
30+
# 将页面上图片的链接加入list
31+
urls = []
32+
for img in imgs:
33+
if 'data-src' in str(img):
34+
urls.append(img['data-src'])
35+
elif 'src=""' in str(img):
36+
pass
37+
elif "src" not in str(img):
38+
pass
39+
else:
40+
urls.append(img['src'])
41+
42+
# 遍历所有图片链接,将图片保存到本地指定文件夹,图片名字用0,1,2...
43+
i = 0
44+
for each_url in urls:
45+
# 跟据文章的图片格式进行处理
46+
if each_url.startswith('//'):
47+
new_url = 'https:' + each_url
48+
r_pic = requests.get(new_url)
49+
elif each_url.startswith('/') and each_url.endswith('gif'):
50+
new_url = domain + each_url
51+
r_pic = requests.get(new_url)
52+
elif each_url.endswith('png') or each_url.endswith('jpg') or each_url.endswith('gif') or each_url.endswith('jpeg'):
53+
r_pic = requests.get(each_url)
54+
# 创建指定目录
55+
t = os.path.join(targetDir, str(i) + '.jpeg')
56+
print('该文章共需处理' + str(len(urls)) + '张图片,正在处理第' + str(i + 1) + '张……')
57+
# 指定绝对路径
58+
fw = open(t, 'wb')
59+
# 保存图片到本地指定目录
60+
fw.write(r_pic.content)
61+
i += 1
62+
# 将旧的链接或相对链接修改为直接访问本地图片
63+
update_file(each_url, t, htmlDir, file_name)
64+
fw.close()
65+
66+
# 保存 HTML 到本地
67+
def save_html(url_content,htmlDir,file_name):
68+
f = open(htmlDir+"/"+file_name+'.html', 'wb')
69+
# 写入文件
70+
f.write(url_content.content)
71+
f.close()
72+
return url_content
73+
74+
# 修改 HTML 文件,将图片的路径改为本地的路径
75+
def update_file(old, new, htmlDir, file_name):
76+
# 打开两个文件,原始文件用来读,另一个文件将修改的内容写入
77+
with open(htmlDir+"/"+file_name+'.html', encoding='utf-8') as f, open(htmlDir+"/"+file_name+'_bak.html', 'w', encoding='utf-8') as fw:
78+
# 遍历每行,用replace()方法替换路径
79+
for line in f:
80+
new_line = line.replace(old, new)
81+
new_line = new_line.replace("data-src", "src")
82+
# 写入新文件
83+
fw.write(new_line)
84+
# 执行完,删除原始文件
85+
os.remove(htmlDir+"/"+file_name+'.html')
86+
time.sleep(5)
87+
# 修改新文件名为 html
88+
os.rename(htmlDir+"/"+file_name+'_bak.html', htmlDir+"/"+file_name+'.html')
89+
90+
# 打开 cookie.txt
91+
with open("cookie.txt", "r") as file:
92+
cookie = file.read()
93+
cookies = json.loads(cookie)
94+
url = "https://mp.weixin.qq.com"
95+
#请求公号平台
96+
response = requests.get(url, cookies=cookies)
97+
# 从url中获取token
98+
token = re.findall(r'token=(\d+)', str(response.url))[0]
99+
# 设置请求访问头信息
100+
headers = {
101+
"Referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&token=" + token + "&lang=zh_CN",
102+
"Host": "mp.weixin.qq.com",
103+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",
104+
}
105+
106+
# 循环遍历前10页的文章
107+
for j in range(1, 10, 1):
108+
begin = (j-1)*5
109+
# 请求当前页获取文章列表
110+
requestUrl = "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin="+str(begin)+"&count=5&fakeid=MzU1NDk2MzQyNg==&type=9&query=&token=" + token + "&lang=zh_CN&f=json&ajax=1"
111+
search_response = requests.get(requestUrl, cookies=cookies, headers=headers)
112+
# 获取到返回列表 Json 信息
113+
re_text = search_response.json()
114+
list = re_text.get("app_msg_list")
115+
# 遍历当前页的文章列表
116+
for i in list:
117+
# 目录名为标题名,目录下存放 html 和图片
118+
dir_name = i["title"].replace(' ','')
119+
print("正在下载文章:" + dir_name)
120+
# 请求文章的 url ,获取文章内容
121+
response = requests.get(i["link"], cookies=cookies, headers=headers)
122+
# 保存文章到本地
123+
save(response, dir_name, i["aid"])
124+
print(dir_name + "下载完成!")
125+
# 过快请求可能会被微信问候,这里进行10秒等待
126+
time.sleep(10)
127+
128+

jiguang/tushare/my-tushare.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# 引入包
2+
import tushare as tu
3+
4+
# 获取上证指数历史三年的数据
5+
tu.get_hist_data('000001')
6+
7+
# 当然我们也可以只获取一段时间范围内的数据
8+
tu.get_hist_data('000001',start='2020-01-05',end='2020-02-05')
9+
10+
# 获取所有股票当前行情
11+
tu.get_today_all()
12+
13+
# 获取茅台和格力两支股票的实时数据
14+
data = tu.get_realtime_quotes(['600519','000651'])
15+
16+
# 也可以设置只显示某些值
17+
data[['code','name','price','bid','ask','volume','amount','time']]
18+
19+
#或者获取上证指数 深圳成指 沪深300指数 上证50 中小板 创业板
20+
tu.get_realtime_quotes(['sh','sz','hs300','sz50','zxb','cyb'])
21+
22+
# 获取大盘行情
23+
data = tu.get_index()
24+
25+
# 获取茅台当前日期的大单交易数据,默认400手
26+
tu.get_sina_dd('600519', date='2020-03-27')
27+
28+
# 获取交易100手以上的数据
29+
tu.get_sina_dd('600519', date='2020-03-27', vol=100)

0 commit comments

Comments
 (0)