1+ import json
2+ import httpx
3+ from urllib .parse import quote
4+ from typing import Dict
5+ import jmespath
6+
7+ client = httpx .Client (
8+ headers = {
9+ "x-ig-app-id" : "936619743392459" ,
10+ "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36" ,
11+ "Accept-Language" : "en-US,en;q=0.9,ru;q=0.8" ,
12+ "Accept-Encoding" : "gzip, deflate, br" ,
13+ "Accept" : "*/*" ,
14+ }
15+ )
16+
17+ def scrape_user_id (username : str ):
18+ """Scrape Instagram user's data"""
19+ result = client .get (
20+ f"https://i.instagram.com/api/v1/users/web_profile_info/?username={ username } " ,
21+ )
22+ data = json .loads (result .content )
23+ user_data = data ["data" ]["user" ]
24+ user_id = user_data .get ("id" )
25+ return user_id
26+
27+ def parse_post (data : Dict ) -> Dict :
28+ result = jmespath .search ("""{
29+ shortcode: shortcode,
30+ likes: edge_media_preview_like.count,
31+ comments: edge_media_to_comment.count
32+ }""" , data )
33+ return result
34+
35+ def scrape_user_posts (user_id : str , session : httpx .Client , page_size = 12 ):
36+ base_url = "https://www.instagram.com/graphql/query/?query_hash=e769aa130647d2354c40ea6a439bfc08&variables="
37+ variables = {
38+ "id" : user_id ,
39+ "first" : page_size ,
40+ "after" : None ,
41+ }
42+ _page_number = 1
43+ while True :
44+ resp = session .get (base_url + quote (json .dumps (variables )))
45+ all_posts_data = resp .json ()
46+ posts = all_posts_data ["data" ]["user" ]["edge_owner_to_timeline_media" ]
47+ for post in posts .get ("edges" ):
48+ yield parse_post (post .get ("node" )) # note: we're using parse_post function from previous chapter
49+ page_info = posts .get ("page_info" )
50+ if _page_number == 1 :
51+ print (f"scraping total { posts ['count' ]} posts of { user_id } " )
52+ else :
53+ print (f"scraping page { _page_number } " )
54+ if not page_info ["has_next_page" ]:
55+ break
56+ if variables ["after" ] == page_info ["end_cursor" ]:
57+ break
58+ variables ["after" ] = page_info ["end_cursor" ]
59+ _page_number += 1
60+
61+
62+ def get_all_posts (user_id ):
63+ with httpx .Client (timeout = None ) as session :
64+ posts = list (scrape_user_posts (str (user_id ), session ))
65+ return posts
0 commit comments