@@ -28,6 +28,9 @@ def clean_text(txt: str) -> str:
2828 txt = re .sub (r":[\w_]+:" , "" , txt )
2929 # collapse > quoted blocks (common in Reddit replies)
3030 txt = re .sub (r"(^|\n)>.*" , "" , txt )
31+ # remove bot regex guard
32+ BOT_FOOTER = re .compile (r"\*I am a bot.*?$" , flags = re .I | re .S )
33+ txt = BOT_FOOTER .sub ("" , txt )
3134 # collapse whitespace
3235 txt = re .sub (r"\s+" , " " , txt ).strip ()
3336 return txt
@@ -39,18 +42,21 @@ def scrape(sub_size_map):
3942
4043 for sub , sub_size in sub_size_map .items ():
4144 got_size = 0
45+ SKIP_AUTHORS = {"AutoModerator" }
46+ FLAIR_BLACKLIST = {"announcement" , "meta" , "megathread" }
47+ SEEN_TITLES = set ()
4248 for post in reddit .subreddit (sub ).top (
4349 limit = None , time_filter = "all"
4450 ): # , time_filter="all"
4551 # don't need to scrape more if got_size already matches sub_size
4652 if got_size >= sub_size :
4753 break
4854
49- SKIP_AUTHORS = {"AutoModerator" }
50- FLAIR_BLACKLIST = {"announcement" , "meta" , "megathread" }
51- SEEN_TITLES = set ()
52-
5355 try :
56+ # sort the order of comments first to avoid error before handling the post
57+ post ._comments = None # low-level cache clear
58+ post .comment_sort = "best" # sort by reddit's "best" ranking
59+
5460 # skip any link or image post, no posts with score lower than 1, no pinned/mod posts, ban any “over 18” content, no locked thread, no crossposts
5561 if (
5662 not post .is_self
@@ -82,7 +88,6 @@ def scrape(sub_size_map):
8288 continue
8389
8490 # get the answer as the highest sore comment
85- post .comment_sort = "best" # sort by reddit's "best" ranking
8691 post .comments .replace_more (
8792 limit = 0
8893 ) # replace_more(limit=0) prevents getting more comments that are yet to be fetched. We just need the best comments.
@@ -105,7 +110,7 @@ def _comment_quality(c):
105110 and post .author
106111 and c .author .name == post .author .name
107112 )
108- or c .score < 3 # require a few up-votes
113+ or c .score < 2 # require a few up-votes
109114 or len (c .body .split ()) < 30 # avoid one-liners
110115 )
111116 ]
0 commit comments