Skip to content

Commit 300f419

Browse files
committed
fixed sorting reddit comments by best
1 parent 263d92c commit 300f419

File tree

1 file changed

+11
-6
lines changed

1 file changed

+11
-6
lines changed

src/korea_travel_guide/data.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ def clean_text(txt: str) -> str:
2828
txt = re.sub(r":[\w_]+:", "", txt)
2929
# collapse > quoted blocks (common in Reddit replies)
3030
txt = re.sub(r"(^|\n)>.*", "", txt)
31+
# remove bot regex guard
32+
BOT_FOOTER = re.compile(r"\*I am a bot.*?$", flags=re.I | re.S)
33+
txt = BOT_FOOTER.sub("", txt)
3134
# collapse whitespace
3235
txt = re.sub(r"\s+", " ", txt).strip()
3336
return txt
@@ -39,18 +42,21 @@ def scrape(sub_size_map):
3942

4043
for sub, sub_size in sub_size_map.items():
4144
got_size = 0
45+
SKIP_AUTHORS = {"AutoModerator"}
46+
FLAIR_BLACKLIST = {"announcement", "meta", "megathread"}
47+
SEEN_TITLES = set()
4248
for post in reddit.subreddit(sub).top(
4349
limit=None, time_filter="all"
4450
): # , time_filter="all"
4551
# don't need to scrape more if got_size already matches sub_size
4652
if got_size >= sub_size:
4753
break
4854

49-
SKIP_AUTHORS = {"AutoModerator"}
50-
FLAIR_BLACKLIST = {"announcement", "meta", "megathread"}
51-
SEEN_TITLES = set()
52-
5355
try:
56+
# sort the order of comments first to avoid error before handling the post
57+
post._comments = None # low-level cache clear
58+
post.comment_sort = "best" # sort by reddit's "best" ranking
59+
5460
# skip any link or image post, no posts with score lower than 1, no pinned/mod posts, ban any “over 18” content, no locked thread, no crossposts
5561
if (
5662
not post.is_self
@@ -82,7 +88,6 @@ def scrape(sub_size_map):
8288
continue
8389

8490
# get the answer as the highest sore comment
85-
post.comment_sort = "best" # sort by reddit's "best" ranking
8691
post.comments.replace_more(
8792
limit=0
8893
) # replace_more(limit=0) prevents getting more comments that are yet to be fetched. We just need the best comments.
@@ -105,7 +110,7 @@ def _comment_quality(c):
105110
and post.author
106111
and c.author.name == post.author.name
107112
)
108-
or c.score < 3 # require a few up-votes
113+
or c.score < 2 # require a few up-votes
109114
or len(c.body.split()) < 30 # avoid one-liners
110115
)
111116
]

0 commit comments

Comments
 (0)