From d1f75a76244dca9f54906c72d1bd674a560fb173 Mon Sep 17 00:00:00 2001 From: Dan Stoner Date: Sat, 9 Sep 2023 16:28:43 -0400 Subject: [PATCH] add FETCH_DELAY for mediaing fetcher --- idigbio_ingestion/mediaing/fetcher.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/idigbio_ingestion/mediaing/fetcher.py b/idigbio_ingestion/mediaing/fetcher.py index ac35595..187af7b 100644 --- a/idigbio_ingestion/mediaing/fetcher.py +++ b/idigbio_ingestion/mediaing/fetcher.py @@ -230,6 +230,9 @@ class FetchItem(object): #: Can http connections be reuesed? REUSE_CONN = True + #: Delay (throttle) to add to each request + FETCH_DELAY = 0 + #: class (static) variable for the session to use session = None @@ -268,6 +271,8 @@ def content(self): def get_media(self): "This calls get_media and handles all the failure scenarios" + # Trottle when FETCH_DELAY is set + sleep(self.FETCH_DELAY) try: self.fetch() self.validate() @@ -478,3 +483,10 @@ def get_media(self): class ArctosItem(FetchItem): FETCHER_COUNT = 1 REUSE_CONN = False + # 12s is slightly longer than the current (2023.09.09) value of crawl-delay in their robots.txt: + # + # User-agent: * + # crawl-delay: 10 + # ... + # + FETCH_DELAY = 12