From 14224485ee056d604750858c939b7729d21c5115 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Rodrigues?= Date: Mon, 9 Nov 2020 14:23:01 -0300 Subject: [PATCH 1/7] Adiciona python-indexer --- sapl-logs/python-indexer.py | 191 ++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100755 sapl-logs/python-indexer.py diff --git a/sapl-logs/python-indexer.py b/sapl-logs/python-indexer.py new file mode 100755 index 000000000..c4d0c4192 --- /dev/null +++ b/sapl-logs/python-indexer.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import logging +import sys +import requests +import json +import time +import re + +# TODO: inserir timestamp no logging do python-indexer.py + +USE_SOLR = os.getenv('USE_SOLR', True) # TODO: trocar por False em produção +SOLR_BASE_URL = os.getenv('SOLR_URL', 'http://localhost:8983') + '/solr' + +SOLR_UPDATE_URL = f'{SOLR_BASE_URL}/sapl-logs/update?commitWithin=1000' + +SOLR_COLLECTION_STATUS = ( + f'{SOLR_BASE_URL}/sapl-logs/admin/ping?distrib=true&wt=json' +) + +BATCH_SIZE = 10 # https://lucidworks.com/post/really-batch-updates-solr-2/ + +previous = None + +buffer = [] +payload = [] + +num_docs = 0 +total_docs = 0 + +# logging setup +logfilename = 'python-indexer.log' + +logging.basicConfig( + filename=logfilename, + filemode='w+', + level=logging.INFO +) + +logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) +logger = logging.getLogger('python-indexer') +logger.setLevel(logging.DEBUG) + +print(f"The logging of this program is done at {logfilename}") + + +def push_to_solr(): + logger.debug(f"Sending {len(payload)} documents to Solr") + + r = requests.post( + SOLR_UPDATE_URL, + data=json.dumps(payload), + headers={'Content-Type': 'application/json; charset=utf-8'} + ) + logger.debug(r.content) + + +def parse_fields(groups): + from datetime import datetime as dt + + iso8601 = "{} {}".format(groups[1], groups[2].replace(",", ".")) + d = dt.fromisoformat(iso8601) + datetime = d.strftime('%Y-%m-%dT%H:%M:%SZ') + + # datetime = groups[1] + "T" + groups[2].split(',')[0] + "Z" + + fields = { + 'level': groups[0], + 'datetime': datetime + } + + parts = groups[3].split() + fields['server'] = parts[0] + fields['path'] = parts[1] + + # format: sapl.painel.views:get_votos:497 + function = parts[2].split(':') + fields['app_file'] = function[0] + fields['method'] = function[1] + fields['line_number'] = function[2] + fields['function'] = parts[2] + + fields['message'] = ' '.join(parts[3:]) + + return fields + + +def parse_logs(line): + global previous + + # discard empty lines + if not line.strip(): + return + + pattern = ( + "^(ERROR|INFO|DEBUG|WARNING)" + + r'\s+(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2}:\d{2},\d+)\s+(.*)' + ) + match = re.match(pattern, line) + + if match: + groups = match.groups() + fields = parse_fields(groups) + fields['line'] = line + + # if match but buffer is full then there was a stack trace before + if buffer and previous: + previous['stacktrace'] = ''.join(buffer) + buffer.clear() + elif not previous: + buffer.clear() # un-garbaged trash + + # append the previous one + if previous: + payload.append(previous) + + # delay append of current (it may have stacktrace) + previous = fields + else: + # while not match again collect into buffer + buffer.append(line) + + logger.debug(len(payload)) + + +def follow(fd): + """ generator function that yields new lines in a file """ + + # seek the end of the file + fd.seek(0, os.SEEK_END) + + # start infinite loop + while True: + # read last line of file + line = fd.readline() + # sleep if file hasn't been updated + if not line: + time.sleep(0.1) + continue + + yield line + + +def check_solr(): + try: + r = requests.get(SOLR_BASE_URL) + if r.status_code == 200: + print(f"Solr server at {SOLR_BASE_URL} is up and running...") + + print("Checking collection health...") + + r = requests.get(SOLR_COLLECTION_STATUS) + data = r.json() + if data['status'] == 'OK': + print("Collection sapl-logs is healthy") + + except Exception as e: + logger.error( + "Exception: " + str(e) + + f"\nError connecting to Solr at {SOLR_COLLECTION_STATUS}" + ) + sys.exit(1) + + +if __name__ == '__main__': + + if not USE_SOLR: + print(f"USE_SOLR={USE_SOLR}") + sys.exit(0) + + check_solr() + + filename = sys.argv[1] + print(f"Opening log file {filename}...") + logfile = open(filename, 'r') + loglines = follow(logfile) + + # iterate over the generator + for line in loglines: + logger.debug(f"Current payload size: {len(payload)}") + parse_logs(line) + + num_docs = (num_docs + 1) % BATCH_SIZE + if num_docs == 0 and payload: + push_to_solr() + total_docs += len(payload) + payload.clear() + + push_to_solr() From 5d414d1d080b186cb2ee094ebaba9cf8013194f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Rodrigues?= Date: Wed, 11 Nov 2020 15:56:24 -0300 Subject: [PATCH 2/7] =?UTF-8?q?Refatora=C3=A7=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sapl-logs/python-indexer.py | 72 ++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 25 deletions(-) diff --git a/sapl-logs/python-indexer.py b/sapl-logs/python-indexer.py index c4d0c4192..47160f5ae 100755 --- a/sapl-logs/python-indexer.py +++ b/sapl-logs/python-indexer.py @@ -1,26 +1,30 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import os +from decouple import config +from random import randint + import logging import sys +import time import requests import json -import time +import os import re # TODO: inserir timestamp no logging do python-indexer.py -USE_SOLR = os.getenv('USE_SOLR', True) # TODO: trocar por False em produção -SOLR_BASE_URL = os.getenv('SOLR_URL', 'http://localhost:8983') + '/solr' +# TODO: trocar por False em produção +USE_SOLR = config('USE_SOLR', default="True", cast=bool) +SOLR_BASE_URL = config('SOLR_URL', default="http://localhost:8983") + '/solr' SOLR_UPDATE_URL = f'{SOLR_BASE_URL}/sapl-logs/update?commitWithin=1000' SOLR_COLLECTION_STATUS = ( f'{SOLR_BASE_URL}/sapl-logs/admin/ping?distrib=true&wt=json' ) -BATCH_SIZE = 10 # https://lucidworks.com/post/really-batch-updates-solr-2/ +BATCH_SIZE = 5 # https://lucidworks.com/post/really-batch-updates-solr-2/ previous = None @@ -46,15 +50,40 @@ print(f"The logging of this program is done at {logfilename}") +def exp_backoff(func): + def inner_func(*args, **kwargs): + MAX_SLEEP_TIME = 180 # 3 min + + iter = 0 + + while True: + try: + func(*args, **kwargs) + break + except Exception as e: + logger.error( + "Exception: " + str(e) # + + # f"\nError connecting to Solr at {SOLR_CONNECTION_STATUS} + ) + + jitter = randint(0, 5) + sleep_time = min(2**iter + jitter, MAX_SLEEP_TIME) + time.sleep(sleep_time) + + iter += 1 + + return inner_func + + +@exp_backoff def push_to_solr(): logger.debug(f"Sending {len(payload)} documents to Solr") - r = requests.post( + requests.post( SOLR_UPDATE_URL, data=json.dumps(payload), headers={'Content-Type': 'application/json; charset=utf-8'} ) - logger.debug(r.content) def parse_fields(groups): @@ -143,25 +172,18 @@ def follow(fd): yield line +@exp_backoff def check_solr(): - try: - r = requests.get(SOLR_BASE_URL) - if r.status_code == 200: - print(f"Solr server at {SOLR_BASE_URL} is up and running...") - - print("Checking collection health...") - - r = requests.get(SOLR_COLLECTION_STATUS) - data = r.json() - if data['status'] == 'OK': - print("Collection sapl-logs is healthy") - - except Exception as e: - logger.error( - "Exception: " + str(e) + - f"\nError connecting to Solr at {SOLR_COLLECTION_STATUS}" - ) - sys.exit(1) + r = requests.get(SOLR_BASE_URL) + if r.status_code == 200: + print(f"Solr server at {SOLR_BASE_URL} is up and running...") + + print("Checking collection health...") + + r = requests.get(SOLR_COLLECTION_STATUS) + data = r.json() + if r.ok and data['status'] == "OK": + print("Collection sapl-logs is healthy") if __name__ == '__main__': From b24e84a4574028315dd163a5c32b8d4beabc9c76 Mon Sep 17 00:00:00 2001 From: Edward <9326037+edwardoliveira@users.noreply.github.com> Date: Wed, 11 Nov 2020 16:15:52 -0300 Subject: [PATCH 3/7] Update sapl-logs/python-indexer.py --- sapl-logs/python-indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sapl-logs/python-indexer.py b/sapl-logs/python-indexer.py index 47160f5ae..23806ee4b 100755 --- a/sapl-logs/python-indexer.py +++ b/sapl-logs/python-indexer.py @@ -175,7 +175,7 @@ def follow(fd): @exp_backoff def check_solr(): r = requests.get(SOLR_BASE_URL) - if r.status_code == 200: + if r.ok and r.status_code == 200: print(f"Solr server at {SOLR_BASE_URL} is up and running...") print("Checking collection health...") From b281bad1a51ba99523d63fe3737b4c90db1a202e Mon Sep 17 00:00:00 2001 From: Edward <9326037+edwardoliveira@users.noreply.github.com> Date: Wed, 11 Nov 2020 16:16:02 -0300 Subject: [PATCH 4/7] Update sapl-logs/python-indexer.py --- sapl-logs/python-indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sapl-logs/python-indexer.py b/sapl-logs/python-indexer.py index 23806ee4b..e2e811f37 100755 --- a/sapl-logs/python-indexer.py +++ b/sapl-logs/python-indexer.py @@ -15,7 +15,7 @@ # TODO: inserir timestamp no logging do python-indexer.py # TODO: trocar por False em produção -USE_SOLR = config('USE_SOLR', default="True", cast=bool) +USE_SOLR = config('USE_SOLR', default="False", cast=bool) SOLR_BASE_URL = config('SOLR_URL', default="http://localhost:8983") + '/solr' SOLR_UPDATE_URL = f'{SOLR_BASE_URL}/sapl-logs/update?commitWithin=1000' From e65e21f4827538d9c8e953b80efe5f27cf54b901 Mon Sep 17 00:00:00 2001 From: eribeiro Date: Wed, 11 Nov 2020 16:40:31 -0300 Subject: [PATCH 5/7] Refactorings --- sapl-logs/python-indexer.py | 42 ++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/sapl-logs/python-indexer.py b/sapl-logs/python-indexer.py index e2e811f37..e8a202679 100755 --- a/sapl-logs/python-indexer.py +++ b/sapl-logs/python-indexer.py @@ -12,9 +12,6 @@ import os import re -# TODO: inserir timestamp no logging do python-indexer.py - -# TODO: trocar por False em produção USE_SOLR = config('USE_SOLR', default="False", cast=bool) SOLR_BASE_URL = config('SOLR_URL', default="http://localhost:8983") + '/solr' @@ -44,8 +41,8 @@ ) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) -logger = logging.getLogger('python-indexer') -logger.setLevel(logging.DEBUG) +logger = logging.getLogger('python-indexer.py') +logger.setLevel(logging.INFO) print(f"The logging of this program is done at {logfilename}") @@ -66,15 +63,29 @@ def inner_func(*args, **kwargs): # f"\nError connecting to Solr at {SOLR_CONNECTION_STATUS} ) - jitter = randint(0, 5) + jitter = randint(0, 10) sleep_time = min(2**iter + jitter, MAX_SLEEP_TIME) + print(f"Retrying in {sleep_time} seconds... ") time.sleep(sleep_time) - iter += 1 return inner_func +@exp_backoff +def check_solr(): + r = requests.get(SOLR_BASE_URL) + if r.ok and r.status_code == 200: + print(f"Solr server at {SOLR_BASE_URL} is up and running...") + + print("Checking collection health...") + + r = requests.get(SOLR_COLLECTION_STATUS) + data = r.json() + if r.ok and data['status'] == "OK": + print("Collection sapl-logs is healthy") + + @exp_backoff def push_to_solr(): logger.debug(f"Sending {len(payload)} documents to Solr") @@ -172,24 +183,11 @@ def follow(fd): yield line -@exp_backoff -def check_solr(): - r = requests.get(SOLR_BASE_URL) - if r.ok and r.status_code == 200: - print(f"Solr server at {SOLR_BASE_URL} is up and running...") - - print("Checking collection health...") - - r = requests.get(SOLR_COLLECTION_STATUS) - data = r.json() - if r.ok and data['status'] == "OK": - print("Collection sapl-logs is healthy") - - if __name__ == '__main__': if not USE_SOLR: - print(f"USE_SOLR={USE_SOLR}") + print("Solr não habilitado, saindo de python-indexer.py") + logger.info(f"USE_SOLR={USE_SOLR}") sys.exit(0) check_solr() From 5d4ac6b0fbb063a1efc96e728c5824bce086827c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vin=C3=ADcius=20Cantu=C3=A1ria?= Date: Thu, 12 Nov 2020 16:04:43 -0300 Subject: [PATCH 6/7] Adiciona abertura do sapl-logs no docker/start --- docker/start.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/start.sh b/docker/start.sh index 868942bfd..bab351e02 100755 --- a/docker/start.sh +++ b/docker/start.sh @@ -79,6 +79,7 @@ if [ "${USE_SOLR-False}" == "True" ] || [ "${USE_SOLR-False}" == "true" ]; then echo "Connecting to solr..." python3 solr_api.py -u $SOLR_URL -c $SOLR_COLLECTION -s $NUM_SHARDS -rf $RF -ms $MAX_SHARDS_PER_NODE & # python3 manage.py rebuild_index --noinput & + python3 ../sapl-logs/python-indexer.py & else echo "Solr is offline, not possible to connect." fi From e84eb1698204a92cc7868ea4a55ca50ef6bc9369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vin=C3=ADcius=20Cantu=C3=A1ria?= Date: Wed, 25 Nov 2020 15:00:06 -0300 Subject: [PATCH 7/7] Corrige abertura do sapl-logs no docker/start --- docker/Dockerfile | 2 ++ docker/start.sh | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 69ef33de3..0885c5ba5 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -47,6 +47,8 @@ COPY docker/config/nginx/sapl.conf /etc/nginx/conf.d COPY docker/config/nginx/nginx.conf /etc/nginx/nginx.conf COPY docker/config/env_dockerfile /var/interlegis/sapl/sapl/.env +COPY sapl-logs/python-indexer.py $HOME + RUN python3 manage.py collectstatic --noinput --clear # Remove .env(fake) e sapl.db da imagem diff --git a/docker/start.sh b/docker/start.sh index bab351e02..59072d7b6 100755 --- a/docker/start.sh +++ b/docker/start.sh @@ -79,7 +79,7 @@ if [ "${USE_SOLR-False}" == "True" ] || [ "${USE_SOLR-False}" == "true" ]; then echo "Connecting to solr..." python3 solr_api.py -u $SOLR_URL -c $SOLR_COLLECTION -s $NUM_SHARDS -rf $RF -ms $MAX_SHARDS_PER_NODE & # python3 manage.py rebuild_index --noinput & - python3 ../sapl-logs/python-indexer.py & + nohup python-indexer.py & else echo "Solr is offline, not possible to connect." fi