From 86b8194d33c266096e2c58d5889705727fb3ff6b Mon Sep 17 00:00:00 2001 From: Addison Schiller Date: Fri, 3 Nov 2017 10:59:15 -0400 Subject: [PATCH 1/4] Adding custom sentry sanitizer --- tests/server/test_sanitize.py | 76 ++++++++++++++++++++++++++++++++++ waterbutler/server/app.py | 3 +- waterbutler/server/sanitize.py | 71 +++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+), 1 deletion(-) create mode 100644 tests/server/test_sanitize.py create mode 100644 waterbutler/server/sanitize.py diff --git a/tests/server/test_sanitize.py b/tests/server/test_sanitize.py new file mode 100644 index 000000000..852d2ad9a --- /dev/null +++ b/tests/server/test_sanitize.py @@ -0,0 +1,76 @@ +import pytest +from unittest import mock + +from waterbutler.server.sanitize import WBSanitizer + + +@pytest.fixture +def sanitizer(): + return WBSanitizer(mock.Mock()) + + +class TestWBSanitizer: + # The sanitize function changes some strings and dictionaries + # you put into it, so you need to explicitly test most things + + MASK = '*' * 8 + + def test_no_sanitization(self, sanitizer): + assert sanitizer.sanitize('thing', 'ghost science') == 'ghost science' + + def test_fields_sanitized(self, sanitizer): + fields = sanitizer.FIELDS + for field in fields: + assert sanitizer.sanitize(field, 'free speech') == self.MASK + + def test_value_is_none(self, sanitizer): + assert sanitizer.sanitize('great hair', None) is None + + def test_sanitize_credit_card(self, sanitizer): + assert sanitizer.sanitize('credit', '424242424242424') == self.MASK + assert sanitizer.sanitize('credit', '4242424242424243333333') != self.MASK + + def test_sanitize_dictionary(self, sanitizer): + value_dict = { + 'great_entry': 'very much not a secret or credit card' + } + + result = sanitizer.sanitize('value_dict', value_dict) + assert result == { + 'great_entry': 'very much not a secret or credit card' + } + + sanitize_dict = { + 'key': 'secret', + 'okay_value': 'bears are awesome' + } + result = result = sanitizer.sanitize('sanitize_dict', sanitize_dict) + + # Sanity check + assert result != { + 'key': 'secret', + 'okay_value': 'bears are awesome' + } + + assert result == { + 'key': '*' * 8, + 'okay_value': 'bears are awesome' + } + + def test_dataverse_secret(self, sanitizer): + + # Named oddly because if you call it `dv_secret` it will get sanitized by a different + # part of the sanitizer + dv_value = 'aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc' + assert sanitizer.sanitize('dv_value', dv_value) == self.MASK + + dv_value = 'random characters and other things aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc' + expected = 'random characters and other things ' + self.MASK + assert sanitizer.sanitize('dv_value', dv_value) == expected + + def test_bytes(self, sanitizer): + key = b'key' + assert sanitizer.sanitize(key, 'bossy yogurt') == self.MASK + + other_key = b'should_be_safe' + assert sanitizer.sanitize(other_key, 'snow science') == 'snow science' diff --git a/waterbutler/server/app.py b/waterbutler/server/app.py index db87fd22a..fb131e355 100644 --- a/waterbutler/server/app.py +++ b/waterbutler/server/app.py @@ -44,7 +44,8 @@ def make_app(debug): [(r'/status', handlers.StatusHandler)], debug=debug, ) - app.sentry_client = AsyncSentryClient(settings.SENTRY_DSN, release=__version__) + app.sentry_client = AsyncSentryClient(settings.SENTRY_DSN, release=__version__, + processors=('waterbutler.server.sanitize.WBSanitizer',)) return app diff --git a/waterbutler/server/sanitize.py b/waterbutler/server/sanitize.py new file mode 100644 index 000000000..2d11eed47 --- /dev/null +++ b/waterbutler/server/sanitize.py @@ -0,0 +1,71 @@ +import re + +from raven.processors import SanitizePasswordsProcessor + + +class WBSanitizer(SanitizePasswordsProcessor): + """Asterisk out things that look like passwords, keys, etc.""" + + # Store mask as a fixed length for security + MASK = '*' * 8 + + # Token and key added from original. Key is used by Dataverse + FIELDS = frozenset([ + 'password', + 'secret', + 'passwd', + 'authorization', + 'api_key', + 'apikey', + 'sentry_dsn', + 'access_token', + 'key', + 'token', + ]) + + # Credit card regex left intact from original processor + # While we should never have credit card information, its still best to perform the check + # and keep old functionality + VALUES_RE = re.compile(r'^(?:\d[ -]*?){13,16}$') + + # Should specifically match Dataverse secrets. Key format checked on demo and on Harvard + DATAVERSE_SECRET_RE = re.compile(r'[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]' + '{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}') + + def sanitize(self, key, value): + """Overload the sanitize function of the `SanitizePasswordsProcessor'.""" + if value is None: + return + + # Part of the original method. Looks for credit cards to sanitize + if isinstance(value, str) and self.VALUES_RE.search(value): + return self.MASK + + if isinstance(value, dict): + for item in value: + if item in self.FIELDS: + value[item] = self.MASK + + # Check for Dataverse secrets + if isinstance(value, str): + matches = self.DATAVERSE_SECRET_RE.findall(value) + for match in matches: + value = value.replace(match, self.MASK) + + # key can be a NoneType + if not key: + return value + + # Just in case we have bytes here, we want to turn them into text + # properly without failing so we can perform our check. + if isinstance(key, bytes): + key = key.decode('utf-8', 'replace') + else: + key = str(key) + + key = key.lower() + for field in self.FIELDS: + if field in key: + return self.MASK + + return value From 99a95e2a3ec494e6f5642c702ceb0b21d5b188fb Mon Sep 17 00:00:00 2001 From: Addison Schiller Date: Fri, 17 Nov 2017 10:37:29 -0500 Subject: [PATCH 2/4] Add sanitize nested lists and dictionaries --- tests/server/test_sanitize.py | 152 +++++++++++++++++++++++++++++---- waterbutler/server/sanitize.py | 13 ++- 2 files changed, 147 insertions(+), 18 deletions(-) diff --git a/tests/server/test_sanitize.py b/tests/server/test_sanitize.py index 852d2ad9a..66c483f3a 100644 --- a/tests/server/test_sanitize.py +++ b/tests/server/test_sanitize.py @@ -26,10 +26,36 @@ def test_fields_sanitized(self, sanitizer): def test_value_is_none(self, sanitizer): assert sanitizer.sanitize('great hair', None) is None + def test_key_is_none(self, sanitizer): + assert sanitizer.sanitize(None, 'best day ever') is 'best day ever' + def test_sanitize_credit_card(self, sanitizer): assert sanitizer.sanitize('credit', '424242424242424') == self.MASK + # This string is not censored since it is out of the range of what it considers + # to be a credit card assert sanitizer.sanitize('credit', '4242424242424243333333') != self.MASK + def test_none_key_is_sanitized(self, sanitizer): + assert sanitizer.sanitize(None, '424242424242424') == self.MASK + # This string is not censored since it is out of the range of what it considers + # to be a credit card + assert sanitizer.sanitize(None, '4242424242424243333333') != self.MASK + + def test_dataverse_secret(self, sanitizer): + + # Named oddly because if you call it `dv_secret` it will get sanitized by a different + # part of the sanitizer + dv_value = 'aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc' + assert sanitizer.sanitize('dv_value', dv_value) == self.MASK + + dv_value = 'random characters and other things aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc' + expected = 'random characters and other things ' + self.MASK + assert sanitizer.sanitize('dv_value', dv_value) == expected + + def test_bytes(self, sanitizer): + assert sanitizer.sanitize(b'key', 'bossy yogurt') == self.MASK + assert sanitizer.sanitize(b'should_be_safe', 'snow science') == 'snow science' + def test_sanitize_dictionary(self, sanitizer): value_dict = { 'great_entry': 'very much not a secret or credit card' @@ -44,7 +70,7 @@ def test_sanitize_dictionary(self, sanitizer): 'key': 'secret', 'okay_value': 'bears are awesome' } - result = result = sanitizer.sanitize('sanitize_dict', sanitize_dict) + result = sanitizer.sanitize('sanitize_dict', sanitize_dict) # Sanity check assert result != { @@ -53,24 +79,118 @@ def test_sanitize_dictionary(self, sanitizer): } assert result == { - 'key': '*' * 8, + 'key': self.MASK, 'okay_value': 'bears are awesome' } - def test_dataverse_secret(self, sanitizer): - - # Named oddly because if you call it `dv_secret` it will get sanitized by a different - # part of the sanitizer - dv_value = 'aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc' - assert sanitizer.sanitize('dv_value', dv_value) == self.MASK + def test_nested_dictionary(self, sanitizer): + value_dict = { + 'value': { + 'other': 'words', + 'key': 'this will be censored', + 'secret': { + 'secret': { + 'secret': 'pie is great' + } + }, + 'new': 'best' + } + } - dv_value = 'random characters and other things aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc' - expected = 'random characters and other things ' + self.MASK - assert sanitizer.sanitize('dv_value', dv_value) == expected + result = sanitizer.sanitize('value_dict', value_dict) + assert result == { + 'value': { + 'other': 'words', + 'key': self.MASK, + 'secret': self.MASK, + 'new': 'best' + } + } - def test_bytes(self, sanitizer): - key = b'key' - assert sanitizer.sanitize(key, 'bossy yogurt') == self.MASK + def test_nested_dictionary_with_list(self, sanitizer): + value_dict = { + 'value': { + 'other': 'words', + 'key': 'this will be censored', + 'secret': { + 'value': ['bunch', 'of', 'semi', 'random', 'beige', 'run'] + + }, + 'not_hidden': { + 'list_of_dict': [ + {'value': 'value'}, + {'key': 'secret'} + ] + }, + 'new': 'best' + } + } + result = sanitizer.sanitize('value_dict', value_dict) + assert result == { + 'value': { + 'other': 'words', + 'key': self.MASK, + 'secret': self.MASK, + 'not_hidden': { + 'list_of_dict': [ + {'value': 'value'}, + {'key': self.MASK} + ] + }, + 'new': 'best' + } + } - other_key = b'should_be_safe' - assert sanitizer.sanitize(other_key, 'snow science') == 'snow science' + def test_sanitize_list(self, sanitizer): + value_list = [ + 'blarg', + '10', + 'key', + 'aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc' + ] + + result = sanitizer.sanitize('value_list', value_list) + + assert result == [ + 'blarg', + '10', + 'key', + self.MASK + ] + + def test_sanitize_nested_lists(self, sanitizer): + value_list = [ + [ + 'blarg', + '10', + 'key', + 'aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc' + ], + 'blarg', + 'aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc', + [[[[[[[ + ['check out this level of nested'], 'aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc' + ]]]]]]], + { + 'key': 'red leaves', + 'secret': [[[[[[[[]]]]]]]] + } + ] + + result = sanitizer.sanitize('value_list', value_list) + + assert result == [ + [ + 'blarg', + '10', + 'key', + self.MASK + ], + 'blarg', + self.MASK, + [[[[[[[['check out this level of nested'], self.MASK]]]]]]], + { + 'key': self.MASK, + 'secret': self.MASK + } + ] diff --git a/waterbutler/server/sanitize.py b/waterbutler/server/sanitize.py index 2d11eed47..3d3e92d5a 100644 --- a/waterbutler/server/sanitize.py +++ b/waterbutler/server/sanitize.py @@ -34,6 +34,7 @@ class WBSanitizer(SanitizePasswordsProcessor): def sanitize(self, key, value): """Overload the sanitize function of the `SanitizePasswordsProcessor'.""" + if value is None: return @@ -43,8 +44,14 @@ def sanitize(self, key, value): if isinstance(value, dict): for item in value: - if item in self.FIELDS: - value[item] = self.MASK + value[item] = self.sanitize(item, value[item]) + + if isinstance(value, list): + new_list = [] + for item in value: + new_list.append(self.sanitize(key, item)) + + value = new_list # Check for Dataverse secrets if isinstance(value, str): @@ -53,12 +60,14 @@ def sanitize(self, key, value): value = value.replace(match, self.MASK) # key can be a NoneType + # This sould be after the regex checks incase a `None` key is a token if not key: return value # Just in case we have bytes here, we want to turn them into text # properly without failing so we can perform our check. if isinstance(key, bytes): + # May want a try/except block around this, but for now it should be okay key = key.decode('utf-8', 'replace') else: key = str(key) From f0d7f3e9d133ce8a4bb5810d739fc5a9a35a234b Mon Sep 17 00:00:00 2001 From: Addison Schiller Date: Tue, 28 Nov 2017 10:10:12 -0500 Subject: [PATCH 3/4] change test and import order --- tests/server/test_sanitize.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/server/test_sanitize.py b/tests/server/test_sanitize.py index 66c483f3a..3d8872391 100644 --- a/tests/server/test_sanitize.py +++ b/tests/server/test_sanitize.py @@ -1,6 +1,7 @@ -import pytest from unittest import mock +import pytest + from waterbutler.server.sanitize import WBSanitizer @@ -72,12 +73,6 @@ def test_sanitize_dictionary(self, sanitizer): } result = sanitizer.sanitize('sanitize_dict', sanitize_dict) - # Sanity check - assert result != { - 'key': 'secret', - 'okay_value': 'bears are awesome' - } - assert result == { 'key': self.MASK, 'okay_value': 'bears are awesome' From 55b871ecbf717be93ab18303a69b6ee407eb5180 Mon Sep 17 00:00:00 2001 From: TomBaxter Date: Wed, 27 Dec 2017 09:09:26 -0500 Subject: [PATCH 4/4] Refactor custom raven processor SVCS-334 Refactor subclass of SanitizePasswordsProcessor to take better advantage of inheritance. --- tests/server/test_sanitize.py | 2 +- waterbutler/server/sanitize.py | 60 +++++++--------------------------- 2 files changed, 13 insertions(+), 49 deletions(-) diff --git a/tests/server/test_sanitize.py b/tests/server/test_sanitize.py index 3d8872391..e7a2fda23 100644 --- a/tests/server/test_sanitize.py +++ b/tests/server/test_sanitize.py @@ -7,7 +7,7 @@ @pytest.fixture def sanitizer(): - return WBSanitizer(mock.Mock()) + return WBSanitizer() class TestWBSanitizer: diff --git a/waterbutler/server/sanitize.py b/waterbutler/server/sanitize.py index 3d3e92d5a..d2bacf1ca 100644 --- a/waterbutler/server/sanitize.py +++ b/waterbutler/server/sanitize.py @@ -4,43 +4,26 @@ class WBSanitizer(SanitizePasswordsProcessor): - """Asterisk out things that look like passwords, keys, etc.""" + """ + Use parent class to asterisk out things that look like passwords, credit card numbers, + and API keys in frames, http, and basic extra data. - # Store mask as a fixed length for security - MASK = '*' * 8 - - # Token and key added from original. Key is used by Dataverse - FIELDS = frozenset([ - 'password', - 'secret', - 'passwd', - 'authorization', - 'api_key', - 'apikey', - 'sentry_dsn', - 'access_token', - 'key', - 'token', - ]) - - # Credit card regex left intact from original processor - # While we should never have credit card information, its still best to perform the check - # and keep old functionality - VALUES_RE = re.compile(r'^(?:\d[ -]*?){13,16}$') + In addition, asterisk out Dataverse formatted ouath tokens. + """ # Should specifically match Dataverse secrets. Key format checked on demo and on Harvard DATAVERSE_SECRET_RE = re.compile(r'[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]' '{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}') - def sanitize(self, key, value): - """Overload the sanitize function of the `SanitizePasswordsProcessor'.""" + def __init__(self): + # As of raven version 6.4 this attribute name has been changed from FIELDS to KEYS. + # Will need to be updated when we upgrade. + self.FIELDS = self.FIELDS.union(['key', 'token', 'refresh_token']) - if value is None: - return + def sanitize(self, key, value): + """Subclass the sanitize function of the `SanitizePasswordsProcessor'.""" - # Part of the original method. Looks for credit cards to sanitize - if isinstance(value, str) and self.VALUES_RE.search(value): - return self.MASK + value = SanitizePasswordsProcessor.sanitize(self, key, value) if isinstance(value, dict): for item in value: @@ -50,7 +33,6 @@ def sanitize(self, key, value): new_list = [] for item in value: new_list.append(self.sanitize(key, item)) - value = new_list # Check for Dataverse secrets @@ -59,22 +41,4 @@ def sanitize(self, key, value): for match in matches: value = value.replace(match, self.MASK) - # key can be a NoneType - # This sould be after the regex checks incase a `None` key is a token - if not key: - return value - - # Just in case we have bytes here, we want to turn them into text - # properly without failing so we can perform our check. - if isinstance(key, bytes): - # May want a try/except block around this, but for now it should be okay - key = key.decode('utf-8', 'replace') - else: - key = str(key) - - key = key.lower() - for field in self.FIELDS: - if field in key: - return self.MASK - return value