Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 191 additions & 0 deletions tests/server/test_sanitize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
from unittest import mock

import pytest

from waterbutler.server.sanitize import WBSanitizer


@pytest.fixture
def sanitizer():
return WBSanitizer(mock.Mock())


class TestWBSanitizer:
# The sanitize function changes some strings and dictionaries
# you put into it, so you need to explicitly test most things

MASK = '*' * 8

def test_no_sanitization(self, sanitizer):
assert sanitizer.sanitize('thing', 'ghost science') == 'ghost science'

def test_fields_sanitized(self, sanitizer):
fields = sanitizer.FIELDS
for field in fields:
assert sanitizer.sanitize(field, 'free speech') == self.MASK

def test_value_is_none(self, sanitizer):
assert sanitizer.sanitize('great hair', None) is None

def test_key_is_none(self, sanitizer):
assert sanitizer.sanitize(None, 'best day ever') is 'best day ever'

def test_sanitize_credit_card(self, sanitizer):
assert sanitizer.sanitize('credit', '424242424242424') == self.MASK
# This string is not censored since it is out of the range of what it considers
# to be a credit card
assert sanitizer.sanitize('credit', '4242424242424243333333') != self.MASK

def test_none_key_is_sanitized(self, sanitizer):
assert sanitizer.sanitize(None, '424242424242424') == self.MASK
# This string is not censored since it is out of the range of what it considers
# to be a credit card
assert sanitizer.sanitize(None, '4242424242424243333333') != self.MASK

def test_dataverse_secret(self, sanitizer):

# Named oddly because if you call it `dv_secret` it will get sanitized by a different
# part of the sanitizer
dv_value = 'aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc'
assert sanitizer.sanitize('dv_value', dv_value) == self.MASK

dv_value = 'random characters and other things aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc'
expected = 'random characters and other things ' + self.MASK
assert sanitizer.sanitize('dv_value', dv_value) == expected

def test_bytes(self, sanitizer):
assert sanitizer.sanitize(b'key', 'bossy yogurt') == self.MASK
assert sanitizer.sanitize(b'should_be_safe', 'snow science') == 'snow science'

def test_sanitize_dictionary(self, sanitizer):
value_dict = {
'great_entry': 'very much not a secret or credit card'
}

result = sanitizer.sanitize('value_dict', value_dict)
assert result == {
'great_entry': 'very much not a secret or credit card'
}

sanitize_dict = {
'key': 'secret',
'okay_value': 'bears are awesome'
}
result = sanitizer.sanitize('sanitize_dict', sanitize_dict)

assert result == {
'key': self.MASK,
'okay_value': 'bears are awesome'
}

def test_nested_dictionary(self, sanitizer):
value_dict = {
'value': {
'other': 'words',
'key': 'this will be censored',
'secret': {
'secret': {
'secret': 'pie is great'
}
},
'new': 'best'
}
}

result = sanitizer.sanitize('value_dict', value_dict)
assert result == {
'value': {
'other': 'words',
'key': self.MASK,
'secret': self.MASK,
'new': 'best'
}
}

def test_nested_dictionary_with_list(self, sanitizer):
value_dict = {
'value': {
'other': 'words',
'key': 'this will be censored',
'secret': {
'value': ['bunch', 'of', 'semi', 'random', 'beige', 'run']

},
'not_hidden': {
'list_of_dict': [
{'value': 'value'},
{'key': 'secret'}
]
},
'new': 'best'
}
}
result = sanitizer.sanitize('value_dict', value_dict)
assert result == {
'value': {
'other': 'words',
'key': self.MASK,
'secret': self.MASK,
'not_hidden': {
'list_of_dict': [
{'value': 'value'},
{'key': self.MASK}
]
},
'new': 'best'
}
}

def test_sanitize_list(self, sanitizer):
value_list = [
'blarg',
'10',
'key',
'aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc'
]

result = sanitizer.sanitize('value_list', value_list)

assert result == [
'blarg',
'10',
'key',
self.MASK
]

def test_sanitize_nested_lists(self, sanitizer):
value_list = [
[
'blarg',
'10',
'key',
'aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc'
],
'blarg',
'aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc',
[[[[[[[
['check out this level of nested'], 'aaaaaaaa-bbbb-bbbb-bbbb-cccccccccccc'
]]]]]]],
{
'key': 'red leaves',
'secret': [[[[[[[[]]]]]]]]
}
]

result = sanitizer.sanitize('value_list', value_list)

assert result == [
[
'blarg',
'10',
'key',
self.MASK
],
'blarg',
self.MASK,
[[[[[[[['check out this level of nested'], self.MASK]]]]]]],
{
'key': self.MASK,
'secret': self.MASK
}
]
3 changes: 2 additions & 1 deletion waterbutler/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ def make_app(debug):
[(r'/status', handlers.StatusHandler)],
debug=debug,
)
app.sentry_client = AsyncSentryClient(settings.SENTRY_DSN, release=waterbutler.__version__)
app.sentry_client = AsyncSentryClient(settings.SENTRY_DSN, release=waterbutler.__version__,
Copy link
Contributor

@cslzchen cslzchen Nov 29, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to update this after bumping setuptools to 37.0.0. Update: ignore this comment

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cslzchen Can you please clarify this comment?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TomBaxter Thanks for catching this. I probably was thinking about MFR. No need to change.

processors=('waterbutler.server.sanitize.WBSanitizer',))
return app


Expand Down
80 changes: 80 additions & 0 deletions waterbutler/server/sanitize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import re

from raven.processors import SanitizePasswordsProcessor


class WBSanitizer(SanitizePasswordsProcessor):
"""Asterisk out things that look like passwords, keys, etc."""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make better use of parent class.


# Store mask as a fixed length for security
MASK = '*' * 8

# Token and key added from original. Key is used by Dataverse
FIELDS = frozenset([
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For phase 2 consideration: double check the list.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cslzchen Can you please clarify this comment?

Copy link
Contributor

@cslzchen cslzchen Dec 26, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TomBaxter My suggestion is check if there are other possible keys that may store sensitive or secret information. Here is the default keys from SanitizePasswordProcessor: https://github.com/htquach/raven-python/blob/master/raven/processors.py#L72.

    FIELDS = frozenset([
        'password', 'secret', 'passwd', 'authorization', 'api_key', 'apikey',
        'pw', 'cred'
    ])

We don't have 'pw', 'cred' and we added 'access_token', 'token', 'key' and 'sentry_dsn'. Other possible ones are 'refresh_token', 'code', etc. We need to understand what we need and what we don't.

Copy link
Member

@TomBaxter TomBaxter Dec 26, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Understood. I plan on making better use of inheritance so that we don't miss any defaults in the parent. I'll also check the sentry data for Dataverse, for other possible things to sanitize. I'll definitely add refresh_token.

'password',
'secret',
'passwd',
'authorization',
'api_key',
'apikey',
'sentry_dsn',
'access_token',
'key',
'token',
])

# Credit card regex left intact from original processor
# While we should never have credit card information, its still best to perform the check
# and keep old functionality
VALUES_RE = re.compile(r'^(?:\d[ -]*?){13,16}$')

# Should specifically match Dataverse secrets. Key format checked on demo and on Harvard
DATAVERSE_SECRET_RE = re.compile(r'[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]'
'{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}')

def sanitize(self, key, value):
"""Overload the sanitize function of the `SanitizePasswordsProcessor'."""

if value is None:
return

# Part of the original method. Looks for credit cards to sanitize
if isinstance(value, str) and self.VALUES_RE.search(value):
return self.MASK

if isinstance(value, dict):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need recursion for dict + a test fot it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Complete, for lists as well

for item in value:
value[item] = self.sanitize(item, value[item])

if isinstance(value, list):
new_list = []
for item in value:
new_list.append(self.sanitize(key, item))

value = new_list

# Check for Dataverse secrets
if isinstance(value, str):
matches = self.DATAVERSE_SECRET_RE.findall(value)
for match in matches:
value = value.replace(match, self.MASK)

# key can be a NoneType
# This sould be after the regex checks incase a `None` key is a token
if not key:
return value

# Just in case we have bytes here, we want to turn them into text
# properly without failing so we can perform our check.
if isinstance(key, bytes):
# May want a try/except block around this, but for now it should be okay
key = key.decode('utf-8', 'replace')
else:
key = str(key)

key = key.lower()
for field in self.FIELDS:
if field in key:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check key directly from self.FIELDS

return self.MASK

return value