From e6b7ee91da22a5349a3492b9e28b0eb267b19a43 Mon Sep 17 00:00:00 2001 From: Adam Spiers Date: Sat, 25 Mar 2017 18:57:42 +0000 Subject: [PATCH 1/2] add elasticsearch/README.md Document the elasticsearch experiments we tried on the Hack Day on 5th March 2017. --- src/elasticsearch/README.md | 129 ++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 src/elasticsearch/README.md diff --git a/src/elasticsearch/README.md b/src/elasticsearch/README.md new file mode 100644 index 0000000..fcb1567 --- /dev/null +++ b/src/elasticsearch/README.md @@ -0,0 +1,129 @@ +# Elasticsearch + +For now, all we have is some rough notes on the experiments we did +during the Hack Day on 5th March 2017. + +## Details of experiments + +This is what we tried: + +- Install elasticsearch + +- `/usr/share/elasticsearch/bin/plugin install mobz/elasticsearch-head` + +- Go to http://localhost:9200/_plugin/head/ + +- Click on the `Any Request` tab + +- Create an index `muti` with settings by submitting the following + "query" (API request): + + - POST http://localhost:9200/muti + + { + "analysis": { + "char_filter": { + "&_to_and": { + "type": "mapping", + "mappings": [ + "&=> and " + ] + } + }, + "filter": { + "filter_stop": { + "type": "stop" + }, + "filter_shingle": { + "type": "shingle", + "max_shingle_size": 5, + "min_shingle_size": 2, + "output_unigrams": "true" + } + }, + "analyzer": { + "analyzer_shingle": { + "type": "custom", + "char_filter": [ + "html_strip", + "&_to_and" + ], + "tokenizer": "standard", + "filter": [ + "standard", + "lowercase", + "filter_stop", + "filter_shingle" + ] + } + } + } + } + +- Create types for the `muti` index + + - PUT http://localhost:9200/muti/meetings/_mapping + + { + "properties": { + "meetingId": { + "index": "not_analyzed", + "type": "string" + }, + "minister": { + "analyzer": "analyzer_shingle", + "type": "string" + }, + "department": { + "analyzer": "analyzer_shingle", + "type": "string" + }, + "organization": { + "analyzer": "analyzer_shingle", + "type": "string" + }, + "reason": { + "analyzer": "analyzer_shingle", + "type": "string" + } + } + } + +- Index documents + + - POST http://localhost:9200/muti/meetings where the body takes + this format: + + { + "meetingId" : meeting_ref, + "minister" : minister, + "department" : department, + "organization" : org, + "reason" : reason, + } + + See https://github.com/aspiers/MinistersUnderTheInfluence/releases/tag/elasticsearch-push-2017-03-05 + for code which we successfully used to do this. + +- Try an unfiltered search with aggregations + + - POST http://localhost:9200/muti/meetings/_search + + { + "from": 0, + "query": { + "match_all": {} + }, + "aggs": { + "organization": { + "terms": { + "field": "organization" + } + }, + "ministers": { + "terms": { + "field": "minister" + } + } + } + } From 81ad2b4b21f5b7b94c0829a026d9f3f31e811edf Mon Sep 17 00:00:00 2001 From: Adam Spiers Date: Sat, 25 Mar 2017 19:01:27 +0000 Subject: [PATCH 2/2] add ntlk/README.md Document the ntlk experiments we tried on the Hack Day on 5th March 2017. --- src/python/nltk/README.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 src/python/nltk/README.md diff --git a/src/python/nltk/README.md b/src/python/nltk/README.md new file mode 100644 index 0000000..5da9214 --- /dev/null +++ b/src/python/nltk/README.md @@ -0,0 +1,33 @@ +# Natural Language Toolkit in MUTI + +For now, all we have is some rough notes on the experiments we did +during the Hack Day on 5th March 2017. + +## Details of experiments + +This is what we tried: + +- Run `python` to get an interactive REPL + +- `import nltk` + +- Run `nltk.download_shell()` and fetch the following: + - `words` + - `averaged_perceptron_tagger` + - `punkt` + - `maxent_ne_chunker` + +- Run the following: + + from nltk import pos_tag, ne_chunk, word_tokenize + + # Pick a random chunk of text + x = "Public sector mutuals and co-ops, Government Digital Services - Digital by Default, Implications of the Francis report (Mid Staffs) for all public services" + tree = ne_chunk(pos_tag(word_tokenize(x))) + + for node in tree: + if isinstance(node, nltk.tree.Tree): + print(node) + +The results didn't look very useful and we gave up / ran out of time +at that point.