From 4b8cf19c3e0872d72e5cefd698cf7594be8fdf4a Mon Sep 17 00:00:00 2001 From: darkushin Date: Sun, 6 Jun 2021 17:37:39 +0300 Subject: [PATCH 01/21] Ex3 --- src/webdata/{SlowIndexWriter.java => IndexWriter.java} | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename src/webdata/{SlowIndexWriter.java => IndexWriter.java} (98%) diff --git a/src/webdata/SlowIndexWriter.java b/src/webdata/IndexWriter.java similarity index 98% rename from src/webdata/SlowIndexWriter.java rename to src/webdata/IndexWriter.java index 0833cb2..255fa7e 100644 --- a/src/webdata/SlowIndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -5,7 +5,7 @@ import java.nio.file.Path; import java.util.*; -public class SlowIndexWriter { +public class IndexWriter { private TreeMap> tokenDict; // keys are tokens, values are a list where odd cells are review ids including this token and even cells are the times the token appeared in the review. private TreeMap> productIds; private TreeMap> reviewIds; @@ -21,7 +21,7 @@ public class SlowIndexWriter { * Given product review data, creates an on disk index * inputFile is the path to the file containing the review data */ - public void slowWrite(String inputFile, String dir) { + public void write(String inputFile, String dir) { this.dir = dir; createDicts(inputFile); createDir(); @@ -200,7 +200,7 @@ private void createReviewIndex() { /** * Save the given object to disk under the given name. The file is saved to the dir that was passed to the - * SlowWrite() function. + * write() function. */ private void saveToDir(String name, Object obj) { FileOutputStream fileOut = null; From 90fdb441a517689fcdd1a60d018821c2f8a7580b Mon Sep 17 00:00:00 2001 From: nirnts Date: Sun, 6 Jun 2021 18:28:16 +0300 Subject: [PATCH 02/21] test commit --- src/webdata/DataParser.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/webdata/DataParser.java b/src/webdata/DataParser.java index 4df436b..0856f21 100644 --- a/src/webdata/DataParser.java +++ b/src/webdata/DataParser.java @@ -30,6 +30,7 @@ public DataParser(String inputFile) throws IOException { } } allReviews.add(parse_review(review.toString())); // add the last review + // Comment test } /** From 3a6e781e6d30f0b27b6fae2bb7c40137dd843e5d Mon Sep 17 00:00:00 2001 From: nirnts Date: Tue, 8 Jun 2021 09:23:56 +0300 Subject: [PATCH 03/21] ReviewSearch added. --- src/webdata/ReviewSearch.java | 82 +++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 src/webdata/ReviewSearch.java diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java new file mode 100644 index 0000000..7bc1e9f --- /dev/null +++ b/src/webdata/ReviewSearch.java @@ -0,0 +1,82 @@ +package webdata; + +import java.util.*; + +public class ReviewSearch { + private IndexReader ir; + /** + * Constructor + */ + public ReviewSearch(IndexReader iReader) { + this.ir = iReader; + } + + /** + * Returns a list of the id-s of the k most highly ranked reviews for the + * given query, using the vector space ranking function lnn.ltc (using the + * SMART notation) + * The list should be sorted by the ranking + */ + public Enumeration vectorSpaceSearch(Enumeration query, int k) { + return null; + } + + private HashMap getDocScores(String token) { + HashMap scores = new HashMap<>(); + Enumeration docFreqs = ir.getReviewsWithToken(token); + while (docFreqs.hasMoreElements()) { + int docId = docFreqs.nextElement(); + int freq = docFreqs.nextElement(); + double val = ((double) freq / ir.getReviewLength(docId)); + scores.put(docId, val); + } + return scores; + } + + /** + * Returns a list of the id-s of the k most highly ranked reviews for the + * given query, using the language model ranking function, smoothed using a + * mixture model with the given value of lambda + * The list should be sorted by the ranking + */ + public Enumeration languageModelSearch(Enumeration query,double lambda, int k) { + HashMap scores= new HashMap<>(); + int toks = 0; + while (query.hasMoreElements()) { + String token = query.nextElement(); + toks++; + double smooth = (1 - lambda) * (double) ir.getTokenCollectionFrequency(token) / ir.getTokenSizeOfReviews(); + HashMap tokenScores = getDocScores(token); + for (Map.Entry ent : tokenScores.entrySet()) { + double val = lambda * ent.getValue() + smooth; + scores.merge(ent.getKey(), Math.pow(val, toks), (x, y) -> x*val); + } + } + List> list = new ArrayList<>(scores.entrySet()); + list.sort(Map.Entry.comparingByValue()); + ArrayList result = new ArrayList<>(); + for (int i = 0; i < Math.min(k, list.size()); i++) { + result.add(list.get(list.size() - i - 1).getKey()); + } + return Collections.enumeration(result); + } + + /** + * Returns a list of the id-s of the k most highly ranked productIds for the + * given query using a function of your choice + * The list should be sorted by the ranking + */ + public Collection productSearch(Enumeration query, int k) { + return null; + } + + public static void main(String[] args) { + String dir = "./Data_Index"; +// IndexWriter iw = new IndexWriter(); +// iw.write("./1000.txt", dir); + + IndexReader ir = new IndexReader(dir); + ReviewSearch rs = new ReviewSearch(ir); + rs.languageModelSearch(Collections.enumeration(Arrays.asList("what", "the", "hell")), 0.4, 10); + } +} \ No newline at end of file From 9d5fcd11e860efaa56b25c9215e4d61face31996 Mon Sep 17 00:00:00 2001 From: darkushin Date: Thu, 10 Jun 2021 12:07:31 +0300 Subject: [PATCH 04/21] Vector Space Model --- src/webdata/IndexWriter.java | 7 +++ src/webdata/ReviewSearch.java | 105 ++++++++++++++++++++++++++++------ 2 files changed, 93 insertions(+), 19 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 255fa7e..ec174cb 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -216,4 +216,11 @@ private void saveToDir(String name, Object obj) { System.exit(1); } } + + public static void main(String[] args) { + String input_file = "./1000.txt"; + String dir = "./Data_Index"; + IndexWriter ir = new IndexWriter(); + ir.write(input_file, dir); + } } \ No newline at end of file diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index 7bc1e9f..ec4567e 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -1,6 +1,7 @@ package webdata; import java.util.*; +import java.lang.Math.*; public class ReviewSearch { private IndexReader ir; @@ -18,21 +19,39 @@ public ReviewSearch(IndexReader iReader) { * The list should be sorted by the ranking */ public Enumeration vectorSpaceSearch(Enumeration query, int k) { - return null; - } + // read entire query and compute query scores: + HashMap fullQuery = new HashMap<>(); + while (query.hasMoreElements()) { + String token = query.nextElement(); + if (fullQuery.containsKey(token)){ + fullQuery.put(token, fullQuery.get(token) + 1); + } else { + fullQuery.put(token, 1); + } + } + HashMap queryScores = this.computeTokenQueryScore(fullQuery); - private HashMap getDocScores(String token) { - HashMap scores = new HashMap<>(); - Enumeration docFreqs = ir.getReviewsWithToken(token); - while (docFreqs.hasMoreElements()) { - int docId = docFreqs.nextElement(); - int freq = docFreqs.nextElement(); - double val = ((double) freq / ir.getReviewLength(docId)); - scores.put(docId, val); + HashMap scores= new HashMap<>(); + for (String token: fullQuery.keySet()){ + HashMap docScores = this.getDocScores(token, "vectorSpace"); + double tokenQueryScore = queryScores.get(token); + for (int doc: docScores.keySet()){ + double curScore = tokenQueryScore * docScores.get(doc); + if (scores.containsKey(doc)) { + scores.put(doc, scores.get(doc) + curScore); + } else { + scores.put(doc, curScore); + } + } } - return scores; + // sort the map and return the ids of the k highest scores: + return kHighestScores(scores, k); } + + + + /** * Returns a list of the id-s of the k most highly ranked reviews for the * given query, using the language model ranking function, smoothed using a @@ -46,19 +65,13 @@ public Enumeration languageModelSearch(Enumeration query,double String token = query.nextElement(); toks++; double smooth = (1 - lambda) * (double) ir.getTokenCollectionFrequency(token) / ir.getTokenSizeOfReviews(); - HashMap tokenScores = getDocScores(token); + HashMap tokenScores = getDocScores(token, "languageModel"); for (Map.Entry ent : tokenScores.entrySet()) { double val = lambda * ent.getValue() + smooth; scores.merge(ent.getKey(), Math.pow(val, toks), (x, y) -> x*val); } } - List> list = new ArrayList<>(scores.entrySet()); - list.sort(Map.Entry.comparingByValue()); - ArrayList result = new ArrayList<>(); - for (int i = 0; i < Math.min(k, list.size()); i++) { - result.add(list.get(list.size() - i - 1).getKey()); - } - return Collections.enumeration(result); + return kHighestScores(scores, k); } /** @@ -70,6 +83,60 @@ public Collection productSearch(Enumeration query, int k) { return null; } + private HashMap computeTokenQueryScore(HashMap query) { + HashMap scores = new HashMap<>(); + + // compute the tf and idf values of every token: + for (String token: query.keySet()) { + float tf = (float) (1 + Math.log10(query.get(token))); + float df = (float) Math.log10(ir.getNumberOfReviews() / ir.getTokenFrequency(token)); + scores.put(token, tf*df); + } + + // compute the norm of the vector: + double vectorNorm = 0; + for (double score: scores.values()){ + vectorNorm += Math.pow(score, 2); + } + + // normalize the values by dividing in the vector's norm: + for (String token: scores.keySet()){ + float normalizedScore = (float) (scores.get(token) / vectorNorm); + scores.put(token, normalizedScore); + } + return scores; + } + + private HashMap getDocScores(String token, String model) { + HashMap scores = new HashMap<>(); + Enumeration docFreqs = ir.getReviewsWithToken(token); + while (docFreqs.hasMoreElements()) { + int docId = docFreqs.nextElement(); + int freq = docFreqs.nextElement(); + double val = 0; + if (model.equals("languageModel")) { + val = ((double) freq / ir.getReviewLength(docId)); + } else if (model.equals("vectorSpace")){ + val = ((double) 1 + Math.log10(freq)); + } else { + System.out.println("Please provide the name of the search for computing docScores. Options are: [languageModel, vectorSpace]"); + System.exit(1); + } + scores.put(docId, val); + } + return scores; + } + + private Enumeration kHighestScores(HashMap scores, int k){ + List> list = new ArrayList<>(scores.entrySet()); + list.sort(Map.Entry.comparingByValue()); + ArrayList result = new ArrayList<>(); + for (int i = 0; i < Math.min(k, list.size()); i++) { + result.add(list.get(list.size() - i - 1).getKey()); + } + return Collections.enumeration(result); + } + public static void main(String[] args) { String dir = "./Data_Index"; // IndexWriter iw = new IndexWriter(); From 6c2d194f000f79d53ebc7bbe9050a790f5fd06e8 Mon Sep 17 00:00:00 2001 From: darkushin Date: Thu, 10 Jun 2021 12:24:10 +0300 Subject: [PATCH 05/21] data reading changes --- src/webdata/DataLoader.java | 63 ++++++++++++++++++ src/webdata/DataParser.java | 117 +++++++++++++++++++++++++--------- src/webdata/ReviewSearch.java | 12 ++-- 3 files changed, 156 insertions(+), 36 deletions(-) create mode 100644 src/webdata/DataLoader.java diff --git a/src/webdata/DataLoader.java b/src/webdata/DataLoader.java new file mode 100644 index 0000000..fd6c9f3 --- /dev/null +++ b/src/webdata/DataLoader.java @@ -0,0 +1,63 @@ +package webdata; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; + +public class DataLoader implements Iterable> { + private BufferedReader br; + private ArrayList reviewStrings; + + public DataLoader(String inputFile) throws FileNotFoundException { + br = new BufferedReader(new FileReader(inputFile)); + reviewStrings = new ArrayList<>(); + } + + public ArrayList readSingleReview() { + String line; + try { + while((line = br.readLine()) != null) { + if (line.contains("product/productId") && reviewStrings.size() != 0) { + ArrayList ret = reviewStrings; + reviewStrings = new ArrayList(); + reviewStrings.add(line); + return ret; + } + reviewStrings.add(line); + } + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + return reviewStrings; + } + + public Iterator> iterator() { + return new Iterator<>() { + @Override + public boolean hasNext(){ + try { + br.mark(1); + int i = br.read(); + br.reset(); + return (i != -1); + } catch (IOException e) { + return false; + } + } + + @Override + public ArrayList next() { + return readSingleReview(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } +} diff --git a/src/webdata/DataParser.java b/src/webdata/DataParser.java index 0856f21..61366e1 100644 --- a/src/webdata/DataParser.java +++ b/src/webdata/DataParser.java @@ -5,53 +5,110 @@ public class DataParser { - ArrayList> allReviews = new ArrayList<>(); - public static final List INTEREST_FIELDS = Arrays.asList("productId", "score", "helpfulness", "text"); + public class Review{ + private String text; + private String productId; + private String score; + private String helpfulness; + public String getText() { + return text; + } + + public String getProductId() { + return productId; + } + + public String getHelpfulness() { + return helpfulness; + } + + public String getScore() { + return score; + } + + public void setHelpfulness(String helpfulness) { + this.helpfulness = helpfulness; + } + + public void setProductId(String productId) { + this.productId = productId; + } + + public void setScore(String score) { + this.score = score; + } + + public void setText(String text) { + this.text = text; + } + } /** * Given product review data, parses the data and creates a new list where each entry i contains hashmap with the fields * of the review, i.e: productId->value, score->value, helpfulness->value, text->value. * inputFile is the path to the file containing the review data */ - public DataParser(String inputFile) throws IOException { - BufferedReader br = new BufferedReader(new FileReader(inputFile)); - String line; - StringBuilder review = new StringBuilder(); - while((line = br.readLine()) != null) { - if (line.contains("product/productId")){ - if (!review.toString().equals("")){ - allReviews.add(parse_review(review.toString())); - } - review = new StringBuilder(line); - } - else{ - review.append(line); - } + public List parseData(List rawReviews){ + ArrayList allReviews = new ArrayList<>(); + for (String review: rawReviews){ + allReviews.add(parseReview(review)); } - allReviews.add(parse_review(review.toString())); // add the last review - // Comment test + return allReviews; } /** - * Given a single review, parse the review and return a hash table containing only the relevant fields of the - * review, i.e: productId, score, helpfulness, text. - * @param review: the review that should be parsed. - * @return a hash table where the keys are the relevant fields mentioned above and their corresponding values. + * Given a single review, parse the review and return a Review object, containing all relevant information from the + * given review, i.e. productId, score, helpfulness and text. */ - private static HashMap parse_review(String review){ - List fields = Arrays.asList(review.split("review/")); - HashMap review_fields = new HashMap(); + public Review parseReview(String review){ + ArrayList fields = new ArrayList<>(Arrays.asList(review.split("review/"))); + Review parsedReview = new Review(); - review_fields.put("productId", fields.get(0).split(": ")[1].split("product/")[0]); + parsedReview.setProductId(fields.get(0).split(": ")[1].split("product/")[0]); for (int i=1; i field_value = Arrays.asList(field.split(": ")); - if (INTEREST_FIELDS.contains(field_value.get(0))) { - review_fields.put(field_value.get(0), String.join(":", field_value.subList(1, field_value.size()))); + List fieldValue = Arrays.asList(field.split(": ")); + if (fieldValue.get(0).equals("text")) { + parsedReview.setText(String.join(": ", fieldValue.subList(1, fieldValue.size()))); + } else if (fieldValue.get(0).equals("helpfulness")) { + parsedReview.setHelpfulness(fieldValue.get(1)); + } else if (fieldValue.get(0).equals("score")) { + parsedReview.setScore(fieldValue.get(1)); + } + } + return parsedReview; + } + + public Review parseReview(ArrayList review){ + Review parsedReview = new Review(); + StringBuilder text = new StringBuilder(); + boolean readingText = false; + for (String line : review){ + if (readingText && !line.equals("")) { + text.append(" "); + text.append(line); + continue; + } + int prefix = line.indexOf("/"); + int delim = line.indexOf(":"); + if (prefix == -1 || delim == -1 || delim < prefix) { + continue; + } + String field = line.substring(prefix + 1, delim); + if (field.equals("text")){ + text.append(line.substring(delim + 2)); + readingText = true; + } else if (field.equals("productId")) { + parsedReview.setProductId(line.substring(delim + 2)); + } else if (field.equals("helpfulness")) { + parsedReview.setHelpfulness(line.substring(delim + 2)); + } else if (field.equals("score")) { + parsedReview.setScore(line.substring(delim + 2)); } } - return review_fields; + parsedReview.setText(text.toString()); + return parsedReview; } } diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index ec4567e..a55f1f4 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -29,7 +29,7 @@ public Enumeration vectorSpaceSearch(Enumeration query, int k) fullQuery.put(token, 1); } } - HashMap queryScores = this.computeTokenQueryScore(fullQuery); + HashMap queryScores = this.computeTokenQueryScore(fullQuery); HashMap scores= new HashMap<>(); for (String token: fullQuery.keySet()){ @@ -83,13 +83,13 @@ public Collection productSearch(Enumeration query, int k) { return null; } - private HashMap computeTokenQueryScore(HashMap query) { - HashMap scores = new HashMap<>(); + private HashMap computeTokenQueryScore(HashMap query) { + HashMap scores = new HashMap<>(); // compute the tf and idf values of every token: for (String token: query.keySet()) { - float tf = (float) (1 + Math.log10(query.get(token))); - float df = (float) Math.log10(ir.getNumberOfReviews() / ir.getTokenFrequency(token)); + double tf = 1 + Math.log10(query.get(token)); + double df = Math.log10((double) ir.getNumberOfReviews() / ir.getTokenFrequency(token)); scores.put(token, tf*df); } @@ -101,7 +101,7 @@ private HashMap computeTokenQueryScore(HashMap q // normalize the values by dividing in the vector's norm: for (String token: scores.keySet()){ - float normalizedScore = (float) (scores.get(token) / vectorNorm); + double normalizedScore = scores.get(token) / vectorNorm; scores.put(token, normalizedScore); } return scores; From 0390496899fafcd0b0e3360ceae65a253e10d1f7 Mon Sep 17 00:00:00 2001 From: nirnts Date: Thu, 10 Jun 2021 12:27:03 +0300 Subject: [PATCH 06/21] IndexWriter --- src/webdata/IndexWriter.java | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index ec174cb..6bba144 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -8,7 +8,7 @@ public class IndexWriter { private TreeMap> tokenDict; // keys are tokens, values are a list where odd cells are review ids including this token and even cells are the times the token appeared in the review. private TreeMap> productIds; - private TreeMap> reviewIds; + private LinkedList> reviewIds; private String dir; private static final String PRODUCT_INDEX_FILE = "product_index.txt"; @@ -65,18 +65,23 @@ private void createDicts(String inputFile){ tokenDict = new TreeMap<>(); reviewIds = new TreeMap<>(); - DataParser dataParser = null; + DataLoader dataLoader = null; + DataParser dataParser = new DataParser(); try { - dataParser = new DataParser(inputFile); + dataLoader = new DataLoader(inputFile); } catch (IOException e) { + e.printStackTrace(); System.out.println("Error occurred while reading the reviews input file."); System.exit(1); } - - for (int i = 0; i < dataParser.allReviews.size(); i++) { - addProductId(dataParser.allReviews.get(i).get("productId"), i + 1); + int i=1; + int readTokens = 0; + for (ArrayList s: dataLoader){ + DataParser.Review review = dataParser.parseReview(s); + addProductId(review.getProductId(), i + 1); int length = addReviewText(dataParser.allReviews.get(i).get("text"), i + 1); - addReviewId(dataParser.allReviews.get(i), i, length); + addReviewId(review, i, length); + i++; } } @@ -129,14 +134,16 @@ private void addProductId(String productId, int reviewId) { /** * Adds all the information that is relevant to the given reviewId to the reviewIds dictionary. */ - private void addReviewId(HashMap review, int reviewId, int length) { - reviewIds.put(reviewId, new ArrayList<>()); + private void addReviewId(DataParser.Review review, int reviewId, int length) { + ArrayList vals = new ArrayList<>(); + // 0 - productId, 1 - score, 2 - helpfulness, 3 - length - for (String field : DataParser.INTEREST_FIELDS) { - if (field.equals("text")) { continue; } - reviewIds.get(reviewId).add(review.get(field)); - } - reviewIds.get(reviewId).add(String.valueOf(length)); + vals.add(review.getProductId()); + vals.add(review.getScore()); + vals.add(review.getHelpfulness()); + vals.add(String.valueOf(length)); + + reviewIds.add(vals); } /** From 6164db5567318c18fc8025ae00b13f87aea585ac Mon Sep 17 00:00:00 2001 From: nirnts Date: Thu, 10 Jun 2021 12:33:07 +0300 Subject: [PATCH 07/21] IndexWriter 2 --- src/webdata/IndexWriter.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 6bba144..7817374 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -63,7 +63,7 @@ private void createDir(){ private void createDicts(String inputFile){ productIds = new TreeMap<>(); tokenDict = new TreeMap<>(); - reviewIds = new TreeMap<>(); + reviewIds = new LinkedList<>(); DataLoader dataLoader = null; DataParser dataParser = new DataParser(); @@ -79,8 +79,9 @@ private void createDicts(String inputFile){ for (ArrayList s: dataLoader){ DataParser.Review review = dataParser.parseReview(s); addProductId(review.getProductId(), i + 1); - int length = addReviewText(dataParser.allReviews.get(i).get("text"), i + 1); + int length = addReviewText(review.getText(), i); addReviewId(review, i, length); + readTokens += length; i++; } } @@ -187,11 +188,16 @@ private void createTokenIndex(){ */ private void createReviewIndex() { // Revise the review dictionary to the correct structure & change productIDs to product index - LinkedList> dictValues = new LinkedList<>(); - for (int review : reviewIds.keySet()) { - ArrayList vals = reviewIds.get(review); + ArrayList> dictValues = new ArrayList<>(); + HashMap productDict = new HashMap<>(productIds.size()); + int i = 0; + for (String productId: productIds.keySet()){ + productDict.put(productId, i); + i++; + } + for (ArrayList vals : reviewIds) { ArrayList new_vals = new ArrayList<>(List.of(0, 0, 0, 0, 0)); - new_vals.set(ReviewIndex.PRODUCTID_INDEX, productIds.headMap(vals.get(0)).size()); + new_vals.set(ReviewIndex.PRODUCTID_INDEX, productDict.get(vals.get(0))); String[] helpf = vals.get(2).split("/"); new_vals.set(ReviewIndex.HELPFNUM_INDEX, Integer.parseInt(helpf[0])); new_vals.set(ReviewIndex.HELPFDNOM_INDEX, Integer.parseInt(helpf[1])); From b51afbe6801cefedebbb86ec972df66278e56537 Mon Sep 17 00:00:00 2001 From: darkushin Date: Thu, 10 Jun 2021 13:58:19 +0300 Subject: [PATCH 08/21] Fixed index-writer --- src/webdata/IndexWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 7817374..43e5644 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -23,8 +23,8 @@ public class IndexWriter { */ public void write(String inputFile, String dir) { this.dir = dir; - createDicts(inputFile); createDir(); + createDicts(inputFile); createProductIndex(); createTokenIndex(); createReviewIndex(); @@ -78,7 +78,7 @@ private void createDicts(String inputFile){ int readTokens = 0; for (ArrayList s: dataLoader){ DataParser.Review review = dataParser.parseReview(s); - addProductId(review.getProductId(), i + 1); + addProductId(review.getProductId(), i); int length = addReviewText(review.getText(), i); addReviewId(review, i, length); readTokens += length; From 056742c81998011402fe109b2af721bd42f02278 Mon Sep 17 00:00:00 2001 From: nirnts Date: Thu, 10 Jun 2021 14:00:59 +0300 Subject: [PATCH 09/21] comparator --- src/webdata/ReviewSearch.java | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index a55f1f4..13d0011 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -67,10 +67,13 @@ public Enumeration languageModelSearch(Enumeration query,double double smooth = (1 - lambda) * (double) ir.getTokenCollectionFrequency(token) / ir.getTokenSizeOfReviews(); HashMap tokenScores = getDocScores(token, "languageModel"); for (Map.Entry ent : tokenScores.entrySet()) { - double val = lambda * ent.getValue() + smooth; - scores.merge(ent.getKey(), Math.pow(val, toks), (x, y) -> x*val); +// double val = lambda * ent.getValue() + smooth; + double val = Math.log(lambda * ent.getValue() + smooth); +// scores.merge(ent.getKey(), Math.pow(val, toks), (x, y) -> x*val); + scores.merge(ent.getKey(), val * toks, (x, y) -> x + val); } } + scores.replaceAll((key, v) -> Math.exp(v)); return kHighestScores(scores, k); } @@ -130,6 +133,14 @@ private HashMap getDocScores(String token, String model) { private Enumeration kHighestScores(HashMap scores, int k){ List> list = new ArrayList<>(scores.entrySet()); list.sort(Map.Entry.comparingByValue()); + list.sort((x, y) -> { + int cmp = x.getValue().compareTo(y.getValue()); + if (cmp == 0) { + return x.getKey().compareTo(y.getKey()); + } else { + return cmp; + } + }); ArrayList result = new ArrayList<>(); for (int i = 0; i < Math.min(k, list.size()); i++) { result.add(list.get(list.size() - i - 1).getKey()); From ce87d87faa9d185888177b0cecff24fb4893a69e Mon Sep 17 00:00:00 2001 From: darkushin Date: Thu, 10 Jun 2021 14:13:52 +0300 Subject: [PATCH 10/21] passing vector model test --- src/webdata/ReviewSearch.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index 13d0011..7f3283d 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -132,9 +132,8 @@ private HashMap getDocScores(String token, String model) { private Enumeration kHighestScores(HashMap scores, int k){ List> list = new ArrayList<>(scores.entrySet()); - list.sort(Map.Entry.comparingByValue()); list.sort((x, y) -> { - int cmp = x.getValue().compareTo(y.getValue()); + int cmp = y.getValue().compareTo(x.getValue()); if (cmp == 0) { return x.getKey().compareTo(y.getKey()); } else { @@ -143,7 +142,7 @@ private Enumeration kHighestScores(HashMap scores, int }); ArrayList result = new ArrayList<>(); for (int i = 0; i < Math.min(k, list.size()); i++) { - result.add(list.get(list.size() - i - 1).getKey()); + result.add(list.get(i).getKey()); } return Collections.enumeration(result); } From 59e70eddfe7e8c297af43a97bf216203fadf305e Mon Sep 17 00:00:00 2001 From: darkushin Date: Thu, 10 Jun 2021 14:55:38 +0300 Subject: [PATCH 11/21] language model smooth fix --- src/webdata/ReviewSearch.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index 7f3283d..bf36cd6 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -67,13 +67,13 @@ public Enumeration languageModelSearch(Enumeration query,double double smooth = (1 - lambda) * (double) ir.getTokenCollectionFrequency(token) / ir.getTokenSizeOfReviews(); HashMap tokenScores = getDocScores(token, "languageModel"); for (Map.Entry ent : tokenScores.entrySet()) { -// double val = lambda * ent.getValue() + smooth; - double val = Math.log(lambda * ent.getValue() + smooth); -// scores.merge(ent.getKey(), Math.pow(val, toks), (x, y) -> x*val); - scores.merge(ent.getKey(), val * toks, (x, y) -> x + val); + double val = lambda * ent.getValue() + smooth; +// double val = Math.log(lambda * ent.getValue() + smooth); + scores.merge(ent.getKey(), val * Math.pow(smooth, toks-1), (x, y) -> x*val); +// scores.merge(ent.getKey(), val * toks, (x, y) -> x + val); } } - scores.replaceAll((key, v) -> Math.exp(v)); +// scores.replaceAll((key, v) -> Math.exp(v)); return kHighestScores(scores, k); } From 4dc3f1fc4671110b100f339637fdd0bf6acee1af Mon Sep 17 00:00:00 2001 From: nirnts Date: Thu, 10 Jun 2021 16:02:49 +0300 Subject: [PATCH 12/21] Passed ReviewSearch tests --- src/webdata/ReviewSearch.java | 44 +++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index 13d0011..a912e47 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -59,21 +59,42 @@ public Enumeration vectorSpaceSearch(Enumeration query, int k) * The list should be sorted by the ranking */ public Enumeration languageModelSearch(Enumeration query,double lambda, int k) { - HashMap scores= new HashMap<>(); - int toks = 0; + HashMap scores = new HashMap<>(); + double total_smooth = 1; while (query.hasMoreElements()) { String token = query.nextElement(); - toks++; double smooth = (1 - lambda) * (double) ir.getTokenCollectionFrequency(token) / ir.getTokenSizeOfReviews(); HashMap tokenScores = getDocScores(token, "languageModel"); - for (Map.Entry ent : tokenScores.entrySet()) { -// double val = lambda * ent.getValue() + smooth; - double val = Math.log(lambda * ent.getValue() + smooth); -// scores.merge(ent.getKey(), Math.pow(val, toks), (x, y) -> x*val); - scores.merge(ent.getKey(), val * toks, (x, y) -> x + val); + + // Update existing keys + for (Map.Entry ent : scores.entrySet()) { + if (tokenScores.containsKey(ent.getKey())){ + scores.put(ent.getKey(), ent.getValue() * (lambda * tokenScores.get(ent.getKey()) + smooth)); + } else { + scores.put(ent.getKey(), ent.getValue() * smooth); + } + } + Set tokenScoresKeys = tokenScores.keySet(); + tokenScoresKeys.removeAll(scores.keySet()); + + // Add remaining, new keys + for (int key : tokenScoresKeys) { + scores.put(key, (lambda * tokenScores.get(key) + smooth) * total_smooth); } + total_smooth *= smooth; } - scores.replaceAll((key, v) -> Math.exp(v)); + + // If k is larger than the number of results to the query, append the lowest reviewId's +// int diff = k - scores.size(); +// for (int i=1; i <= ir.getNumberOfReviews(); i++) { +// if (diff <= 0) { +// break; +// } +// if (!scores.containsKey(i)) { +// scores.put(i, total_smooth); +// diff--; +// } +// } return kHighestScores(scores, k); } @@ -132,9 +153,8 @@ private HashMap getDocScores(String token, String model) { private Enumeration kHighestScores(HashMap scores, int k){ List> list = new ArrayList<>(scores.entrySet()); - list.sort(Map.Entry.comparingByValue()); list.sort((x, y) -> { - int cmp = x.getValue().compareTo(y.getValue()); + int cmp = y.getValue().compareTo(x.getValue()); if (cmp == 0) { return x.getKey().compareTo(y.getKey()); } else { @@ -143,7 +163,7 @@ private Enumeration kHighestScores(HashMap scores, int }); ArrayList result = new ArrayList<>(); for (int i = 0; i < Math.min(k, list.size()); i++) { - result.add(list.get(list.size() - i - 1).getKey()); + result.add(list.get(i).getKey()); } return Collections.enumeration(result); } From e21da2124af167ed573c6f857595e8cd9e8c490f Mon Sep 17 00:00:00 2001 From: darkushin Date: Thu, 10 Jun 2021 16:16:31 +0300 Subject: [PATCH 13/21] languageModel & vectorModel after tests --- src/webdata/ReviewSearch.java | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index a912e47..7037e4d 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -83,18 +83,6 @@ public Enumeration languageModelSearch(Enumeration query,double } total_smooth *= smooth; } - - // If k is larger than the number of results to the query, append the lowest reviewId's -// int diff = k - scores.size(); -// for (int i=1; i <= ir.getNumberOfReviews(); i++) { -// if (diff <= 0) { -// break; -// } -// if (!scores.containsKey(i)) { -// scores.put(i, total_smooth); -// diff--; -// } -// } return kHighestScores(scores, k); } From 4409d45040636c9b89965d14c9b7cac7bbae2537 Mon Sep 17 00:00:00 2001 From: darkushin Date: Thu, 10 Jun 2021 18:44:21 +0300 Subject: [PATCH 14/21] Added productSeardch --- src/webdata/ReviewSearch.java | 43 +++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index 7037e4d..921fad4 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -1,5 +1,6 @@ package webdata; +import javax.lang.model.type.ArrayType; import java.util.*; import java.lang.Math.*; @@ -48,10 +49,6 @@ public Enumeration vectorSpaceSearch(Enumeration query, int k) return kHighestScores(scores, k); } - - - - /** * Returns a list of the id-s of the k most highly ranked reviews for the * given query, using the language model ranking function, smoothed using a @@ -92,7 +89,35 @@ public Enumeration languageModelSearch(Enumeration query,double * The list should be sorted by the ranking */ public Collection productSearch(Enumeration query, int k) { - return null; + Enumeration relevantReviews = this.vectorSpaceSearch(query, this.ir.getNumberOfReviews()); + HashMap>> products = new HashMap<>(); + int reviewRank = 1; + while (relevantReviews.hasMoreElements()) { + int reviewId = relevantReviews.nextElement(); + String productId = ir.getProductId(reviewId); + if (!products.containsKey(productId)){ + products.put(productId, new ArrayList<>()); + } + products.get(productId).add(new ArrayList<>(Arrays.asList(reviewId, reviewRank))); + reviewRank++; + } + HashMap productRelevance = new HashMap<>(); + for (Map.Entry>> product: products.entrySet()){ + productRelevance.put(product.getKey() ,this.getProductRelevance(product.getValue())); + } + + HashMap productQuality = new HashMap<>(); + for (String product: products.keySet()){ + productQuality.put(this.getProductQuality(product)); + } + + double alpha = 0.5; + HashMap productScores = new HashMap<>(); + for (String product: productRelevance.keySet()){ + productScores.put(product, alpha*productRelevance.get(product) + (1-alpha)*productQuality.get(product)); + } + Enumeration topProducts = kHighestScores(productScores, k); + return Collections.list(topProducts); } private HashMap computeTokenQueryScore(HashMap query) { @@ -139,8 +164,8 @@ private HashMap getDocScores(String token, String model) { return scores; } - private Enumeration kHighestScores(HashMap scores, int k){ - List> list = new ArrayList<>(scores.entrySet()); + private > Enumeration kHighestScores(HashMap scores, int k){ + List> list = new ArrayList<>(scores.entrySet()); list.sort((x, y) -> { int cmp = y.getValue().compareTo(x.getValue()); if (cmp == 0) { @@ -149,7 +174,7 @@ private Enumeration kHighestScores(HashMap scores, int return cmp; } }); - ArrayList result = new ArrayList<>(); + ArrayList result = new ArrayList<>(); for (int i = 0; i < Math.min(k, list.size()); i++) { result.add(list.get(i).getKey()); } @@ -163,6 +188,6 @@ public static void main(String[] args) { IndexReader ir = new IndexReader(dir); ReviewSearch rs = new ReviewSearch(ir); - rs.languageModelSearch(Collections.enumeration(Arrays.asList("what", "the", "hell")), 0.4, 10); +// rs.productSearch(Collections.enumeration(Arrays.asList("dog")), 10); } } \ No newline at end of file From 75c1d5b902d2d543232cc8e50ac0342ec6db13f3 Mon Sep 17 00:00:00 2001 From: nirnts Date: Sat, 19 Jun 2021 02:43:42 +0300 Subject: [PATCH 15/21] started getProductRelevance + small changes --- src/webdata/ReviewSearch.java | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index 921fad4..421f221 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -108,7 +108,7 @@ public Collection productSearch(Enumeration query, int k) { HashMap productQuality = new HashMap<>(); for (String product: products.keySet()){ - productQuality.put(this.getProductQuality(product)); + productQuality.put(product, this.getProductQuality(product)); } double alpha = 0.5; @@ -120,6 +120,22 @@ public Collection productSearch(Enumeration query, int k) { return Collections.list(topProducts); } + private double getProductRelevance(ArrayList> reviews) { + + for (ArrayList vals : reviews) { + int reviewScore = ir.getReviewScore(vals.get(0)); + int reviewHelpfulnessNumerator = ir.getReviewHelpfulnessNumerator(vals.get(0)); + int reviewHelpfulnessDenominator = ir.getReviewHelpfulnessDenominator(vals.get(0)); + int reviewLength = ir.getReviewLength(vals.get(0)); + } + + return 1; + } + + private double getProductQuality(String p) { + return 1; + } + private HashMap computeTokenQueryScore(HashMap query) { HashMap scores = new HashMap<>(); @@ -188,6 +204,6 @@ public static void main(String[] args) { IndexReader ir = new IndexReader(dir); ReviewSearch rs = new ReviewSearch(ir); -// rs.productSearch(Collections.enumeration(Arrays.asList("dog")), 10); + rs.productSearch(Collections.enumeration(Arrays.asList("dog")), 10); } } \ No newline at end of file From 14f6e59369c864df225944e7601f134c242e7c00 Mon Sep 17 00:00:00 2001 From: darkushin Date: Sun, 20 Jun 2021 10:27:31 +0300 Subject: [PATCH 16/21] Added product quality --- src/webdata/ReviewSearch.java | 54 +++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index 421f221..2d6be47 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -133,7 +133,57 @@ private double getProductRelevance(ArrayList> reviews) { } private double getProductQuality(String p) { - return 1; + int REVIEW_SCORE = 0; + int REVIEW_LENGTH = 1; + int REVIEW_NUMERATOR = 2; + int REVIEW_DENOMINATOR = 3; + double ALPHA = 0.5; + + // todo: use score, helpfulness and length + Enumeration productReviews = this.ir.getProductReviews(p); + // HashMap where each key is a review id and the values are an array list of [score, length, helpfulness numerator, helpfulness denominator] + HashMap> reviewsInfo = new HashMap<>(); + + // Save the length longest review and greatest denominator value for normalization: + int maxLength = 0; + int maxDenominator = 0; + + // Extract all necessary information from all reviews: + while (productReviews.hasMoreElements()) { + int reviewId = productReviews.nextElement(); + int reviewScore = this.ir.getReviewScore(reviewId); + int numerator = this.ir.getReviewHelpfulnessNumerator(reviewId); + int denominator = this.ir.getReviewHelpfulnessDenominator(reviewId); + int length = this.ir.getReviewLength(reviewId); + + // check that denominator is larger than numerator, if not swap: + if (numerator > denominator){ + numerator = denominator; + denominator = this.ir.getReviewHelpfulnessNumerator(reviewId); + } + + if (length > maxLength){ + maxLength = length; + } + if (denominator > maxDenominator){ + maxDenominator = denominator; + } + + reviewsInfo.put(reviewId, new ArrayList<>(Arrays.asList(reviewScore, length, numerator, denominator))); + } + + double productQuality = 0; + for (ArrayList review: reviewsInfo.values()){ + double helpfulness; + if (review.get(REVIEW_DENOMINATOR) > 0) { + double normalizedDenominator = (double) review.get(REVIEW_DENOMINATOR) / maxDenominator; + helpfulness = (double) (review.get(REVIEW_NUMERATOR) / review.get(REVIEW_DENOMINATOR)) * normalizedDenominator; + } else {helpfulness = 0.05;} + double normalizedLength = (double) review.get(REVIEW_LENGTH) / maxLength; + double totalReviewScore = (ALPHA * normalizedLength + (1 - ALPHA) * helpfulness) * review.get(REVIEW_SCORE); + productQuality += (totalReviewScore / reviewsInfo.size()); + } + return productQuality; } private HashMap computeTokenQueryScore(HashMap query) { @@ -204,6 +254,6 @@ public static void main(String[] args) { IndexReader ir = new IndexReader(dir); ReviewSearch rs = new ReviewSearch(ir); - rs.productSearch(Collections.enumeration(Arrays.asList("dog")), 10); + rs.productSearch(Collections.enumeration(Arrays.asList("pop", "tart", "tarts")), 10); } } \ No newline at end of file From 7669af7fbd63f06d29ddabeb3f07c6851a8e811b Mon Sep 17 00:00:00 2001 From: Nir Nitskansky Date: Sun, 20 Jun 2021 11:46:47 +0300 Subject: [PATCH 17/21] ReviewSearch changes --- src/webdata/ReviewSearch.java | 37 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index 2d6be47..3a903d8 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -122,25 +122,15 @@ public Collection productSearch(Enumeration query, int k) { private double getProductRelevance(ArrayList> reviews) { - for (ArrayList vals : reviews) { - int reviewScore = ir.getReviewScore(vals.get(0)); - int reviewHelpfulnessNumerator = ir.getReviewHelpfulnessNumerator(vals.get(0)); - int reviewHelpfulnessDenominator = ir.getReviewHelpfulnessDenominator(vals.get(0)); - int reviewLength = ir.getReviewLength(vals.get(0)); - } - return 1; } - private double getProductQuality(String p) { + private HashMap reviewNormalizedScore(Enumeration productReviews) { int REVIEW_SCORE = 0; int REVIEW_LENGTH = 1; int REVIEW_NUMERATOR = 2; int REVIEW_DENOMINATOR = 3; double ALPHA = 0.5; - - // todo: use score, helpfulness and length - Enumeration productReviews = this.ir.getProductReviews(p); // HashMap where each key is a review id and the values are an array list of [score, length, helpfulness numerator, helpfulness denominator] HashMap> reviewsInfo = new HashMap<>(); @@ -172,16 +162,25 @@ private double getProductQuality(String p) { reviewsInfo.put(reviewId, new ArrayList<>(Arrays.asList(reviewScore, length, numerator, denominator))); } - double productQuality = 0; - for (ArrayList review: reviewsInfo.values()){ + HashMap ret = new HashMap<>(); + for (Map.Entry> entry: reviewsInfo.entrySet()){ double helpfulness; - if (review.get(REVIEW_DENOMINATOR) > 0) { - double normalizedDenominator = (double) review.get(REVIEW_DENOMINATOR) / maxDenominator; - helpfulness = (double) (review.get(REVIEW_NUMERATOR) / review.get(REVIEW_DENOMINATOR)) * normalizedDenominator; + if (entry.getValue().get(REVIEW_DENOMINATOR) > 0) { + double normalizedDenominator = (double) entry.getValue().get(REVIEW_DENOMINATOR) / maxDenominator; + helpfulness = (double) (entry.getValue().get(REVIEW_NUMERATOR) / entry.getValue().get(REVIEW_DENOMINATOR)) * normalizedDenominator; } else {helpfulness = 0.05;} - double normalizedLength = (double) review.get(REVIEW_LENGTH) / maxLength; - double totalReviewScore = (ALPHA * normalizedLength + (1 - ALPHA) * helpfulness) * review.get(REVIEW_SCORE); - productQuality += (totalReviewScore / reviewsInfo.size()); + double normalizedLength = (double) entry.getValue().get(REVIEW_LENGTH) / maxLength; + double totalReviewScore = (ALPHA * normalizedLength + (1 - ALPHA) * helpfulness) * entry.getValue().get(REVIEW_SCORE); + ret.put(entry.getKey(), totalReviewScore); + } + return ret; + } + + double productQuality = 0; + private double getProductQuality(String p) { + HashMap normalizedScores = reviewNormalizedScore(this.ir.getProductReviews(p)); + for (Double nscore: normalizedScores.values()){ + productQuality += (nscore / normalizedScores.size()); } return productQuality; } From 87d21e5167caae19fa7b3051f5be81c759933a54 Mon Sep 17 00:00:00 2001 From: Nir Nitskansky Date: Sun, 20 Jun 2021 16:37:43 +0300 Subject: [PATCH 18/21] productRelevance --- src/webdata/ReviewSearch.java | 60 ++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 15 deletions(-) diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index 3a903d8..bd7daa3 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -101,17 +101,17 @@ public Collection productSearch(Enumeration query, int k) { products.get(productId).add(new ArrayList<>(Arrays.asList(reviewId, reviewRank))); reviewRank++; } - HashMap productRelevance = new HashMap<>(); - for (Map.Entry>> product: products.entrySet()){ - productRelevance.put(product.getKey() ,this.getProductRelevance(product.getValue())); - } + HashMap productRelevance = getProductRelevance(products); +// for (Map.Entry>> product: products.entrySet()){ +// productRelevance.put(product.getKey() ,this.getProductRelevance(product.getValue())); +// } HashMap productQuality = new HashMap<>(); for (String product: products.keySet()){ productQuality.put(product, this.getProductQuality(product)); } - double alpha = 0.5; + double alpha = 0.9; HashMap productScores = new HashMap<>(); for (String product: productRelevance.keySet()){ productScores.put(product, alpha*productRelevance.get(product) + (1-alpha)*productQuality.get(product)); @@ -120,12 +120,39 @@ public Collection productSearch(Enumeration query, int k) { return Collections.list(topProducts); } - private double getProductRelevance(ArrayList> reviews) { + private HashMap getProductRelevance(HashMap>> products) { + HashMap nrmlzd = reviewNormalized(new Enumeration() { + Iterator>> it = products.values().iterator(); + Iterator> cur = null; + @Override + public boolean hasMoreElements() { + if (!it.hasNext()) { + return cur != null && cur.hasNext(); + } + return true; + } + + @Override + public Integer nextElement() { + if (cur == null || !cur.hasNext()) { + cur = it.next().iterator(); + } + return cur.next().get(0); + } + }, false); - return 1; + HashMap ret = new HashMap<>(); + for (Map.Entry>> ent : products.entrySet()) { + double score = 0; + for (ArrayList review : ent.getValue()) { + score += (1 / (double) review.get(1)) * nrmlzd.get(review.get(0)) * 5; + } + ret.put(ent.getKey(), score); + } + return ret; } - private HashMap reviewNormalizedScore(Enumeration productReviews) { + private HashMap reviewNormalized(Enumeration productReviews, boolean score) { int REVIEW_SCORE = 0; int REVIEW_LENGTH = 1; int REVIEW_NUMERATOR = 2; @@ -167,18 +194,22 @@ private HashMap reviewNormalizedScore(Enumeration prod double helpfulness; if (entry.getValue().get(REVIEW_DENOMINATOR) > 0) { double normalizedDenominator = (double) entry.getValue().get(REVIEW_DENOMINATOR) / maxDenominator; - helpfulness = (double) (entry.getValue().get(REVIEW_NUMERATOR) / entry.getValue().get(REVIEW_DENOMINATOR)) * normalizedDenominator; + helpfulness = (entry.getValue().get(REVIEW_NUMERATOR) / (double) entry.getValue().get(REVIEW_DENOMINATOR)) * normalizedDenominator; } else {helpfulness = 0.05;} double normalizedLength = (double) entry.getValue().get(REVIEW_LENGTH) / maxLength; - double totalReviewScore = (ALPHA * normalizedLength + (1 - ALPHA) * helpfulness) * entry.getValue().get(REVIEW_SCORE); + double totalReviewScore = (ALPHA * normalizedLength + (1 - ALPHA) * helpfulness); + if (score) { + totalReviewScore *= entry.getValue().get(REVIEW_SCORE); + } ret.put(entry.getKey(), totalReviewScore); } return ret; } - double productQuality = 0; + private double getProductQuality(String p) { - HashMap normalizedScores = reviewNormalizedScore(this.ir.getProductReviews(p)); + double productQuality = 0; + HashMap normalizedScores = reviewNormalized(this.ir.getProductReviews(p), true); for (Double nscore: normalizedScores.values()){ productQuality += (nscore / normalizedScores.size()); } @@ -247,12 +278,11 @@ private > Enumeration kHighestScores(HashMap Date: Sun, 20 Jun 2021 17:01:20 +0300 Subject: [PATCH 19/21] another one --- src/webdata/ReviewSearch.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index bd7daa3..72fea31 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -283,6 +283,9 @@ public static void main(String[] args) { // iw.write("./1000.txt", dir); IndexReader ir = new IndexReader(dir); ReviewSearch rs = new ReviewSearch(ir); - rs.productSearch(Collections.enumeration(Arrays.asList("tart", "pop")), 10); + Enumeration asd = rs.vectorSpaceSearch(Collections.enumeration(Arrays.asList("aasjdjkasdjkasjkdhasjkd")), 10); + Enumeration asd2 = rs.languageModelSearch(Collections.enumeration(Arrays.asList("aasjdjkasdjkasjkdhasjkd")), 0.6, 10); + Collection asd3 = rs.productSearch(Collections.enumeration(Arrays.asList("aasjdjkasdjkasjkdhasjkd")), 10); + } } \ No newline at end of file From d8731378379d054485790c8872e4590c92f96b62 Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 21 Jun 2021 11:09:31 +0300 Subject: [PATCH 20/21] added log to productQuality --- src/webdata/ReviewSearch.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index 72fea31..92194ce 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -213,7 +213,7 @@ private double getProductQuality(String p) { for (Double nscore: normalizedScores.values()){ productQuality += (nscore / normalizedScores.size()); } - return productQuality; + return productQuality * Math.log(normalizedScores.size()+1); } private HashMap computeTokenQueryScore(HashMap query) { From 1f9047b9a61c8231ed27198627de0ce1db023380 Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 21 Jun 2021 15:37:21 +0300 Subject: [PATCH 21/21] final code --- src/webdata/IndexWriter.java | 7 ------- src/webdata/ReviewSearch.java | 17 +---------------- 2 files changed, 1 insertion(+), 23 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 43e5644..76997a6 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -229,11 +229,4 @@ private void saveToDir(String name, Object obj) { System.exit(1); } } - - public static void main(String[] args) { - String input_file = "./1000.txt"; - String dir = "./Data_Index"; - IndexWriter ir = new IndexWriter(); - ir.write(input_file, dir); - } } \ No newline at end of file diff --git a/src/webdata/ReviewSearch.java b/src/webdata/ReviewSearch.java index 92194ce..d458572 100644 --- a/src/webdata/ReviewSearch.java +++ b/src/webdata/ReviewSearch.java @@ -102,16 +102,13 @@ public Collection productSearch(Enumeration query, int k) { reviewRank++; } HashMap productRelevance = getProductRelevance(products); -// for (Map.Entry>> product: products.entrySet()){ -// productRelevance.put(product.getKey() ,this.getProductRelevance(product.getValue())); -// } HashMap productQuality = new HashMap<>(); for (String product: products.keySet()){ productQuality.put(product, this.getProductQuality(product)); } - double alpha = 0.9; + double alpha = 0.7; HashMap productScores = new HashMap<>(); for (String product: productRelevance.keySet()){ productScores.put(product, alpha*productRelevance.get(product) + (1-alpha)*productQuality.get(product)); @@ -276,16 +273,4 @@ private > Enumeration kHighestScores(HashMap asd = rs.vectorSpaceSearch(Collections.enumeration(Arrays.asList("aasjdjkasdjkasjkdhasjkd")), 10); - Enumeration asd2 = rs.languageModelSearch(Collections.enumeration(Arrays.asList("aasjdjkasdjkasjkdhasjkd")), 0.6, 10); - Collection asd3 = rs.productSearch(Collections.enumeration(Arrays.asList("aasjdjkasdjkasjkdhasjkd")), 10); - - } } \ No newline at end of file