From b24ee9096fd366bf2d3028a434774bacc89f3e69 Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 3 May 2021 15:16:10 +0300 Subject: [PATCH 01/55] Ex2 first commit --- src/webdata/{SlowIndexWriter.java => IndexWriter.java} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename src/webdata/{SlowIndexWriter.java => IndexWriter.java} (99%) diff --git a/src/webdata/SlowIndexWriter.java b/src/webdata/IndexWriter.java similarity index 99% rename from src/webdata/SlowIndexWriter.java rename to src/webdata/IndexWriter.java index 0833cb2..569285d 100644 --- a/src/webdata/SlowIndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -5,7 +5,7 @@ import java.nio.file.Path; import java.util.*; -public class SlowIndexWriter { +public class IndexWriter { private TreeMap> tokenDict; // keys are tokens, values are a list where odd cells are review ids including this token and even cells are the times the token appeared in the review. private TreeMap> productIds; private TreeMap> reviewIds; From aa21bccaef1eb7eed79c8edae05f1344cac44427 Mon Sep 17 00:00:00 2001 From: nirnts Date: Mon, 3 May 2021 17:21:22 +0300 Subject: [PATCH 02/55] DataLoader added. --- src/webdata/DataLoader.java | 95 +++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 src/webdata/DataLoader.java diff --git a/src/webdata/DataLoader.java b/src/webdata/DataLoader.java new file mode 100644 index 0000000..ad20537 --- /dev/null +++ b/src/webdata/DataLoader.java @@ -0,0 +1,95 @@ +package webdata; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +public class DataLoader implements Iterable { + private BufferedReader br; + private StringBuilder stringBuffer; + + public DataLoader(String inputFile) throws FileNotFoundException { + br = new BufferedReader(new FileReader(inputFile)); + stringBuffer = new StringBuilder(); + } + + public String readSingleReview() { + String line; + try { + while((line = br.readLine()) != null) { + if (line.contains("product/productId") && stringBuffer.length() != 0) { + String ret = stringBuffer.toString(); + stringBuffer = new StringBuilder(line); + return ret; + } + stringBuffer.append(line); + } + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + return stringBuffer.toString(); + } + + public List readMultipleReviews(int num) { + LinkedList ret = new LinkedList<>(); + for (int i = 0; i < num; i++) { + ret.add(readSingleReview()); + } + return ret; + } + + public Iterator iterator() { + return new Iterator<>() { + + private int currentIndex = 0; + + @Override + public boolean hasNext(){ + try { + return br.ready(); + } catch (IOException e) { + return false; + } + } + + @Override + public String next() { + return readSingleReview(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + +// public static void main(String[] args) { +// DataLoader dl = null; +// try { +// dl = new DataLoader("./100.txt"); +// } catch (FileNotFoundException e) { +// e.printStackTrace(); +// System.exit(1); +// } +// +// for (int i = 0; i < 5; i++) { +// String s = dl.readSingleReview(); +// System.out.println(s); +// } +// +// ArrayList readd = new ArrayList<>(dl.readMultipleReviews(10)); +// System.out.println(readd.size()); +// +// for (String s : dl) { +// readd.add(s); +// } +// System.out.println(readd.size()); +// } +} From aa14759e2b57397029a89348c30873f348a6af4e Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 3 May 2021 18:04:19 +0300 Subject: [PATCH 03/55] updated data parser --- src/webdata/DataParser.java | 117 ++++++++++++++++++++++++++--------- src/webdata/IndexWriter.java | 48 +++++++++----- 2 files changed, 120 insertions(+), 45 deletions(-) diff --git a/src/webdata/DataParser.java b/src/webdata/DataParser.java index 4df436b..4ba3dbe 100644 --- a/src/webdata/DataParser.java +++ b/src/webdata/DataParser.java @@ -5,8 +5,46 @@ public class DataParser { - ArrayList> allReviews = new ArrayList<>(); - public static final List INTEREST_FIELDS = Arrays.asList("productId", "score", "helpfulness", "text"); + public class Review{ + private String text; + private String productId; + private String score; + private String helpfulness; + + public String getText() { + return text; + } + + public String getProductId() { + return productId; + } + + public String getHelpfulness() { + return helpfulness; + } + + public String getScore() { + return score; + } + + public void setHelpfulness(String helpfulness) { + this.helpfulness = helpfulness; + } + + public void setProductId(String productId) { + this.productId = productId; + } + + public void setScore(String score) { + this.score = score; + } + + public void setText(String text) { + this.text = text; + } + } + +// public static final List INTEREST_FIELDS = Arrays.asList("productId", "score", "helpfulness", "text"); /** @@ -14,43 +52,66 @@ public class DataParser { * of the review, i.e: productId->value, score->value, helpfulness->value, text->value. * inputFile is the path to the file containing the review data */ - public DataParser(String inputFile) throws IOException { - BufferedReader br = new BufferedReader(new FileReader(inputFile)); - String line; - StringBuilder review = new StringBuilder(); - while((line = br.readLine()) != null) { - if (line.contains("product/productId")){ - if (!review.toString().equals("")){ - allReviews.add(parse_review(review.toString())); - } - review = new StringBuilder(line); - } - else{ - review.append(line); - } +// public DataParser(String inputFile) throws IOException { +// allReviews.add(parse_review(review.toString())); // add the last review +// } + + public List parseData(List rawReviews){ + LinkedList allReviews = new LinkedList<>(); + for (String review: rawReviews){ + allReviews.add(parseReview(review)); } - allReviews.add(parse_review(review.toString())); // add the last review + return allReviews; } /** - * Given a single review, parse the review and return a hash table containing only the relevant fields of the - * review, i.e: productId, score, helpfulness, text. - * @param review: the review that should be parsed. - * @return a hash table where the keys are the relevant fields mentioned above and their corresponding values. + * Given a single review, parse the review and return a Review object, containing all relevant information from the + * given review, i.e. productId, score, helpfulness and text. */ - private static HashMap parse_review(String review){ + public Review parseReview(String review){ List fields = Arrays.asList(review.split("review/")); - HashMap review_fields = new HashMap(); + Review parsedReview = new Review(); - review_fields.put("productId", fields.get(0).split(": ")[1].split("product/")[0]); + parsedReview.setProductId(fields.get(0).split(": ")[1].split("product/")[0]); for (int i=1; i field_value = Arrays.asList(field.split(": ")); - if (INTEREST_FIELDS.contains(field_value.get(0))) { - review_fields.put(field_value.get(0), String.join(":", field_value.subList(1, field_value.size()))); + List fieldValue = Arrays.asList(field.split(": ")); + switch (fieldValue.get(0)) { + case "text" -> parsedReview.setText(String.join(":", fieldValue.subList(1, fieldValue.size()))); + case "helpfulness" -> parsedReview.setHelpfulness(fieldValue.get(1)); + case "score" -> parsedReview.setScore(fieldValue.get(1)); } } - return review_fields; + return parsedReview; } + +// public static void main(String[] args) throws IOException { +// String inputFile = "./100.txt"; +// BufferedReader br = new BufferedReader(new FileReader(inputFile)); +// String line; +// StringBuilder review = new StringBuilder(); +// List data = new ArrayList<>(); +// boolean stopFlag = false; +// int i = 0; +// while(!stopFlag && (line = br.readLine()) != null) { +// if (line.contains("product/productId")){ +// if (i > 0) { +// data.add(review.toString()); +// stopFlag = true; +// } +// else { +// review.append(line); +// i++; +// } +// } +// else{ +// review.append(line); +// } +// } +// DataParser dt = new DataParser(); +// dt.parseData(data); +// List reviews = dt.getParsedData(); +// System.out.println("daniel"); +// } } diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 569285d..158ed2b 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -15,13 +15,14 @@ public class IndexWriter { private static final String REVIEW_INDEX_FILE = "review_index.txt"; private static final String TOKEN_INDEX_FILE = "token_index.txt"; private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; + private static final int REVIEWS_TO_LOAD = 1000; /** * Given product review data, creates an on disk index * inputFile is the path to the file containing the review data */ - public void slowWrite(String inputFile, String dir) { + public void write(String inputFile, String dir) { this.dir = dir; createDicts(inputFile); createDir(); @@ -58,26 +59,39 @@ private void createDir(){ /** * Create temporary dictionaries that will store all information, before saving the indices to the disk. - * @param inputFile + * @param inputFile the file containing all reviews */ private void createDicts(String inputFile){ productIds = new TreeMap<>(); tokenDict = new TreeMap<>(); reviewIds = new TreeMap<>(); - DataParser dataParser = null; + DataLoader dataLoader = null; + DataParser dataParser = new DataParser(); try { - dataParser = new DataParser(inputFile); + dataLoader = new DataLoader(inputFile); } catch (IOException e) { System.out.println("Error occurred while reading the reviews input file."); System.exit(1); } + int i=0; + for (String s: dataLoader){ + DataParser.Review review = dataParser.parseReview(s); + addProductId(review.getProductId(), i + 1); + int length = addReviewText(review.getText(), i + 1); +// addReviewId(review, i, length); + i++; + if (i == REVIEWS_TO_LOAD){ + i = 0; + productIds.clear(); + tokenDict.clear(); + reviewIds.clear(); + // todo: save the current dicts to disk + } - for (int i = 0; i < dataParser.allReviews.size(); i++) { - addProductId(dataParser.allReviews.get(i).get("productId"), i + 1); - int length = addReviewText(dataParser.allReviews.get(i).get("text"), i + 1); - addReviewId(dataParser.allReviews.get(i), i, length); } + + // todo: merge all dictionaries } /** @@ -129,15 +143,15 @@ private void addProductId(String productId, int reviewId) { /** * Adds all the information that is relevant to the given reviewId to the reviewIds dictionary. */ - private void addReviewId(HashMap review, int reviewId, int length) { - reviewIds.put(reviewId, new ArrayList<>()); - // 0 - productId, 1 - score, 2 - helpfulness, 3 - length - for (String field : DataParser.INTEREST_FIELDS) { - if (field.equals("text")) { continue; } - reviewIds.get(reviewId).add(review.get(field)); - } - reviewIds.get(reviewId).add(String.valueOf(length)); - } +// private void addReviewId(DataParser.Review review, int reviewId, int length) { +// reviewIds.put(reviewId, new ArrayList<>()); +// // 0 - productId, 1 - score, 2 - helpfulness, 3 - length +// for (String field : DataParser.INTEREST_FIELDS) { +// if (field.equals("text")) { continue; } +// reviewIds.get(reviewId).add(review.get(field)); +// } +// reviewIds.get(reviewId).add(String.valueOf(length)); +// } /** * Creates and saves to the disk the product index, i.e. all the information that is related to products. From d679760212e9b20d7ebddd9367bc4298ff42de25 Mon Sep 17 00:00:00 2001 From: nirnts Date: Tue, 4 May 2021 12:25:05 +0300 Subject: [PATCH 04/55] dl change --- src/webdata/DataLoader.java | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/src/webdata/DataLoader.java b/src/webdata/DataLoader.java index ad20537..e0202d5 100644 --- a/src/webdata/DataLoader.java +++ b/src/webdata/DataLoader.java @@ -4,7 +4,6 @@ import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; -import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedList; import java.util.List; @@ -46,9 +45,6 @@ public List readMultipleReviews(int num) { public Iterator iterator() { return new Iterator<>() { - - private int currentIndex = 0; - @Override public boolean hasNext(){ try { @@ -69,27 +65,4 @@ public void remove() { } }; } - -// public static void main(String[] args) { -// DataLoader dl = null; -// try { -// dl = new DataLoader("./100.txt"); -// } catch (FileNotFoundException e) { -// e.printStackTrace(); -// System.exit(1); -// } -// -// for (int i = 0; i < 5; i++) { -// String s = dl.readSingleReview(); -// System.out.println(s); -// } -// -// ArrayList readd = new ArrayList<>(dl.readMultipleReviews(10)); -// System.out.println(readd.size()); -// -// for (String s : dl) { -// readd.add(s); -// } -// System.out.println(readd.size()); -// } } From be529b3c92c3e1bee9b879702a5589a18f2ad6b8 Mon Sep 17 00:00:00 2001 From: darkushin Date: Tue, 4 May 2021 17:54:19 +0300 Subject: [PATCH 05/55] addReviewText before saving files --- src/webdata/IndexWriter.java | 59 +++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 158ed2b..2b4e296 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -6,9 +6,10 @@ import java.util.*; public class IndexWriter { - private TreeMap> tokenDict; // keys are tokens, values are a list where odd cells are review ids including this token and even cells are the times the token appeared in the review. + private TreeMap tokenDict; // token and tokenId private TreeMap> productIds; private TreeMap> reviewIds; + private ArrayList tokenBuffer; // array list containing termIds and docIds pairs private String dir; private static final String PRODUCT_INDEX_FILE = "product_index.txt"; @@ -16,6 +17,7 @@ public class IndexWriter { private static final String TOKEN_INDEX_FILE = "token_index.txt"; private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; private static final int REVIEWS_TO_LOAD = 1000; + private static final int TOKEN_BUFFER_SIZE = 1000000; /** @@ -27,7 +29,7 @@ public void write(String inputFile, String dir) { createDicts(inputFile); createDir(); createProductIndex(); - createTokenIndex(); +// createTokenIndex(); createReviewIndex(); } @@ -109,19 +111,14 @@ private int addReviewText(String reviewText, int reviewIndex){ } reviewLength += 1; token = token.toLowerCase(); - if (tokenDict.containsKey(token)){ // token already exists, update its entry - List tokenInfo = tokenDict.get(token); - // check if the current review was already added to the token's review list. If yes, increase the # appearances of the token, else add it with # appearance = 1. - if (tokenInfo.get(tokenInfo.size()-2) == reviewIndex){ - tokenInfo.set(tokenInfo.size()-1 ,tokenInfo.get(tokenInfo.size()-1) + 1); - } else { // token appears first time in the given review - tokenInfo.add(reviewIndex); - tokenInfo.add(1); - } - } - else{ // token seen for the first time, add a new entry for it - tokenDict.put(token, new ArrayList<>(Arrays.asList(reviewIndex, 1))); + int termId = tokenDict.computeIfAbsent(token, k -> tokenDict.size()); + tokenBuffer.add(termId); + tokenBuffer.add(reviewIndex); + if (tokenBuffer.size() >= TOKEN_BUFFER_SIZE){ + this.sortBuffer(); + this.saveBuffer(); } + } return reviewLength; } @@ -175,19 +172,19 @@ private void createProductIndex() { * Creates the index file for the tokens in the collection. * The index is created using the k-1-in-k front coding method. */ - private void createTokenIndex(){ - LinkedList tokens = new LinkedList<>(tokenDict.keySet()); - ArrayList> vals = new ArrayList<>(tokenDict.values()); - int k = 8; - - KFront kf = new KFront(true); - kf.createKFront(k, tokens); - - TokensIndex tIdx = new TokensIndex(k, this.dir); - tIdx.insertData(kf.getTable(), vals, kf.getConcatString()); - - saveToDir(TOKEN_INDEX_FILE, tIdx); - } +// private void createTokenIndex(){ +// LinkedList tokens = new LinkedList<>(tokenDict.keySet()); +// ArrayList> vals = new ArrayList<>(tokenDict.values()); +// int k = 8; +// +// KFront kf = new KFront(true); +// kf.createKFront(k, tokens); +// +// TokensIndex tIdx = new TokensIndex(k, this.dir); +// tIdx.insertData(kf.getTable(), vals, kf.getConcatString()); +// +// saveToDir(TOKEN_INDEX_FILE, tIdx); +// } /** * Creates and saves to the disk the review index which hold all information related to reviews. @@ -230,4 +227,12 @@ private void saveToDir(String name, Object obj) { System.exit(1); } } + + public static void main(String[] args) { + String inputFile = "./100.txt"; + String dir = "./Data_Index"; + IndexWriter indexWriter = new IndexWriter(); + indexWriter.write(inputFile, dir); + System.out.println("here"); + } } \ No newline at end of file From c9b9b6658b1e2e4cf6f2586c4e0d85dcbd4ac747 Mon Sep 17 00:00:00 2001 From: nirnts Date: Wed, 5 May 2021 17:57:25 +0300 Subject: [PATCH 06/55] sortBuffer + saveBuffer --- src/webdata/IndexWriter.java | 67 +++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 9 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 2b4e296..072f922 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -9,7 +9,11 @@ public class IndexWriter { private TreeMap tokenDict; // token and tokenId private TreeMap> productIds; private TreeMap> reviewIds; - private ArrayList tokenBuffer; // array list containing termIds and docIds pairs + + private int[][] tokenBuffer; // Array of termID, docID pairs. Regular array to sort in-place + private int tokenBufferPointer; + private ObjectOutputStream tokenBufferWriter; + private String dir; private static final String PRODUCT_INDEX_FILE = "product_index.txt"; @@ -17,7 +21,7 @@ public class IndexWriter { private static final String TOKEN_INDEX_FILE = "token_index.txt"; private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; private static final int REVIEWS_TO_LOAD = 1000; - private static final int TOKEN_BUFFER_SIZE = 1000000; + private static final int TOKEN_BUFFER_SIZE = 1000; /** @@ -26,11 +30,11 @@ public class IndexWriter { */ public void write(String inputFile, String dir) { this.dir = dir; - createDicts(inputFile); createDir(); - createProductIndex(); + createDicts(inputFile); +// createProductIndex(); // createTokenIndex(); - createReviewIndex(); +// createReviewIndex(); } /** @@ -68,6 +72,15 @@ private void createDicts(String inputFile){ tokenDict = new TreeMap<>(); reviewIds = new TreeMap<>(); + tokenBuffer = new int[2][TOKEN_BUFFER_SIZE]; + tokenBufferPointer = 0; + try { + tokenBufferWriter = new ObjectOutputStream(new FileOutputStream(dir + "/tokenpairs.txt", true)); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + DataLoader dataLoader = null; DataParser dataParser = new DataParser(); try { @@ -92,6 +105,12 @@ private void createDicts(String inputFile){ } } + try { + tokenBufferWriter.close(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } // todo: merge all dictionaries } @@ -112,17 +131,47 @@ private int addReviewText(String reviewText, int reviewIndex){ reviewLength += 1; token = token.toLowerCase(); int termId = tokenDict.computeIfAbsent(token, k -> tokenDict.size()); - tokenBuffer.add(termId); - tokenBuffer.add(reviewIndex); - if (tokenBuffer.size() >= TOKEN_BUFFER_SIZE){ + tokenBuffer[0][tokenBufferPointer] = termId; + tokenBuffer[1][tokenBufferPointer] = reviewIndex; + tokenBufferPointer++; + if (tokenBufferPointer >= TOKEN_BUFFER_SIZE){ this.sortBuffer(); this.saveBuffer(); + this.clearBuffer(); } - } return reviewLength; } + private void sortBuffer() { + // TODO Currently this is not in-place. + Arrays.sort(tokenBuffer, Comparator.comparingInt(a -> a[0])); + } + + private void saveBuffer() { + for (int i = 0; i < TOKEN_BUFFER_SIZE; i++) { + try { + tokenBufferWriter.writeInt(tokenBuffer[0][i]); + tokenBufferWriter.writeInt(tokenBuffer[1][i]); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + } + + // TODO should we write the entire buffer? +// try { +// tokenBufferWriter.writeObject(tokenBuffer); +// } catch (IOException e) { +// e.printStackTrace(); +// } + } + + private void clearBuffer() { + tokenBuffer = new int[2][TOKEN_BUFFER_SIZE]; + tokenBufferPointer = 0; + } + /** * Update the productId dictionary by adding to it the given product. If the product already exists, it adds review * id to the reviews that are matching to this product. From c3c382e88ac2547f346556256c691f86d8558e8e Mon Sep 17 00:00:00 2001 From: darkushin Date: Sun, 9 May 2021 16:10:15 +0300 Subject: [PATCH 07/55] Before sorting tokens alphanumerically --- src/webdata/IndexWriter.java | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 072f922..5c83a8e 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -6,7 +6,8 @@ import java.util.*; public class IndexWriter { - private TreeMap tokenDict; // token and tokenId + private HashMap tokenDict; // token: tokenId + private ArrayList invertedTokenDict; // tokenId: token private TreeMap> productIds; private TreeMap> reviewIds; @@ -21,7 +22,7 @@ public class IndexWriter { private static final String TOKEN_INDEX_FILE = "token_index.txt"; private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; private static final int REVIEWS_TO_LOAD = 1000; - private static final int TOKEN_BUFFER_SIZE = 1000; + private static final int TOKEN_BUFFER_SIZE = 10; /** @@ -69,8 +70,9 @@ private void createDir(){ */ private void createDicts(String inputFile){ productIds = new TreeMap<>(); - tokenDict = new TreeMap<>(); + tokenDict = new HashMap<>(); reviewIds = new TreeMap<>(); + invertedTokenDict = new ArrayList<>(); tokenBuffer = new int[2][TOKEN_BUFFER_SIZE]; tokenBufferPointer = 0; @@ -121,9 +123,10 @@ private void createDicts(String inputFile){ * @param reviewIndex the number of the given review. * @return the number of tokens in the given review text. */ - private int addReviewText(String reviewText, int reviewIndex){ + private int addReviewText(String reviewText, int reviewIndex){ String[] tokens = reviewText.split("[^a-zA-Z0-9]"); // split to alphanumeric tokens int reviewLength = 0; + tokens = new String[]{"I", "bought", "I", "I"}; for (String token: tokens){ if (!token.matches("[a-zA-Z0-9]+")){ continue; @@ -131,10 +134,11 @@ private int addReviewText(String reviewText, int reviewIndex){ reviewLength += 1; token = token.toLowerCase(); int termId = tokenDict.computeIfAbsent(token, k -> tokenDict.size()); + if (termId == invertedTokenDict.size()) { invertedTokenDict.add(token);} // if a new token was added, add it also to the invertedTokenDict tokenBuffer[0][tokenBufferPointer] = termId; tokenBuffer[1][tokenBufferPointer] = reviewIndex; tokenBufferPointer++; - if (tokenBufferPointer >= TOKEN_BUFFER_SIZE){ + if (tokenBufferPointer == TOKEN_BUFFER_SIZE){ this.sortBuffer(); this.saveBuffer(); this.clearBuffer(); @@ -146,6 +150,7 @@ private int addReviewText(String reviewText, int reviewIndex){ private void sortBuffer() { // TODO Currently this is not in-place. Arrays.sort(tokenBuffer, Comparator.comparingInt(a -> a[0])); +// Arrays.sort(tokenBuffer, (a, b) -> invertedTokenDict.get(a[0]).compareTo(invertedTokenDict.get(a[1]))); } private void saveBuffer() { From 3eb81ec58355da7f3188ad7c5856cb72626a3971 Mon Sep 17 00:00:00 2001 From: darkushin Date: Sun, 9 May 2021 21:42:13 +0300 Subject: [PATCH 08/55] Added ExternalMergeSort - before implementing --- src/webdata/IndexWriter.java | 51 ++++++++++++++---------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 5c83a8e..337c4c1 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -13,7 +13,7 @@ public class IndexWriter { private int[][] tokenBuffer; // Array of termID, docID pairs. Regular array to sort in-place private int tokenBufferPointer; - private ObjectOutputStream tokenBufferWriter; + private int tokenFilesNumber = 0; private String dir; @@ -25,6 +25,7 @@ public class IndexWriter { private static final int TOKEN_BUFFER_SIZE = 10; + /** * Given product review data, creates an on disk index * inputFile is the path to the file containing the review data @@ -74,14 +75,9 @@ private void createDicts(String inputFile){ reviewIds = new TreeMap<>(); invertedTokenDict = new ArrayList<>(); - tokenBuffer = new int[2][TOKEN_BUFFER_SIZE]; - tokenBufferPointer = 0; - try { - tokenBufferWriter = new ObjectOutputStream(new FileOutputStream(dir + "/tokenpairs.txt", true)); - } catch (IOException e) { - e.printStackTrace(); - System.exit(1); - } +// tokenBuffer = new int[TOKEN_BUFFER_SIZE][2]; +// tokenBufferPointer = 0; + this.clearBuffer(); DataLoader dataLoader = null; DataParser dataParser = new DataParser(); @@ -107,13 +103,6 @@ private void createDicts(String inputFile){ } } - try { - tokenBufferWriter.close(); - } catch (IOException e) { - e.printStackTrace(); - System.exit(1); - } - // todo: merge all dictionaries } @@ -126,7 +115,7 @@ private void createDicts(String inputFile){ private int addReviewText(String reviewText, int reviewIndex){ String[] tokens = reviewText.split("[^a-zA-Z0-9]"); // split to alphanumeric tokens int reviewLength = 0; - tokens = new String[]{"I", "bought", "I", "I"}; +// tokens = new String[]{"I", "bought", "I", "I"}; for (String token: tokens){ if (!token.matches("[a-zA-Z0-9]+")){ continue; @@ -135,8 +124,8 @@ private int addReviewText(String reviewText, int reviewIndex){ token = token.toLowerCase(); int termId = tokenDict.computeIfAbsent(token, k -> tokenDict.size()); if (termId == invertedTokenDict.size()) { invertedTokenDict.add(token);} // if a new token was added, add it also to the invertedTokenDict - tokenBuffer[0][tokenBufferPointer] = termId; - tokenBuffer[1][tokenBufferPointer] = reviewIndex; + tokenBuffer[tokenBufferPointer][0] = termId; + tokenBuffer[tokenBufferPointer][1] = reviewIndex; tokenBufferPointer++; if (tokenBufferPointer == TOKEN_BUFFER_SIZE){ this.sortBuffer(); @@ -148,32 +137,30 @@ private int addReviewText(String reviewText, int reviewIndex){ } private void sortBuffer() { - // TODO Currently this is not in-place. - Arrays.sort(tokenBuffer, Comparator.comparingInt(a -> a[0])); -// Arrays.sort(tokenBuffer, (a, b) -> invertedTokenDict.get(a[0]).compareTo(invertedTokenDict.get(a[1]))); + Arrays.sort(tokenBuffer, Comparator.comparing(a -> invertedTokenDict.get(a[0]))); } private void saveBuffer() { + ObjectOutputStream tokenBufferWriter = null; + try { + tokenBufferWriter = new ObjectOutputStream(new FileOutputStream(dir + "/tokenpairs_" + tokenFilesNumber + ".txt")); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } for (int i = 0; i < TOKEN_BUFFER_SIZE; i++) { try { - tokenBufferWriter.writeInt(tokenBuffer[0][i]); - tokenBufferWriter.writeInt(tokenBuffer[1][i]); + tokenBufferWriter.writeInt(tokenBuffer[i][0]); + tokenBufferWriter.writeInt(tokenBuffer[i][1]); } catch (IOException e) { e.printStackTrace(); System.exit(1); } } - - // TODO should we write the entire buffer? -// try { -// tokenBufferWriter.writeObject(tokenBuffer); -// } catch (IOException e) { -// e.printStackTrace(); -// } } private void clearBuffer() { - tokenBuffer = new int[2][TOKEN_BUFFER_SIZE]; + tokenBuffer = new int[TOKEN_BUFFER_SIZE][2]; tokenBufferPointer = 0; } From bfdfa1cd4d62724e05fa38ced519000dd6fd6d2f Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 10 May 2021 18:42:43 +0300 Subject: [PATCH 09/55] ExternalMergeSort --- src/webdata/ExternalMergeSort.java | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 src/webdata/ExternalMergeSort.java diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java new file mode 100644 index 0000000..182b1f7 --- /dev/null +++ b/src/webdata/ExternalMergeSort.java @@ -0,0 +1,25 @@ +package webdata; + +import java.util.List; + +public class ExternalMergeSort { + private List invertedTokenDict; + private String filePrefix; + int numFiles; + int blockSize; + + ExternalMergeSort(List invertedTokenDict, int numFiles, String filePrefix, int blockSize){ + this.invertedTokenDict = invertedTokenDict; + this.numFiles = numFiles; + this.filePrefix = filePrefix; + this.blockSize = blockSize; + } + + /** + * Merge all files in the given range + */ + private void mergeFiles(int start, int end){ + // todo: iterate over all files in the range and take every time the smallest element. + // when a block is full, save it to the new file until done with all files + } +} From dc575127db7833f432d95b186f380eee3ae91c48 Mon Sep 17 00:00:00 2001 From: darkushin Date: Tue, 11 May 2021 19:05:48 +0300 Subject: [PATCH 10/55] ExternalMergeSort - before debugging --- src/webdata/ExternalMergeSort.java | 171 ++++++++++++++++++++++++++--- src/webdata/IndexWriter.java | 35 +++--- 2 files changed, 177 insertions(+), 29 deletions(-) diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java index 182b1f7..f18b9ce 100644 --- a/src/webdata/ExternalMergeSort.java +++ b/src/webdata/ExternalMergeSort.java @@ -1,25 +1,168 @@ package webdata; -import java.util.List; +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; public class ExternalMergeSort { - private List invertedTokenDict; - private String filePrefix; - int numFiles; - int blockSize; +// private List invertedTokenDict; + private Comparator cmp; +// private String filePrefix; + private int numFiles; // current number of files to merge + private int blockSize; + private String dir; + private int iteration; // number of merges performed (including current iteration). 1 means we are currently in the first iteration. + private int savedFiles; // number of files that were saved in the current iteration. - ExternalMergeSort(List invertedTokenDict, int numFiles, String filePrefix, int blockSize){ - this.invertedTokenDict = invertedTokenDict; + private int AVAILABLE_BLOCKS = 60000; + + ExternalMergeSort(Comparator cmp, int numFiles, int blockSize, String dir){ + this.cmp = cmp; this.numFiles = numFiles; - this.filePrefix = filePrefix; +// this.filePrefix = filePrefix; this.blockSize = blockSize; + this.dir = dir; + this.iteration = 1; + this.savedFiles = 0; } - /** - * Merge all files in the given range - */ - private void mergeFiles(int start, int end){ - // todo: iterate over all files in the range and take every time the smallest element. - // when a block is full, save it to the new file until done with all files + public void sort(){ + try { + SingleMerge singleMerge = new SingleMerge(1, numFiles); + } catch (IOException e){ + e.printStackTrace(); + System.exit(1); + } + + } + + + + /** Holds all the information required for a single iteration of the merge-sort algorithm */ + private class SingleMerge{ + private ArrayList fileReaders; + private ArrayList> fileDeques; + private final int dequeSize; + private int[] outputBlock; + private int outputPtr; + private ObjectOutputStream mergedOutput; + + + private SingleMerge(int start, int end) throws IOException { + // make a new dir for the files of this iteration: + Files.createDirectories(Path.of(dir + "/iteration_" + (iteration+1))); + + this.dequeSize = (AVAILABLE_BLOCKS - 1) / (end-start); + this.mergedOutput = new ObjectOutputStream(new FileOutputStream(dir + "/iteration_" + (iteration+1) + "/" + (savedFiles+1))); + for (int i=start; i<=end; i++){ + FileInputStream fileIn = new FileInputStream(dir + "/iteration_" + iteration + "/" + i); + this.fileReaders.add(new ObjectInputStream(fileIn)); + this.fileDeques.add(new ArrayDeque(this.dequeSize)); + } + } + + private void merge() throws IOException { + this.clearOutputBlock(); + this.loadAll(); + while (!this.areAllDequesEmpty()){ + int minIndex = this.getMin(); + this.extractMin(minIndex); + } + this.saveOutputBlock(); // needed in case the block wasn't full + this.removeDir(dir + "/iteration_" + iteration); // remove the temp dir in which the files of this iteration were stored + } + + /** Add the first element in the deque[minIndex] to the output block. + * If the block is full, save it to the output file and clear the block. + * If the deque is empty, load the next elements in the file given in minIndex. + */ + private void extractMin(int minIndex) throws IOException { + int[] minPair = fileDeques.get(minIndex).pollFirst(); + this.outputBlock[this.outputPtr] = minPair[0]; + this.outputBlock[this.outputPtr + 1] = minPair[1]; + this.outputPtr += 2; + if (this.outputPtr == blockSize * 2){ + this.saveOutputBlock(); + this.clearOutputBlock(); + } + if (fileDeques.get(minIndex).isEmpty() && fileReaders.get(minIndex) != null){ + this.loadData(minIndex, dequeSize); + } + } + + /** Return the index of the minimal element of the first elements (smallest elements) in all deques. */ + private int getMin(){ + int minIndex = -1; + for (int i=0; i 0){ + if (minIndex == -1) { + minIndex = i; + } else if (cmp.compare(fileDeques.get(minIndex).peekFirst()[0], fileDeques.get(i).getFirst()[0]) > 0){ + minIndex = i; + } + } + } + return minIndex; + } + + private void loadAll() throws IOException { + for (int i = 0; i <= this.fileReaders.size(); i++){ + this.loadData(i, this.dequeSize); + } + } + + /** Load numbBlocks from the file given by index i to the matching deque*/ + private void loadData(int i, int numBlocks) throws IOException { + for (int j=0; j d: fileDeques){ + if (!d.isEmpty()){ + return false; + } + } + return true; + } + + private void clearOutputBlock(){ + outputBlock = new int[blockSize*2]; + outputPtr = 0; + } + + private void saveOutputBlock() throws IOException { + for (int i = 0; i < this.outputPtr; i++){ + this.mergedOutput.writeInt(this.outputBlock[i]); + } + } + + private void removeDir(String dir){ + File dirToRemove = new File(dir); + File[] contents = dirToRemove.listFiles(); + if (contents != null) { + for (File file : contents) { + file.delete(); + } + } + dirToRemove.delete(); + } + + + } + + } diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 337c4c1..392788c 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -22,7 +22,7 @@ public class IndexWriter { private static final String TOKEN_INDEX_FILE = "token_index.txt"; private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; private static final int REVIEWS_TO_LOAD = 1000; - private static final int TOKEN_BUFFER_SIZE = 10; + private static final int TOKEN_BUFFER_SIZE = 60000; @@ -75,8 +75,14 @@ private void createDicts(String inputFile){ reviewIds = new TreeMap<>(); invertedTokenDict = new ArrayList<>(); -// tokenBuffer = new int[TOKEN_BUFFER_SIZE][2]; -// tokenBufferPointer = 0; + // todo: remove the directory creation from here! + try { + Files.createDirectories(Path.of(dir + "/iteration_1")); + } catch (IOException e) { + e.printStackTrace(); + } + + this.clearBuffer(); DataLoader dataLoader = null; @@ -84,6 +90,7 @@ private void createDicts(String inputFile){ try { dataLoader = new DataLoader(inputFile); } catch (IOException e) { + e.printStackTrace(); System.out.println("Error occurred while reading the reviews input file."); System.exit(1); } @@ -93,17 +100,14 @@ private void createDicts(String inputFile){ addProductId(review.getProductId(), i + 1); int length = addReviewText(review.getText(), i + 1); // addReviewId(review, i, length); - i++; - if (i == REVIEWS_TO_LOAD){ - i = 0; - productIds.clear(); - tokenDict.clear(); - reviewIds.clear(); - // todo: save the current dicts to disk - } - } - // todo: merge all dictionaries + this.saveBuffer(); + + // todo: merge sort all files - maybe move to a new function + Comparator cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); + ExternalMergeSort externalMergeSort = new ExternalMergeSort(cmp, tokenFilesNumber, 1000, dir); + externalMergeSort.sort(); + } /** @@ -142,13 +146,14 @@ private void sortBuffer() { private void saveBuffer() { ObjectOutputStream tokenBufferWriter = null; + this.tokenFilesNumber++; try { - tokenBufferWriter = new ObjectOutputStream(new FileOutputStream(dir + "/tokenpairs_" + tokenFilesNumber + ".txt")); + tokenBufferWriter = new ObjectOutputStream(new FileOutputStream(dir + "/iteration_1/" + tokenFilesNumber + ".txt")); } catch (IOException e) { e.printStackTrace(); System.exit(1); } - for (int i = 0; i < TOKEN_BUFFER_SIZE; i++) { + for (int i = 0; i < tokenBufferPointer; i++) { try { tokenBufferWriter.writeInt(tokenBuffer[i][0]); tokenBufferWriter.writeInt(tokenBuffer[i][1]); From c66a86188dd1dace8013d6626c85f51d96f0564a Mon Sep 17 00:00:00 2001 From: darkushin Date: Tue, 11 May 2021 20:01:57 +0300 Subject: [PATCH 11/55] ExternalMergeSort - train debugging --- src/webdata/ExternalMergeSort.java | 28 ++++++++++++++++------------ src/webdata/IndexWriter.java | 8 ++++---- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java index f18b9ce..b2c1bad 100644 --- a/src/webdata/ExternalMergeSort.java +++ b/src/webdata/ExternalMergeSort.java @@ -10,18 +10,18 @@ public class ExternalMergeSort { private Comparator cmp; // private String filePrefix; private int numFiles; // current number of files to merge - private int blockSize; + private int pairsInBlock; private String dir; private int iteration; // number of merges performed (including current iteration). 1 means we are currently in the first iteration. private int savedFiles; // number of files that were saved in the current iteration. private int AVAILABLE_BLOCKS = 60000; - ExternalMergeSort(Comparator cmp, int numFiles, int blockSize, String dir){ + ExternalMergeSort(Comparator cmp, int numFiles, int pairsInBlock, String dir){ this.cmp = cmp; this.numFiles = numFiles; // this.filePrefix = filePrefix; - this.blockSize = blockSize; + this.pairsInBlock = pairsInBlock; this.dir = dir; this.iteration = 1; this.savedFiles = 0; @@ -30,6 +30,7 @@ public class ExternalMergeSort { public void sort(){ try { SingleMerge singleMerge = new SingleMerge(1, numFiles); + singleMerge.merge(); } catch (IOException e){ e.printStackTrace(); System.exit(1); @@ -44,7 +45,7 @@ public void sort(){ private class SingleMerge{ private ArrayList fileReaders; private ArrayList> fileDeques; - private final int dequeSize; + private final int numPairsInDeque; private int[] outputBlock; private int outputPtr; private ObjectOutputStream mergedOutput; @@ -54,12 +55,15 @@ private SingleMerge(int start, int end) throws IOException { // make a new dir for the files of this iteration: Files.createDirectories(Path.of(dir + "/iteration_" + (iteration+1))); - this.dequeSize = (AVAILABLE_BLOCKS - 1) / (end-start); + this.numPairsInDeque = ((AVAILABLE_BLOCKS - 1) / (end-start+1)) * pairsInBlock; this.mergedOutput = new ObjectOutputStream(new FileOutputStream(dir + "/iteration_" + (iteration+1) + "/" + (savedFiles+1))); + this.fileReaders = new ArrayList<>(end-start+1); + this.fileDeques = new ArrayList<>(end-start+1); + for (int i=start; i<=end; i++){ FileInputStream fileIn = new FileInputStream(dir + "/iteration_" + iteration + "/" + i); this.fileReaders.add(new ObjectInputStream(fileIn)); - this.fileDeques.add(new ArrayDeque(this.dequeSize)); + this.fileDeques.add(new ArrayDeque(this.numPairsInDeque)); } } @@ -83,12 +87,12 @@ private void extractMin(int minIndex) throws IOException { this.outputBlock[this.outputPtr] = minPair[0]; this.outputBlock[this.outputPtr + 1] = minPair[1]; this.outputPtr += 2; - if (this.outputPtr == blockSize * 2){ + if (this.outputPtr == pairsInBlock * 2){ this.saveOutputBlock(); this.clearOutputBlock(); } if (fileDeques.get(minIndex).isEmpty() && fileReaders.get(minIndex) != null){ - this.loadData(minIndex, dequeSize); + this.loadData(minIndex, numPairsInDeque); } } @@ -108,14 +112,14 @@ private int getMin(){ } private void loadAll() throws IOException { - for (int i = 0; i <= this.fileReaders.size(); i++){ - this.loadData(i, this.dequeSize); + for (int i = 0; i < this.fileReaders.size(); i++){ + this.loadData(i, this.numPairsInDeque); } } /** Load numbBlocks from the file given by index i to the matching deque*/ private void loadData(int i, int numBlocks) throws IOException { - for (int j=0; j cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); - ExternalMergeSort externalMergeSort = new ExternalMergeSort(cmp, tokenFilesNumber, 1000, dir); + ExternalMergeSort externalMergeSort = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir); externalMergeSort.sort(); } @@ -148,7 +148,7 @@ private void saveBuffer() { ObjectOutputStream tokenBufferWriter = null; this.tokenFilesNumber++; try { - tokenBufferWriter = new ObjectOutputStream(new FileOutputStream(dir + "/iteration_1/" + tokenFilesNumber + ".txt")); + tokenBufferWriter = new ObjectOutputStream(new FileOutputStream(dir + "/iteration_1/" + tokenFilesNumber)); } catch (IOException e) { e.printStackTrace(); System.exit(1); From ed15c2b608756ace5ffa4293cb39c07c1e36d700 Mon Sep 17 00:00:00 2001 From: darkushin Date: Fri, 14 May 2021 19:14:22 +0300 Subject: [PATCH 12/55] Beginning of TokenIndex --- src/webdata/ExternalMergeSort.java | 3 +- src/webdata/IndexWriter.java | 64 +++++++++++++++++++++--------- 2 files changed, 48 insertions(+), 19 deletions(-) diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java index b2c1bad..c9214c5 100644 --- a/src/webdata/ExternalMergeSort.java +++ b/src/webdata/ExternalMergeSort.java @@ -28,6 +28,7 @@ public class ExternalMergeSort { } public void sort(){ + // todo: need to handle the case where more than one iteration is needed try { SingleMerge singleMerge = new SingleMerge(1, numFiles); singleMerge.merge(); @@ -119,7 +120,7 @@ private void loadAll() throws IOException { /** Load numbBlocks from the file given by index i to the matching deque*/ private void loadData(int i, int numBlocks) throws IOException { - for (int j = 0; j tokenDict; // token: tokenId + private HashMap> tokenDict; // token: tokenId private ArrayList invertedTokenDict; // tokenId: token private TreeMap> productIds; private TreeMap> reviewIds; @@ -35,7 +35,7 @@ public void write(String inputFile, String dir) { createDir(); createDicts(inputFile); // createProductIndex(); -// createTokenIndex(); + createTokenIndex(); // createReviewIndex(); } @@ -82,7 +82,6 @@ private void createDicts(String inputFile){ e.printStackTrace(); } - this.clearBuffer(); DataLoader dataLoader = null; @@ -119,14 +118,14 @@ private void createDicts(String inputFile){ private int addReviewText(String reviewText, int reviewIndex){ String[] tokens = reviewText.split("[^a-zA-Z0-9]"); // split to alphanumeric tokens int reviewLength = 0; -// tokens = new String[]{"I", "bought", "I", "I"}; for (String token: tokens){ if (!token.matches("[a-zA-Z0-9]+")){ continue; } reviewLength += 1; token = token.toLowerCase(); - int termId = tokenDict.computeIfAbsent(token, k -> tokenDict.size()); + ArrayList termIdArr = tokenDict.computeIfAbsent(token, k -> new ArrayList(tokenDict.size())); + int termId = termIdArr.get(0); if (termId == invertedTokenDict.size()) { invertedTokenDict.add(token);} // if a new token was added, add it also to the invertedTokenDict tokenBuffer[tokenBufferPointer][0] = termId; tokenBuffer[tokenBufferPointer][1] = reviewIndex; @@ -218,19 +217,21 @@ private void createProductIndex() { * Creates the index file for the tokens in the collection. * The index is created using the k-1-in-k front coding method. */ -// private void createTokenIndex(){ -// LinkedList tokens = new LinkedList<>(tokenDict.keySet()); -// ArrayList> vals = new ArrayList<>(tokenDict.values()); -// int k = 8; -// -// KFront kf = new KFront(true); -// kf.createKFront(k, tokens); -// -// TokensIndex tIdx = new TokensIndex(k, this.dir); -// tIdx.insertData(kf.getTable(), vals, kf.getConcatString()); -// -// saveToDir(TOKEN_INDEX_FILE, tIdx); -// } + private void createTokenIndex(){ + LinkedList tokens = new LinkedList<>(tokenDict.keySet()); + this.prepareTokenValues(); + + ArrayList> vals = new ArrayList<>(tokenDict.values()); + int k = 8; + + KFront kf = new KFront(true); + kf.createKFront(k, tokens); + + TokensIndex tIdx = new TokensIndex(k, this.dir); + tIdx.insertData(kf.getTable(), vals, kf.getConcatString()); + + saveToDir(TOKEN_INDEX_FILE, tIdx); + } /** * Creates and saves to the disk the review index which hold all information related to reviews. @@ -274,6 +275,33 @@ private void saveToDir(String name, Object obj) { } } + /** + * Read the termID-docID file, and convert all the appearances to the format of token:[doc1-#appearances, doc2-#appearance] + * this way, the same code as in ex1 can be used to create the token index. + */ + private void prepareTokenValues(){ + // todo: figure out how to get the file name + String fileName = "bla"; + FileInputStream fileIn = null; + try { + fileIn = new FileInputStream(fileName); + ObjectInputStream file = new ObjectInputStream(fileIn); + } catch (IOException e) { + e.printStackTrace(); + } + int previousTokenId = 0; + int previousDocId = 0; + + // while we didn't reach EOF, read two integers at a time - termID and docID. + // for every such pair, check if the termID is the same as the termID of the previous: + // If not - find the token matching to the termID (using invertedTermId dict) and add the list created here to the tokenDict. + // If yes - continue to update the list of this token - this list is the same as in ex1: pairs of docId-#appearances, i.e. for every document count the appearances of the token in the doc (can be done easily because they are consecutive in this case)/ + // For every pair, as the termID is the same, check if the docId matches the previous docId: + // If yes - raise the count for this docId + // If not - add a new entry for this docId and set its appearances to 1 + + } + public static void main(String[] args) { String inputFile = "./100.txt"; String dir = "./Data_Index"; From 3cbe60718e546c4597f5b3b069ddb8e405c1be6d Mon Sep 17 00:00:00 2001 From: darkushin Date: Sat, 15 May 2021 20:00:20 +0300 Subject: [PATCH 13/55] prepareTokenDict --- src/webdata/IndexWriter.java | 56 +++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index b51fda4..d306fe1 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -77,7 +77,7 @@ private void createDicts(String inputFile){ // todo: remove the directory creation from here! try { - Files.createDirectories(Path.of(dir + "/iteration_1")); + Files.createDirectories(Path.of(this.dir + "/iteration_1")); } catch (IOException e) { e.printStackTrace(); } @@ -124,7 +124,7 @@ private int addReviewText(String reviewText, int reviewIndex){ } reviewLength += 1; token = token.toLowerCase(); - ArrayList termIdArr = tokenDict.computeIfAbsent(token, k -> new ArrayList(tokenDict.size())); + ArrayList termIdArr = tokenDict.computeIfAbsent(token, k -> new ArrayList(Arrays.asList(tokenDict.size()))); int termId = termIdArr.get(0); if (termId == invertedTokenDict.size()) { invertedTokenDict.add(token);} // if a new token was added, add it also to the invertedTokenDict tokenBuffer[tokenBufferPointer][0] = termId; @@ -218,9 +218,10 @@ private void createProductIndex() { * The index is created using the k-1-in-k front coding method. */ private void createTokenIndex(){ + // Convert the current tokenDict of {token:termId} pairs to {token:[docId1,#freq1,docId2,#freq2,...]} format. + this.prepareTokenDict(); + // todo: need to sort the dictionary by keys! LinkedList tokens = new LinkedList<>(tokenDict.keySet()); - this.prepareTokenValues(); - ArrayList> vals = new ArrayList<>(tokenDict.values()); int k = 8; @@ -279,18 +280,53 @@ private void saveToDir(String name, Object obj) { * Read the termID-docID file, and convert all the appearances to the format of token:[doc1-#appearances, doc2-#appearance] * this way, the same code as in ex1 can be used to create the token index. */ - private void prepareTokenValues(){ - // todo: figure out how to get the file name - String fileName = "bla"; + private void prepareTokenDict(){ + // todo: change the fileName to be according to the directory! + String fileName = this.dir + "/iteration_2/1"; FileInputStream fileIn = null; + ObjectInputStream termFile = null; try { fileIn = new FileInputStream(fileName); - ObjectInputStream file = new ObjectInputStream(fileIn); + termFile = new ObjectInputStream(fileIn); } catch (IOException e) { e.printStackTrace(); } - int previousTokenId = 0; - int previousDocId = 0; + + // read all the integers from the file until reaching EOF + try{ + int previousTermId = 0; + int previousDocId = 0; + ArrayList tokenVals = new ArrayList<>(); // odd places-docId, even places-freq in doc. + while (true){ // todo: ugly solution, any better idea? + int termId = termFile.readInt(); + int docId = termFile.readInt(); + if (termId == previousTermId){ + if (docId == previousDocId){ // token already appeared in the doc - increment the frequency + tokenVals.set(tokenVals.size()-1, tokenVals.get(tokenVals.size()-1) + 1); + } else { // first appearance of the token in this doc + tokenVals.addAll(Arrays.asList(docId, 1)); + previousDocId = docId; + } + } else { + // save the values of the previous token: + String token = invertedTokenDict.get(previousTermId); + tokenDict.put(token, tokenVals); + + // start a new array for the new term: + tokenVals = new ArrayList<>(Arrays.asList(docId, 1)); + previousTermId = termId; + previousDocId = docId; + } + } + + } catch (EOFException e){ // reached EOF and finished converting all tokens. + return; + } catch (Exception e){ + e.printStackTrace(); + System.out.println("Error occurred while converting token dict."); + System.exit(1); + } + // while we didn't reach EOF, read two integers at a time - termID and docID. // for every such pair, check if the termID is the same as the termID of the previous: From 12d27f2889ec878114386d3b83e48b79ddfc86ba Mon Sep 17 00:00:00 2001 From: nirnts Date: Mon, 17 May 2021 14:58:23 +0300 Subject: [PATCH 14/55] other dicts --- src/webdata/IndexWriter.java | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index d306fe1..865a557 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -34,9 +34,9 @@ public void write(String inputFile, String dir) { this.dir = dir; createDir(); createDicts(inputFile); -// createProductIndex(); + createProductIndex(); createTokenIndex(); -// createReviewIndex(); + createReviewIndex(); } /** @@ -98,7 +98,7 @@ private void createDicts(String inputFile){ DataParser.Review review = dataParser.parseReview(s); addProductId(review.getProductId(), i + 1); int length = addReviewText(review.getText(), i + 1); -// addReviewId(review, i, length); + addReviewId(review, i, length); } this.saveBuffer(); @@ -185,15 +185,14 @@ private void addProductId(String productId, int reviewId) { /** * Adds all the information that is relevant to the given reviewId to the reviewIds dictionary. */ -// private void addReviewId(DataParser.Review review, int reviewId, int length) { -// reviewIds.put(reviewId, new ArrayList<>()); -// // 0 - productId, 1 - score, 2 - helpfulness, 3 - length -// for (String field : DataParser.INTEREST_FIELDS) { -// if (field.equals("text")) { continue; } -// reviewIds.get(reviewId).add(review.get(field)); -// } -// reviewIds.get(reviewId).add(String.valueOf(length)); -// } + private void addReviewId(DataParser.Review review, int reviewId, int length) { + reviewIds.put(reviewId, new ArrayList<>()); + // 0 - productId, 1 - score, 2 - helpfulness, 3 - length + reviewIds.get(reviewId).add(review.getProductId()); + reviewIds.get(reviewId).add(review.getScore()); + reviewIds.get(reviewId).add(review.getHelpfulness()); + reviewIds.get(reviewId).add(String.valueOf(length)); + } /** * Creates and saves to the disk the product index, i.e. all the information that is related to products. From f681df9cbf5dab266b93689a2a8d9dc4b55ed567 Mon Sep 17 00:00:00 2001 From: nirnts Date: Mon, 17 May 2021 19:04:20 +0300 Subject: [PATCH 15/55] Sorting works? --- src/webdata/ExternalMergeSort.java | 54 +++++++++++++-- src/webdata/IndexWriter.java | 108 ++++++++++++++++++++++------- 2 files changed, 130 insertions(+), 32 deletions(-) diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java index c9214c5..4997f37 100644 --- a/src/webdata/ExternalMergeSort.java +++ b/src/webdata/ExternalMergeSort.java @@ -6,7 +6,7 @@ import java.util.*; public class ExternalMergeSort { -// private List invertedTokenDict; + private List inv; private Comparator cmp; // private String filePrefix; private int numFiles; // current number of files to merge @@ -15,9 +15,9 @@ public class ExternalMergeSort { private int iteration; // number of merges performed (including current iteration). 1 means we are currently in the first iteration. private int savedFiles; // number of files that were saved in the current iteration. - private int AVAILABLE_BLOCKS = 60000; + private int AVAILABLE_BLOCKS = 5000; - ExternalMergeSort(Comparator cmp, int numFiles, int pairsInBlock, String dir){ + ExternalMergeSort(Comparator cmp, int numFiles, int pairsInBlock, String dir, List inv){ this.cmp = cmp; this.numFiles = numFiles; // this.filePrefix = filePrefix; @@ -25,6 +25,8 @@ public class ExternalMergeSort { this.dir = dir; this.iteration = 1; this.savedFiles = 0; + + this.inv = inv; } public void sort(){ @@ -72,13 +74,29 @@ private void merge() throws IOException { this.clearOutputBlock(); this.loadAll(); while (!this.areAllDequesEmpty()){ + ArrayList heads = getHeads(); int minIndex = this.getMin(); this.extractMin(minIndex); } this.saveOutputBlock(); // needed in case the block wasn't full + mergedOutput.close(); + // TODO: For some reason, the last few files are not removed this.removeDir(dir + "/iteration_" + iteration); // remove the temp dir in which the files of this iteration were stored } + private ArrayList getHeads() { + ArrayList heads = new ArrayList<>(); + for (int i=0; i 0){ if (minIndex == -1) { minIndex = i; - } else if (cmp.compare(fileDeques.get(minIndex).peekFirst()[0], fileDeques.get(i).getFirst()[0]) > 0){ + } else if (cmp.compare(fileDeques.get(minIndex).getFirst()[0], fileDeques.get(i).getFirst()[0]) > 0){ minIndex = i; } } @@ -119,8 +137,30 @@ private void loadAll() throws IOException { } /** Load numbBlocks from the file given by index i to the matching deque*/ - private void loadData(int i, int numBlocks) throws IOException { - for (int j = 0; j cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); - ExternalMergeSort externalMergeSort = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir); + + for (int j = 1; j <= tokenFilesNumber; j++) { + System.out.println("File " + j + " sorted: " + isFileSorted(dir + "/iteration_1/" + j, cmp)); + System.out.println("File " + j + " count: " + countNumsInFile(dir + "/iteration_1/" + j)); + } + + ExternalMergeSort externalMergeSort = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir, invertedTokenDict); externalMergeSort.sort(); + System.out.println(isFileSorted(dir + "/iteration_2/1", cmp)); + } + // TODO: for debugging. Remove this later + private boolean isFileSorted(String fileName, Comparator cmp) { + FileInputStream fileIn; + ObjectInputStream ois; + long tot = 0; + try { + fileIn = new FileInputStream(fileName); + ois = new ObjectInputStream(fileIn); + int prev = ois.readInt(); + int prevDocId = ois.readInt(); + tot++; + while (true) { + int cur = ois.readInt(); + int docId = ois.readInt(); + if (cmp.compare(prev, cur) > 0) { + System.out.println("Oops! Occured in " + tot); + } + prev = cur; + prevDocId = docId; + tot++; + } + } catch (EOFException ex) { + System.out.println("Read " + tot + " pairs."); + return true; + } catch (IOException ex) { + ex.printStackTrace(); + System.exit(1); + } + return true; + } + private long countNumsInFile(String fileName) { + FileInputStream fileIn; + ObjectInputStream ois; + long tot = 0; + try { + fileIn = new FileInputStream(fileName); + ois = new ObjectInputStream(fileIn); + while (true) { + ois.readInt(); + tot++; + } + } catch (EOFException ex) { + return tot; + } catch (IOException ex) { + ex.printStackTrace(); + System.exit(1); + } + return tot; } + /** * Split the given text of the i-th review into tokens and add them to the tokens dictionary. * @param reviewText the text of the review that should be added. @@ -132,7 +195,12 @@ private int addReviewText(String reviewText, int reviewIndex){ tokenBufferPointer++; if (tokenBufferPointer == TOKEN_BUFFER_SIZE){ this.sortBuffer(); - this.saveBuffer(); + try { + this.saveBuffer(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } this.clearBuffer(); } } @@ -143,24 +211,14 @@ private void sortBuffer() { Arrays.sort(tokenBuffer, Comparator.comparing(a -> invertedTokenDict.get(a[0]))); } - private void saveBuffer() { - ObjectOutputStream tokenBufferWriter = null; + private void saveBuffer() throws IOException { this.tokenFilesNumber++; - try { - tokenBufferWriter = new ObjectOutputStream(new FileOutputStream(dir + "/iteration_1/" + tokenFilesNumber)); - } catch (IOException e) { - e.printStackTrace(); - System.exit(1); - } + ObjectOutputStream tokenBufferWriter = new ObjectOutputStream(new FileOutputStream(dir + "/iteration_1/" + tokenFilesNumber)); for (int i = 0; i < tokenBufferPointer; i++) { - try { - tokenBufferWriter.writeInt(tokenBuffer[i][0]); - tokenBufferWriter.writeInt(tokenBuffer[i][1]); - } catch (IOException e) { - e.printStackTrace(); - System.exit(1); - } + tokenBufferWriter.writeInt(tokenBuffer[i][0]); + tokenBufferWriter.writeInt(tokenBuffer[i][1]); } + tokenBufferWriter.close(); } private void clearBuffer() { @@ -338,7 +396,7 @@ private void prepareTokenDict(){ } public static void main(String[] args) { - String inputFile = "./100.txt"; + String inputFile = "./1000.txt"; String dir = "./Data_Index"; IndexWriter indexWriter = new IndexWriter(); indexWriter.write(inputFile, dir); From 87b6bd5e79a5c69e7cfe3e4de775161d27dc4a3c Mon Sep 17 00:00:00 2001 From: nirnts Date: Mon, 17 May 2021 20:24:18 +0300 Subject: [PATCH 16/55] EM sort should be working (almost?) --- src/webdata/ExternalMergeSort.java | 69 ++++++++++++++++++------------ src/webdata/IndexWriter.java | 6 +-- 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java index 4997f37..43e2a69 100644 --- a/src/webdata/ExternalMergeSort.java +++ b/src/webdata/ExternalMergeSort.java @@ -8,14 +8,14 @@ public class ExternalMergeSort { private List inv; private Comparator cmp; -// private String filePrefix; + private String folderName = "/iteration_"; private int numFiles; // current number of files to merge private int pairsInBlock; private String dir; private int iteration; // number of merges performed (including current iteration). 1 means we are currently in the first iteration. private int savedFiles; // number of files that were saved in the current iteration. - private int AVAILABLE_BLOCKS = 5000; + private int AVAILABLE_BLOCKS = 4; ExternalMergeSort(Comparator cmp, int numFiles, int pairsInBlock, String dir, List inv){ this.cmp = cmp; @@ -30,19 +30,47 @@ public class ExternalMergeSort { } public void sort(){ - // todo: need to handle the case where more than one iteration is needed - try { - SingleMerge singleMerge = new SingleMerge(1, numFiles); - singleMerge.merge(); - } catch (IOException e){ - e.printStackTrace(); - System.exit(1); - } + while (numFiles > 1) { + try { + Files.createDirectories(Path.of(dir + folderName + (iteration+1))); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + for (int i = 0; i < Math.ceil((float) numFiles / (AVAILABLE_BLOCKS - 1)); i++) { + int end = Math.min(numFiles, (i + 1) * (AVAILABLE_BLOCKS - 1)); + try { + // TODO: Handle case when start == end? + SingleMerge sm = new SingleMerge(i * (AVAILABLE_BLOCKS - 1) + 1, end); + sm.merge(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + } + this.removeDir(dir + folderName + iteration); // remove the temp dir in which the files of this iteration were stored + numFiles = savedFiles; + savedFiles = 0; + iteration++; + } } + private void removeDir(String dir){ + File dirToRemove = new File(dir); + File[] contents = dirToRemove.listFiles(); + if (contents != null) { + for (File file : contents) { + file.delete(); + } + } + dirToRemove.delete(); + } + public String getFinalFile() { + return dir + folderName + iteration + "/1"; + } /** Holds all the information required for a single iteration of the merge-sort algorithm */ private class SingleMerge{ @@ -55,16 +83,13 @@ private class SingleMerge{ private SingleMerge(int start, int end) throws IOException { - // make a new dir for the files of this iteration: - Files.createDirectories(Path.of(dir + "/iteration_" + (iteration+1))); - this.numPairsInDeque = ((AVAILABLE_BLOCKS - 1) / (end-start+1)) * pairsInBlock; - this.mergedOutput = new ObjectOutputStream(new FileOutputStream(dir + "/iteration_" + (iteration+1) + "/" + (savedFiles+1))); + this.mergedOutput = new ObjectOutputStream(new FileOutputStream(dir + folderName + (iteration+1) + "/" + (savedFiles+1))); this.fileReaders = new ArrayList<>(end-start+1); this.fileDeques = new ArrayList<>(end-start+1); for (int i=start; i<=end; i++){ - FileInputStream fileIn = new FileInputStream(dir + "/iteration_" + iteration + "/" + i); + FileInputStream fileIn = new FileInputStream(dir + folderName + iteration + "/" + i); this.fileReaders.add(new ObjectInputStream(fileIn)); this.fileDeques.add(new ArrayDeque(this.numPairsInDeque)); } @@ -80,8 +105,7 @@ private void merge() throws IOException { } this.saveOutputBlock(); // needed in case the block wasn't full mergedOutput.close(); - // TODO: For some reason, the last few files are not removed - this.removeDir(dir + "/iteration_" + iteration); // remove the temp dir in which the files of this iteration were stored + savedFiles++; } private ArrayList getHeads() { @@ -195,16 +219,7 @@ private void saveOutputBlock() throws IOException { } } - private void removeDir(String dir){ - File dirToRemove = new File(dir); - File[] contents = dirToRemove.listFiles(); - if (contents != null) { - for (File file : contents) { - file.delete(); - } - } - dirToRemove.delete(); - } + } diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index e6a8184..59b2911 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -116,9 +116,9 @@ private void createDicts(String inputFile){ System.out.println("File " + j + " count: " + countNumsInFile(dir + "/iteration_1/" + j)); } - ExternalMergeSort externalMergeSort = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir, invertedTokenDict); - externalMergeSort.sort(); - System.out.println(isFileSorted(dir + "/iteration_2/1", cmp)); + ExternalMergeSort ems = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir, invertedTokenDict); + ems.sort(); + System.out.println(isFileSorted(ems.getFinalFile(), cmp)); } // TODO: for debugging. Remove this later From 1b19874b272bcbc8c9b3a21ceef211a851f69ea6 Mon Sep 17 00:00:00 2001 From: nirnts Date: Mon, 17 May 2021 22:35:50 +0300 Subject: [PATCH 17/55] sort improvements --- src/webdata/ExternalMergeSort.java | 40 ++++++++++++++---------------- src/webdata/IndexWriter.java | 26 +++++++++++++------ 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java index 43e2a69..d808682 100644 --- a/src/webdata/ExternalMergeSort.java +++ b/src/webdata/ExternalMergeSort.java @@ -6,9 +6,8 @@ import java.util.*; public class ExternalMergeSort { - private List inv; private Comparator cmp; - private String folderName = "/iteration_"; + public static String folderName = "/iteration_"; private int numFiles; // current number of files to merge private int pairsInBlock; private String dir; @@ -17,7 +16,7 @@ public class ExternalMergeSort { private int AVAILABLE_BLOCKS = 4; - ExternalMergeSort(Comparator cmp, int numFiles, int pairsInBlock, String dir, List inv){ + ExternalMergeSort(Comparator cmp, int numFiles, int pairsInBlock, String dir){ this.cmp = cmp; this.numFiles = numFiles; // this.filePrefix = filePrefix; @@ -25,8 +24,6 @@ public class ExternalMergeSort { this.dir = dir; this.iteration = 1; this.savedFiles = 0; - - this.inv = inv; } public void sort(){ @@ -55,6 +52,9 @@ public void sort(){ savedFiles = 0; iteration++; } + File sorted = new File(dir + folderName + iteration + "/1"); + sorted.renameTo(new File(dir + "/1")); + removeDir(dir + folderName + iteration); } private void removeDir(String dir){ @@ -68,10 +68,6 @@ private void removeDir(String dir){ dirToRemove.delete(); } - public String getFinalFile() { - return dir + folderName + iteration + "/1"; - } - /** Holds all the information required for a single iteration of the merge-sort algorithm */ private class SingleMerge{ private ArrayList fileReaders; @@ -99,7 +95,7 @@ private void merge() throws IOException { this.clearOutputBlock(); this.loadAll(); while (!this.areAllDequesEmpty()){ - ArrayList heads = getHeads(); +// ArrayList heads = getHeads(); int minIndex = this.getMin(); this.extractMin(minIndex); } @@ -108,18 +104,18 @@ private void merge() throws IOException { savedFiles++; } - private ArrayList getHeads() { - ArrayList heads = new ArrayList<>(); - for (int i=0; i getHeads() { +// ArrayList heads = new ArrayList<>(); +// for (int i=0; i cmp) { - FileInputStream fileIn; - ObjectInputStream ois; + FileInputStream fileIn = null; + ObjectInputStream ois = null; long tot = 0; try { fileIn = new FileInputStream(fileName); @@ -144,6 +144,12 @@ private boolean isFileSorted(String fileName, Comparator cmp) { } } catch (EOFException ex) { System.out.println("Read " + tot + " pairs."); + try { + ois.close(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } return true; } catch (IOException ex) { ex.printStackTrace(); @@ -153,7 +159,7 @@ private boolean isFileSorted(String fileName, Comparator cmp) { } private long countNumsInFile(String fileName) { FileInputStream fileIn; - ObjectInputStream ois; + ObjectInputStream ois = null; long tot = 0; try { fileIn = new FileInputStream(fileName); @@ -163,6 +169,12 @@ private long countNumsInFile(String fileName) { tot++; } } catch (EOFException ex) { + try { + ois.close(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } return tot; } catch (IOException ex) { ex.printStackTrace(); @@ -213,7 +225,7 @@ private void sortBuffer() { private void saveBuffer() throws IOException { this.tokenFilesNumber++; - ObjectOutputStream tokenBufferWriter = new ObjectOutputStream(new FileOutputStream(dir + "/iteration_1/" + tokenFilesNumber)); + ObjectOutputStream tokenBufferWriter = new ObjectOutputStream(new FileOutputStream(dir + ExternalMergeSort.folderName + "1/" + tokenFilesNumber)); for (int i = 0; i < tokenBufferPointer; i++) { tokenBufferWriter.writeInt(tokenBuffer[i][0]); tokenBufferWriter.writeInt(tokenBuffer[i][1]); From baad054cabe5e63ee0aa554ffb5f0fa522a72f54 Mon Sep 17 00:00:00 2001 From: nirnts Date: Tue, 18 May 2021 17:48:05 +0300 Subject: [PATCH 18/55] Token Index is (almost?) done. --- src/webdata/DataLoader.java | 5 ++- src/webdata/ExternalMergeSort.java | 2 +- src/webdata/IndexWriter.java | 25 ++++++++------- src/webdata/PairsLoader.java | 32 +++++++++++++++++++ src/webdata/TokensIndex.java | 51 ++++++++++++++++++++++++++++++ 5 files changed, 102 insertions(+), 13 deletions(-) create mode 100644 src/webdata/PairsLoader.java diff --git a/src/webdata/DataLoader.java b/src/webdata/DataLoader.java index e0202d5..da0df2e 100644 --- a/src/webdata/DataLoader.java +++ b/src/webdata/DataLoader.java @@ -48,7 +48,10 @@ public Iterator iterator() { @Override public boolean hasNext(){ try { - return br.ready(); + br.mark(1); + int i = br.read(); + br.reset(); + return (i != -1); } catch (IOException e) { return false; } diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java index d808682..195c136 100644 --- a/src/webdata/ExternalMergeSort.java +++ b/src/webdata/ExternalMergeSort.java @@ -14,7 +14,7 @@ public class ExternalMergeSort { private int iteration; // number of merges performed (including current iteration). 1 means we are currently in the first iteration. private int savedFiles; // number of files that were saved in the current iteration. - private int AVAILABLE_BLOCKS = 4; + private int AVAILABLE_BLOCKS = 1000; ExternalMergeSort(Comparator cmp, int numFiles, int pairsInBlock, String dir){ this.cmp = cmp; diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 7c1ab8b..c72e880 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -33,8 +33,10 @@ public void write(String inputFile, String dir) { createDir(); createDicts(inputFile); createProductIndex(); - createTokenIndex(); createReviewIndex(); + productIds = null; + reviewIds = null; // Clears memory? + createTokenIndex(); } /** @@ -136,7 +138,9 @@ private boolean isFileSorted(String fileName, Comparator cmp) { int cur = ois.readInt(); int docId = ois.readInt(); if (cmp.compare(prev, cur) > 0) { - System.out.println("Oops! Occured in " + tot); + System.out.println("Terms not sorted. Occured in " + tot); + } else if ((cmp.compare(prev, cur) == 0) && (prevDocId > docId)) { + System.out.println("DocIds not sorted. Occured in " + tot); } prev = cur; prevDocId = docId; @@ -220,7 +224,7 @@ private int addReviewText(String reviewText, int reviewIndex){ } private void sortBuffer() { - Arrays.sort(tokenBuffer, Comparator.comparing(a -> invertedTokenDict.get(a[0]))); + Arrays.sort(tokenBuffer,0, tokenBufferPointer, Comparator.comparing(a -> invertedTokenDict.get(a[0]))); } private void saveBuffer() throws IOException { @@ -287,19 +291,18 @@ private void createProductIndex() { * The index is created using the k-1-in-k front coding method. */ private void createTokenIndex(){ - // Convert the current tokenDict of {token:termId} pairs to {token:[docId1,#freq1,docId2,#freq2,...]} format. - this.prepareTokenDict(); - // todo: need to sort the dictionary by keys! +// // Convert the current tokenDict of {token:termId} pairs to {token:[docId1,#freq1,docId2,#freq2,...]} format. +// this.prepareTokenDict(); + LinkedList tokens = new LinkedList<>(tokenDict.keySet()); - ArrayList> vals = new ArrayList<>(tokenDict.values()); + Collections.sort(tokens); + ArrayList> vals = new ArrayList<>(tokenDict.values()); // TODO: I think we can throw away the termIDs at this point + tokenDict = null; int k = 8; - KFront kf = new KFront(true); kf.createKFront(k, tokens); - TokensIndex tIdx = new TokensIndex(k, this.dir); - tIdx.insertData(kf.getTable(), vals, kf.getConcatString()); - + tIdx.insertData2(kf.getTable(), kf.getConcatString(), dir + "/1"); saveToDir(TOKEN_INDEX_FILE, tIdx); } diff --git a/src/webdata/PairsLoader.java b/src/webdata/PairsLoader.java new file mode 100644 index 0000000..df2dec5 --- /dev/null +++ b/src/webdata/PairsLoader.java @@ -0,0 +1,32 @@ +package webdata; + +import java.io.EOFException; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.ObjectInputStream; + +public class PairsLoader { + ObjectInputStream ois = null; + + public PairsLoader(String file) { + try { + ois = new ObjectInputStream(new FileInputStream(file)); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public int[] readPair() { + int[] pair = new int[2]; + try { + pair[0] = ois.readInt(); + pair[1] = ois.readInt(); + } catch (EOFException e) { + return null; + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + return pair; + } +} diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index 7c85d92..f15a760 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -5,6 +5,7 @@ import java.math.BigInteger; import java.util.ArrayList; import java.util.Arrays; +import java.util.LinkedList; import java.util.List; public class TokensIndex implements Serializable { @@ -113,6 +114,56 @@ public void insertData(List> tokensData, ArrayList> tokensData, String concatString, String pairsFilename) { + dictString = concatString; + PairsLoader pl = new PairsLoader(pairsFilename); + int offset = 0; + int[] curPair = pl.readPair(); // This should correspond to the first token + for (int i=0; i< tokensData.size(); i++){ + List tokenData = tokensData.get(i); + TokenInfo token = new TokenInfo(); + LinkedList invertedIdx = new LinkedList<>(); + + invertedIdx.add(curPair[1]); + invertedIdx.add(1); + token.frequency++; + token.collectionFrequency++; + int[] nextPair = pl.readPair(); + while (nextPair != null && nextPair[0] == curPair[0]){ + if (nextPair[1] == curPair[1]) { // Token repetition inside the same doc + int docFreq = invertedIdx.removeLast(); + invertedIdx.add(docFreq + 1); + } else { + invertedIdx.add(nextPair[1]); + invertedIdx.add(1); + token.collectionFrequency++; + } + token.frequency++; + curPair = nextPair; + nextPair = pl.readPair(); + } + curPair = nextPair; // Save the pair for the next token + + try { + token.invertedIndexPtr = (int) this.invertedIndexFile.getFilePointer(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + saveInvertedIndex(invertedIdx); + numTokens += token.collectionFrequency; + token.length = tokenData.get(TOKEN_LENGTH).shortValue(); + if (offset == 0){ + token.stringInfo = tokenData.get(POINTER_INDEX).shortValue(); + } else { + token.stringInfo = tokenData.get(PREFIX_INDEX).shortValue(); + } + offset++; + offset = offset % k; + this.data.add(token); + } + } + /** * Create a sub list of the given list containing only the odd/even elements in the array * @param inputList the list that should be sliced From 3a5e5dcaae9210746d7aa5e9266af11ca6cea5f2 Mon Sep 17 00:00:00 2001 From: darkushin Date: Fri, 21 May 2021 14:41:48 +0300 Subject: [PATCH 19/55] Code after debugging, before cleaning --- src/webdata/DataParser.java | 10 ++++---- src/webdata/IndexWriter.java | 16 ++++++------- src/webdata/TokensIndex.java | 44 ++++-------------------------------- 3 files changed, 17 insertions(+), 53 deletions(-) diff --git a/src/webdata/DataParser.java b/src/webdata/DataParser.java index 4ba3dbe..a2c44c0 100644 --- a/src/webdata/DataParser.java +++ b/src/webdata/DataParser.java @@ -76,10 +76,12 @@ public Review parseReview(String review){ for (int i=1; i fieldValue = Arrays.asList(field.split(": ")); - switch (fieldValue.get(0)) { - case "text" -> parsedReview.setText(String.join(":", fieldValue.subList(1, fieldValue.size()))); - case "helpfulness" -> parsedReview.setHelpfulness(fieldValue.get(1)); - case "score" -> parsedReview.setScore(fieldValue.get(1)); + if (fieldValue.get(0).equals("text")) { + parsedReview.setText(String.join(":", fieldValue.subList(1, fieldValue.size()))); + } else if (fieldValue.get(0).equals("helpfulness")) { + parsedReview.setHelpfulness(fieldValue.get(1)); + } else if (fieldValue.get(0).equals("score")) { + parsedReview.setScore(fieldValue.get(1)); } } return parsedReview; diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index c72e880..64a6cb0 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -6,7 +6,7 @@ import java.util.*; public class IndexWriter { - private HashMap> tokenDict; // token: tokenId + private HashMap tokenDict; // token: tokenId private ArrayList invertedTokenDict; // tokenId: token private TreeMap> productIds; private TreeMap> reviewIds; @@ -203,8 +203,7 @@ private int addReviewText(String reviewText, int reviewIndex){ } reviewLength += 1; token = token.toLowerCase(); - ArrayList termIdArr = tokenDict.computeIfAbsent(token, k -> new ArrayList(Arrays.asList(tokenDict.size()))); - int termId = termIdArr.get(0); + int termId = tokenDict.computeIfAbsent(token, k -> tokenDict.size()); if (termId == invertedTokenDict.size()) { invertedTokenDict.add(token);} // if a new token was added, add it also to the invertedTokenDict tokenBuffer[tokenBufferPointer][0] = termId; tokenBuffer[tokenBufferPointer][1] = reviewIndex; @@ -291,18 +290,14 @@ private void createProductIndex() { * The index is created using the k-1-in-k front coding method. */ private void createTokenIndex(){ -// // Convert the current tokenDict of {token:termId} pairs to {token:[docId1,#freq1,docId2,#freq2,...]} format. -// this.prepareTokenDict(); - LinkedList tokens = new LinkedList<>(tokenDict.keySet()); Collections.sort(tokens); - ArrayList> vals = new ArrayList<>(tokenDict.values()); // TODO: I think we can throw away the termIDs at this point tokenDict = null; int k = 8; KFront kf = new KFront(true); kf.createKFront(k, tokens); TokensIndex tIdx = new TokensIndex(k, this.dir); - tIdx.insertData2(kf.getTable(), kf.getConcatString(), dir + "/1"); + tIdx.insertData(kf.getTable(), kf.getConcatString(), dir + "/1"); saveToDir(TOKEN_INDEX_FILE, tIdx); } @@ -382,7 +377,7 @@ private void prepareTokenDict(){ } else { // save the values of the previous token: String token = invertedTokenDict.get(previousTermId); - tokenDict.put(token, tokenVals); +// tokenDict.put(token, tokenVals); // start a new array for the new term: tokenVals = new ArrayList<>(Arrays.asList(docId, 1)); @@ -415,6 +410,9 @@ public static void main(String[] args) { String dir = "./Data_Index"; IndexWriter indexWriter = new IndexWriter(); indexWriter.write(inputFile, dir); +// Comparator cmp = Comparator.comparing(a -> indexWriter.invertedTokenDict.get(a)); + +// indexWriter.isFileSorted("./Data_Index/1", cmp); System.out.println("here"); } } \ No newline at end of file diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index f15a760..59259e1 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -44,7 +44,7 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException { private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; - private ArrayList data; + public ArrayList data; private String dictString; private int numTokens; // the total number of tokens in the collection, including repetitions private int k; @@ -78,43 +78,7 @@ private void createRandomAccessFile(){ } } - /** - * Insert the given information of token properties into the index format that should be saved. - * @param tokensData the data of the token containing its pointer/prefix length and token length as created in the KFront class. - * @param tokensVals a list of reviewId-num appearances of reviews containing every token and the number the token appeared in every review. - * @param concatString the concatenated string of all tokens in the collection, created by the KFront class. - */ - public void insertData(List> tokensData, ArrayList> tokensVals, String concatString){ - dictString = concatString; - int offset = 0; - for (int i=0; i< tokensData.size(); i++){ - List tokenData = tokensData.get(i); - List tokenVal = tokensVals.get(i); - TokenInfo token = new TokenInfo(); - token.length = tokenData.get(TOKEN_LENGTH).shortValue(); - token.frequency = (short) (tokenVal.size() / 2); - token.collectionFrequency = (short) subListVals(tokenVal, "even").stream().mapToInt(Integer::intValue).sum(); - numTokens += token.getCollectionFrequency(); - try { - token.invertedIndexPtr = (int) this.invertedIndexFile.getFilePointer(); - } catch (IOException e) { - System.out.println("Error occurred while accessing the token_inverted_index file"); - e.printStackTrace(); - System.exit(1); - } - saveInvertedIndex(tokenVal); - if (offset == 0){ - token.stringInfo = tokenData.get(POINTER_INDEX).shortValue(); - } else { - token.stringInfo = tokenData.get(PREFIX_INDEX).shortValue(); - } - offset++; - offset = offset % k; - this.data.add(token); - } - } - - public void insertData2(List> tokensData, String concatString, String pairsFilename) { + public void insertData(List> tokensData, String concatString, String pairsFilename) { dictString = concatString; PairsLoader pl = new PairsLoader(pairsFilename); int offset = 0; @@ -136,9 +100,9 @@ public void insertData2(List> tokensData, String concatString, Str } else { invertedIdx.add(nextPair[1]); invertedIdx.add(1); - token.collectionFrequency++; + token.frequency++; } - token.frequency++; + token.collectionFrequency++; curPair = nextPair; nextPair = pl.readPair(); } From da01cbe481fb462cb797ec82338bc2ee5bc52276 Mon Sep 17 00:00:00 2001 From: darkushin Date: Sat, 22 May 2021 18:02:30 +0300 Subject: [PATCH 20/55] Cleaned code, before optimization --- src/webdata/DataParser.java | 29 ------------- src/webdata/ExternalMergeSort.java | 15 ------- src/webdata/IndexWriter.java | 68 +----------------------------- 3 files changed, 1 insertion(+), 111 deletions(-) diff --git a/src/webdata/DataParser.java b/src/webdata/DataParser.java index a2c44c0..5ec8687 100644 --- a/src/webdata/DataParser.java +++ b/src/webdata/DataParser.java @@ -86,34 +86,5 @@ public Review parseReview(String review){ } return parsedReview; } - -// public static void main(String[] args) throws IOException { -// String inputFile = "./100.txt"; -// BufferedReader br = new BufferedReader(new FileReader(inputFile)); -// String line; -// StringBuilder review = new StringBuilder(); -// List data = new ArrayList<>(); -// boolean stopFlag = false; -// int i = 0; -// while(!stopFlag && (line = br.readLine()) != null) { -// if (line.contains("product/productId")){ -// if (i > 0) { -// data.add(review.toString()); -// stopFlag = true; -// } -// else { -// review.append(line); -// i++; -// } -// } -// else{ -// review.append(line); -// } -// } -// DataParser dt = new DataParser(); -// dt.parseData(data); -// List reviews = dt.getParsedData(); -// System.out.println("daniel"); -// } } diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java index 195c136..785229e 100644 --- a/src/webdata/ExternalMergeSort.java +++ b/src/webdata/ExternalMergeSort.java @@ -19,7 +19,6 @@ public class ExternalMergeSort { ExternalMergeSort(Comparator cmp, int numFiles, int pairsInBlock, String dir){ this.cmp = cmp; this.numFiles = numFiles; -// this.filePrefix = filePrefix; this.pairsInBlock = pairsInBlock; this.dir = dir; this.iteration = 1; @@ -95,7 +94,6 @@ private void merge() throws IOException { this.clearOutputBlock(); this.loadAll(); while (!this.areAllDequesEmpty()){ -// ArrayList heads = getHeads(); int minIndex = this.getMin(); this.extractMin(minIndex); } @@ -104,19 +102,6 @@ private void merge() throws IOException { savedFiles++; } -// private ArrayList getHeads() { -// ArrayList heads = new ArrayList<>(); -// for (int i=0; i cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); for (int j = 1; j <= tokenFilesNumber; j++) { @@ -343,76 +342,11 @@ private void saveToDir(String name, Object obj) { } } - /** - * Read the termID-docID file, and convert all the appearances to the format of token:[doc1-#appearances, doc2-#appearance] - * this way, the same code as in ex1 can be used to create the token index. - */ - private void prepareTokenDict(){ - // todo: change the fileName to be according to the directory! - String fileName = this.dir + "/iteration_2/1"; - FileInputStream fileIn = null; - ObjectInputStream termFile = null; - try { - fileIn = new FileInputStream(fileName); - termFile = new ObjectInputStream(fileIn); - } catch (IOException e) { - e.printStackTrace(); - } - - // read all the integers from the file until reaching EOF - try{ - int previousTermId = 0; - int previousDocId = 0; - ArrayList tokenVals = new ArrayList<>(); // odd places-docId, even places-freq in doc. - while (true){ // todo: ugly solution, any better idea? - int termId = termFile.readInt(); - int docId = termFile.readInt(); - if (termId == previousTermId){ - if (docId == previousDocId){ // token already appeared in the doc - increment the frequency - tokenVals.set(tokenVals.size()-1, tokenVals.get(tokenVals.size()-1) + 1); - } else { // first appearance of the token in this doc - tokenVals.addAll(Arrays.asList(docId, 1)); - previousDocId = docId; - } - } else { - // save the values of the previous token: - String token = invertedTokenDict.get(previousTermId); -// tokenDict.put(token, tokenVals); - - // start a new array for the new term: - tokenVals = new ArrayList<>(Arrays.asList(docId, 1)); - previousTermId = termId; - previousDocId = docId; - } - } - - } catch (EOFException e){ // reached EOF and finished converting all tokens. - return; - } catch (Exception e){ - e.printStackTrace(); - System.out.println("Error occurred while converting token dict."); - System.exit(1); - } - - - // while we didn't reach EOF, read two integers at a time - termID and docID. - // for every such pair, check if the termID is the same as the termID of the previous: - // If not - find the token matching to the termID (using invertedTermId dict) and add the list created here to the tokenDict. - // If yes - continue to update the list of this token - this list is the same as in ex1: pairs of docId-#appearances, i.e. for every document count the appearances of the token in the doc (can be done easily because they are consecutive in this case)/ - // For every pair, as the termID is the same, check if the docId matches the previous docId: - // If yes - raise the count for this docId - // If not - add a new entry for this docId and set its appearances to 1 - - } - public static void main(String[] args) { - String inputFile = "./1000.txt"; + String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; String dir = "./Data_Index"; IndexWriter indexWriter = new IndexWriter(); indexWriter.write(inputFile, dir); -// Comparator cmp = Comparator.comparing(a -> indexWriter.invertedTokenDict.get(a)); - -// indexWriter.isFileSorted("./Data_Index/1", cmp); System.out.println("here"); } } \ No newline at end of file From d8c4a18e02ea08711a0972c9200104fd26a2e67e Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 24 May 2021 11:26:18 +0300 Subject: [PATCH 21/55] Updated dictString writing in TokensIndex --- src/webdata/ExternalMergeSort.java | 2 +- src/webdata/IndexWriter.java | 29 +++++++++++++++++++++-------- src/webdata/TokensIndex.java | 10 ++++++++-- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java index 785229e..6d190f3 100644 --- a/src/webdata/ExternalMergeSort.java +++ b/src/webdata/ExternalMergeSort.java @@ -14,7 +14,7 @@ public class ExternalMergeSort { private int iteration; // number of merges performed (including current iteration). 1 means we are currently in the first iteration. private int savedFiles; // number of files that were saved in the current iteration. - private int AVAILABLE_BLOCKS = 1000; + private int AVAILABLE_BLOCKS = 50000; ExternalMergeSort(Comparator cmp, int numFiles, int pairsInBlock, String dir){ this.cmp = cmp; diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 56413c4..ac4275f 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -4,6 +4,8 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.*; +import java.util.Date; + public class IndexWriter { private HashMap tokenDict; // token: tokenId @@ -22,7 +24,11 @@ public class IndexWriter { private static final String TOKEN_INDEX_FILE = "token_index.txt"; private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; private static final int PAIRS_IN_BLOCK = 1000; - private static final int TOKEN_BUFFER_SIZE = 5000; // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. + private static final int M = 50000; + private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. + + int NUM_REVIEWS = 100000; // todo: remove before submission! + /** * Given product review data, creates an on disk index @@ -100,6 +106,9 @@ private void createDicts(String inputFile){ int length = addReviewText(review.getText(), i); addReviewId(review, i, length); i++; + + // todo: remove this part - is used only to test with specific number of reviews + if (i > NUM_REVIEWS) { break;} } this.sortBuffer(); try { @@ -112,14 +121,14 @@ private void createDicts(String inputFile){ Comparator cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); - for (int j = 1; j <= tokenFilesNumber; j++) { - System.out.println("File " + j + " sorted: " + isFileSorted(dir + "/iteration_1/" + j, cmp)); - System.out.println("File " + j + " count: " + countNumsInFile(dir + "/iteration_1/" + j)); - } +// for (int j = 1; j <= tokenFilesNumber; j++) { +// System.out.println("File " + j + " sorted: " + isFileSorted(dir + "/iteration_1/" + j, cmp)); +// System.out.println("File " + j + " count: " + countNumsInFile(dir + "/iteration_1/" + j)); +// } ExternalMergeSort ems = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir); ems.sort(); - System.out.println(isFileSorted(dir + "/1", cmp)); +// System.out.println(isFileSorted(dir + "/1", cmp)); } // TODO: for debugging. Remove this later @@ -292,7 +301,7 @@ private void createTokenIndex(){ LinkedList tokens = new LinkedList<>(tokenDict.keySet()); Collections.sort(tokens); tokenDict = null; - int k = 8; + int k = 256; KFront kf = new KFront(true); kf.createKFront(k, tokens); TokensIndex tIdx = new TokensIndex(k, this.dir); @@ -343,10 +352,14 @@ private void saveToDir(String name, Object obj) { } public static void main(String[] args) { - String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; +// String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; + String inputFile = "./1000.txt"; String dir = "./Data_Index"; + long startTime = new Date().getTime(); IndexWriter indexWriter = new IndexWriter(); indexWriter.write(inputFile, dir); + long endTime = new Date().getTime(); + System.out.println("Indexing Time: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); System.out.println("here"); } } \ No newline at end of file diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index 59259e1..bc565c8 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.io.*; import java.math.BigInteger; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedList; @@ -46,6 +47,7 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException { public ArrayList data; private String dictString; + private int dictBytes; private int numTokens; // the total number of tokens in the collection, including repetitions private int k; private String dir; @@ -54,6 +56,7 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException { public TokensIndex(int k, String dir) { this.data = new ArrayList<>(); this.dictString = null; + this.dictBytes = 0; this.numTokens = 0; this.k = k; this.dir = dir; @@ -126,6 +129,7 @@ public void insertData(List> tokensData, String concatString, Stri offset = offset % k; this.data.add(token); } + this.dictBytes = this.dictString.getBytes(StandardCharsets.UTF_8).length; } /** @@ -247,7 +251,8 @@ public int search(String str) { private void readObject(ObjectInputStream inputFile) throws IOException, ClassNotFoundException { k = inputFile.readInt(); - dictString = inputFile.readUTF(); + dictBytes = inputFile.readInt(); + dictString = new String(inputFile.readNBytes(dictBytes), StandardCharsets.UTF_8); numTokens = inputFile.readInt(); data = (ArrayList) inputFile.readObject(); @@ -255,7 +260,8 @@ private void readObject(ObjectInputStream inputFile) throws IOException, ClassNo private void writeObject(ObjectOutputStream outputFile) throws IOException { outputFile.writeInt(this.k); - outputFile.writeUTF(this.dictString); + outputFile.writeInt(this.dictBytes); + outputFile.writeBytes(this.dictString); outputFile.writeInt(this.numTokens); outputFile.writeObject(this.data); } From 5e6ebbe30ab505c25682906055c8fe7a24d10959 Mon Sep 17 00:00:00 2001 From: Daniel Arkushin Date: Mon, 24 May 2021 11:52:27 +0300 Subject: [PATCH 22/55] test cs computers --- src/webdata/IndexWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index ac4275f..4899998 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -16,7 +16,7 @@ public class IndexWriter { private int[][] tokenBuffer; // Array of termID, docID pairs. Regular array to sort in-place private int tokenBufferPointer; private int tokenFilesNumber = 0; - +// test private String dir; private static final String PRODUCT_INDEX_FILE = "product_index.txt"; From f427cdb7b03ac3c1c766f273d63ab4f48456ca95 Mon Sep 17 00:00:00 2001 From: nirnts Date: Mon, 24 May 2021 11:55:42 +0300 Subject: [PATCH 23/55] time --- src/webdata/IndexWriter.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index ac4275f..fbcc65e 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -88,8 +88,8 @@ private void createDicts(String inputFile){ e.printStackTrace(); } + long startTime = new Date().getTime(); this.clearBuffer(); - DataLoader dataLoader = null; DataParser dataParser = new DataParser(); try { @@ -117,7 +117,8 @@ private void createDicts(String inputFile){ e.printStackTrace(); System.exit(1); } - + long endTime = new Date().getTime(); + System.out.println("Reading data & writing files took " + ((endTime - startTime) / 1000) + " Seconds"); Comparator cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); From 4ca2e6d76cd685ba75c2f87047295b048c6fe2a2 Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 24 May 2021 13:27:00 +0300 Subject: [PATCH 24/55] Added time info --- src/webdata/ExternalMergeSort.java | 1 + src/webdata/IndexWriter.java | 37 +++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java index 6d190f3..da6adbb 100644 --- a/src/webdata/ExternalMergeSort.java +++ b/src/webdata/ExternalMergeSort.java @@ -49,6 +49,7 @@ public void sort(){ this.removeDir(dir + folderName + iteration); // remove the temp dir in which the files of this iteration were stored numFiles = savedFiles; savedFiles = 0; + System.out.println("Number of files in iteration: " + iteration + "is: " + numFiles); iteration++; } File sorted = new File(dir + folderName + iteration + "/1"); diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 4899998..57663c9 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -13,10 +13,10 @@ public class IndexWriter { private TreeMap> productIds; private TreeMap> reviewIds; - private int[][] tokenBuffer; // Array of termID, docID pairs. Regular array to sort in-place + private int[][] tokenBuffer = new int[TOKEN_BUFFER_SIZE][2]; + ; // Array of termID, docID pairs. Regular array to sort in-place private int tokenBufferPointer; private int tokenFilesNumber = 0; -// test private String dir; private static final String PRODUCT_INDEX_FILE = "product_index.txt"; @@ -24,10 +24,10 @@ public class IndexWriter { private static final String TOKEN_INDEX_FILE = "token_index.txt"; private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; private static final int PAIRS_IN_BLOCK = 1000; - private static final int M = 50000; + private static final int M = 100000; private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. - int NUM_REVIEWS = 100000; // todo: remove before submission! + int NUM_REVIEWS = 1000000; // todo: remove before submission! /** @@ -38,11 +38,24 @@ public void write(String inputFile, String dir) { this.dir = dir; createDir(); createDicts(inputFile); + long startTime = new Date().getTime(); createProductIndex(); + long endTime = new Date().getTime(); + System.out.println("Create Product Index: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); + + startTime = new Date().getTime(); createReviewIndex(); + endTime = new Date().getTime(); + System.out.println("Create Review Index: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); + productIds = null; reviewIds = null; // Clears memory? + + startTime = new Date().getTime(); createTokenIndex(); + endTime = new Date().getTime(); + System.out.println("Create Token Index: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); + } /** @@ -99,6 +112,7 @@ private void createDicts(String inputFile){ System.out.println("Error occurred while reading the reviews input file."); System.exit(1); } + long startTime = new Date().getTime(); int i=1; for (String s: dataLoader){ DataParser.Review review = dataParser.parseReview(s); @@ -110,6 +124,7 @@ private void createDicts(String inputFile){ // todo: remove this part - is used only to test with specific number of reviews if (i > NUM_REVIEWS) { break;} } + this.sortBuffer(); try { this.saveBuffer(); @@ -118,16 +133,23 @@ private void createDicts(String inputFile){ System.exit(1); } + long endTime = new Date().getTime(); + System.out.println("Data Loading And Saving Time: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); + this.tokenBuffer = null; // free the token buffer space Comparator cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); // for (int j = 1; j <= tokenFilesNumber; j++) { // System.out.println("File " + j + " sorted: " + isFileSorted(dir + "/iteration_1/" + j, cmp)); // System.out.println("File " + j + " count: " + countNumsInFile(dir + "/iteration_1/" + j)); // } - + startTime = new Date().getTime(); ExternalMergeSort ems = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir); + System.out.println("Number of files before merging: " + tokenFilesNumber); ems.sort(); + endTime = new Date().getTime(); + System.out.println("Merging Time: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); + // System.out.println(isFileSorted(dir + "/1", cmp)); } @@ -245,7 +267,6 @@ private void saveBuffer() throws IOException { } private void clearBuffer() { - tokenBuffer = new int[TOKEN_BUFFER_SIZE][2]; tokenBufferPointer = 0; } @@ -352,8 +373,8 @@ private void saveToDir(String name, Object obj) { } public static void main(String[] args) { -// String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; - String inputFile = "./1000.txt"; + String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; +// String inputFile = "./1000.txt"; String dir = "./Data_Index"; long startTime = new Date().getTime(); IndexWriter indexWriter = new IndexWriter(); From 42812521fc7b5cb1ae955654242ff409b8aaeffd Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 24 May 2021 15:37:27 +0300 Subject: [PATCH 25/55] saveInvertedIndex time improvement --- src/webdata/IndexWriter.java | 19 ++++++++++++++++++- src/webdata/ProductIndex.java | 16 ++++++++++++---- src/webdata/TokensIndex.java | 34 +++++++++++++++++++++++++++------- 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 57663c9..5a596e0 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -27,7 +27,7 @@ public class IndexWriter { private static final int M = 100000; private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. - int NUM_REVIEWS = 1000000; // todo: remove before submission! + int NUM_REVIEWS = 10000; // todo: remove before submission! /** @@ -320,14 +320,31 @@ private void createProductIndex() { */ private void createTokenIndex(){ LinkedList tokens = new LinkedList<>(tokenDict.keySet()); + long startTime = new Date().getTime(); Collections.sort(tokens); + long endTime = new Date().getTime(); + System.out.println("Token Index After Sort: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); + tokenDict = null; + startTime = new Date().getTime(); + int k = 256; KFront kf = new KFront(true); kf.createKFront(k, tokens); + endTime = new Date().getTime(); + System.out.println("Token Index After KFront: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); + + startTime = new Date().getTime(); TokensIndex tIdx = new TokensIndex(k, this.dir); tIdx.insertData(kf.getTable(), kf.getConcatString(), dir + "/1"); + endTime = new Date().getTime(); + System.out.println("Token Index Inserting Data: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); + + startTime = new Date().getTime(); saveToDir(TOKEN_INDEX_FILE, tIdx); + endTime = new Date().getTime(); + System.out.println("Token Index Saving File: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); + } /** diff --git a/src/webdata/ProductIndex.java b/src/webdata/ProductIndex.java index 2e32a7f..6e6f14a 100644 --- a/src/webdata/ProductIndex.java +++ b/src/webdata/ProductIndex.java @@ -1,6 +1,7 @@ package webdata; import java.io.*; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -36,11 +37,13 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException private ArrayList data; private String dictString; + private int dictBytes; private int k; public ProductIndex(int k) { - data = new ArrayList<>(); - dictString = null; + this.data = new ArrayList<>(); + this.dictString = null; + this.dictBytes = 0; this.k = k; } @@ -63,6 +66,7 @@ public void insertData(List> inData, String concatString) { offset = offset % k; data.add(pf); } + this.dictBytes = this.dictString.getBytes(StandardCharsets.UTF_8).length; } /** @@ -122,14 +126,18 @@ public int search(String str) { private void readObject(ObjectInputStream inputFile) throws ClassNotFoundException, IOException { k = inputFile.readInt(); - dictString = inputFile.readUTF(); + dictBytes = inputFile.readInt(); + dictString = new String(inputFile.readNBytes(dictBytes), StandardCharsets.UTF_8); +// dictString = inputFile.readUTF(); data = (ArrayList) inputFile.readObject(); } private void writeObject(ObjectOutputStream outputFile) throws IOException { outputFile.writeInt(k); - outputFile.writeUTF(dictString); + outputFile.writeInt(this.dictBytes); + outputFile.writeBytes(this.dictString); +// outputFile.writeUTF(dictString); outputFile.writeObject(data); } diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index bc565c8..1e1469b 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -4,10 +4,7 @@ import java.io.*; import java.math.BigInteger; import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; +import java.util.*; public class TokensIndex implements Serializable { public class TokenInfo implements Serializable{ @@ -52,6 +49,8 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException { private int k; private String dir; private RandomAccessFile invertedIndexFile; + private long invertedAll = 0; + private long invertedSave = 0; public TokensIndex(int k, String dir) { this.data = new ArrayList<>(); @@ -85,11 +84,14 @@ public void insertData(List> tokensData, String concatString, Stri dictString = concatString; PairsLoader pl = new PairsLoader(pairsFilename); int offset = 0; + long cumAll = 0; + long cumSave = 0; int[] curPair = pl.readPair(); // This should correspond to the first token for (int i=0; i< tokensData.size(); i++){ + long startTime = new Date().getTime(); List tokenData = tokensData.get(i); TokenInfo token = new TokenInfo(); - LinkedList invertedIdx = new LinkedList<>(); + ArrayList invertedIdx = new ArrayList<>(); invertedIdx.add(curPair[1]); invertedIdx.add(1); @@ -98,7 +100,7 @@ public void insertData(List> tokensData, String concatString, Stri int[] nextPair = pl.readPair(); while (nextPair != null && nextPair[0] == curPair[0]){ if (nextPair[1] == curPair[1]) { // Token repetition inside the same doc - int docFreq = invertedIdx.removeLast(); + int docFreq = invertedIdx.remove(invertedIdx.size()-1); invertedIdx.add(docFreq + 1); } else { invertedIdx.add(nextPair[1]); @@ -111,6 +113,10 @@ public void insertData(List> tokensData, String concatString, Stri } curPair = nextPair; // Save the pair for the next token + long endTime = new Date().getTime(); + cumAll += (endTime - startTime); + + startTime = new Date().getTime(); try { token.invertedIndexPtr = (int) this.invertedIndexFile.getFilePointer(); } catch (IOException e) { @@ -118,6 +124,9 @@ public void insertData(List> tokensData, String concatString, Stri System.exit(1); } saveInvertedIndex(invertedIdx); + endTime = new Date().getTime(); + cumSave += (endTime - startTime); + numTokens += token.collectionFrequency; token.length = tokenData.get(TOKEN_LENGTH).shortValue(); if (offset == 0){ @@ -129,6 +138,10 @@ public void insertData(List> tokensData, String concatString, Stri offset = offset % k; this.data.add(token); } + System.out.println("CumAll: " + cumAll); + System.out.println("CumSave: " + cumSave); + System.out.println("InvertedAll: " + invertedAll); + System.out.println("InvertedSave: " + invertedSave); this.dictBytes = this.dictString.getBytes(StandardCharsets.UTF_8).length; } @@ -155,17 +168,24 @@ private List subListVals(List inputList, String type){ private void saveInvertedIndex(List valsList) { try { // change the reviewIds (odd indices) to a difference list (except for the first id): + long start = new Date().getTime(); + for (int i = valsList.size()-2; i>0; i = i - 2){ valsList.set(i, valsList.get(i) - valsList.get(i-2)); } - + long end = new Date().getTime(); + invertedAll += (end-start); StringBuilder stringCodes = new StringBuilder(); for (int num : valsList) { String code = Encoding.deltaEncode(num); stringCodes.append(code); } byte[] codeBytes = Encoding.toByteArray(stringCodes.toString()); + + start = new Date().getTime(); this.invertedIndexFile.write(codeBytes); + end = new Date().getTime(); + invertedSave += (end-start); } catch (Exception e){ System.out.println("Error occurred while saving invertedIndex bytes"); e.printStackTrace(); From 1981b1e526524c682081be9f3760dc57665e0d75 Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 24 May 2021 18:54:15 +0300 Subject: [PATCH 26/55] Before changing saveInvertedIndex to save integers --- src/webdata/IndexWriter.java | 6 +---- src/webdata/TokensIndex.java | 50 ++++++++++++++++++++++++------------ 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 5a596e0..4ef8fcb 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -27,7 +27,7 @@ public class IndexWriter { private static final int M = 100000; private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. - int NUM_REVIEWS = 10000; // todo: remove before submission! + int NUM_REVIEWS = 100000; // todo: remove before submission! /** @@ -340,11 +340,7 @@ private void createTokenIndex(){ endTime = new Date().getTime(); System.out.println("Token Index Inserting Data: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); - startTime = new Date().getTime(); saveToDir(TOKEN_INDEX_FILE, tIdx); - endTime = new Date().getTime(); - System.out.println("Token Index Saving File: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); - } /** diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index 1e1469b..71de7b5 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -49,7 +49,8 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException { private int k; private String dir; private RandomAccessFile invertedIndexFile; - private long invertedAll = 0; + private long invertedDiff = 0; + private long invertedEncode = 0; private long invertedSave = 0; public TokensIndex(int k, String dir) { @@ -59,14 +60,14 @@ public TokensIndex(int k, String dir) { this.numTokens = 0; this.k = k; this.dir = dir; - createRandomAccessFile(); + createOutputFile(); } /** * Create a new RandomAccessFile to write the tokens inverted index into. * If such a file already exists, first remove it. */ - private void createRandomAccessFile(){ + private void createOutputFile(){ try { File file = new File(this.dir + "/" + TOKEN_INVERTED_INDEX_FILE); if (file.exists()){ @@ -84,10 +85,14 @@ public void insertData(List> tokensData, String concatString, Stri dictString = concatString; PairsLoader pl = new PairsLoader(pairsFilename); int offset = 0; - long cumAll = 0; - long cumSave = 0; + long insert = 0; + long insertSave = 0; + int invertedPtr = 0; int[] curPair = pl.readPair(); // This should correspond to the first token for (int i=0; i< tokensData.size(); i++){ + if (i % (tokensData.size()/10) == 0){ + System.out.println("Finished " + i + " tokens. Time: " + (insert + insertSave)); + } long startTime = new Date().getTime(); List tokenData = tokensData.get(i); TokenInfo token = new TokenInfo(); @@ -114,7 +119,7 @@ public void insertData(List> tokensData, String concatString, Stri curPair = nextPair; // Save the pair for the next token long endTime = new Date().getTime(); - cumAll += (endTime - startTime); + insert += (endTime - startTime); startTime = new Date().getTime(); try { @@ -123,9 +128,11 @@ public void insertData(List> tokensData, String concatString, Stri e.printStackTrace(); System.exit(1); } - saveInvertedIndex(invertedIdx); +// token.invertedIndexPtr = invertedPtr; +// saveInvertedIndex(invertedIdx); + endTime = new Date().getTime(); - cumSave += (endTime - startTime); + insertSave += (endTime - startTime); numTokens += token.collectionFrequency; token.length = tokenData.get(TOKEN_LENGTH).shortValue(); @@ -137,10 +144,15 @@ public void insertData(List> tokensData, String concatString, Stri offset++; offset = offset % k; this.data.add(token); + + token = null; + invertedIdx = null; + tokenData = null; } - System.out.println("CumAll: " + cumAll); - System.out.println("CumSave: " + cumSave); - System.out.println("InvertedAll: " + invertedAll); + System.out.println("insert: " + insert); + System.out.println("insertSave: " + insertSave); + System.out.println("InvertedDiff: " + invertedDiff); + System.out.println("InvertedEncode: " + invertedEncode); System.out.println("InvertedSave: " + invertedSave); this.dictBytes = this.dictString.getBytes(StandardCharsets.UTF_8).length; } @@ -165,7 +177,7 @@ private List subListVals(List inputList, String type){ * Encodes the integers given in the integer list using delta encoding, and saves them in the invertedIndexFile. * @param valsList a list with number that should be encoded and saved in the inverted index file. */ - private void saveInvertedIndex(List valsList) { + private int saveInvertedIndex(List valsList) { try { // change the reviewIds (odd indices) to a difference list (except for the first id): long start = new Date().getTime(); @@ -174,23 +186,29 @@ private void saveInvertedIndex(List valsList) { valsList.set(i, valsList.get(i) - valsList.get(i-2)); } long end = new Date().getTime(); - invertedAll += (end-start); + invertedDiff += (end-start); + + start = new Date().getTime(); StringBuilder stringCodes = new StringBuilder(); for (int num : valsList) { String code = Encoding.deltaEncode(num); stringCodes.append(code); } byte[] codeBytes = Encoding.toByteArray(stringCodes.toString()); - +// byte[] codeBytes = new byte[1000]; + end = new Date().getTime(); + invertedEncode += (end-start); start = new Date().getTime(); - this.invertedIndexFile.write(codeBytes); + this.invertedIndexFile.write(codeBytes, 0, codeBytes.length); end = new Date().getTime(); invertedSave += (end-start); - } catch (Exception e){ + return codeBytes.length; + } catch (Exception e) { System.out.println("Error occurred while saving invertedIndex bytes"); e.printStackTrace(); System.exit(1); } + return 0; } /** From 9e035c36852da120e43ccd5691bdf95fbd93458d Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 24 May 2021 20:03:02 +0300 Subject: [PATCH 27/55] Buffered RandomAccessFile --- src/webdata/TokensIndex.java | 66 ++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index 71de7b5..6fa361a 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -89,6 +89,14 @@ public void insertData(List> tokensData, String concatString, Stri long insertSave = 0; int invertedPtr = 0; int[] curPair = pl.readPair(); // This should correspond to the first token + FileOutputStream fis = null; + ObjectOutputStream bis = null; + try { + fis = new FileOutputStream(this.invertedIndexFile.getFD()); + bis = new ObjectOutputStream(fis); + } catch (IOException e) { + e.printStackTrace(); + } for (int i=0; i< tokensData.size(); i++){ if (i % (tokensData.size()/10) == 0){ System.out.println("Finished " + i + " tokens. Time: " + (insert + insertSave)); @@ -130,7 +138,11 @@ public void insertData(List> tokensData, String concatString, Stri } // token.invertedIndexPtr = invertedPtr; // saveInvertedIndex(invertedIdx); - + try { + bis.writeObject(invertedIdx); + } catch (IOException e) { + e.printStackTrace(); + } endTime = new Date().getTime(); insertSave += (endTime - startTime); @@ -177,38 +189,48 @@ private List subListVals(List inputList, String type){ * Encodes the integers given in the integer list using delta encoding, and saves them in the invertedIndexFile. * @param valsList a list with number that should be encoded and saved in the inverted index file. */ - private int saveInvertedIndex(List valsList) { + private void saveInvertedIndex(List valsList) { try { // change the reviewIds (odd indices) to a difference list (except for the first id): long start = new Date().getTime(); - for (int i = valsList.size()-2; i>0; i = i - 2){ - valsList.set(i, valsList.get(i) - valsList.get(i-2)); - } - long end = new Date().getTime(); - invertedDiff += (end-start); - - start = new Date().getTime(); - StringBuilder stringCodes = new StringBuilder(); - for (int num : valsList) { - String code = Encoding.deltaEncode(num); - stringCodes.append(code); - } - byte[] codeBytes = Encoding.toByteArray(stringCodes.toString()); +// for (int i = valsList.size()-2; i>0; i = i - 2){ +// valsList.set(i, valsList.get(i) - valsList.get(i-2)); +// } +// long end = new Date().getTime(); +// invertedDiff += (end-start); +// +// start = new Date().getTime(); +// StringBuilder stringCodes = new StringBuilder(); +// for (int num : valsList) { +// String code = Encoding.deltaEncode(num); +// stringCodes.append(code); +// } +// byte[] codeBytes = Encoding.toByteArray(stringCodes.toString()); // byte[] codeBytes = new byte[1000]; - end = new Date().getTime(); +// for (int val: valsList){ +// this.invertedIndexFile.writeInt(val); +// } + FileOutputStream fis = new FileOutputStream(this.invertedIndexFile.getFD()); + ObjectOutputStream bis = new ObjectOutputStream(fis); + bis.writeObject(valsList); + + long end = new Date().getTime(); invertedEncode += (end-start); - start = new Date().getTime(); - this.invertedIndexFile.write(codeBytes, 0, codeBytes.length); - end = new Date().getTime(); - invertedSave += (end-start); - return codeBytes.length; + + + +// start = new Date().getTime(); +// this.invertedIndexFile.write(codeBytes, 0, codeBytes.length); +// end = new Date().getTime(); +// invertedSave += (end-start); +// return codeBytes.length; } catch (Exception e) { System.out.println("Error occurred while saving invertedIndex bytes"); e.printStackTrace(); System.exit(1); } - return 0; +// return 0; } /** From 20b8d41ee2d680c9c8246bbd62e87ab415fba0cc Mon Sep 17 00:00:00 2001 From: darkushin Date: Tue, 25 May 2021 12:20:30 +0300 Subject: [PATCH 28/55] reviewIndex optimization start --- src/webdata/IndexWriter.java | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 4ef8fcb..53ada44 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -27,7 +27,7 @@ public class IndexWriter { private static final int M = 100000; private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. - int NUM_REVIEWS = 100000; // todo: remove before submission! + int NUM_REVIEWS = 1000000; // todo: remove before submission! /** @@ -52,7 +52,7 @@ public void write(String inputFile, String dir) { reviewIds = null; // Clears memory? startTime = new Date().getTime(); - createTokenIndex(); +// createTokenIndex(); endTime = new Date().getTime(); System.out.println("Create Token Index: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); @@ -349,10 +349,12 @@ private void createTokenIndex(){ private void createReviewIndex() { // Revise the review dictionary to the correct structure & change productIDs to product index LinkedList> dictValues = new LinkedList<>(); + long start = new Date().getTime(); for (int review : reviewIds.keySet()) { ArrayList vals = reviewIds.get(review); ArrayList new_vals = new ArrayList<>(List.of(0, 0, 0, 0, 0)); new_vals.set(ReviewIndex.PRODUCTID_INDEX, productIds.headMap(vals.get(0)).size()); +// new_vals.set(ReviewIndex.PRODUCTID_INDEX, 0); String[] helpf = vals.get(2).split("/"); new_vals.set(ReviewIndex.HELPFNUM_INDEX, Integer.parseInt(helpf[0])); new_vals.set(ReviewIndex.HELPFDNOM_INDEX, Integer.parseInt(helpf[1])); @@ -360,10 +362,19 @@ private void createReviewIndex() { new_vals.set(ReviewIndex.SCORE_INDEX, (int) Float.parseFloat(vals.get(1))); dictValues.add(new_vals); } + long end = new Date().getTime(); + System.out.println("Reviews data: " + (end-start)); + + start = new Date().getTime(); ReviewIndex rIndex = new ReviewIndex(); rIndex.insertData(dictValues); + end = new Date().getTime(); + System.out.println("Insert data: " + (end - start)); + start = new Date().getTime(); saveToDir(REVIEW_INDEX_FILE, rIndex); + end = new Date().getTime(); + System.out.println("Save Data: " + (end-start)); } /** From 4377a53a053f670aca1274034153bae4840f20d1 Mon Sep 17 00:00:00 2001 From: darkushin Date: Tue, 25 May 2021 12:47:28 +0300 Subject: [PATCH 29/55] Changed reviewIndex to use hashMap --- src/webdata/IndexWriter.java | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 53ada44..4c5d8f3 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -27,7 +27,7 @@ public class IndexWriter { private static final int M = 100000; private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. - int NUM_REVIEWS = 1000000; // todo: remove before submission! + int NUM_REVIEWS = 100000; // todo: remove before submission! /** @@ -349,12 +349,17 @@ private void createTokenIndex(){ private void createReviewIndex() { // Revise the review dictionary to the correct structure & change productIDs to product index LinkedList> dictValues = new LinkedList<>(); + HashMap productDict = new HashMap<>(productIds.size()); + int i = 0; + for (String productId: productIds.keySet()){ + productDict.put(productId, i); + i++; + } long start = new Date().getTime(); for (int review : reviewIds.keySet()) { ArrayList vals = reviewIds.get(review); ArrayList new_vals = new ArrayList<>(List.of(0, 0, 0, 0, 0)); - new_vals.set(ReviewIndex.PRODUCTID_INDEX, productIds.headMap(vals.get(0)).size()); -// new_vals.set(ReviewIndex.PRODUCTID_INDEX, 0); + new_vals.set(ReviewIndex.PRODUCTID_INDEX, productDict.get(vals.get(0))); String[] helpf = vals.get(2).split("/"); new_vals.set(ReviewIndex.HELPFNUM_INDEX, Integer.parseInt(helpf[0])); new_vals.set(ReviewIndex.HELPFDNOM_INDEX, Integer.parseInt(helpf[1])); From e405f225fac5009610e798a4e4b38e7d86cbd53e Mon Sep 17 00:00:00 2001 From: darkushin Date: Tue, 25 May 2021 14:51:58 +0300 Subject: [PATCH 30/55] Improved tokenIndex, before changing all LinkedLists --- src/webdata/IndexWriter.java | 12 +-------- src/webdata/KFront.java | 4 +-- src/webdata/TokensIndex.java | 51 ++++++++++++++++++------------------ 3 files changed, 28 insertions(+), 39 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 4c5d8f3..957bdcc 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -52,7 +52,7 @@ public void write(String inputFile, String dir) { reviewIds = null; // Clears memory? startTime = new Date().getTime(); -// createTokenIndex(); + createTokenIndex(); endTime = new Date().getTime(); System.out.println("Create Token Index: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); @@ -355,7 +355,6 @@ private void createReviewIndex() { productDict.put(productId, i); i++; } - long start = new Date().getTime(); for (int review : reviewIds.keySet()) { ArrayList vals = reviewIds.get(review); ArrayList new_vals = new ArrayList<>(List.of(0, 0, 0, 0, 0)); @@ -367,19 +366,10 @@ private void createReviewIndex() { new_vals.set(ReviewIndex.SCORE_INDEX, (int) Float.parseFloat(vals.get(1))); dictValues.add(new_vals); } - long end = new Date().getTime(); - System.out.println("Reviews data: " + (end-start)); - - start = new Date().getTime(); ReviewIndex rIndex = new ReviewIndex(); rIndex.insertData(dictValues); - end = new Date().getTime(); - System.out.println("Insert data: " + (end - start)); - start = new Date().getTime(); saveToDir(REVIEW_INDEX_FILE, rIndex); - end = new Date().getTime(); - System.out.println("Save Data: " + (end-start)); } /** diff --git a/src/webdata/KFront.java b/src/webdata/KFront.java index 6132471..f37fd2d 100644 --- a/src/webdata/KFront.java +++ b/src/webdata/KFront.java @@ -13,7 +13,7 @@ public class KFront { private boolean saveLength; public KFront() { - table = new LinkedList<>(); + table = new ArrayList<>(); concatString = null; saveLength = false; } @@ -25,7 +25,7 @@ public KFront(List> outputTable) { } public KFront(boolean saveLength) { - table = new LinkedList<>(); + table = new ArrayList<>(); concatString = null; this.saveLength = saveLength; } diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index 6fa361a..e941991 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -85,6 +85,7 @@ public void insertData(List> tokensData, String concatString, Stri dictString = concatString; PairsLoader pl = new PairsLoader(pairsFilename); int offset = 0; + long insert = 0; long insertSave = 0; int invertedPtr = 0; @@ -137,12 +138,12 @@ public void insertData(List> tokensData, String concatString, Stri System.exit(1); } // token.invertedIndexPtr = invertedPtr; -// saveInvertedIndex(invertedIdx); - try { - bis.writeObject(invertedIdx); - } catch (IOException e) { - e.printStackTrace(); - } + saveInvertedIndex(invertedIdx); +// try { +// bis.writeObject(invertedIdx); +// } catch (IOException e) { +// e.printStackTrace(); +// } endTime = new Date().getTime(); insertSave += (endTime - startTime); @@ -194,32 +195,30 @@ private void saveInvertedIndex(List valsList) { // change the reviewIds (odd indices) to a difference list (except for the first id): long start = new Date().getTime(); -// for (int i = valsList.size()-2; i>0; i = i - 2){ -// valsList.set(i, valsList.get(i) - valsList.get(i-2)); -// } -// long end = new Date().getTime(); -// invertedDiff += (end-start); -// -// start = new Date().getTime(); -// StringBuilder stringCodes = new StringBuilder(); -// for (int num : valsList) { -// String code = Encoding.deltaEncode(num); -// stringCodes.append(code); -// } -// byte[] codeBytes = Encoding.toByteArray(stringCodes.toString()); + for (int i = valsList.size()-2; i>0; i = i - 2){ + valsList.set(i, valsList.get(i) - valsList.get(i-2)); + } + long end = new Date().getTime(); + invertedDiff += (end-start); + + start = new Date().getTime(); + StringBuilder stringCodes = new StringBuilder(); + for (int num : valsList) { + String code = Encoding.deltaEncode(num); + stringCodes.append(code); + } + byte[] codeBytes = Encoding.toByteArray(stringCodes.toString()); + this.invertedIndexFile.write(codeBytes); // byte[] codeBytes = new byte[1000]; // for (int val: valsList){ // this.invertedIndexFile.writeInt(val); // } - FileOutputStream fis = new FileOutputStream(this.invertedIndexFile.getFD()); - ObjectOutputStream bis = new ObjectOutputStream(fis); - bis.writeObject(valsList); +// FileOutputStream fis = new FileOutputStream(this.invertedIndexFile.getFD()); +// ObjectOutputStream bis = new ObjectOutputStream(fis); +// bis.writeObject(valsList); - long end = new Date().getTime(); + end = new Date().getTime(); invertedEncode += (end-start); - - - // start = new Date().getTime(); // this.invertedIndexFile.write(codeBytes, 0, codeBytes.length); // end = new Date().getTime(); From 95aafb64f980d05decb8209d5688c9c7acfa7054 Mon Sep 17 00:00:00 2001 From: darkushin Date: Tue, 25 May 2021 15:12:00 +0300 Subject: [PATCH 31/55] Removed LinkedLists --- src/webdata/DataLoader.java | 3 ++- src/webdata/DataParser.java | 2 +- src/webdata/IndexWriter.java | 6 +++--- src/webdata/TokensIndex.java | 38 ++++++++---------------------------- 4 files changed, 14 insertions(+), 35 deletions(-) diff --git a/src/webdata/DataLoader.java b/src/webdata/DataLoader.java index da0df2e..430d5d1 100644 --- a/src/webdata/DataLoader.java +++ b/src/webdata/DataLoader.java @@ -4,6 +4,7 @@ import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; +import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedList; import java.util.List; @@ -36,7 +37,7 @@ public String readSingleReview() { } public List readMultipleReviews(int num) { - LinkedList ret = new LinkedList<>(); + ArrayList ret = new ArrayList<>(); for (int i = 0; i < num; i++) { ret.add(readSingleReview()); } diff --git a/src/webdata/DataParser.java b/src/webdata/DataParser.java index 5ec8687..7f8172a 100644 --- a/src/webdata/DataParser.java +++ b/src/webdata/DataParser.java @@ -57,7 +57,7 @@ public void setText(String text) { // } public List parseData(List rawReviews){ - LinkedList allReviews = new LinkedList<>(); + ArrayList allReviews = new ArrayList<>(); for (String review: rawReviews){ allReviews.add(parseReview(review)); } diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 957bdcc..c924eec 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -300,7 +300,7 @@ private void addReviewId(DataParser.Review review, int reviewId, int length) { * Creates and saves to the disk the product index, i.e. all the information that is related to products. */ private void createProductIndex() { - LinkedList ids = new LinkedList<>(productIds.keySet()); + ArrayList ids = new ArrayList<>(productIds.keySet()); ArrayList> vals = new ArrayList<>(productIds.values()); int k = 8; KFront kf = new KFront(); @@ -319,7 +319,7 @@ private void createProductIndex() { * The index is created using the k-1-in-k front coding method. */ private void createTokenIndex(){ - LinkedList tokens = new LinkedList<>(tokenDict.keySet()); + ArrayList tokens = new ArrayList<>(tokenDict.keySet()); long startTime = new Date().getTime(); Collections.sort(tokens); long endTime = new Date().getTime(); @@ -348,7 +348,7 @@ private void createTokenIndex(){ */ private void createReviewIndex() { // Revise the review dictionary to the correct structure & change productIDs to product index - LinkedList> dictValues = new LinkedList<>(); + ArrayList> dictValues = new ArrayList<>(); HashMap productDict = new HashMap<>(productIds.size()); int i = 0; for (String productId: productIds.keySet()){ diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index e941991..ce610d5 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -88,16 +88,8 @@ public void insertData(List> tokensData, String concatString, Stri long insert = 0; long insertSave = 0; - int invertedPtr = 0; int[] curPair = pl.readPair(); // This should correspond to the first token - FileOutputStream fis = null; - ObjectOutputStream bis = null; - try { - fis = new FileOutputStream(this.invertedIndexFile.getFD()); - bis = new ObjectOutputStream(fis); - } catch (IOException e) { - e.printStackTrace(); - } + for (int i=0; i< tokensData.size(); i++){ if (i % (tokensData.size()/10) == 0){ System.out.println("Finished " + i + " tokens. Time: " + (insert + insertSave)); @@ -137,13 +129,7 @@ public void insertData(List> tokensData, String concatString, Stri e.printStackTrace(); System.exit(1); } -// token.invertedIndexPtr = invertedPtr; saveInvertedIndex(invertedIdx); -// try { -// bis.writeObject(invertedIdx); -// } catch (IOException e) { -// e.printStackTrace(); -// } endTime = new Date().getTime(); insertSave += (endTime - startTime); @@ -208,28 +194,20 @@ private void saveInvertedIndex(List valsList) { stringCodes.append(code); } byte[] codeBytes = Encoding.toByteArray(stringCodes.toString()); - this.invertedIndexFile.write(codeBytes); -// byte[] codeBytes = new byte[1000]; -// for (int val: valsList){ -// this.invertedIndexFile.writeInt(val); -// } -// FileOutputStream fis = new FileOutputStream(this.invertedIndexFile.getFD()); -// ObjectOutputStream bis = new ObjectOutputStream(fis); -// bis.writeObject(valsList); - end = new Date().getTime(); invertedEncode += (end-start); -// start = new Date().getTime(); -// this.invertedIndexFile.write(codeBytes, 0, codeBytes.length); -// end = new Date().getTime(); -// invertedSave += (end-start); -// return codeBytes.length; + + start = new Date().getTime(); + this.invertedIndexFile.write(codeBytes); + end = new Date().getTime(); + invertedSave += (end-start); + + } catch (Exception e) { System.out.println("Error occurred while saving invertedIndex bytes"); e.printStackTrace(); System.exit(1); } -// return 0; } /** From b10d7a94b08f4068d2854c6ad1c6f602bacaf361 Mon Sep 17 00:00:00 2001 From: darkushin Date: Tue, 25 May 2021 16:54:39 +0300 Subject: [PATCH 32/55] TokensIndex without encoding --- src/webdata/TokensIndex.java | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index ce610d5..78ce5d5 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -188,12 +188,18 @@ private void saveInvertedIndex(List valsList) { invertedDiff += (end-start); start = new Date().getTime(); - StringBuilder stringCodes = new StringBuilder(); +// StringBuilder stringCodes = new StringBuilder(); +// for (int num : valsList) { +// String code = Encoding.deltaEncode(num); +// stringCodes.append(code); +// } +// byte[] codeBytes = Encoding.toByteArray(stringCodes.toString()); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(baos); for (int num : valsList) { - String code = Encoding.deltaEncode(num); - stringCodes.append(code); + dos.writeInt(num); } - byte[] codeBytes = Encoding.toByteArray(stringCodes.toString()); + byte[] codeBytes = baos.toByteArray(); end = new Date().getTime(); invertedEncode += (end-start); From c85587d909810188a25fb6b0bc4b14b3f2bca94c Mon Sep 17 00:00:00 2001 From: darkushin Date: Tue, 25 May 2021 18:53:45 +0300 Subject: [PATCH 33/55] Basic Test are working. before investigating data loader & parser --- src/webdata/Encoding.java | 2 +- src/webdata/IndexReader.java | 16 ++++++++++++++-- src/webdata/IndexWriter.java | 4 ++-- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/webdata/Encoding.java b/src/webdata/Encoding.java index 817927b..bb17448 100644 --- a/src/webdata/Encoding.java +++ b/src/webdata/Encoding.java @@ -149,7 +149,7 @@ public static int[] groupVarintDecode(byte[] encoding) { /** * Convert the given list of id-1, num-appearances-1, id-2, num-appearances-2... where the ids are given by their - * differences to a list where every id entry are the full id number. + * differences to a list where every id entry is the full id number. */ public static List diffToIds(List vals){ for (int i = 2; i < vals.size() - 1; i = i + 2){ diff --git a/src/webdata/IndexReader.java b/src/webdata/IndexReader.java index bf6ff58..ddeddff 100644 --- a/src/webdata/IndexReader.java +++ b/src/webdata/IndexReader.java @@ -1,7 +1,10 @@ package webdata; import java.io.*; +import java.nio.ByteBuffer; +import java.nio.IntBuffer; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.Enumeration; @@ -164,9 +167,13 @@ public Enumeration getReviewsWithToken(String token) { e.printStackTrace(); System.exit(1); } - ArrayList vals = new ArrayList(Encoding.deltaDecode(dest).subList(0, numReviews)); - Encoding.diffToIds(vals); + ArrayList vals = new ArrayList<>(); + for (int i = 0; i < dest.length; i = i +4){ + byte[] numBytes = Arrays.copyOfRange(dest, i, i+4); + vals.add(ByteBuffer.wrap(numBytes).getInt()); + } + Encoding.diffToIds(vals); return Collections.enumeration(vals); } @@ -203,4 +210,9 @@ public Enumeration getProductReviews(String productId) { } return Collections.enumeration(reviews); } + +// public static void main(String[] args) { +// IndexReader indexReader = new IndexReader("./Data_index"); +// indexReader.getReviewsWithToken("0"); +// } } \ No newline at end of file diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index c924eec..b64ee2a 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -392,8 +392,8 @@ private void saveToDir(String name, Object obj) { } public static void main(String[] args) { - String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; -// String inputFile = "./1000.txt"; +// String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; + String inputFile = "./1000.txt"; String dir = "./Data_Index"; long startTime = new Date().getTime(); IndexWriter indexWriter = new IndexWriter(); From 5c80f65e3fb031d831935b88562b0e1177e56e15 Mon Sep 17 00:00:00 2001 From: nirnts Date: Wed, 26 May 2021 10:21:52 +0300 Subject: [PATCH 34/55] groupvarint encoding for inverted index --- src/webdata/Encoding.java | 79 +++++++++++++++++++++++++++++++----- src/webdata/IndexReader.java | 2 +- src/webdata/TokensIndex.java | 18 ++++---- 3 files changed, 76 insertions(+), 23 deletions(-) diff --git a/src/webdata/Encoding.java b/src/webdata/Encoding.java index 817927b..2b16167 100644 --- a/src/webdata/Encoding.java +++ b/src/webdata/Encoding.java @@ -1,6 +1,7 @@ package webdata; import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.math.BigInteger; import java.nio.ByteBuffer; import java.util.ArrayList; @@ -13,18 +14,21 @@ public class Encoding { * Encode the given number using gamma encoding. * The encoded output is a string representing the bytes of the number. */ - public static String gammaEncode(int num) { + public static void gammaEncode(int num, StringBuilder s) { String offset = Integer.toBinaryString(num + 1); - return "1".repeat(offset.length() - 1) + "0" + offset.substring(1); + s.append("1".repeat(offset.length() - 1)); + s.append("0"); + s.append(offset.substring(1)); } /** * Encode the given number using delta encoding. * The encoded output is a string representing the bytes of the number. */ - public static String deltaEncode(int num) { + public static void deltaEncode(int num, StringBuilder s) { String offset = Integer.toBinaryString(num + 1); - return gammaEncode(offset.length() - 1) + offset.substring(1); + gammaEncode(offset.length() - 1, s); + s.append(offset.substring(1)); } /** @@ -79,13 +83,17 @@ public static ArrayList deltaDecode(byte[] code) { */ public static byte[] toByteArray(String encoding) { // Pad 0s to the nearest multiple of 8 - String padded = encoding + "0".repeat((int) Math.ceil((float) encoding.length() / 8) * 8 - encoding.length()); + StringBuilder s = new StringBuilder(); + s.append(encoding); + s.append("0".repeat((int) Math.ceil((float) encoding.length() / 8) * 8 - encoding.length())); + String padded = s.toString(); byte[] ret = new BigInteger(padded, 2).toByteArray(); - if (ret.length * 8 == padded.length() + 8) { - return Arrays.copyOfRange(ret, 1, ret.length); - } else { - return ret; - } +// if (ret.length * 8 == padded.length() + 8) { +// return Arrays.copyOfRange(ret, 1, ret.length); +// } else { +// return ret; +// } + return new byte[5]; } /** @@ -113,7 +121,7 @@ public static byte[] groupVarintEncode(int[] nums) { byte[] numAsBytes = ByteBuffer.allocate(4).putInt(nums[i]).array(); byte numLength = -1; for (int j = 0; j < numAsBytes.length; j++) { - if (numAsBytes[j] != 0) { + if (numAsBytes[j] != 0 || numLength >= 0) { out.write(numAsBytes[j]); numLength++; } else if (j == numAsBytes.length - 1 & numLength == -1) { @@ -128,6 +136,30 @@ public static byte[] groupVarintEncode(int[] nums) { return output; } + public static byte[] groupVarEncodeMultiple(List nums) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + int i; + for (i=0; i + 3 < nums.size(); i=i+4) { + try { + baos.write(groupVarintEncode(new int[]{nums.get(i), nums.get(i + 1), nums.get(i + 2), nums.get(i + 3)})); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + } + int[] remainder = new int[4]; + for (int j=0;j < nums.size() - i; j++) { + remainder[j] = nums.get(i+j); + } + try { + baos.write(groupVarintEncode(remainder)); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + return baos.toByteArray(); + } + /** * Decode the given byte array to numbers, using Group-Varing-Encoding. */ @@ -147,6 +179,31 @@ public static int[] groupVarintDecode(byte[] encoding) { return output; } + public static ArrayList groupVarDecodeMultiple(byte[] encoding) { + ArrayList ret = new ArrayList<>(); + int bytesRead = 0; + while (bytesRead < encoding.length) { + byte lengths = encoding[bytesRead]; + bytesRead++; + for (int i = 0; i < 4; i++) { + int bytesToRead = 1 + (lengths >> (2 * (3 - i))) & 3; + byte[] o = new byte[bytesToRead]; + for (int b = 0; b < bytesToRead; b++) { + o[b] = encoding[bytesRead + b]; + } + bytesRead += bytesToRead; + ret.add(new BigInteger(1, o).intValue()); + } + } + for (int j=0; j < 4; j++) { + if (ret.get(ret.size() - 1) != 0) { + break; + } + ret.remove(ret.size() - 1); + } + return ret; + } + /** * Convert the given list of id-1, num-appearances-1, id-2, num-appearances-2... where the ids are given by their * differences to a list where every id entry are the full id number. diff --git a/src/webdata/IndexReader.java b/src/webdata/IndexReader.java index bf6ff58..b0ba2c1 100644 --- a/src/webdata/IndexReader.java +++ b/src/webdata/IndexReader.java @@ -164,7 +164,7 @@ public Enumeration getReviewsWithToken(String token) { e.printStackTrace(); System.exit(1); } - ArrayList vals = new ArrayList(Encoding.deltaDecode(dest).subList(0, numReviews)); + ArrayList vals = Encoding.groupVarDecodeMultiple(dest); Encoding.diffToIds(vals); return Collections.enumeration(vals); diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index 78ce5d5..9f86f55 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -188,18 +188,14 @@ private void saveInvertedIndex(List valsList) { invertedDiff += (end-start); start = new Date().getTime(); -// StringBuilder stringCodes = new StringBuilder(); -// for (int num : valsList) { -// String code = Encoding.deltaEncode(num); -// stringCodes.append(code); + byte[] codeBytes = Encoding.groupVarEncodeMultiple(valsList); +// byte[] codeBytes = new byte[10]; +// ArrayList tst = Encoding.groupVarDecodeMultiple(codeBytes); +// for (int j=0;j "); +// } // } -// byte[] codeBytes = Encoding.toByteArray(stringCodes.toString()); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutputStream dos = new DataOutputStream(baos); - for (int num : valsList) { - dos.writeInt(num); - } - byte[] codeBytes = baos.toByteArray(); end = new Date().getTime(); invertedEncode += (end-start); From 2f70bc318f0b1bfb69bc67923a0d358d8b196faa Mon Sep 17 00:00:00 2001 From: nirnts Date: Wed, 26 May 2021 11:38:59 +0300 Subject: [PATCH 35/55] changed stringInfo to int --- src/webdata/TokensIndex.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index 9f86f55..c69ff37 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -8,7 +8,7 @@ public class TokensIndex implements Serializable { public class TokenInfo implements Serializable{ - private short stringInfo; // This is either a pointer to the concatenated string, or a prefix size. + private int stringInfo; // This is either a pointer to the concatenated string, or a prefix size. private short frequency; private short collectionFrequency; private short length; @@ -19,7 +19,7 @@ public class TokenInfo implements Serializable{ public int getInvertedIdxPtr(){ return invertedIndexPtr;} private void readObject(ObjectInputStream inputFile) throws IOException, ClassNotFoundException { - stringInfo = inputFile.readShort(); + stringInfo = inputFile.readInt(); frequency = inputFile.readShort(); collectionFrequency = inputFile.readShort(); length = inputFile.readShort(); @@ -27,7 +27,7 @@ private void readObject(ObjectInputStream inputFile) throws IOException, ClassNo } private void writeObject(ObjectOutputStream outputFile) throws IOException { - outputFile.writeShort(stringInfo); + outputFile.writeInt(stringInfo); outputFile.writeShort(frequency); outputFile.writeShort(collectionFrequency); outputFile.writeShort(length); @@ -136,9 +136,9 @@ public void insertData(List> tokensData, String concatString, Stri numTokens += token.collectionFrequency; token.length = tokenData.get(TOKEN_LENGTH).shortValue(); if (offset == 0){ - token.stringInfo = tokenData.get(POINTER_INDEX).shortValue(); + token.stringInfo = tokenData.get(POINTER_INDEX); } else { - token.stringInfo = tokenData.get(PREFIX_INDEX).shortValue(); + token.stringInfo = tokenData.get(PREFIX_INDEX); } offset++; offset = offset % k; From 8339097a772c76f5877480388fba0fbf2b91522f Mon Sep 17 00:00:00 2001 From: darkushin Date: Thu, 27 May 2021 00:54:00 +0300 Subject: [PATCH 36/55] Converting short->int --- src/webdata/IndexReader.java | 6 +++--- src/webdata/IndexWriter.java | 21 +++++++++++++++++++-- src/webdata/ProductIndex.java | 10 +++++----- src/webdata/TokensIndex.java | 24 ++++++++++++------------ 4 files changed, 39 insertions(+), 22 deletions(-) diff --git a/src/webdata/IndexReader.java b/src/webdata/IndexReader.java index 674e15e..c70d72c 100644 --- a/src/webdata/IndexReader.java +++ b/src/webdata/IndexReader.java @@ -147,10 +147,10 @@ public Enumeration getReviewsWithToken(String token) { if (currentTokenIdx == -1){ return enumerator; } - int tokenInvertedIdxPtr = tokenIndex.get(currentTokenIdx).getInvertedIdxPtr(); + long tokenInvertedIdxPtr = tokenIndex.get(currentTokenIdx).getInvertedIdxPtr(); int numReviews = tokenIndex.get(currentTokenIdx).getFrequency() * 2; byte[] dest = null; - int nextInvertedIdxPtr; + long nextInvertedIdxPtr; try { RandomAccessFile file = new RandomAccessFile(this.dir + "/" + TOKEN_INVERTED_INDEX_FILE, "r"); if (currentTokenIdx + 1 getReviewsWithToken(String token) { } else { nextInvertedIdxPtr = (int) file.length(); } - int bytesToRead = nextInvertedIdxPtr - tokenInvertedIdxPtr; + int bytesToRead = (int) (nextInvertedIdxPtr - tokenInvertedIdxPtr); dest = new byte[bytesToRead]; file.seek(tokenInvertedIdxPtr); file.read(dest); diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index b64ee2a..697b4df 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -14,6 +14,7 @@ public class IndexWriter { private TreeMap> reviewIds; private int[][] tokenBuffer = new int[TOKEN_BUFFER_SIZE][2]; +// private ArrayList> tokenBuffer = new ArrayList>(); ; // Array of termID, docID pairs. Regular array to sort in-place private int tokenBufferPointer; private int tokenFilesNumber = 0; @@ -24,10 +25,10 @@ public class IndexWriter { private static final String TOKEN_INDEX_FILE = "token_index.txt"; private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; private static final int PAIRS_IN_BLOCK = 1000; - private static final int M = 100000; + private static final int M = 25000; private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. - int NUM_REVIEWS = 100000; // todo: remove before submission! + int NUM_REVIEWS = 10000000; // todo: remove before submission! /** @@ -114,16 +115,22 @@ private void createDicts(String inputFile){ } long startTime = new Date().getTime(); int i=1; + int readTokens = 0; for (String s: dataLoader){ DataParser.Review review = dataParser.parseReview(s); addProductId(review.getProductId(), i); int length = addReviewText(review.getText(), i); addReviewId(review, i, length); + readTokens += length; i++; // todo: remove this part - is used only to test with specific number of reviews if (i > NUM_REVIEWS) { break;} + if (i % 100000 == 0) { + System.out.println("Read " + i + " reviews and " + readTokens + " tokens"); + } } + System.out.println("Done Reading"); this.sortBuffer(); try { @@ -238,7 +245,9 @@ private int addReviewText(String reviewText, int reviewIndex){ tokenBuffer[tokenBufferPointer][0] = termId; tokenBuffer[tokenBufferPointer][1] = reviewIndex; tokenBufferPointer++; +// tokenBuffer.add(new ArrayList<>(Arrays.asList(termId, reviewIndex))); if (tokenBufferPointer == TOKEN_BUFFER_SIZE){ +// if (tokenBuffer.size() == TOKEN_BUFFER_SIZE){ this.sortBuffer(); try { this.saveBuffer(); @@ -247,13 +256,17 @@ private int addReviewText(String reviewText, int reviewIndex){ System.exit(1); } this.clearBuffer(); +// this.tokenBuffer.clear(); + } } return reviewLength; } private void sortBuffer() { + System.out.println("In sort"); Arrays.sort(tokenBuffer,0, tokenBufferPointer, Comparator.comparing(a -> invertedTokenDict.get(a[0]))); +// tokenBuffer.sort(Comparator.comparing(a -> invertedTokenDict.get(a.get(0)))); } private void saveBuffer() throws IOException { @@ -263,6 +276,10 @@ private void saveBuffer() throws IOException { tokenBufferWriter.writeInt(tokenBuffer[i][0]); tokenBufferWriter.writeInt(tokenBuffer[i][1]); } +// for (int i = 0; i < tokenBuffer.size(); i++) { +// tokenBufferWriter.writeInt(tokenBuffer.get(i).get(0)); +// tokenBufferWriter.writeInt(tokenBuffer.get(i).get(1)); +// } tokenBufferWriter.close(); } diff --git a/src/webdata/ProductIndex.java b/src/webdata/ProductIndex.java index 6e6f14a..5a732c9 100644 --- a/src/webdata/ProductIndex.java +++ b/src/webdata/ProductIndex.java @@ -9,20 +9,20 @@ public class ProductIndex implements Serializable { private class ProductInfo implements Serializable{ - private short stringInfo; // This is either a pointer to the concatenated string, or a prefix size. + private int stringInfo; // This is either a pointer to the concatenated string, or a prefix size. private int reviewId; private short spanLength; private void readObject(ObjectInputStream inputFile) throws ClassNotFoundException, IOException { - stringInfo = inputFile.readShort(); + stringInfo = inputFile.readInt(); reviewId = inputFile.readInt(); spanLength = inputFile.readShort(); } private void writeObject(ObjectOutputStream outputFile) throws IOException { - outputFile.writeShort(stringInfo); + outputFile.writeInt(stringInfo); outputFile.writeInt(reviewId); outputFile.writeShort(spanLength); } @@ -58,9 +58,9 @@ public void insertData(List> inData, String concatString) { pf.reviewId = entry.get(REVIEWID_INDEX); pf.spanLength = entry.get(SPANLENGTH_INDEX).shortValue(); if (offset == 0) { - pf.stringInfo = entry.get(POINTER_INDEX).shortValue(); + pf.stringInfo = entry.get(POINTER_INDEX); } else { - pf.stringInfo = entry.get(PREFIXL_INDEX).shortValue(); + pf.stringInfo = entry.get(PREFIXL_INDEX); } offset++; offset = offset % k; diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index c69ff37..a4fde58 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -9,29 +9,29 @@ public class TokensIndex implements Serializable { public class TokenInfo implements Serializable{ private int stringInfo; // This is either a pointer to the concatenated string, or a prefix size. - private short frequency; - private short collectionFrequency; + private int frequency; + private int collectionFrequency; private short length; - private int invertedIndexPtr; + private long invertedIndexPtr; - public short getFrequency(){ return frequency;} - public short getCollectionFrequency(){ return collectionFrequency;} - public int getInvertedIdxPtr(){ return invertedIndexPtr;} + public int getFrequency(){ return frequency;} + public int getCollectionFrequency(){ return collectionFrequency;} + public long getInvertedIdxPtr(){ return invertedIndexPtr;} private void readObject(ObjectInputStream inputFile) throws IOException, ClassNotFoundException { stringInfo = inputFile.readInt(); - frequency = inputFile.readShort(); - collectionFrequency = inputFile.readShort(); + frequency = inputFile.readInt(); + collectionFrequency = inputFile.readInt(); length = inputFile.readShort(); - invertedIndexPtr = inputFile.readInt(); + invertedIndexPtr = inputFile.readLong(); } private void writeObject(ObjectOutputStream outputFile) throws IOException { outputFile.writeInt(stringInfo); - outputFile.writeShort(frequency); - outputFile.writeShort(collectionFrequency); + outputFile.writeInt(frequency); + outputFile.writeInt(collectionFrequency); outputFile.writeShort(length); - outputFile.writeInt(invertedIndexPtr); + outputFile.writeLong(invertedIndexPtr); } } From 4137867df8d083b339f70628f3608b0722512685 Mon Sep 17 00:00:00 2001 From: darkushin Date: Sun, 30 May 2021 10:16:37 +0300 Subject: [PATCH 37/55] External Merge Sort logs --- src/webdata/ExternalMergeSort.java | 49 +++++++++++++++++++++++++++++- src/webdata/IndexWriter.java | 9 +++--- 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java index da6adbb..e51a4c2 100644 --- a/src/webdata/ExternalMergeSort.java +++ b/src/webdata/ExternalMergeSort.java @@ -49,7 +49,7 @@ public void sort(){ this.removeDir(dir + folderName + iteration); // remove the temp dir in which the files of this iteration were stored numFiles = savedFiles; savedFiles = 0; - System.out.println("Number of files in iteration: " + iteration + "is: " + numFiles); + System.out.println("Number of files in iteration: " + iteration + " is: " + numFiles); iteration++; } File sorted = new File(dir + folderName + iteration + "/1"); @@ -77,6 +77,12 @@ private class SingleMerge{ private int outputPtr; private ObjectOutputStream mergedOutput; + // todo: remove these: + private int extractMinPolling; + private int extractMinSave; + private int extractMinClear; + private int extractMinLoad; + private SingleMerge(int start, int end) throws IOException { this.numPairsInDeque = ((AVAILABLE_BLOCKS - 1) / (end-start+1)) * pairsInBlock; @@ -89,15 +95,41 @@ private SingleMerge(int start, int end) throws IOException { this.fileReaders.add(new ObjectInputStream(fileIn)); this.fileDeques.add(new ArrayDeque(this.numPairsInDeque)); } + + // todo: remove these + this.extractMinPolling = 0; + this.extractMinSave = 0; + this.extractMinClear = 0; + this.extractMinLoad = 0; + } private void merge() throws IOException { this.clearOutputBlock(); + long start = new Date().getTime(); this.loadAll(); + long end = new Date().getTime(); + System.out.println("SingleMerge-loadAll: " + (end-start)); + int getMin = 0; + int extractMin = 0; while (!this.areAllDequesEmpty()){ + start = new Date().getTime(); int minIndex = this.getMin(); + end = new Date().getTime(); + getMin += (end - start); + + start = new Date().getTime(); this.extractMin(minIndex); + end = new Date().getTime(); + extractMin += (end - start); } + System.out.println("SingleMerge getMin: " + getMin); + System.out.println("SingleMerge extractMin: " + extractMin); + System.out.println("SingleMerge extractMinPolling: " + this.extractMinPolling); + System.out.println("SingleMerge extractMinSave: " + this.extractMinSave); + System.out.println("SingleMerge extractMinClear: " + this.extractMinClear); + System.out.println("SingleMerge extractMinLoad: " + this.extractMinLoad); + this.saveOutputBlock(); // needed in case the block wasn't full mergedOutput.close(); savedFiles++; @@ -108,16 +140,31 @@ private void merge() throws IOException { * If the deque is empty, load the next elements in the file given in minIndex. */ private void extractMin(int minIndex) throws IOException { + long start = new Date().getTime(); int[] minPair = fileDeques.get(minIndex).pollFirst(); + long end = new Date().getTime(); + this.extractMinPolling += (end-start); + this.outputBlock[this.outputPtr] = minPair[0]; this.outputBlock[this.outputPtr + 1] = minPair[1]; this.outputPtr += 2; if (this.outputPtr == pairsInBlock * 2){ + start = new Date().getTime(); this.saveOutputBlock(); + end = new Date().getTime(); + this.extractMinSave += (end-start); + start = new Date().getTime(); this.clearOutputBlock(); + end = new Date().getTime(); + this.extractMinClear += (end-start); + } if (fileDeques.get(minIndex).isEmpty() && fileReaders.get(minIndex) != null){ + start = new Date().getTime(); this.loadData(minIndex, numPairsInDeque); + end = new Date().getTime(); + this.extractMinLoad += (end-start); + } } diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 697b4df..ec597d9 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -26,9 +26,10 @@ public class IndexWriter { private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; private static final int PAIRS_IN_BLOCK = 1000; private static final int M = 25000; +// private static final int M = 25; private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. - int NUM_REVIEWS = 10000000; // todo: remove before submission! + int NUM_REVIEWS = 1000000; // todo: remove before submission! /** @@ -56,6 +57,7 @@ public void write(String inputFile, String dir) { createTokenIndex(); endTime = new Date().getTime(); System.out.println("Create Token Index: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); + // TODO: remove the merged file that was created (./Data_Index/1) } @@ -409,14 +411,13 @@ private void saveToDir(String name, Object obj) { } public static void main(String[] args) { -// String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; - String inputFile = "./1000.txt"; + String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; +// String inputFile = "./1000.txt"; String dir = "./Data_Index"; long startTime = new Date().getTime(); IndexWriter indexWriter = new IndexWriter(); indexWriter.write(inputFile, dir); long endTime = new Date().getTime(); System.out.println("Indexing Time: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); - System.out.println("here"); } } \ No newline at end of file From e3535cb42056da32daa1c8198c5ac96decf7879f Mon Sep 17 00:00:00 2001 From: darkushin Date: Sun, 30 May 2021 15:22:11 +0300 Subject: [PATCH 38/55] Analysis code --- src/webdata/Analysis.java | 61 ++++++++++++++++++++++++++++++++++++ src/webdata/IndexWriter.java | 6 ++-- 2 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 src/webdata/Analysis.java diff --git a/src/webdata/Analysis.java b/src/webdata/Analysis.java new file mode 100644 index 0000000..634a621 --- /dev/null +++ b/src/webdata/Analysis.java @@ -0,0 +1,61 @@ +package webdata; + +import java.util.ArrayList; +import java.util.Date; +import java.util.Random; + +public class Analysis { + private IndexReader indexReader; + private TokensIndex tokensIndex; + private ArrayList randomTokens; + private long getReviewsWithTokenTime; + private long getTokenFrequencytTime; + + public Analysis(IndexReader indexReader){ + this.indexReader = indexReader; + this.tokensIndex = indexReader.tokenIndex; + this.randomTokens = new ArrayList<>(); + this.getReviewsWithTokenTime = 0; + this.getTokenFrequencytTime = 0; + + getRandomTokens(100); + measureGetReviewsWithToken(); + measureTokenFrequencyTime(); + } + + private void measureGetReviewsWithToken() { + long start = new Date().getTime(); + for (String token: this.randomTokens){ + indexReader.getReviewsWithToken(token); + } + long end = new Date().getTime(); + this.getReviewsWithTokenTime = (end - start); + } + + private void measureTokenFrequencyTime() { + long start = new Date().getTime(); + for (String token: this.randomTokens){ + indexReader.getTokenFrequency(token); + } + long end = new Date().getTime(); + this.getTokenFrequencytTime = (end - start); + } + + /** + * Get n random tokens from the index. + */ + public void getRandomTokens(int n){ + Random random = new Random(); + for (int i=0; i < n; i++){ + int randIndex = random.nextInt(this.tokensIndex.data.size()); // get random index + this.randomTokens.add(tokensIndex.getWordAt(randIndex)); + } + } + + public static void main(String[] args) { + IndexReader indexReader = new IndexReader("./Data_index"); + Analysis analysis = new Analysis(indexReader); + System.out.println("getReviewsWithToken runtime: " + analysis.getReviewsWithTokenTime + "(ms)"); + System.out.println("getTokenFrequency runtime: " + analysis.getTokenFrequencytTime + "(ms)"); + } +} \ No newline at end of file diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index ec597d9..b05ba9a 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -58,6 +58,8 @@ public void write(String inputFile, String dir) { endTime = new Date().getTime(); System.out.println("Create Token Index: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); // TODO: remove the merged file that was created (./Data_Index/1) + File mergedDataFile = new File(dir + "/1"); + mergedDataFile.delete(); } @@ -411,8 +413,8 @@ private void saveToDir(String name, Object obj) { } public static void main(String[] args) { - String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; -// String inputFile = "./1000.txt"; +// String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; + String inputFile = "./1000.txt"; String dir = "./Data_Index"; long startTime = new Date().getTime(); IndexWriter indexWriter = new IndexWriter(); From 9c3fd416aea8c684426124edb44ae6daf707eaae Mon Sep 17 00:00:00 2001 From: darkushin Date: Sun, 30 May 2021 17:59:35 +0300 Subject: [PATCH 39/55] Text Creator tests --- src/webdata/DataParser.java | 2 +- src/webdata/IndexWriter.java | 42 +++++++++++++---- src/webdata/TextCreator.java | 88 ++++++++++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 11 deletions(-) create mode 100644 src/webdata/TextCreator.java diff --git a/src/webdata/DataParser.java b/src/webdata/DataParser.java index 7f8172a..0b542b8 100644 --- a/src/webdata/DataParser.java +++ b/src/webdata/DataParser.java @@ -77,7 +77,7 @@ public Review parseReview(String review){ String field = fields.get(i); List fieldValue = Arrays.asList(field.split(": ")); if (fieldValue.get(0).equals("text")) { - parsedReview.setText(String.join(":", fieldValue.subList(1, fieldValue.size()))); + parsedReview.setText(String.join(": ", fieldValue.subList(1, fieldValue.size()))); } else if (fieldValue.get(0).equals("helpfulness")) { parsedReview.setHelpfulness(fieldValue.get(1)); } else if (fieldValue.get(0).equals("score")) { diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index b05ba9a..8f04b8d 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -29,7 +29,7 @@ public class IndexWriter { // private static final int M = 25; private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. - int NUM_REVIEWS = 1000000; // todo: remove before submission! + int NUM_REVIEWS = 10000000; // todo: remove before submission! /** @@ -39,6 +39,7 @@ public class IndexWriter { public void write(String inputFile, String dir) { this.dir = dir; createDir(); + testParser(inputFile); createDicts(inputFile); long startTime = new Date().getTime(); createProductIndex(); @@ -57,12 +58,39 @@ public void write(String inputFile, String dir) { createTokenIndex(); endTime = new Date().getTime(); System.out.println("Create Token Index: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); - // TODO: remove the merged file that was created (./Data_Index/1) File mergedDataFile = new File(dir + "/1"); mergedDataFile.delete(); } + public void testParser(String inputFile){ + DataLoader dataLoader = null; + DataParser dataParser = new DataParser(); + try { + dataLoader = new DataLoader(inputFile); + } catch (IOException e) { + e.printStackTrace(); + System.out.println("Error occurred while reading the reviews input file."); + System.exit(1); + } + int i=1; + int readTokens = 0; + for (String s: dataLoader) { + DataParser.Review review = dataParser.parseReview(s); + int length = addReviewText(review.getText(), i); + readTokens += length; + i++; + if (i > NUM_REVIEWS) { break;} + if (i % 100000 == 0) { + System.out.println("Read " + i + " reviews and " + readTokens + " tokens"); + } + } + System.out.println("TOTAL: " + i + " reviews and " + readTokens + " tokens"); + System.out.println("Done Reading"); + + } + + /** * Delete all index files by removing the given directory */ @@ -150,18 +178,12 @@ private void createDicts(String inputFile){ this.tokenBuffer = null; // free the token buffer space Comparator cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); -// for (int j = 1; j <= tokenFilesNumber; j++) { -// System.out.println("File " + j + " sorted: " + isFileSorted(dir + "/iteration_1/" + j, cmp)); -// System.out.println("File " + j + " count: " + countNumsInFile(dir + "/iteration_1/" + j)); -// } startTime = new Date().getTime(); ExternalMergeSort ems = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir); System.out.println("Number of files before merging: " + tokenFilesNumber); ems.sort(); endTime = new Date().getTime(); System.out.println("Merging Time: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); - -// System.out.println(isFileSorted(dir + "/1", cmp)); } // TODO: for debugging. Remove this later @@ -413,8 +435,8 @@ private void saveToDir(String name, Object obj) { } public static void main(String[] args) { -// String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; - String inputFile = "./1000.txt"; + String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; +// String inputFile = "./1000.txt"; String dir = "./Data_Index"; long startTime = new Date().getTime(); IndexWriter indexWriter = new IndexWriter(); diff --git a/src/webdata/TextCreator.java b/src/webdata/TextCreator.java new file mode 100644 index 0000000..6bb6c88 --- /dev/null +++ b/src/webdata/TextCreator.java @@ -0,0 +1,88 @@ +package webdata; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +public class TextCreator { + public String dir; + private BufferedReader br; + private StringBuilder stringBuffer; + + + public TextCreator(String inputFile) throws FileNotFoundException { +// this.dir = dir; + br = new BufferedReader(new FileReader(inputFile)); + stringBuffer = new StringBuilder(); + loadText(); +// createDir(); +// saveText(inputFile); + } + + private void loadText(){ + String line; + int lineNum = 0; + int numAppear = 0; + try { + while((line = br.readLine()) != null) { + lineNum++; + if (line.contains("review/text")) { + line = line.toLowerCase(Locale.ROOT); + if (line.contains("labeled")){ + String[] tokens = line.split("[^a-zA-Z0-9]"); +// System.out.println(line); + for (String token: tokens){ + if (token.equals("labeled")){ + numAppear++; + } + } + } + } + if (lineNum % 10000000 == 0){ + System.out.println("Read: " + lineNum + " lines"); + } + } + } catch (IOException e) { + e.printStackTrace(); + } + System.out.println("Num Appearances = " + numAppear); + } + + + private void saveText(String inputFile) { + DataLoader dataLoader = null; + DataParser dataParser = new DataParser(); + try { + dataLoader = new DataLoader(inputFile); + } catch (IOException e) { + e.printStackTrace(); + System.out.println("Error occurred while reading the reviews input file."); + System.exit(1); + } + for (String s : dataLoader) { + DataParser.Review review = dataParser.parseReview(s); + } + } + + private void createDir(){ + Path path = Path.of(this.dir); + try { + Files.createDirectories(path); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static void main(String[] args) throws FileNotFoundException { + String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; +// String inputFile = "./100.txt"; + TextCreator textCreator = new TextCreator(inputFile); + } +} From ee93588fe2fff95f7eecb9c8e1e10291ace00342 Mon Sep 17 00:00:00 2001 From: nirnts Date: Sun, 30 May 2021 23:17:34 +0300 Subject: [PATCH 40/55] DataParser + Loader changes --- src/webdata/DataLoader.java | 39 ++++++++-------- src/webdata/DataParser.java | 31 ++++++++++++- src/webdata/IndexWriter.java | 87 ++++++++++++++++++++---------------- src/webdata/TextCreator.java | 28 ++++++------ 4 files changed, 111 insertions(+), 74 deletions(-) diff --git a/src/webdata/DataLoader.java b/src/webdata/DataLoader.java index 430d5d1..c177b06 100644 --- a/src/webdata/DataLoader.java +++ b/src/webdata/DataLoader.java @@ -6,45 +6,44 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -public class DataLoader implements Iterable { +public class DataLoader implements Iterable> { private BufferedReader br; - private StringBuilder stringBuffer; + private ArrayList reviewStrings; public DataLoader(String inputFile) throws FileNotFoundException { br = new BufferedReader(new FileReader(inputFile)); - stringBuffer = new StringBuilder(); + reviewStrings = new ArrayList<>(); } - public String readSingleReview() { + public ArrayList readSingleReview() { String line; try { while((line = br.readLine()) != null) { - if (line.contains("product/productId") && stringBuffer.length() != 0) { - String ret = stringBuffer.toString(); - stringBuffer = new StringBuilder(line); + if (line.contains("product/productId") && reviewStrings.size() != 0) { + ArrayList ret = reviewStrings; + reviewStrings = new ArrayList(); + reviewStrings.add(line); return ret; } - stringBuffer.append(line); + reviewStrings.add(line); } } catch (IOException e) { e.printStackTrace(); System.exit(1); } - return stringBuffer.toString(); + return reviewStrings; } - public List readMultipleReviews(int num) { - ArrayList ret = new ArrayList<>(); - for (int i = 0; i < num; i++) { - ret.add(readSingleReview()); - } - return ret; - } +// public List readMultipleReviews(int num) { +// ArrayList ret = new ArrayList<>(); +// for (int i = 0; i < num; i++) { +// ret.add(readSingleReview()); +// } +// return ret; +// } - public Iterator iterator() { + public Iterator> iterator() { return new Iterator<>() { @Override public boolean hasNext(){ @@ -59,7 +58,7 @@ public boolean hasNext(){ } @Override - public String next() { + public ArrayList next() { return readSingleReview(); } diff --git a/src/webdata/DataParser.java b/src/webdata/DataParser.java index 0b542b8..b8ce136 100644 --- a/src/webdata/DataParser.java +++ b/src/webdata/DataParser.java @@ -69,7 +69,7 @@ public List parseData(List rawReviews){ * given review, i.e. productId, score, helpfulness and text. */ public Review parseReview(String review){ - List fields = Arrays.asList(review.split("review/")); + ArrayList fields = new ArrayList<>(Arrays.asList(review.split("review/"))); Review parsedReview = new Review(); parsedReview.setProductId(fields.get(0).split(": ")[1].split("product/")[0]); @@ -86,5 +86,34 @@ public Review parseReview(String review){ } return parsedReview; } + + public Review parseReview(ArrayList review){ + Review parsedReview = new Review(); + StringBuilder text = new StringBuilder(); + boolean readingText = false; + for (String line : review){ + if (readingText) { + text.append(line); + continue; + } + int prefix = line.indexOf("/"); + int delim = line.indexOf(":"); + if (prefix == -1 || delim == -1 || delim < prefix) { + continue; + } + String field = line.substring(prefix + 1, delim); + switch (field) { + case "text" -> { + text.append(line.substring(delim + 2)); + readingText = true; + } + case "productId" -> parsedReview.setProductId(line.substring(delim + 2)); + case "helpfulness" -> parsedReview.setHelpfulness(line.substring(delim + 2)); + case "score" -> parsedReview.setScore(line.substring(delim + 2)); + } + } + parsedReview.setText(text.toString()); + return parsedReview; + } } diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 8f04b8d..e079d7f 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -11,7 +11,7 @@ public class IndexWriter { private HashMap tokenDict; // token: tokenId private ArrayList invertedTokenDict; // tokenId: token private TreeMap> productIds; - private TreeMap> reviewIds; + private LinkedList> reviewIds; private int[][] tokenBuffer = new int[TOKEN_BUFFER_SIZE][2]; // private ArrayList> tokenBuffer = new ArrayList>(); @@ -39,7 +39,7 @@ public class IndexWriter { public void write(String inputFile, String dir) { this.dir = dir; createDir(); - testParser(inputFile); +// testParser(inputFile); createDicts(inputFile); long startTime = new Date().getTime(); createProductIndex(); @@ -58,37 +58,38 @@ public void write(String inputFile, String dir) { createTokenIndex(); endTime = new Date().getTime(); System.out.println("Create Token Index: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); + // TODO: remove the merged file that was created (./Data_Index/1) File mergedDataFile = new File(dir + "/1"); mergedDataFile.delete(); } - public void testParser(String inputFile){ - DataLoader dataLoader = null; - DataParser dataParser = new DataParser(); - try { - dataLoader = new DataLoader(inputFile); - } catch (IOException e) { - e.printStackTrace(); - System.out.println("Error occurred while reading the reviews input file."); - System.exit(1); - } - int i=1; - int readTokens = 0; - for (String s: dataLoader) { - DataParser.Review review = dataParser.parseReview(s); - int length = addReviewText(review.getText(), i); - readTokens += length; - i++; - if (i > NUM_REVIEWS) { break;} - if (i % 100000 == 0) { - System.out.println("Read " + i + " reviews and " + readTokens + " tokens"); - } - } - System.out.println("TOTAL: " + i + " reviews and " + readTokens + " tokens"); - System.out.println("Done Reading"); - - } +// public void testParser(String inputFile){ +// DataLoader dataLoader = null; +// DataParser dataParser = new DataParser(); +// try { +// dataLoader = new DataLoader(inputFile); +// } catch (IOException e) { +// e.printStackTrace(); +// System.out.println("Error occurred while reading the reviews input file."); +// System.exit(1); +// } +// int i=1; +// int readTokens = 0; +// for (String s: dataLoader) { +// DataParser.Review review = dataParser.parseReview(s); +// int length = addReviewText(review.getText(), i); +// readTokens += length; +// i++; +// if (i > NUM_REVIEWS) { break;} +// if (i % 100000 == 0) { +// System.out.println("Read " + i + " reviews and " + readTokens + " tokens"); +// } +// } +// System.out.println("TOTAL: " + i + " reviews and " + readTokens + " tokens"); +// System.out.println("Done Reading"); +// +// } /** @@ -124,7 +125,7 @@ private void createDir(){ private void createDicts(String inputFile){ productIds = new TreeMap<>(); tokenDict = new HashMap<>(); - reviewIds = new TreeMap<>(); + reviewIds = new LinkedList<>(); invertedTokenDict = new ArrayList<>(); // todo: remove the directory creation from here! @@ -148,7 +149,7 @@ private void createDicts(String inputFile){ long startTime = new Date().getTime(); int i=1; int readTokens = 0; - for (String s: dataLoader){ + for (ArrayList s: dataLoader){ DataParser.Review review = dataParser.parseReview(s); addProductId(review.getProductId(), i); int length = addReviewText(review.getText(), i); @@ -178,12 +179,18 @@ private void createDicts(String inputFile){ this.tokenBuffer = null; // free the token buffer space Comparator cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); +// for (int j = 1; j <= tokenFilesNumber; j++) { +// System.out.println("File " + j + " sorted: " + isFileSorted(dir + "/iteration_1/" + j, cmp)); +// System.out.println("File " + j + " count: " + countNumsInFile(dir + "/iteration_1/" + j)); +// } startTime = new Date().getTime(); ExternalMergeSort ems = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir); System.out.println("Number of files before merging: " + tokenFilesNumber); ems.sort(); endTime = new Date().getTime(); System.out.println("Merging Time: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); + +// System.out.println(isFileSorted(dir + "/1", cmp)); } // TODO: for debugging. Remove this later @@ -331,12 +338,15 @@ private void addProductId(String productId, int reviewId) { * Adds all the information that is relevant to the given reviewId to the reviewIds dictionary. */ private void addReviewId(DataParser.Review review, int reviewId, int length) { - reviewIds.put(reviewId, new ArrayList<>()); + ArrayList vals = new ArrayList<>(); + // 0 - productId, 1 - score, 2 - helpfulness, 3 - length - reviewIds.get(reviewId).add(review.getProductId()); - reviewIds.get(reviewId).add(review.getScore()); - reviewIds.get(reviewId).add(review.getHelpfulness()); - reviewIds.get(reviewId).add(String.valueOf(length)); + vals.add(review.getProductId()); + vals.add(review.getScore()); + vals.add(review.getHelpfulness()); + vals.add(String.valueOf(length)); + + reviewIds.add(vals); } /** @@ -398,8 +408,7 @@ private void createReviewIndex() { productDict.put(productId, i); i++; } - for (int review : reviewIds.keySet()) { - ArrayList vals = reviewIds.get(review); + for (ArrayList vals : reviewIds) { ArrayList new_vals = new ArrayList<>(List.of(0, 0, 0, 0, 0)); new_vals.set(ReviewIndex.PRODUCTID_INDEX, productDict.get(vals.get(0))); String[] helpf = vals.get(2).split("/"); @@ -435,8 +444,8 @@ private void saveToDir(String name, Object obj) { } public static void main(String[] args) { - String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; -// String inputFile = "./1000.txt"; +// String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; + String inputFile = "./1M.txt"; String dir = "./Data_Index"; long startTime = new Date().getTime(); IndexWriter indexWriter = new IndexWriter(); diff --git a/src/webdata/TextCreator.java b/src/webdata/TextCreator.java index 6bb6c88..c46f5e4 100644 --- a/src/webdata/TextCreator.java +++ b/src/webdata/TextCreator.java @@ -56,20 +56,20 @@ private void loadText(){ } - private void saveText(String inputFile) { - DataLoader dataLoader = null; - DataParser dataParser = new DataParser(); - try { - dataLoader = new DataLoader(inputFile); - } catch (IOException e) { - e.printStackTrace(); - System.out.println("Error occurred while reading the reviews input file."); - System.exit(1); - } - for (String s : dataLoader) { - DataParser.Review review = dataParser.parseReview(s); - } - } +// private void saveText(String inputFile) { +// DataLoader dataLoader = null; +// DataParser dataParser = new DataParser(); +// try { +// dataLoader = new DataLoader(inputFile); +// } catch (IOException e) { +// e.printStackTrace(); +// System.out.println("Error occurred while reading the reviews input file."); +// System.exit(1); +// } +// for (String s : dataLoader) { +// DataParser.Review review = dataParser.parseReview(s); +// } +// } private void createDir(){ Path path = Path.of(this.dir); From 6a77a7786eaf1b1d0b687122694fee0b7e79f59f Mon Sep 17 00:00:00 2001 From: nirnts Date: Sun, 30 May 2021 23:25:43 +0300 Subject: [PATCH 41/55] again --- src/webdata/DataParser.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/webdata/DataParser.java b/src/webdata/DataParser.java index b8ce136..ebd8b85 100644 --- a/src/webdata/DataParser.java +++ b/src/webdata/DataParser.java @@ -92,7 +92,8 @@ public Review parseReview(ArrayList review){ StringBuilder text = new StringBuilder(); boolean readingText = false; for (String line : review){ - if (readingText) { + if (readingText && !line.equals("")) { + text.append(" "); text.append(line); continue; } From 6f44fcc0470d66d00750b8eff1dd0fb981731370 Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 31 May 2021 09:09:52 +0300 Subject: [PATCH 42/55] Updated Data Parsing --- src/webdata/IndexWriter.java | 30 ++++++++++++++---------------- src/webdata/TextCreator.java | 26 +++++++++++++++++--------- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index e079d7f..7fabc47 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -30,6 +30,7 @@ public class IndexWriter { private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. int NUM_REVIEWS = 10000000; // todo: remove before submission! +// private int numLabeled; /** @@ -39,6 +40,7 @@ public class IndexWriter { public void write(String inputFile, String dir) { this.dir = dir; createDir(); +// numLabeled = 0; // testParser(inputFile); createDicts(inputFile); long startTime = new Date().getTime(); @@ -58,7 +60,6 @@ public void write(String inputFile, String dir) { createTokenIndex(); endTime = new Date().getTime(); System.out.println("Create Token Index: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); - // TODO: remove the merged file that was created (./Data_Index/1) File mergedDataFile = new File(dir + "/1"); mergedDataFile.delete(); @@ -76,7 +77,7 @@ public void write(String inputFile, String dir) { // } // int i=1; // int readTokens = 0; -// for (String s: dataLoader) { +// for (ArrayList s: dataLoader) { // DataParser.Review review = dataParser.parseReview(s); // int length = addReviewText(review.getText(), i); // readTokens += length; @@ -88,10 +89,9 @@ public void write(String inputFile, String dir) { // } // System.out.println("TOTAL: " + i + " reviews and " + readTokens + " tokens"); // System.out.println("Done Reading"); +// System.out.println("labeled appearances: " + numLabeled); // // } - - /** * Delete all index files by removing the given directory */ @@ -164,6 +164,8 @@ private void createDicts(String inputFile){ } } System.out.println("Done Reading"); + System.out.println("TOTAL: Read " + i + " reviews and " + readTokens + " tokens"); + this.sortBuffer(); try { @@ -179,18 +181,12 @@ private void createDicts(String inputFile){ this.tokenBuffer = null; // free the token buffer space Comparator cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); -// for (int j = 1; j <= tokenFilesNumber; j++) { -// System.out.println("File " + j + " sorted: " + isFileSorted(dir + "/iteration_1/" + j, cmp)); -// System.out.println("File " + j + " count: " + countNumsInFile(dir + "/iteration_1/" + j)); -// } startTime = new Date().getTime(); ExternalMergeSort ems = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir); System.out.println("Number of files before merging: " + tokenFilesNumber); ems.sort(); endTime = new Date().getTime(); System.out.println("Merging Time: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); - -// System.out.println(isFileSorted(dir + "/1", cmp)); } // TODO: for debugging. Remove this later @@ -267,12 +263,14 @@ private long countNumsInFile(String fileName) { private int addReviewText(String reviewText, int reviewIndex){ String[] tokens = reviewText.split("[^a-zA-Z0-9]"); // split to alphanumeric tokens int reviewLength = 0; - for (String token: tokens){ - if (!token.matches("[a-zA-Z0-9]+")){ - continue; - } + String[] cleanTokens = Arrays.stream(tokens).filter(value -> value != null && value.length() > 0).toArray(size -> new String[size]); + + for (String token: cleanTokens){ reviewLength += 1; token = token.toLowerCase(); +// if (token.equals("labeled")){ +// numLabeled++; +// } int termId = tokenDict.computeIfAbsent(token, k -> tokenDict.size()); if (termId == invertedTokenDict.size()) { invertedTokenDict.add(token);} // if a new token was added, add it also to the invertedTokenDict tokenBuffer[tokenBufferPointer][0] = termId; @@ -444,8 +442,8 @@ private void saveToDir(String name, Object obj) { } public static void main(String[] args) { -// String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; - String inputFile = "./1M.txt"; + String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; +// String inputFile = "./100.txt"; String dir = "./Data_Index"; long startTime = new Date().getTime(); IndexWriter indexWriter = new IndexWriter(); diff --git a/src/webdata/TextCreator.java b/src/webdata/TextCreator.java index c46f5e4..17eb94f 100644 --- a/src/webdata/TextCreator.java +++ b/src/webdata/TextCreator.java @@ -6,6 +6,8 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Arrays; +import java.util.Date; import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -30,20 +32,24 @@ private void loadText(){ String line; int lineNum = 0; int numAppear = 0; + long start = new Date().getTime(); try { while((line = br.readLine()) != null) { lineNum++; if (line.contains("review/text")) { - line = line.toLowerCase(Locale.ROOT); - if (line.contains("labeled")){ - String[] tokens = line.split("[^a-zA-Z0-9]"); +// line = line.toLowerCase(Locale.ROOT); +// if (line.contains("labeled")){ + String[] tokens = line.split("[^a-zA-Z0-9]"); // System.out.println(line); - for (String token: tokens){ - if (token.equals("labeled")){ - numAppear++; - } - } - } + String[] removedNull = Arrays.stream(tokens).filter(value -> value != null && value.length() > 0).toArray(size -> new String[size]); +// for (String token: tokens){ +// if (!token.matches("[a-zA-Z0-9]+")){ +// continue; +// } + numAppear += removedNull.length; +// } +// } + } if (lineNum % 10000000 == 0){ System.out.println("Read: " + lineNum + " lines"); @@ -53,6 +59,8 @@ private void loadText(){ e.printStackTrace(); } System.out.println("Num Appearances = " + numAppear); + long end = new Date().getTime(); + System.out.println("Time: " + (end-start)); } From d0e2e3cbb0bd3ba9ac2336b3e7b8491c53c903e6 Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 31 May 2021 10:14:16 +0300 Subject: [PATCH 43/55] Cleaned Code, with time measuring --- src/webdata/DataLoader.java | 8 --- src/webdata/DataParser.java | 7 --- src/webdata/ExternalMergeSort.java | 79 +----------------------------- src/webdata/IndexWriter.java | 67 ++----------------------- src/webdata/TokensIndex.java | 41 ---------------- 5 files changed, 4 insertions(+), 198 deletions(-) diff --git a/src/webdata/DataLoader.java b/src/webdata/DataLoader.java index c177b06..fd6c9f3 100644 --- a/src/webdata/DataLoader.java +++ b/src/webdata/DataLoader.java @@ -35,14 +35,6 @@ public ArrayList readSingleReview() { return reviewStrings; } -// public List readMultipleReviews(int num) { -// ArrayList ret = new ArrayList<>(); -// for (int i = 0; i < num; i++) { -// ret.add(readSingleReview()); -// } -// return ret; -// } - public Iterator> iterator() { return new Iterator<>() { @Override diff --git a/src/webdata/DataParser.java b/src/webdata/DataParser.java index ebd8b85..f3c79d6 100644 --- a/src/webdata/DataParser.java +++ b/src/webdata/DataParser.java @@ -44,18 +44,11 @@ public void setText(String text) { } } -// public static final List INTEREST_FIELDS = Arrays.asList("productId", "score", "helpfulness", "text"); - - /** * Given product review data, parses the data and creates a new list where each entry i contains hashmap with the fields * of the review, i.e: productId->value, score->value, helpfulness->value, text->value. * inputFile is the path to the file containing the review data */ -// public DataParser(String inputFile) throws IOException { -// allReviews.add(parse_review(review.toString())); // add the last review -// } - public List parseData(List rawReviews){ ArrayList allReviews = new ArrayList<>(); for (String review: rawReviews){ diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java index e51a4c2..62b2e5b 100644 --- a/src/webdata/ExternalMergeSort.java +++ b/src/webdata/ExternalMergeSort.java @@ -14,7 +14,7 @@ public class ExternalMergeSort { private int iteration; // number of merges performed (including current iteration). 1 means we are currently in the first iteration. private int savedFiles; // number of files that were saved in the current iteration. - private int AVAILABLE_BLOCKS = 50000; + private int AVAILABLE_BLOCKS = 20000; ExternalMergeSort(Comparator cmp, int numFiles, int pairsInBlock, String dir){ this.cmp = cmp; @@ -49,7 +49,6 @@ public void sort(){ this.removeDir(dir + folderName + iteration); // remove the temp dir in which the files of this iteration were stored numFiles = savedFiles; savedFiles = 0; - System.out.println("Number of files in iteration: " + iteration + " is: " + numFiles); iteration++; } File sorted = new File(dir + folderName + iteration + "/1"); @@ -77,13 +76,6 @@ private class SingleMerge{ private int outputPtr; private ObjectOutputStream mergedOutput; - // todo: remove these: - private int extractMinPolling; - private int extractMinSave; - private int extractMinClear; - private int extractMinLoad; - - private SingleMerge(int start, int end) throws IOException { this.numPairsInDeque = ((AVAILABLE_BLOCKS - 1) / (end-start+1)) * pairsInBlock; this.mergedOutput = new ObjectOutputStream(new FileOutputStream(dir + folderName + (iteration+1) + "/" + (savedFiles+1))); @@ -95,41 +87,15 @@ private SingleMerge(int start, int end) throws IOException { this.fileReaders.add(new ObjectInputStream(fileIn)); this.fileDeques.add(new ArrayDeque(this.numPairsInDeque)); } - - // todo: remove these - this.extractMinPolling = 0; - this.extractMinSave = 0; - this.extractMinClear = 0; - this.extractMinLoad = 0; - } private void merge() throws IOException { this.clearOutputBlock(); - long start = new Date().getTime(); this.loadAll(); - long end = new Date().getTime(); - System.out.println("SingleMerge-loadAll: " + (end-start)); - int getMin = 0; - int extractMin = 0; while (!this.areAllDequesEmpty()){ - start = new Date().getTime(); int minIndex = this.getMin(); - end = new Date().getTime(); - getMin += (end - start); - - start = new Date().getTime(); this.extractMin(minIndex); - end = new Date().getTime(); - extractMin += (end - start); } - System.out.println("SingleMerge getMin: " + getMin); - System.out.println("SingleMerge extractMin: " + extractMin); - System.out.println("SingleMerge extractMinPolling: " + this.extractMinPolling); - System.out.println("SingleMerge extractMinSave: " + this.extractMinSave); - System.out.println("SingleMerge extractMinClear: " + this.extractMinClear); - System.out.println("SingleMerge extractMinLoad: " + this.extractMinLoad); - this.saveOutputBlock(); // needed in case the block wasn't full mergedOutput.close(); savedFiles++; @@ -140,31 +106,16 @@ private void merge() throws IOException { * If the deque is empty, load the next elements in the file given in minIndex. */ private void extractMin(int minIndex) throws IOException { - long start = new Date().getTime(); int[] minPair = fileDeques.get(minIndex).pollFirst(); - long end = new Date().getTime(); - this.extractMinPolling += (end-start); - this.outputBlock[this.outputPtr] = minPair[0]; this.outputBlock[this.outputPtr + 1] = minPair[1]; this.outputPtr += 2; if (this.outputPtr == pairsInBlock * 2){ - start = new Date().getTime(); this.saveOutputBlock(); - end = new Date().getTime(); - this.extractMinSave += (end-start); - start = new Date().getTime(); this.clearOutputBlock(); - end = new Date().getTime(); - this.extractMinClear += (end-start); - } if (fileDeques.get(minIndex).isEmpty() && fileReaders.get(minIndex) != null){ - start = new Date().getTime(); this.loadData(minIndex, numPairsInDeque); - end = new Date().getTime(); - this.extractMinLoad += (end-start); - } } @@ -191,28 +142,6 @@ private void loadAll() throws IOException { /** Load numbBlocks from the file given by index i to the matching deque*/ private void loadData(int i, int numPairs) throws IOException { -// // TODO: Code for reading -blocks- (not pairs). Remove if not used -// int blocksRead = 0; -// int pairsRead = 0; -// while (blocksRead < numBlocks) { -// int[] pair = new int[2]; -// try { -// pair[0] = fileReaders.get(i).readInt(); -// pair[1] = fileReaders.get(i).readInt(); -// } catch (EOFException e){ -// // Reached end of file. -// fileReaders.get(i).close(); -// fileReaders.set(i, null); -// break; -// } -// fileDeques.get(i).add(pair); -// pairsRead++; -// if (pairsRead == pairsInBlock) { -// pairsRead = 0; -// blocksRead++; -// } -// } - for (int j = 0; j < numPairs; j++) { int[] pair = new int[2]; try { @@ -247,11 +176,5 @@ private void saveOutputBlock() throws IOException { this.mergedOutput.writeInt(this.outputBlock[i]); } } - - - - } - - } diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 7fabc47..38b3374 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -14,8 +14,7 @@ public class IndexWriter { private LinkedList> reviewIds; private int[][] tokenBuffer = new int[TOKEN_BUFFER_SIZE][2]; -// private ArrayList> tokenBuffer = new ArrayList>(); - ; // Array of termID, docID pairs. Regular array to sort in-place + // Array of termID, docID pairs. Regular array to sort in-place private int tokenBufferPointer; private int tokenFilesNumber = 0; private String dir; @@ -26,11 +25,9 @@ public class IndexWriter { private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; private static final int PAIRS_IN_BLOCK = 1000; private static final int M = 25000; -// private static final int M = 25; private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. int NUM_REVIEWS = 10000000; // todo: remove before submission! -// private int numLabeled; /** @@ -40,8 +37,6 @@ public class IndexWriter { public void write(String inputFile, String dir) { this.dir = dir; createDir(); -// numLabeled = 0; -// testParser(inputFile); createDicts(inputFile); long startTime = new Date().getTime(); createProductIndex(); @@ -65,33 +60,6 @@ public void write(String inputFile, String dir) { } -// public void testParser(String inputFile){ -// DataLoader dataLoader = null; -// DataParser dataParser = new DataParser(); -// try { -// dataLoader = new DataLoader(inputFile); -// } catch (IOException e) { -// e.printStackTrace(); -// System.out.println("Error occurred while reading the reviews input file."); -// System.exit(1); -// } -// int i=1; -// int readTokens = 0; -// for (ArrayList s: dataLoader) { -// DataParser.Review review = dataParser.parseReview(s); -// int length = addReviewText(review.getText(), i); -// readTokens += length; -// i++; -// if (i > NUM_REVIEWS) { break;} -// if (i % 100000 == 0) { -// System.out.println("Read " + i + " reviews and " + readTokens + " tokens"); -// } -// } -// System.out.println("TOTAL: " + i + " reviews and " + readTokens + " tokens"); -// System.out.println("Done Reading"); -// System.out.println("labeled appearances: " + numLabeled); -// -// } /** * Delete all index files by removing the given directory */ @@ -159,9 +127,6 @@ private void createDicts(String inputFile){ // todo: remove this part - is used only to test with specific number of reviews if (i > NUM_REVIEWS) { break;} - if (i % 100000 == 0) { - System.out.println("Read " + i + " reviews and " + readTokens + " tokens"); - } } System.out.println("Done Reading"); System.out.println("TOTAL: Read " + i + " reviews and " + readTokens + " tokens"); @@ -183,7 +148,6 @@ private void createDicts(String inputFile){ startTime = new Date().getTime(); ExternalMergeSort ems = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir); - System.out.println("Number of files before merging: " + tokenFilesNumber); ems.sort(); endTime = new Date().getTime(); System.out.println("Merging Time: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); @@ -268,17 +232,12 @@ private int addReviewText(String reviewText, int reviewIndex){ for (String token: cleanTokens){ reviewLength += 1; token = token.toLowerCase(); -// if (token.equals("labeled")){ -// numLabeled++; -// } int termId = tokenDict.computeIfAbsent(token, k -> tokenDict.size()); if (termId == invertedTokenDict.size()) { invertedTokenDict.add(token);} // if a new token was added, add it also to the invertedTokenDict tokenBuffer[tokenBufferPointer][0] = termId; tokenBuffer[tokenBufferPointer][1] = reviewIndex; tokenBufferPointer++; -// tokenBuffer.add(new ArrayList<>(Arrays.asList(termId, reviewIndex))); if (tokenBufferPointer == TOKEN_BUFFER_SIZE){ -// if (tokenBuffer.size() == TOKEN_BUFFER_SIZE){ this.sortBuffer(); try { this.saveBuffer(); @@ -287,17 +246,13 @@ private int addReviewText(String reviewText, int reviewIndex){ System.exit(1); } this.clearBuffer(); -// this.tokenBuffer.clear(); - } } return reviewLength; } private void sortBuffer() { - System.out.println("In sort"); Arrays.sort(tokenBuffer,0, tokenBufferPointer, Comparator.comparing(a -> invertedTokenDict.get(a[0]))); -// tokenBuffer.sort(Comparator.comparing(a -> invertedTokenDict.get(a.get(0)))); } private void saveBuffer() throws IOException { @@ -307,10 +262,6 @@ private void saveBuffer() throws IOException { tokenBufferWriter.writeInt(tokenBuffer[i][0]); tokenBufferWriter.writeInt(tokenBuffer[i][1]); } -// for (int i = 0; i < tokenBuffer.size(); i++) { -// tokenBufferWriter.writeInt(tokenBuffer.get(i).get(0)); -// tokenBufferWriter.writeInt(tokenBuffer.get(i).get(1)); -// } tokenBufferWriter.close(); } @@ -371,26 +322,14 @@ private void createProductIndex() { */ private void createTokenIndex(){ ArrayList tokens = new ArrayList<>(tokenDict.keySet()); - long startTime = new Date().getTime(); Collections.sort(tokens); - long endTime = new Date().getTime(); - System.out.println("Token Index After Sort: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); tokenDict = null; - startTime = new Date().getTime(); - int k = 256; KFront kf = new KFront(true); kf.createKFront(k, tokens); - endTime = new Date().getTime(); - System.out.println("Token Index After KFront: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); - - startTime = new Date().getTime(); TokensIndex tIdx = new TokensIndex(k, this.dir); tIdx.insertData(kf.getTable(), kf.getConcatString(), dir + "/1"); - endTime = new Date().getTime(); - System.out.println("Token Index Inserting Data: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); - saveToDir(TOKEN_INDEX_FILE, tIdx); } @@ -442,8 +381,8 @@ private void saveToDir(String name, Object obj) { } public static void main(String[] args) { - String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; -// String inputFile = "./100.txt"; +// String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; + String inputFile = "./1000.txt"; String dir = "./Data_Index"; long startTime = new Date().getTime(); IndexWriter indexWriter = new IndexWriter(); diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index a4fde58..4f481d8 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -49,9 +49,6 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException { private int k; private String dir; private RandomAccessFile invertedIndexFile; - private long invertedDiff = 0; - private long invertedEncode = 0; - private long invertedSave = 0; public TokensIndex(int k, String dir) { this.data = new ArrayList<>(); @@ -86,15 +83,9 @@ public void insertData(List> tokensData, String concatString, Stri PairsLoader pl = new PairsLoader(pairsFilename); int offset = 0; - long insert = 0; - long insertSave = 0; int[] curPair = pl.readPair(); // This should correspond to the first token for (int i=0; i< tokensData.size(); i++){ - if (i % (tokensData.size()/10) == 0){ - System.out.println("Finished " + i + " tokens. Time: " + (insert + insertSave)); - } - long startTime = new Date().getTime(); List tokenData = tokensData.get(i); TokenInfo token = new TokenInfo(); ArrayList invertedIdx = new ArrayList<>(); @@ -119,10 +110,6 @@ public void insertData(List> tokensData, String concatString, Stri } curPair = nextPair; // Save the pair for the next token - long endTime = new Date().getTime(); - insert += (endTime - startTime); - - startTime = new Date().getTime(); try { token.invertedIndexPtr = (int) this.invertedIndexFile.getFilePointer(); } catch (IOException e) { @@ -130,8 +117,6 @@ public void insertData(List> tokensData, String concatString, Stri System.exit(1); } saveInvertedIndex(invertedIdx); - endTime = new Date().getTime(); - insertSave += (endTime - startTime); numTokens += token.collectionFrequency; token.length = tokenData.get(TOKEN_LENGTH).shortValue(); @@ -148,11 +133,6 @@ public void insertData(List> tokensData, String concatString, Stri invertedIdx = null; tokenData = null; } - System.out.println("insert: " + insert); - System.out.println("insertSave: " + insertSave); - System.out.println("InvertedDiff: " + invertedDiff); - System.out.println("InvertedEncode: " + invertedEncode); - System.out.println("InvertedSave: " + invertedSave); this.dictBytes = this.dictString.getBytes(StandardCharsets.UTF_8).length; } @@ -179,32 +159,11 @@ private List subListVals(List inputList, String type){ private void saveInvertedIndex(List valsList) { try { // change the reviewIds (odd indices) to a difference list (except for the first id): - long start = new Date().getTime(); - for (int i = valsList.size()-2; i>0; i = i - 2){ valsList.set(i, valsList.get(i) - valsList.get(i-2)); } - long end = new Date().getTime(); - invertedDiff += (end-start); - - start = new Date().getTime(); byte[] codeBytes = Encoding.groupVarEncodeMultiple(valsList); -// byte[] codeBytes = new byte[10]; -// ArrayList tst = Encoding.groupVarDecodeMultiple(codeBytes); -// for (int j=0;j "); -// } -// } - end = new Date().getTime(); - invertedEncode += (end-start); - - start = new Date().getTime(); this.invertedIndexFile.write(codeBytes); - end = new Date().getTime(); - invertedSave += (end-start); - - } catch (Exception e) { System.out.println("Error occurred while saving invertedIndex bytes"); e.printStackTrace(); From 19e6879956a88774a61c3978dcea912ef4f8c0d9 Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 31 May 2021 10:28:40 +0300 Subject: [PATCH 44/55] Changed switch-case in DataParser --- src/webdata/DataParser.java | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/webdata/DataParser.java b/src/webdata/DataParser.java index f3c79d6..61366e1 100644 --- a/src/webdata/DataParser.java +++ b/src/webdata/DataParser.java @@ -96,14 +96,15 @@ public Review parseReview(ArrayList review){ continue; } String field = line.substring(prefix + 1, delim); - switch (field) { - case "text" -> { - text.append(line.substring(delim + 2)); - readingText = true; - } - case "productId" -> parsedReview.setProductId(line.substring(delim + 2)); - case "helpfulness" -> parsedReview.setHelpfulness(line.substring(delim + 2)); - case "score" -> parsedReview.setScore(line.substring(delim + 2)); + if (field.equals("text")){ + text.append(line.substring(delim + 2)); + readingText = true; + } else if (field.equals("productId")) { + parsedReview.setProductId(line.substring(delim + 2)); + } else if (field.equals("helpfulness")) { + parsedReview.setHelpfulness(line.substring(delim + 2)); + } else if (field.equals("score")) { + parsedReview.setScore(line.substring(delim + 2)); } } parsedReview.setText(text.toString()); From 7287e07583f72dff06074bbe25214053140ac859 Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 31 May 2021 11:27:33 +0300 Subject: [PATCH 45/55] Cleaned code for submission, before running tests again --- src/webdata/IndexReader.java | 8 --- src/webdata/IndexWriter.java | 96 +---------------------------------- src/webdata/ProductIndex.java | 2 - src/webdata/TextCreator.java | 96 ----------------------------------- src/webdata/TokensIndex.java | 17 ------- 5 files changed, 2 insertions(+), 217 deletions(-) delete mode 100644 src/webdata/TextCreator.java diff --git a/src/webdata/IndexReader.java b/src/webdata/IndexReader.java index c70d72c..8514d96 100644 --- a/src/webdata/IndexReader.java +++ b/src/webdata/IndexReader.java @@ -1,10 +1,7 @@ package webdata; import java.io.*; -import java.nio.ByteBuffer; -import java.nio.IntBuffer; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.Enumeration; @@ -206,9 +203,4 @@ public Enumeration getProductReviews(String productId) { } return Collections.enumeration(reviews); } - -// public static void main(String[] args) { -// IndexReader indexReader = new IndexReader("./Data_index"); -// indexReader.getReviewsWithToken("0"); -// } } \ No newline at end of file diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 38b3374..9c3901a 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -38,26 +38,13 @@ public void write(String inputFile, String dir) { this.dir = dir; createDir(); createDicts(inputFile); - long startTime = new Date().getTime(); createProductIndex(); - long endTime = new Date().getTime(); - System.out.println("Create Product Index: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); - - startTime = new Date().getTime(); createReviewIndex(); - endTime = new Date().getTime(); - System.out.println("Create Review Index: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); - productIds = null; - reviewIds = null; // Clears memory? - - startTime = new Date().getTime(); + reviewIds = null; // Clear memory createTokenIndex(); - endTime = new Date().getTime(); - System.out.println("Create Token Index: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); File mergedDataFile = new File(dir + "/1"); mergedDataFile.delete(); - } /** @@ -96,7 +83,6 @@ private void createDicts(String inputFile){ reviewIds = new LinkedList<>(); invertedTokenDict = new ArrayList<>(); - // todo: remove the directory creation from here! try { Files.createDirectories(Path.of(this.dir + ExternalMergeSort.folderName + "1")); } catch (IOException e) { @@ -114,24 +100,18 @@ private void createDicts(String inputFile){ System.out.println("Error occurred while reading the reviews input file."); System.exit(1); } - long startTime = new Date().getTime(); + // todo: remove i int i=1; - int readTokens = 0; for (ArrayList s: dataLoader){ DataParser.Review review = dataParser.parseReview(s); addProductId(review.getProductId(), i); int length = addReviewText(review.getText(), i); addReviewId(review, i, length); - readTokens += length; i++; // todo: remove this part - is used only to test with specific number of reviews if (i > NUM_REVIEWS) { break;} } - System.out.println("Done Reading"); - System.out.println("TOTAL: Read " + i + " reviews and " + readTokens + " tokens"); - - this.sortBuffer(); try { this.saveBuffer(); @@ -139,85 +119,13 @@ private void createDicts(String inputFile){ e.printStackTrace(); System.exit(1); } - - long endTime = new Date().getTime(); - System.out.println("Data Loading And Saving Time: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); - this.tokenBuffer = null; // free the token buffer space Comparator cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); - startTime = new Date().getTime(); ExternalMergeSort ems = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir); ems.sort(); - endTime = new Date().getTime(); - System.out.println("Merging Time: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); } - // TODO: for debugging. Remove this later - private boolean isFileSorted(String fileName, Comparator cmp) { - FileInputStream fileIn = null; - ObjectInputStream ois = null; - long tot = 0; - try { - fileIn = new FileInputStream(fileName); - ois = new ObjectInputStream(fileIn); - int prev = ois.readInt(); - int prevDocId = ois.readInt(); - tot++; - while (true) { - int cur = ois.readInt(); - int docId = ois.readInt(); - if (cmp.compare(prev, cur) > 0) { - System.out.println("Terms not sorted. Occured in " + tot); - } else if ((cmp.compare(prev, cur) == 0) && (prevDocId > docId)) { - System.out.println("DocIds not sorted. Occured in " + tot); - } - prev = cur; - prevDocId = docId; - tot++; - } - } catch (EOFException ex) { - System.out.println("Read " + tot + " pairs."); - try { - ois.close(); - } catch (IOException e) { - e.printStackTrace(); - System.exit(1); - } - return true; - } catch (IOException ex) { - ex.printStackTrace(); - System.exit(1); - } - return true; - } - private long countNumsInFile(String fileName) { - FileInputStream fileIn; - ObjectInputStream ois = null; - long tot = 0; - try { - fileIn = new FileInputStream(fileName); - ois = new ObjectInputStream(fileIn); - while (true) { - ois.readInt(); - tot++; - } - } catch (EOFException ex) { - try { - ois.close(); - } catch (IOException e) { - e.printStackTrace(); - System.exit(1); - } - return tot; - } catch (IOException ex) { - ex.printStackTrace(); - System.exit(1); - } - return tot; - } - - /** * Split the given text of the i-th review into tokens and add them to the tokens dictionary. * @param reviewText the text of the review that should be added. diff --git a/src/webdata/ProductIndex.java b/src/webdata/ProductIndex.java index 5a732c9..b79f5a7 100644 --- a/src/webdata/ProductIndex.java +++ b/src/webdata/ProductIndex.java @@ -128,7 +128,6 @@ private void readObject(ObjectInputStream inputFile) throws ClassNotFoundExcepti k = inputFile.readInt(); dictBytes = inputFile.readInt(); dictString = new String(inputFile.readNBytes(dictBytes), StandardCharsets.UTF_8); -// dictString = inputFile.readUTF(); data = (ArrayList) inputFile.readObject(); } @@ -137,7 +136,6 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException outputFile.writeInt(k); outputFile.writeInt(this.dictBytes); outputFile.writeBytes(this.dictString); -// outputFile.writeUTF(dictString); outputFile.writeObject(data); } diff --git a/src/webdata/TextCreator.java b/src/webdata/TextCreator.java deleted file mode 100644 index 17eb94f..0000000 --- a/src/webdata/TextCreator.java +++ /dev/null @@ -1,96 +0,0 @@ -package webdata; - -import java.io.BufferedReader; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Date; -import java.util.Locale; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - - -public class TextCreator { - public String dir; - private BufferedReader br; - private StringBuilder stringBuffer; - - - public TextCreator(String inputFile) throws FileNotFoundException { -// this.dir = dir; - br = new BufferedReader(new FileReader(inputFile)); - stringBuffer = new StringBuilder(); - loadText(); -// createDir(); -// saveText(inputFile); - } - - private void loadText(){ - String line; - int lineNum = 0; - int numAppear = 0; - long start = new Date().getTime(); - try { - while((line = br.readLine()) != null) { - lineNum++; - if (line.contains("review/text")) { -// line = line.toLowerCase(Locale.ROOT); -// if (line.contains("labeled")){ - String[] tokens = line.split("[^a-zA-Z0-9]"); -// System.out.println(line); - String[] removedNull = Arrays.stream(tokens).filter(value -> value != null && value.length() > 0).toArray(size -> new String[size]); -// for (String token: tokens){ -// if (!token.matches("[a-zA-Z0-9]+")){ -// continue; -// } - numAppear += removedNull.length; -// } -// } - - } - if (lineNum % 10000000 == 0){ - System.out.println("Read: " + lineNum + " lines"); - } - } - } catch (IOException e) { - e.printStackTrace(); - } - System.out.println("Num Appearances = " + numAppear); - long end = new Date().getTime(); - System.out.println("Time: " + (end-start)); - } - - -// private void saveText(String inputFile) { -// DataLoader dataLoader = null; -// DataParser dataParser = new DataParser(); -// try { -// dataLoader = new DataLoader(inputFile); -// } catch (IOException e) { -// e.printStackTrace(); -// System.out.println("Error occurred while reading the reviews input file."); -// System.exit(1); -// } -// for (String s : dataLoader) { -// DataParser.Review review = dataParser.parseReview(s); -// } -// } - - private void createDir(){ - Path path = Path.of(this.dir); - try { - Files.createDirectories(path); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public static void main(String[] args) throws FileNotFoundException { - String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; -// String inputFile = "./100.txt"; - TextCreator textCreator = new TextCreator(inputFile); - } -} diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index 4f481d8..ace06cf 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -82,7 +82,6 @@ public void insertData(List> tokensData, String concatString, Stri dictString = concatString; PairsLoader pl = new PairsLoader(pairsFilename); int offset = 0; - int[] curPair = pl.readPair(); // This should correspond to the first token for (int i=0; i< tokensData.size(); i++){ @@ -136,22 +135,6 @@ public void insertData(List> tokensData, String concatString, Stri this.dictBytes = this.dictString.getBytes(StandardCharsets.UTF_8).length; } - /** - * Create a sub list of the given list containing only the odd/even elements in the array - * @param inputList the list that should be sliced - * @param type can be `odd` or `even` - * @return a List of integers containing only the elements in odd/even indices of the input array - */ - private List subListVals(List inputList, String type){ - int first = 0; - List subList = new ArrayList<>(); - if (type.equals("even")){ first = 1; } - for (int i = first; i < inputList.size(); i = i + 2){ - subList.add(inputList.get(i)); - } - return subList; - } - /** * Encodes the integers given in the integer list using delta encoding, and saves them in the invertedIndexFile. * @param valsList a list with number that should be encoded and saved in the inverted index file. From c12c01844a2ec7ec28d7f9d15fdab9efa77b846c Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 31 May 2021 16:53:48 +0300 Subject: [PATCH 46/55] Submission code --- src/webdata/Encoding.java | 100 ----------------------------------- src/webdata/IndexWriter.java | 9 +--- src/webdata/TokensIndex.java | 2 +- 3 files changed, 3 insertions(+), 108 deletions(-) diff --git a/src/webdata/Encoding.java b/src/webdata/Encoding.java index 0a7addf..3af1e0c 100644 --- a/src/webdata/Encoding.java +++ b/src/webdata/Encoding.java @@ -9,106 +9,6 @@ import java.util.List; public class Encoding { - - /** - * Encode the given number using gamma encoding. - * The encoded output is a string representing the bytes of the number. - */ - public static void gammaEncode(int num, StringBuilder s) { - String offset = Integer.toBinaryString(num + 1); - s.append("1".repeat(offset.length() - 1)); - s.append("0"); - s.append(offset.substring(1)); - } - - /** - * Encode the given number using delta encoding. - * The encoded output is a string representing the bytes of the number. - */ - public static void deltaEncode(int num, StringBuilder s) { - String offset = Integer.toBinaryString(num + 1); - gammaEncode(offset.length() - 1, s); - s.append(offset.substring(1)); - } - - /** - * Decode the given string, which represents a binary sequence using gamma code. - */ - public static ArrayList gammaDecode(String encoding) { - ArrayList output = new ArrayList<>(); - int bitsRead = 0; - while (bitsRead < encoding.length()) { - int length = encoding.substring(bitsRead).indexOf('0'); // Find the first 0 - int offsetLoc = bitsRead + length + 1; - output.add(Integer.parseInt("1" + encoding.substring(offsetLoc, offsetLoc + length), 2) - 1); - bitsRead = offsetLoc + length; - } - return output; - } - - /** - * Decode the given string, which represents a binary sequence using delta code. - */ - public static ArrayList deltaDecode(String encoding) { - ArrayList output = new ArrayList<>(); - int bitsRead = 0; - while (bitsRead < encoding.length()) { - int length = encoding.substring(bitsRead).indexOf('0'); // Find the first 0 - int offsetLoc = bitsRead + length + 1; - int actualLength = Integer.parseInt("1" + encoding.substring(offsetLoc, offsetLoc + length), 2); - bitsRead = offsetLoc + length; - - output.add(Integer.parseInt("1" + encoding.substring(bitsRead, bitsRead + actualLength - 1), 2) - 1); - bitsRead += actualLength - 1; - } - return output; - } - - /** - * Decode the given byte array, using gamma code. - */ - public static ArrayList gammaDecode(byte[] code) { - return gammaDecode(byteToString(code)); - } - - /** - * Decode the given byte array, using delta code. - */ - public static ArrayList deltaDecode(byte[] code) { - return deltaDecode(byteToString(code)); - } - - /** - * Convert the given string representing a bit sequence of numbers to a byte array. - */ - public static byte[] toByteArray(String encoding) { - // Pad 0s to the nearest multiple of 8 - StringBuilder s = new StringBuilder(); - s.append(encoding); - s.append("0".repeat((int) Math.ceil((float) encoding.length() / 8) * 8 - encoding.length())); - String padded = s.toString(); - byte[] ret = new BigInteger(padded, 2).toByteArray(); -// if (ret.length * 8 == padded.length() + 8) { -// return Arrays.copyOfRange(ret, 1, ret.length); -// } else { -// return ret; -// } - return new byte[5]; - } - - /** - * Convert the given byte array to a string representing the bits of the byte array. - */ - public static String byteToString(byte[] encoding) { - StringBuilder s = new StringBuilder(); - for (byte b : encoding) { - String binary = Integer.toBinaryString(Byte.toUnsignedInt(b)); - s.append("0".repeat(8 - binary.length())); // toBinaryString removes leading 0's - s.append(binary); - } - return s.toString(); - } - /** * Encode the given list of numbers using Group-Varint-Encoding. The first byte of the resulting byte array * holds the number of bytes required to decode each of the next four numbers. diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 9c3901a..005b7d6 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -4,8 +4,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.*; -import java.util.Date; - public class IndexWriter { private HashMap tokenDict; // token: tokenId @@ -289,13 +287,10 @@ private void saveToDir(String name, Object obj) { } public static void main(String[] args) { -// String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; - String inputFile = "./1000.txt"; + String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; +// String inputFile = "./1000.txt"; String dir = "./Data_Index"; - long startTime = new Date().getTime(); IndexWriter indexWriter = new IndexWriter(); indexWriter.write(inputFile, dir); - long endTime = new Date().getTime(); - System.out.println("Indexing Time: " + (endTime-startTime) + " Milliseconds = " + ((endTime - startTime) / 1000) + " Seconds"); } } \ No newline at end of file diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index ace06cf..e423c76 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -110,7 +110,7 @@ public void insertData(List> tokensData, String concatString, Stri curPair = nextPair; // Save the pair for the next token try { - token.invertedIndexPtr = (int) this.invertedIndexFile.getFilePointer(); + token.invertedIndexPtr = this.invertedIndexFile.getFilePointer(); } catch (IOException e) { e.printStackTrace(); System.exit(1); From 42ea768443a2f38101c07acfecc32c6cfb28cae0 Mon Sep 17 00:00:00 2001 From: darkushin Date: Mon, 31 May 2021 22:11:41 +0300 Subject: [PATCH 47/55] Final Submission --- src/webdata/ExternalMergeSort.java | 1 - src/webdata/IndexWriter.java | 15 --------------- 2 files changed, 16 deletions(-) diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java index 62b2e5b..1800f7b 100644 --- a/src/webdata/ExternalMergeSort.java +++ b/src/webdata/ExternalMergeSort.java @@ -37,7 +37,6 @@ public void sort(){ for (int i = 0; i < Math.ceil((float) numFiles / (AVAILABLE_BLOCKS - 1)); i++) { int end = Math.min(numFiles, (i + 1) * (AVAILABLE_BLOCKS - 1)); try { - // TODO: Handle case when start == end? SingleMerge sm = new SingleMerge(i * (AVAILABLE_BLOCKS - 1) + 1, end); sm.merge(); } catch (IOException e) { diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 005b7d6..c77deb9 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -25,9 +25,6 @@ public class IndexWriter { private static final int M = 25000; private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. - int NUM_REVIEWS = 10000000; // todo: remove before submission! - - /** * Given product review data, creates an on disk index * inputFile is the path to the file containing the review data @@ -98,7 +95,6 @@ private void createDicts(String inputFile){ System.out.println("Error occurred while reading the reviews input file."); System.exit(1); } - // todo: remove i int i=1; for (ArrayList s: dataLoader){ DataParser.Review review = dataParser.parseReview(s); @@ -106,9 +102,6 @@ private void createDicts(String inputFile){ int length = addReviewText(review.getText(), i); addReviewId(review, i, length); i++; - - // todo: remove this part - is used only to test with specific number of reviews - if (i > NUM_REVIEWS) { break;} } this.sortBuffer(); try { @@ -285,12 +278,4 @@ private void saveToDir(String name, Object obj) { System.exit(1); } } - - public static void main(String[] args) { - String inputFile = "/Users/darkushin/Downloads/Movies_&_TV.txt"; -// String inputFile = "./1000.txt"; - String dir = "./Data_Index"; - IndexWriter indexWriter = new IndexWriter(); - indexWriter.write(inputFile, dir); - } } \ No newline at end of file From 6ffc156ad08c741f91817494ffd02a3a922f8f70 Mon Sep 17 00:00:00 2001 From: Daniel Arkushin Date: Sun, 20 Jun 2021 17:01:09 +0300 Subject: [PATCH 48/55] Ex2 fix --- src/webdata/IndexWriter.java | 63 +++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 11 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index c77deb9..c2aaf8d 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -9,7 +9,7 @@ public class IndexWriter { private HashMap tokenDict; // token: tokenId private ArrayList invertedTokenDict; // tokenId: token private TreeMap> productIds; - private LinkedList> reviewIds; +// private LinkedList> reviewIds; private int[][] tokenBuffer = new int[TOKEN_BUFFER_SIZE][2]; // Array of termID, docID pairs. Regular array to sort in-place @@ -22,9 +22,11 @@ public class IndexWriter { private static final String TOKEN_INDEX_FILE = "token_index.txt"; private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; private static final int PAIRS_IN_BLOCK = 1000; - private static final int M = 25000; + private static final int M = 5000; private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. + private static final int NUM_REVIEWS = 10000000; + /** * Given product review data, creates an on disk index * inputFile is the path to the file containing the review data @@ -34,9 +36,13 @@ public void write(String inputFile, String dir) { createDir(); createDicts(inputFile); createProductIndex(); - createReviewIndex(); + try{ + createReviewIndex(); + } catch (Exception e){ + e.printStackTrace(); + System.exit(1); + } productIds = null; - reviewIds = null; // Clear memory createTokenIndex(); File mergedDataFile = new File(dir + "/1"); mergedDataFile.delete(); @@ -73,9 +79,19 @@ private void createDir(){ * @param inputFile the file containing all reviews */ private void createDicts(String inputFile){ + ObjectOutputStream reviewOutput = null; + try { + BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(this.dir + "/reviewIds")); + reviewOutput = new ObjectOutputStream(out); + }catch (IOException e) { + System.out.println("Error occurred while saving the index file: reviewIds"); + e.printStackTrace(); + System.exit(1); + } + productIds = new TreeMap<>(); tokenDict = new HashMap<>(); - reviewIds = new LinkedList<>(); +// reviewIds = new LinkedList<>(); invertedTokenDict = new ArrayList<>(); try { @@ -100,8 +116,18 @@ private void createDicts(String inputFile){ DataParser.Review review = dataParser.parseReview(s); addProductId(review.getProductId(), i); int length = addReviewText(review.getText(), i); - addReviewId(review, i, length); + addReviewId(review, reviewOutput, length); i++; + if (i % 100000 == 0){ + System.out.println("Num Reviews: " + i); + System.out.println("Total Memory: " + Runtime.getRuntime().totalMemory() / (float)(1000000) + " MB" + " (MAX: " + Runtime.getRuntime().maxMemory()/ (float)(1000000) + " MB" + ")"); + System.out.println("Used Memory: " + (Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / (float)(1000000) + " MB"); + System.out.println("Free Memory: " + Runtime.getRuntime().freeMemory() / (float)(1000000) + " MB"); + } + + if (i == NUM_REVIEWS) { + break; + } } this.sortBuffer(); try { @@ -185,7 +211,7 @@ private void addProductId(String productId, int reviewId) { /** * Adds all the information that is relevant to the given reviewId to the reviewIds dictionary. */ - private void addReviewId(DataParser.Review review, int reviewId, int length) { + private void addReviewId(DataParser.Review review, ObjectOutputStream reviewOutput, int length) { ArrayList vals = new ArrayList<>(); // 0 - productId, 1 - score, 2 - helpfulness, 3 - length @@ -193,8 +219,13 @@ private void addReviewId(DataParser.Review review, int reviewId, int length) { vals.add(review.getScore()); vals.add(review.getHelpfulness()); vals.add(String.valueOf(length)); - - reviewIds.add(vals); + try { + reviewOutput.writeObject(vals); + reviewOutput.reset(); + } catch (IOException e) { + e.printStackTrace(); + } +// reviewIds.add(vals); } /** @@ -235,7 +266,9 @@ private void createTokenIndex(){ /** * Creates and saves to the disk the review index which hold all information related to reviews. */ - private void createReviewIndex() { + private void createReviewIndex() throws IOException, ClassNotFoundException { + ObjectInputStream reviewIds = new ObjectInputStream(new FileInputStream(this.dir + "/reviewIds")); + // Revise the review dictionary to the correct structure & change productIDs to product index ArrayList> dictValues = new ArrayList<>(); HashMap productDict = new HashMap<>(productIds.size()); @@ -244,7 +277,8 @@ private void createReviewIndex() { productDict.put(productId, i); i++; } - for (ArrayList vals : reviewIds) { + while (reviewIds.available() != 0) { + ArrayList vals = (ArrayList) reviewIds.readObject(); ArrayList new_vals = new ArrayList<>(List.of(0, 0, 0, 0, 0)); new_vals.set(ReviewIndex.PRODUCTID_INDEX, productDict.get(vals.get(0))); String[] helpf = vals.get(2).split("/"); @@ -278,4 +312,11 @@ private void saveToDir(String name, Object obj) { System.exit(1); } } +// +// public static void main(String[] args) { +// String inputFile = "/cs/+/course/webdata/Movies_&_TV.txt"; +// String dir = "/tmp/Data_Index"; +// IndexWriter indexWriter = new IndexWriter(); +// indexWriter.write(inputFile, dir); +// } } \ No newline at end of file From b8cc0812a975d169ac38fdf20460f02a07ac0eac Mon Sep 17 00:00:00 2001 From: nirnts Date: Mon, 21 Jun 2021 16:59:21 +0300 Subject: [PATCH 49/55] fixed reviewdict bug --- src/webdata/IndexWriter.java | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index c2aaf8d..d5dcf21 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -36,13 +36,13 @@ public void write(String inputFile, String dir) { createDir(); createDicts(inputFile); createProductIndex(); + invertedTokenDict = null; try{ createReviewIndex(); - } catch (Exception e){ + } catch (Exception e) { e.printStackTrace(); System.exit(1); } - productIds = null; createTokenIndex(); File mergedDataFile = new File(dir + "/1"); mergedDataFile.delete(); @@ -137,6 +137,12 @@ private void createDicts(String inputFile){ System.exit(1); } this.tokenBuffer = null; // free the token buffer space + + try { + reviewOutput.close(); + } catch (IOException e) { + e.printStackTrace(); + } Comparator cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); ExternalMergeSort ems = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir); @@ -277,8 +283,14 @@ private void createReviewIndex() throws IOException, ClassNotFoundException { productDict.put(productId, i); i++; } - while (reviewIds.available() != 0) { - ArrayList vals = (ArrayList) reviewIds.readObject(); + productIds = null; + while (true) { + ArrayList vals = null; + try { + vals = (ArrayList) reviewIds.readObject(); + } catch (EOFException ex) { + break; + } ArrayList new_vals = new ArrayList<>(List.of(0, 0, 0, 0, 0)); new_vals.set(ReviewIndex.PRODUCTID_INDEX, productDict.get(vals.get(0))); String[] helpf = vals.get(2).split("/"); @@ -288,6 +300,8 @@ private void createReviewIndex() throws IOException, ClassNotFoundException { new_vals.set(ReviewIndex.SCORE_INDEX, (int) Float.parseFloat(vals.get(1))); dictValues.add(new_vals); } + reviewIds.close(); + productDict = null; ReviewIndex rIndex = new ReviewIndex(); rIndex.insertData(dictValues); @@ -312,11 +326,11 @@ private void saveToDir(String name, Object obj) { System.exit(1); } } -// -// public static void main(String[] args) { -// String inputFile = "/cs/+/course/webdata/Movies_&_TV.txt"; -// String dir = "/tmp/Data_Index"; -// IndexWriter indexWriter = new IndexWriter(); -// indexWriter.write(inputFile, dir); -// } + + public static void main(String[] args) { + String inputFile = "./1000.txt"; + String dir = "./Data_Index"; + IndexWriter indexWriter = new IndexWriter(); + indexWriter.write(inputFile, dir); + } } \ No newline at end of file From 42089eef4a08bbcf5681cf194c851b93a17063df Mon Sep 17 00:00:00 2001 From: nirnts Date: Mon, 21 Jun 2021 18:19:58 +0300 Subject: [PATCH 50/55] commented memory improvements --- src/webdata/IndexWriter.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index d5dcf21..d8fb2ef 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -36,7 +36,7 @@ public void write(String inputFile, String dir) { createDir(); createDicts(inputFile); createProductIndex(); - invertedTokenDict = null; +// invertedTokenDict = null; // TODO: remove? (1) try{ createReviewIndex(); } catch (Exception e) { @@ -283,7 +283,7 @@ private void createReviewIndex() throws IOException, ClassNotFoundException { productDict.put(productId, i); i++; } - productIds = null; +// productIds = null; // TODO: remove? (2) while (true) { ArrayList vals = null; try { @@ -301,7 +301,7 @@ private void createReviewIndex() throws IOException, ClassNotFoundException { dictValues.add(new_vals); } reviewIds.close(); - productDict = null; +// productDict = null; // TODO: remove? (3) ReviewIndex rIndex = new ReviewIndex(); rIndex.insertData(dictValues); From c00346e9c345eca9bc997b50b6e442766ee6f3b7 Mon Sep 17 00:00:00 2001 From: darkushin Date: Tue, 22 Jun 2021 22:12:58 +0300 Subject: [PATCH 51/55] not working changes --- src/webdata/IndexWriter.java | 54 +++++++++++++++++++++++------------- src/webdata/ReviewIndex.java | 30 ++++++-------------- 2 files changed, 44 insertions(+), 40 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index d8fb2ef..7a33f35 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -22,11 +22,11 @@ public class IndexWriter { private static final String TOKEN_INDEX_FILE = "token_index.txt"; private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; private static final int PAIRS_IN_BLOCK = 1000; - private static final int M = 5000; + private static final int M = 20000; private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. private static final int NUM_REVIEWS = 10000000; - +// todo: remove the reviewIds file after index creation! /** * Given product review data, creates an on disk index * inputFile is the path to the file containing the review data @@ -46,6 +46,8 @@ public void write(String inputFile, String dir) { createTokenIndex(); File mergedDataFile = new File(dir + "/1"); mergedDataFile.delete(); + File reviewIds = new File(dir + "/reviewIds"); + reviewIds.delete(); } /** @@ -276,7 +278,9 @@ private void createReviewIndex() throws IOException, ClassNotFoundException { ObjectInputStream reviewIds = new ObjectInputStream(new FileInputStream(this.dir + "/reviewIds")); // Revise the review dictionary to the correct structure & change productIDs to product index - ArrayList> dictValues = new ArrayList<>(); +// ArrayList> dictValues = new ArrayList<>(); + ArrayList data = new ArrayList<>(); + HashMap productDict = new HashMap<>(productIds.size()); int i = 0; for (String productId: productIds.keySet()){ @@ -284,6 +288,7 @@ private void createReviewIndex() throws IOException, ClassNotFoundException { i++; } // productIds = null; // TODO: remove? (2) + ReviewIndex rIndex = new ReviewIndex(); while (true) { ArrayList vals = null; try { @@ -291,20 +296,31 @@ private void createReviewIndex() throws IOException, ClassNotFoundException { } catch (EOFException ex) { break; } - ArrayList new_vals = new ArrayList<>(List.of(0, 0, 0, 0, 0)); - new_vals.set(ReviewIndex.PRODUCTID_INDEX, productDict.get(vals.get(0))); + ReviewIndex.ReviewInfo rI = rIndex.new ReviewInfo(); + int[] info = new int[4]; + byte score = (byte) (int) Float.parseFloat(vals.get(1)); + info[ReviewIndex.PRODUCTID_INDEX] = productDict.get(vals.get(0)); String[] helpf = vals.get(2).split("/"); - new_vals.set(ReviewIndex.HELPFNUM_INDEX, Integer.parseInt(helpf[0])); - new_vals.set(ReviewIndex.HELPFDNOM_INDEX, Integer.parseInt(helpf[1])); - new_vals.set(ReviewIndex.REVIEWLENGTH_INDEX, Integer.parseInt(vals.get(3))); - new_vals.set(ReviewIndex.SCORE_INDEX, (int) Float.parseFloat(vals.get(1))); - dictValues.add(new_vals); + info[ReviewIndex.HELPFNUM_INDEX] = Integer.parseInt(helpf[0]); + info[ReviewIndex.HELPFDNOM_INDEX] = Integer.parseInt(helpf[1]); + info[ReviewIndex.REVIEWLENGTH_INDEX] = Integer.parseInt(vals.get(3)); + rI.encodedInfo = Encoding.groupVarintEncode(info); + rI.score = score; + data.add(rI); + + +// new_vals.set(ReviewIndex.PRODUCTID_INDEX, productDict.get(vals.get(0))); +// String[] helpf = vals.get(2).split("/"); +// new_vals.set(ReviewIndex.HELPFNUM_INDEX, Integer.parseInt(helpf[0])); +// new_vals.set(ReviewIndex.HELPFDNOM_INDEX, Integer.parseInt(helpf[1])); +// new_vals.set(ReviewIndex.REVIEWLENGTH_INDEX, Integer.parseInt(vals.get(3))); +// new_vals.set(ReviewIndex.SCORE_INDEX, (int) Float.parseFloat(vals.get(1))); +// dictValues.add(new_vals); } reviewIds.close(); // productDict = null; // TODO: remove? (3) - ReviewIndex rIndex = new ReviewIndex(); - rIndex.insertData(dictValues); - +// ReviewIndex rIndex = new ReviewIndex(); + rIndex.insertData(data); saveToDir(REVIEW_INDEX_FILE, rIndex); } @@ -327,10 +343,10 @@ private void saveToDir(String name, Object obj) { } } - public static void main(String[] args) { - String inputFile = "./1000.txt"; - String dir = "./Data_Index"; - IndexWriter indexWriter = new IndexWriter(); - indexWriter.write(inputFile, dir); - } +// public static void main(String[] args) { +// String inputFile = "./1000.txt"; +// String dir = "./Data_Index"; +// IndexWriter indexWriter = new IndexWriter(); +// indexWriter.write(inputFile, dir); +// } } \ No newline at end of file diff --git a/src/webdata/ReviewIndex.java b/src/webdata/ReviewIndex.java index 8d0cbb9..45ca444 100644 --- a/src/webdata/ReviewIndex.java +++ b/src/webdata/ReviewIndex.java @@ -5,19 +5,19 @@ import java.util.List; public class ReviewIndex implements Serializable{ - private class ReviewInfo implements Serializable { - private byte[] encodedInfo; - private byte score; + public class ReviewInfo implements Serializable { + public byte[] encodedInfo; + public byte score; private void readObject(ObjectInputStream inputFile) throws ClassNotFoundException, IOException { - encodedInfo = (byte[]) inputFile.readObject(); + encodedInfo = (byte[]) inputFile.readUnshared(); score = inputFile.readByte(); } private void writeObject(ObjectOutputStream outputFile) throws IOException { - outputFile.writeObject(encodedInfo); + outputFile.writeUnshared(encodedInfo); outputFile.writeByte(score); } } @@ -33,20 +33,8 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException /** * insert the given data into the list containing all the information of reviews. */ - public void insertData(List> inData) { - data = new ArrayList<>(); - for (List entry : inData) { - ReviewInfo rI = new ReviewInfo(); - int[] info = new int[4]; - byte score = (byte) entry.get(4).intValue(); - info[PRODUCTID_INDEX] = entry.get(PRODUCTID_INDEX); - info[HELPFNUM_INDEX] = entry.get(HELPFNUM_INDEX); - info[HELPFDNOM_INDEX] = entry.get(HELPFDNOM_INDEX); - info[REVIEWLENGTH_INDEX] = entry.get(REVIEWLENGTH_INDEX); - rI.encodedInfo = Encoding.groupVarintEncode(info); - rI.score = score; - data.add(rI); - } + public void insertData(ArrayList inData) { + this.data = inData; } /** @@ -86,11 +74,11 @@ public int getNumReview(){ private void readObject(ObjectInputStream inputFile) throws ClassNotFoundException, IOException { - data = (ArrayList) inputFile.readObject(); + data = (ArrayList) inputFile.readUnshared(); } private void writeObject(ObjectOutputStream outputFile) throws IOException { - outputFile.writeObject(this.data); + outputFile.writeUnshared(this.data); } } From 6e384a6c4a736ed6a96cf06a49230e73461a1dda Mon Sep 17 00:00:00 2001 From: nirnts Date: Wed, 23 Jun 2021 17:59:09 +0300 Subject: [PATCH 52/55] reviewindex save/load --- src/webdata/IndexReader.java | 11 ++++++----- src/webdata/IndexWriter.java | 15 ++++++++------- src/webdata/ReviewIndex.java | 25 +++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 12 deletions(-) diff --git a/src/webdata/IndexReader.java b/src/webdata/IndexReader.java index 8514d96..f0bcab9 100644 --- a/src/webdata/IndexReader.java +++ b/src/webdata/IndexReader.java @@ -43,11 +43,12 @@ private void loadIndices(String dir){ in.close(); fileIn.close(); - fileIn = new FileInputStream(dir + "/" + REVIEW_INDEX_FILE); - in = new ObjectInputStream(fileIn); - reviewIndex = (ReviewIndex) in.readObject(); - in.close(); - fileIn.close(); +// fileIn = new FileInputStream(dir + "/" + REVIEW_INDEX_FILE); +// in = new ObjectInputStream(fileIn); +// reviewIndex = (ReviewIndex) in.readObject(); +// in.close(); +// fileIn.close(); + reviewIndex.load(dir + "/" + REVIEW_INDEX_FILE); } catch (IOException | ClassNotFoundException e) { System.out.println("Error occurred while loading an index file."); diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 7a33f35..51f9f24 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -321,7 +321,8 @@ private void createReviewIndex() throws IOException, ClassNotFoundException { // productDict = null; // TODO: remove? (3) // ReviewIndex rIndex = new ReviewIndex(); rIndex.insertData(data); - saveToDir(REVIEW_INDEX_FILE, rIndex); +// saveToDir(REVIEW_INDEX_FILE, rIndex); + rIndex.save(this.dir + "/" + REVIEW_INDEX_FILE); } /** @@ -343,10 +344,10 @@ private void saveToDir(String name, Object obj) { } } -// public static void main(String[] args) { -// String inputFile = "./1000.txt"; -// String dir = "./Data_Index"; -// IndexWriter indexWriter = new IndexWriter(); -// indexWriter.write(inputFile, dir); -// } + public static void main(String[] args) { + String inputFile = "./1000.txt"; + String dir = "./Data_Index"; + IndexWriter indexWriter = new IndexWriter(); + indexWriter.write(inputFile, dir); + } } \ No newline at end of file diff --git a/src/webdata/ReviewIndex.java b/src/webdata/ReviewIndex.java index 45ca444..c2a0ae3 100644 --- a/src/webdata/ReviewIndex.java +++ b/src/webdata/ReviewIndex.java @@ -81,4 +81,29 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException { outputFile.writeUnshared(this.data); } + + public void save(String outputFile) throws IOException { + ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile))); + for (ReviewInfo rI : data) { + oos.writeObject(rI); + oos.reset(); + } + oos.close(); + } + + public void load(String inputFile) throws IOException, ClassNotFoundException { + data = new ArrayList<>(); + ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(new FileInputStream(inputFile))); + while (true) { + ReviewInfo rI = null; + try { + rI = (ReviewInfo) ois.readObject(); + } catch (EOFException ex) { + break; + } + data.add(rI); + ois.reset(); + } + ois.close(); + } } From 3728348e3b233e43cf03b6f3f68cb8b88db2b53b Mon Sep 17 00:00:00 2001 From: nirnts Date: Wed, 23 Jun 2021 18:12:37 +0300 Subject: [PATCH 53/55] reviewIndex save/load #2 --- src/webdata/IndexReader.java | 1 + src/webdata/ReviewIndex.java | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webdata/IndexReader.java b/src/webdata/IndexReader.java index f0bcab9..56e55d5 100644 --- a/src/webdata/IndexReader.java +++ b/src/webdata/IndexReader.java @@ -48,6 +48,7 @@ private void loadIndices(String dir){ // reviewIndex = (ReviewIndex) in.readObject(); // in.close(); // fileIn.close(); + reviewIndex = new ReviewIndex(); reviewIndex.load(dir + "/" + REVIEW_INDEX_FILE); } catch (IOException | ClassNotFoundException e) { diff --git a/src/webdata/ReviewIndex.java b/src/webdata/ReviewIndex.java index c2a0ae3..4167ab2 100644 --- a/src/webdata/ReviewIndex.java +++ b/src/webdata/ReviewIndex.java @@ -102,7 +102,6 @@ public void load(String inputFile) throws IOException, ClassNotFoundException { break; } data.add(rI); - ois.reset(); } ois.close(); } From b43ef58fb51ae976e212ec80d878d1e8252c1c8c Mon Sep 17 00:00:00 2001 From: darkushin Date: Thu, 24 Jun 2021 10:03:14 +0200 Subject: [PATCH 54/55] working code - before cleanup --- src/webdata/IndexWriter.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 51f9f24..5ef601d 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -344,10 +344,10 @@ private void saveToDir(String name, Object obj) { } } - public static void main(String[] args) { - String inputFile = "./1000.txt"; - String dir = "./Data_Index"; - IndexWriter indexWriter = new IndexWriter(); - indexWriter.write(inputFile, dir); - } +// public static void main(String[] args) { +// String inputFile = "./1000.txt"; +// String dir = "./Data_Index"; +// IndexWriter indexWriter = new IndexWriter(); +// indexWriter.write(inputFile, dir); +// } } \ No newline at end of file From 5baa5b48b95f64cddd67aa44d97414a2eea7b742 Mon Sep 17 00:00:00 2001 From: darkushin Date: Thu, 24 Jun 2021 14:59:45 +0200 Subject: [PATCH 55/55] Ex2 - fixed code --- src/webdata/IndexReader.java | 5 --- src/webdata/IndexWriter.java | 85 +++++++----------------------------- src/webdata/ReviewIndex.java | 8 ++-- 3 files changed, 19 insertions(+), 79 deletions(-) diff --git a/src/webdata/IndexReader.java b/src/webdata/IndexReader.java index 56e55d5..1f59912 100644 --- a/src/webdata/IndexReader.java +++ b/src/webdata/IndexReader.java @@ -43,11 +43,6 @@ private void loadIndices(String dir){ in.close(); fileIn.close(); -// fileIn = new FileInputStream(dir + "/" + REVIEW_INDEX_FILE); -// in = new ObjectInputStream(fileIn); -// reviewIndex = (ReviewIndex) in.readObject(); -// in.close(); -// fileIn.close(); reviewIndex = new ReviewIndex(); reviewIndex.load(dir + "/" + REVIEW_INDEX_FILE); diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java index 5ef601d..aa74eab 100644 --- a/src/webdata/IndexWriter.java +++ b/src/webdata/IndexWriter.java @@ -9,7 +9,6 @@ public class IndexWriter { private HashMap tokenDict; // token: tokenId private ArrayList invertedTokenDict; // tokenId: token private TreeMap> productIds; -// private LinkedList> reviewIds; private int[][] tokenBuffer = new int[TOKEN_BUFFER_SIZE][2]; // Array of termID, docID pairs. Regular array to sort in-place @@ -22,11 +21,9 @@ public class IndexWriter { private static final String TOKEN_INDEX_FILE = "token_index.txt"; private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; private static final int PAIRS_IN_BLOCK = 1000; - private static final int M = 20000; + private static final int M = 15000; private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. - private static final int NUM_REVIEWS = 10000000; -// todo: remove the reviewIds file after index creation! /** * Given product review data, creates an on disk index * inputFile is the path to the file containing the review data @@ -36,18 +33,13 @@ public void write(String inputFile, String dir) { createDir(); createDicts(inputFile); createProductIndex(); -// invertedTokenDict = null; // TODO: remove? (1) - try{ - createReviewIndex(); - } catch (Exception e) { - e.printStackTrace(); - System.exit(1); - } + try{ createReviewIndex(); + } catch (Exception e) { e.printStackTrace(); + System.exit(1);} + productIds = null; createTokenIndex(); - File mergedDataFile = new File(dir + "/1"); - mergedDataFile.delete(); - File reviewIds = new File(dir + "/reviewIds"); - reviewIds.delete(); + new File(dir + "/1").delete(); + new File(dir + "/reviewIds").delete(); } /** @@ -82,18 +74,12 @@ private void createDir(){ */ private void createDicts(String inputFile){ ObjectOutputStream reviewOutput = null; - try { - BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(this.dir + "/reviewIds")); - reviewOutput = new ObjectOutputStream(out); - }catch (IOException e) { - System.out.println("Error occurred while saving the index file: reviewIds"); - e.printStackTrace(); - System.exit(1); - } + try { reviewOutput = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(this.dir + "/reviewIds"))); + } catch (IOException e) { e.printStackTrace(); + System.exit(1);} productIds = new TreeMap<>(); tokenDict = new HashMap<>(); -// reviewIds = new LinkedList<>(); invertedTokenDict = new ArrayList<>(); try { @@ -120,16 +106,6 @@ private void createDicts(String inputFile){ int length = addReviewText(review.getText(), i); addReviewId(review, reviewOutput, length); i++; - if (i % 100000 == 0){ - System.out.println("Num Reviews: " + i); - System.out.println("Total Memory: " + Runtime.getRuntime().totalMemory() / (float)(1000000) + " MB" + " (MAX: " + Runtime.getRuntime().maxMemory()/ (float)(1000000) + " MB" + ")"); - System.out.println("Used Memory: " + (Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / (float)(1000000) + " MB"); - System.out.println("Free Memory: " + Runtime.getRuntime().freeMemory() / (float)(1000000) + " MB"); - } - - if (i == NUM_REVIEWS) { - break; - } } this.sortBuffer(); try { @@ -139,12 +115,8 @@ private void createDicts(String inputFile){ System.exit(1); } this.tokenBuffer = null; // free the token buffer space - - try { - reviewOutput.close(); - } catch (IOException e) { - e.printStackTrace(); - } + try { reviewOutput.close(); + } catch (IOException e) { e.printStackTrace();} Comparator cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); ExternalMergeSort ems = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir); @@ -230,10 +202,7 @@ private void addReviewId(DataParser.Review review, ObjectOutputStream reviewOutp try { reviewOutput.writeObject(vals); reviewOutput.reset(); - } catch (IOException e) { - e.printStackTrace(); - } -// reviewIds.add(vals); + } catch (IOException e) { e.printStackTrace();} } /** @@ -278,7 +247,6 @@ private void createReviewIndex() throws IOException, ClassNotFoundException { ObjectInputStream reviewIds = new ObjectInputStream(new FileInputStream(this.dir + "/reviewIds")); // Revise the review dictionary to the correct structure & change productIDs to product index -// ArrayList> dictValues = new ArrayList<>(); ArrayList data = new ArrayList<>(); HashMap productDict = new HashMap<>(productIds.size()); @@ -287,15 +255,11 @@ private void createReviewIndex() throws IOException, ClassNotFoundException { productDict.put(productId, i); i++; } -// productIds = null; // TODO: remove? (2) ReviewIndex rIndex = new ReviewIndex(); while (true) { ArrayList vals = null; - try { - vals = (ArrayList) reviewIds.readObject(); - } catch (EOFException ex) { - break; - } + try { vals = (ArrayList) reviewIds.readObject(); + } catch (EOFException ex) { break;} ReviewIndex.ReviewInfo rI = rIndex.new ReviewInfo(); int[] info = new int[4]; byte score = (byte) (int) Float.parseFloat(vals.get(1)); @@ -307,21 +271,9 @@ private void createReviewIndex() throws IOException, ClassNotFoundException { rI.encodedInfo = Encoding.groupVarintEncode(info); rI.score = score; data.add(rI); - - -// new_vals.set(ReviewIndex.PRODUCTID_INDEX, productDict.get(vals.get(0))); -// String[] helpf = vals.get(2).split("/"); -// new_vals.set(ReviewIndex.HELPFNUM_INDEX, Integer.parseInt(helpf[0])); -// new_vals.set(ReviewIndex.HELPFDNOM_INDEX, Integer.parseInt(helpf[1])); -// new_vals.set(ReviewIndex.REVIEWLENGTH_INDEX, Integer.parseInt(vals.get(3))); -// new_vals.set(ReviewIndex.SCORE_INDEX, (int) Float.parseFloat(vals.get(1))); -// dictValues.add(new_vals); } reviewIds.close(); -// productDict = null; // TODO: remove? (3) -// ReviewIndex rIndex = new ReviewIndex(); rIndex.insertData(data); -// saveToDir(REVIEW_INDEX_FILE, rIndex); rIndex.save(this.dir + "/" + REVIEW_INDEX_FILE); } @@ -343,11 +295,4 @@ private void saveToDir(String name, Object obj) { System.exit(1); } } - -// public static void main(String[] args) { -// String inputFile = "./1000.txt"; -// String dir = "./Data_Index"; -// IndexWriter indexWriter = new IndexWriter(); -// indexWriter.write(inputFile, dir); -// } } \ No newline at end of file diff --git a/src/webdata/ReviewIndex.java b/src/webdata/ReviewIndex.java index 4167ab2..15f6832 100644 --- a/src/webdata/ReviewIndex.java +++ b/src/webdata/ReviewIndex.java @@ -11,13 +11,13 @@ public class ReviewInfo implements Serializable { private void readObject(ObjectInputStream inputFile) throws ClassNotFoundException, IOException { - encodedInfo = (byte[]) inputFile.readUnshared(); + encodedInfo = (byte[]) inputFile.readObject(); score = inputFile.readByte(); } private void writeObject(ObjectOutputStream outputFile) throws IOException { - outputFile.writeUnshared(encodedInfo); + outputFile.writeObject(encodedInfo); outputFile.writeByte(score); } } @@ -74,12 +74,12 @@ public int getNumReview(){ private void readObject(ObjectInputStream inputFile) throws ClassNotFoundException, IOException { - data = (ArrayList) inputFile.readUnshared(); + data = (ArrayList) inputFile.readObject(); } private void writeObject(ObjectOutputStream outputFile) throws IOException { - outputFile.writeUnshared(this.data); + outputFile.writeObject(this.data); } public void save(String outputFile) throws IOException {