diff --git a/src/webdata/Analysis.java b/src/webdata/Analysis.java new file mode 100644 index 0000000..634a621 --- /dev/null +++ b/src/webdata/Analysis.java @@ -0,0 +1,61 @@ +package webdata; + +import java.util.ArrayList; +import java.util.Date; +import java.util.Random; + +public class Analysis { + private IndexReader indexReader; + private TokensIndex tokensIndex; + private ArrayList randomTokens; + private long getReviewsWithTokenTime; + private long getTokenFrequencytTime; + + public Analysis(IndexReader indexReader){ + this.indexReader = indexReader; + this.tokensIndex = indexReader.tokenIndex; + this.randomTokens = new ArrayList<>(); + this.getReviewsWithTokenTime = 0; + this.getTokenFrequencytTime = 0; + + getRandomTokens(100); + measureGetReviewsWithToken(); + measureTokenFrequencyTime(); + } + + private void measureGetReviewsWithToken() { + long start = new Date().getTime(); + for (String token: this.randomTokens){ + indexReader.getReviewsWithToken(token); + } + long end = new Date().getTime(); + this.getReviewsWithTokenTime = (end - start); + } + + private void measureTokenFrequencyTime() { + long start = new Date().getTime(); + for (String token: this.randomTokens){ + indexReader.getTokenFrequency(token); + } + long end = new Date().getTime(); + this.getTokenFrequencytTime = (end - start); + } + + /** + * Get n random tokens from the index. + */ + public void getRandomTokens(int n){ + Random random = new Random(); + for (int i=0; i < n; i++){ + int randIndex = random.nextInt(this.tokensIndex.data.size()); // get random index + this.randomTokens.add(tokensIndex.getWordAt(randIndex)); + } + } + + public static void main(String[] args) { + IndexReader indexReader = new IndexReader("./Data_index"); + Analysis analysis = new Analysis(indexReader); + System.out.println("getReviewsWithToken runtime: " + analysis.getReviewsWithTokenTime + "(ms)"); + System.out.println("getTokenFrequency runtime: " + analysis.getTokenFrequencytTime + "(ms)"); + } +} \ No newline at end of file diff --git a/src/webdata/DataLoader.java b/src/webdata/DataLoader.java new file mode 100644 index 0000000..fd6c9f3 --- /dev/null +++ b/src/webdata/DataLoader.java @@ -0,0 +1,63 @@ +package webdata; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; + +public class DataLoader implements Iterable> { + private BufferedReader br; + private ArrayList reviewStrings; + + public DataLoader(String inputFile) throws FileNotFoundException { + br = new BufferedReader(new FileReader(inputFile)); + reviewStrings = new ArrayList<>(); + } + + public ArrayList readSingleReview() { + String line; + try { + while((line = br.readLine()) != null) { + if (line.contains("product/productId") && reviewStrings.size() != 0) { + ArrayList ret = reviewStrings; + reviewStrings = new ArrayList(); + reviewStrings.add(line); + return ret; + } + reviewStrings.add(line); + } + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + return reviewStrings; + } + + public Iterator> iterator() { + return new Iterator<>() { + @Override + public boolean hasNext(){ + try { + br.mark(1); + int i = br.read(); + br.reset(); + return (i != -1); + } catch (IOException e) { + return false; + } + } + + @Override + public ArrayList next() { + return readSingleReview(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } +} diff --git a/src/webdata/DataParser.java b/src/webdata/DataParser.java index 4df436b..61366e1 100644 --- a/src/webdata/DataParser.java +++ b/src/webdata/DataParser.java @@ -5,52 +5,110 @@ public class DataParser { - ArrayList> allReviews = new ArrayList<>(); - public static final List INTEREST_FIELDS = Arrays.asList("productId", "score", "helpfulness", "text"); + public class Review{ + private String text; + private String productId; + private String score; + private String helpfulness; + public String getText() { + return text; + } + + public String getProductId() { + return productId; + } + + public String getHelpfulness() { + return helpfulness; + } + + public String getScore() { + return score; + } + + public void setHelpfulness(String helpfulness) { + this.helpfulness = helpfulness; + } + + public void setProductId(String productId) { + this.productId = productId; + } + + public void setScore(String score) { + this.score = score; + } + + public void setText(String text) { + this.text = text; + } + } /** * Given product review data, parses the data and creates a new list where each entry i contains hashmap with the fields * of the review, i.e: productId->value, score->value, helpfulness->value, text->value. * inputFile is the path to the file containing the review data */ - public DataParser(String inputFile) throws IOException { - BufferedReader br = new BufferedReader(new FileReader(inputFile)); - String line; - StringBuilder review = new StringBuilder(); - while((line = br.readLine()) != null) { - if (line.contains("product/productId")){ - if (!review.toString().equals("")){ - allReviews.add(parse_review(review.toString())); - } - review = new StringBuilder(line); - } - else{ - review.append(line); - } + public List parseData(List rawReviews){ + ArrayList allReviews = new ArrayList<>(); + for (String review: rawReviews){ + allReviews.add(parseReview(review)); } - allReviews.add(parse_review(review.toString())); // add the last review + return allReviews; } /** - * Given a single review, parse the review and return a hash table containing only the relevant fields of the - * review, i.e: productId, score, helpfulness, text. - * @param review: the review that should be parsed. - * @return a hash table where the keys are the relevant fields mentioned above and their corresponding values. + * Given a single review, parse the review and return a Review object, containing all relevant information from the + * given review, i.e. productId, score, helpfulness and text. */ - private static HashMap parse_review(String review){ - List fields = Arrays.asList(review.split("review/")); - HashMap review_fields = new HashMap(); + public Review parseReview(String review){ + ArrayList fields = new ArrayList<>(Arrays.asList(review.split("review/"))); + Review parsedReview = new Review(); - review_fields.put("productId", fields.get(0).split(": ")[1].split("product/")[0]); + parsedReview.setProductId(fields.get(0).split(": ")[1].split("product/")[0]); for (int i=1; i field_value = Arrays.asList(field.split(": ")); - if (INTEREST_FIELDS.contains(field_value.get(0))) { - review_fields.put(field_value.get(0), String.join(":", field_value.subList(1, field_value.size()))); + List fieldValue = Arrays.asList(field.split(": ")); + if (fieldValue.get(0).equals("text")) { + parsedReview.setText(String.join(": ", fieldValue.subList(1, fieldValue.size()))); + } else if (fieldValue.get(0).equals("helpfulness")) { + parsedReview.setHelpfulness(fieldValue.get(1)); + } else if (fieldValue.get(0).equals("score")) { + parsedReview.setScore(fieldValue.get(1)); + } + } + return parsedReview; + } + + public Review parseReview(ArrayList review){ + Review parsedReview = new Review(); + StringBuilder text = new StringBuilder(); + boolean readingText = false; + for (String line : review){ + if (readingText && !line.equals("")) { + text.append(" "); + text.append(line); + continue; + } + int prefix = line.indexOf("/"); + int delim = line.indexOf(":"); + if (prefix == -1 || delim == -1 || delim < prefix) { + continue; + } + String field = line.substring(prefix + 1, delim); + if (field.equals("text")){ + text.append(line.substring(delim + 2)); + readingText = true; + } else if (field.equals("productId")) { + parsedReview.setProductId(line.substring(delim + 2)); + } else if (field.equals("helpfulness")) { + parsedReview.setHelpfulness(line.substring(delim + 2)); + } else if (field.equals("score")) { + parsedReview.setScore(line.substring(delim + 2)); } } - return review_fields; + parsedReview.setText(text.toString()); + return parsedReview; } } diff --git a/src/webdata/Encoding.java b/src/webdata/Encoding.java index 817927b..3af1e0c 100644 --- a/src/webdata/Encoding.java +++ b/src/webdata/Encoding.java @@ -1,6 +1,7 @@ package webdata; import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.math.BigInteger; import java.nio.ByteBuffer; import java.util.ArrayList; @@ -8,99 +9,6 @@ import java.util.List; public class Encoding { - - /** - * Encode the given number using gamma encoding. - * The encoded output is a string representing the bytes of the number. - */ - public static String gammaEncode(int num) { - String offset = Integer.toBinaryString(num + 1); - return "1".repeat(offset.length() - 1) + "0" + offset.substring(1); - } - - /** - * Encode the given number using delta encoding. - * The encoded output is a string representing the bytes of the number. - */ - public static String deltaEncode(int num) { - String offset = Integer.toBinaryString(num + 1); - return gammaEncode(offset.length() - 1) + offset.substring(1); - } - - /** - * Decode the given string, which represents a binary sequence using gamma code. - */ - public static ArrayList gammaDecode(String encoding) { - ArrayList output = new ArrayList<>(); - int bitsRead = 0; - while (bitsRead < encoding.length()) { - int length = encoding.substring(bitsRead).indexOf('0'); // Find the first 0 - int offsetLoc = bitsRead + length + 1; - output.add(Integer.parseInt("1" + encoding.substring(offsetLoc, offsetLoc + length), 2) - 1); - bitsRead = offsetLoc + length; - } - return output; - } - - /** - * Decode the given string, which represents a binary sequence using delta code. - */ - public static ArrayList deltaDecode(String encoding) { - ArrayList output = new ArrayList<>(); - int bitsRead = 0; - while (bitsRead < encoding.length()) { - int length = encoding.substring(bitsRead).indexOf('0'); // Find the first 0 - int offsetLoc = bitsRead + length + 1; - int actualLength = Integer.parseInt("1" + encoding.substring(offsetLoc, offsetLoc + length), 2); - bitsRead = offsetLoc + length; - - output.add(Integer.parseInt("1" + encoding.substring(bitsRead, bitsRead + actualLength - 1), 2) - 1); - bitsRead += actualLength - 1; - } - return output; - } - - /** - * Decode the given byte array, using gamma code. - */ - public static ArrayList gammaDecode(byte[] code) { - return gammaDecode(byteToString(code)); - } - - /** - * Decode the given byte array, using delta code. - */ - public static ArrayList deltaDecode(byte[] code) { - return deltaDecode(byteToString(code)); - } - - /** - * Convert the given string representing a bit sequence of numbers to a byte array. - */ - public static byte[] toByteArray(String encoding) { - // Pad 0s to the nearest multiple of 8 - String padded = encoding + "0".repeat((int) Math.ceil((float) encoding.length() / 8) * 8 - encoding.length()); - byte[] ret = new BigInteger(padded, 2).toByteArray(); - if (ret.length * 8 == padded.length() + 8) { - return Arrays.copyOfRange(ret, 1, ret.length); - } else { - return ret; - } - } - - /** - * Convert the given byte array to a string representing the bits of the byte array. - */ - public static String byteToString(byte[] encoding) { - StringBuilder s = new StringBuilder(); - for (byte b : encoding) { - String binary = Integer.toBinaryString(Byte.toUnsignedInt(b)); - s.append("0".repeat(8 - binary.length())); // toBinaryString removes leading 0's - s.append(binary); - } - return s.toString(); - } - /** * Encode the given list of numbers using Group-Varint-Encoding. The first byte of the resulting byte array * holds the number of bytes required to decode each of the next four numbers. @@ -113,7 +21,7 @@ public static byte[] groupVarintEncode(int[] nums) { byte[] numAsBytes = ByteBuffer.allocate(4).putInt(nums[i]).array(); byte numLength = -1; for (int j = 0; j < numAsBytes.length; j++) { - if (numAsBytes[j] != 0) { + if (numAsBytes[j] != 0 || numLength >= 0) { out.write(numAsBytes[j]); numLength++; } else if (j == numAsBytes.length - 1 & numLength == -1) { @@ -128,6 +36,30 @@ public static byte[] groupVarintEncode(int[] nums) { return output; } + public static byte[] groupVarEncodeMultiple(List nums) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + int i; + for (i=0; i + 3 < nums.size(); i=i+4) { + try { + baos.write(groupVarintEncode(new int[]{nums.get(i), nums.get(i + 1), nums.get(i + 2), nums.get(i + 3)})); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + } + int[] remainder = new int[4]; + for (int j=0;j < nums.size() - i; j++) { + remainder[j] = nums.get(i+j); + } + try { + baos.write(groupVarintEncode(remainder)); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + return baos.toByteArray(); + } + /** * Decode the given byte array to numbers, using Group-Varing-Encoding. */ @@ -147,9 +79,34 @@ public static int[] groupVarintDecode(byte[] encoding) { return output; } + public static ArrayList groupVarDecodeMultiple(byte[] encoding) { + ArrayList ret = new ArrayList<>(); + int bytesRead = 0; + while (bytesRead < encoding.length) { + byte lengths = encoding[bytesRead]; + bytesRead++; + for (int i = 0; i < 4; i++) { + int bytesToRead = 1 + (lengths >> (2 * (3 - i))) & 3; + byte[] o = new byte[bytesToRead]; + for (int b = 0; b < bytesToRead; b++) { + o[b] = encoding[bytesRead + b]; + } + bytesRead += bytesToRead; + ret.add(new BigInteger(1, o).intValue()); + } + } + for (int j=0; j < 4; j++) { + if (ret.get(ret.size() - 1) != 0) { + break; + } + ret.remove(ret.size() - 1); + } + return ret; + } + /** * Convert the given list of id-1, num-appearances-1, id-2, num-appearances-2... where the ids are given by their - * differences to a list where every id entry are the full id number. + * differences to a list where every id entry is the full id number. */ public static List diffToIds(List vals){ for (int i = 2; i < vals.size() - 1; i = i + 2){ diff --git a/src/webdata/ExternalMergeSort.java b/src/webdata/ExternalMergeSort.java new file mode 100644 index 0000000..1800f7b --- /dev/null +++ b/src/webdata/ExternalMergeSort.java @@ -0,0 +1,179 @@ +package webdata; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; + +public class ExternalMergeSort { + private Comparator cmp; + public static String folderName = "/iteration_"; + private int numFiles; // current number of files to merge + private int pairsInBlock; + private String dir; + private int iteration; // number of merges performed (including current iteration). 1 means we are currently in the first iteration. + private int savedFiles; // number of files that were saved in the current iteration. + + private int AVAILABLE_BLOCKS = 20000; + + ExternalMergeSort(Comparator cmp, int numFiles, int pairsInBlock, String dir){ + this.cmp = cmp; + this.numFiles = numFiles; + this.pairsInBlock = pairsInBlock; + this.dir = dir; + this.iteration = 1; + this.savedFiles = 0; + } + + public void sort(){ + while (numFiles > 1) { + try { + Files.createDirectories(Path.of(dir + folderName + (iteration+1))); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + + for (int i = 0; i < Math.ceil((float) numFiles / (AVAILABLE_BLOCKS - 1)); i++) { + int end = Math.min(numFiles, (i + 1) * (AVAILABLE_BLOCKS - 1)); + try { + SingleMerge sm = new SingleMerge(i * (AVAILABLE_BLOCKS - 1) + 1, end); + sm.merge(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + } + + this.removeDir(dir + folderName + iteration); // remove the temp dir in which the files of this iteration were stored + numFiles = savedFiles; + savedFiles = 0; + iteration++; + } + File sorted = new File(dir + folderName + iteration + "/1"); + sorted.renameTo(new File(dir + "/1")); + removeDir(dir + folderName + iteration); + } + + private void removeDir(String dir){ + File dirToRemove = new File(dir); + File[] contents = dirToRemove.listFiles(); + if (contents != null) { + for (File file : contents) { + file.delete(); + } + } + dirToRemove.delete(); + } + + /** Holds all the information required for a single iteration of the merge-sort algorithm */ + private class SingleMerge{ + private ArrayList fileReaders; + private ArrayList> fileDeques; + private final int numPairsInDeque; + private int[] outputBlock; + private int outputPtr; + private ObjectOutputStream mergedOutput; + + private SingleMerge(int start, int end) throws IOException { + this.numPairsInDeque = ((AVAILABLE_BLOCKS - 1) / (end-start+1)) * pairsInBlock; + this.mergedOutput = new ObjectOutputStream(new FileOutputStream(dir + folderName + (iteration+1) + "/" + (savedFiles+1))); + this.fileReaders = new ArrayList<>(end-start+1); + this.fileDeques = new ArrayList<>(end-start+1); + + for (int i=start; i<=end; i++){ + FileInputStream fileIn = new FileInputStream(dir + folderName + iteration + "/" + i); + this.fileReaders.add(new ObjectInputStream(fileIn)); + this.fileDeques.add(new ArrayDeque(this.numPairsInDeque)); + } + } + + private void merge() throws IOException { + this.clearOutputBlock(); + this.loadAll(); + while (!this.areAllDequesEmpty()){ + int minIndex = this.getMin(); + this.extractMin(minIndex); + } + this.saveOutputBlock(); // needed in case the block wasn't full + mergedOutput.close(); + savedFiles++; + } + + /** Add the first element in the deque[minIndex] to the output block. + * If the block is full, save it to the output file and clear the block. + * If the deque is empty, load the next elements in the file given in minIndex. + */ + private void extractMin(int minIndex) throws IOException { + int[] minPair = fileDeques.get(minIndex).pollFirst(); + this.outputBlock[this.outputPtr] = minPair[0]; + this.outputBlock[this.outputPtr + 1] = minPair[1]; + this.outputPtr += 2; + if (this.outputPtr == pairsInBlock * 2){ + this.saveOutputBlock(); + this.clearOutputBlock(); + } + if (fileDeques.get(minIndex).isEmpty() && fileReaders.get(minIndex) != null){ + this.loadData(minIndex, numPairsInDeque); + } + } + + /** Return the index of the minimal element of the first elements (smallest elements) in all deques. */ + private int getMin(){ + int minIndex = -1; + for (int i=0; i 0){ + if (minIndex == -1) { + minIndex = i; + } else if (cmp.compare(fileDeques.get(minIndex).getFirst()[0], fileDeques.get(i).getFirst()[0]) > 0){ + minIndex = i; + } + } + } + return minIndex; + } + + private void loadAll() throws IOException { + for (int i = 0; i < this.fileReaders.size(); i++){ + this.loadData(i, this.numPairsInDeque); + } + } + + /** Load numbBlocks from the file given by index i to the matching deque*/ + private void loadData(int i, int numPairs) throws IOException { + for (int j = 0; j < numPairs; j++) { + int[] pair = new int[2]; + try { + pair[0] = fileReaders.get(i).readInt(); + pair[1] = fileReaders.get(i).readInt(); + } catch (EOFException e){ + // Reached end of file. + fileReaders.get(i).close(); + fileReaders.set(i, null); + break; + } + fileDeques.get(i).add(pair); + } + } + + private boolean areAllDequesEmpty(){ + for (Deque d: fileDeques){ + if (!d.isEmpty()){ + return false; + } + } + return true; + } + + private void clearOutputBlock(){ + outputBlock = new int[pairsInBlock * 2]; + outputPtr = 0; + } + + private void saveOutputBlock() throws IOException { + for (int i = 0; i < this.outputPtr; i++){ + this.mergedOutput.writeInt(this.outputBlock[i]); + } + } + } +} diff --git a/src/webdata/IndexReader.java b/src/webdata/IndexReader.java index bf6ff58..1f59912 100644 --- a/src/webdata/IndexReader.java +++ b/src/webdata/IndexReader.java @@ -43,11 +43,8 @@ private void loadIndices(String dir){ in.close(); fileIn.close(); - fileIn = new FileInputStream(dir + "/" + REVIEW_INDEX_FILE); - in = new ObjectInputStream(fileIn); - reviewIndex = (ReviewIndex) in.readObject(); - in.close(); - fileIn.close(); + reviewIndex = new ReviewIndex(); + reviewIndex.load(dir + "/" + REVIEW_INDEX_FILE); } catch (IOException | ClassNotFoundException e) { System.out.println("Error occurred while loading an index file."); @@ -144,10 +141,10 @@ public Enumeration getReviewsWithToken(String token) { if (currentTokenIdx == -1){ return enumerator; } - int tokenInvertedIdxPtr = tokenIndex.get(currentTokenIdx).getInvertedIdxPtr(); + long tokenInvertedIdxPtr = tokenIndex.get(currentTokenIdx).getInvertedIdxPtr(); int numReviews = tokenIndex.get(currentTokenIdx).getFrequency() * 2; byte[] dest = null; - int nextInvertedIdxPtr; + long nextInvertedIdxPtr; try { RandomAccessFile file = new RandomAccessFile(this.dir + "/" + TOKEN_INVERTED_INDEX_FILE, "r"); if (currentTokenIdx + 1 getReviewsWithToken(String token) { } else { nextInvertedIdxPtr = (int) file.length(); } - int bytesToRead = nextInvertedIdxPtr - tokenInvertedIdxPtr; + int bytesToRead = (int) (nextInvertedIdxPtr - tokenInvertedIdxPtr); dest = new byte[bytesToRead]; file.seek(tokenInvertedIdxPtr); file.read(dest); @@ -164,7 +161,7 @@ public Enumeration getReviewsWithToken(String token) { e.printStackTrace(); System.exit(1); } - ArrayList vals = new ArrayList(Encoding.deltaDecode(dest).subList(0, numReviews)); + ArrayList vals = Encoding.groupVarDecodeMultiple(dest); Encoding.diffToIds(vals); return Collections.enumeration(vals); diff --git a/src/webdata/IndexWriter.java b/src/webdata/IndexWriter.java new file mode 100644 index 0000000..aa74eab --- /dev/null +++ b/src/webdata/IndexWriter.java @@ -0,0 +1,298 @@ +package webdata; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; + +public class IndexWriter { + private HashMap tokenDict; // token: tokenId + private ArrayList invertedTokenDict; // tokenId: token + private TreeMap> productIds; + + private int[][] tokenBuffer = new int[TOKEN_BUFFER_SIZE][2]; + // Array of termID, docID pairs. Regular array to sort in-place + private int tokenBufferPointer; + private int tokenFilesNumber = 0; + private String dir; + + private static final String PRODUCT_INDEX_FILE = "product_index.txt"; + private static final String REVIEW_INDEX_FILE = "review_index.txt"; + private static final String TOKEN_INDEX_FILE = "token_index.txt"; + private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; + private static final int PAIRS_IN_BLOCK = 1000; + private static final int M = 15000; + private static final int TOKEN_BUFFER_SIZE = PAIRS_IN_BLOCK * (M - 1); // Number of -pairs- in memory. Should be PAIRS_IN_BLOCK * (M-1) or something. + + /** + * Given product review data, creates an on disk index + * inputFile is the path to the file containing the review data + */ + public void write(String inputFile, String dir) { + this.dir = dir; + createDir(); + createDicts(inputFile); + createProductIndex(); + try{ createReviewIndex(); + } catch (Exception e) { e.printStackTrace(); + System.exit(1);} + productIds = null; + createTokenIndex(); + new File(dir + "/1").delete(); + new File(dir + "/reviewIds").delete(); + } + + /** + * Delete all index files by removing the given directory + */ + public void removeIndex(String dir) { + File dirToRemove = new File(dir); + File[] contents = dirToRemove.listFiles(); + if (contents != null) { + for (File file : contents) { + file.delete(); + } + } + dirToRemove.delete(); + } + + /** + * Create a new directory in the path specified in the instance initialization. + */ + private void createDir(){ + Path path = Path.of(this.dir); + try { + Files.createDirectories(path); + } catch (IOException e) { + e.printStackTrace(); + } + } + + /** + * Create temporary dictionaries that will store all information, before saving the indices to the disk. + * @param inputFile the file containing all reviews + */ + private void createDicts(String inputFile){ + ObjectOutputStream reviewOutput = null; + try { reviewOutput = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(this.dir + "/reviewIds"))); + } catch (IOException e) { e.printStackTrace(); + System.exit(1);} + + productIds = new TreeMap<>(); + tokenDict = new HashMap<>(); + invertedTokenDict = new ArrayList<>(); + + try { + Files.createDirectories(Path.of(this.dir + ExternalMergeSort.folderName + "1")); + } catch (IOException e) { + e.printStackTrace(); + } + + this.clearBuffer(); + + DataLoader dataLoader = null; + DataParser dataParser = new DataParser(); + try { + dataLoader = new DataLoader(inputFile); + } catch (IOException e) { + e.printStackTrace(); + System.out.println("Error occurred while reading the reviews input file."); + System.exit(1); + } + int i=1; + for (ArrayList s: dataLoader){ + DataParser.Review review = dataParser.parseReview(s); + addProductId(review.getProductId(), i); + int length = addReviewText(review.getText(), i); + addReviewId(review, reviewOutput, length); + i++; + } + this.sortBuffer(); + try { + this.saveBuffer(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + this.tokenBuffer = null; // free the token buffer space + try { reviewOutput.close(); + } catch (IOException e) { e.printStackTrace();} + Comparator cmp = Comparator.comparing(a -> invertedTokenDict.get(a)); + + ExternalMergeSort ems = new ExternalMergeSort(cmp, tokenFilesNumber, PAIRS_IN_BLOCK, dir); + ems.sort(); + } + + /** + * Split the given text of the i-th review into tokens and add them to the tokens dictionary. + * @param reviewText the text of the review that should be added. + * @param reviewIndex the number of the given review. + * @return the number of tokens in the given review text. + */ + private int addReviewText(String reviewText, int reviewIndex){ + String[] tokens = reviewText.split("[^a-zA-Z0-9]"); // split to alphanumeric tokens + int reviewLength = 0; + String[] cleanTokens = Arrays.stream(tokens).filter(value -> value != null && value.length() > 0).toArray(size -> new String[size]); + + for (String token: cleanTokens){ + reviewLength += 1; + token = token.toLowerCase(); + int termId = tokenDict.computeIfAbsent(token, k -> tokenDict.size()); + if (termId == invertedTokenDict.size()) { invertedTokenDict.add(token);} // if a new token was added, add it also to the invertedTokenDict + tokenBuffer[tokenBufferPointer][0] = termId; + tokenBuffer[tokenBufferPointer][1] = reviewIndex; + tokenBufferPointer++; + if (tokenBufferPointer == TOKEN_BUFFER_SIZE){ + this.sortBuffer(); + try { + this.saveBuffer(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + this.clearBuffer(); + } + } + return reviewLength; + } + + private void sortBuffer() { + Arrays.sort(tokenBuffer,0, tokenBufferPointer, Comparator.comparing(a -> invertedTokenDict.get(a[0]))); + } + + private void saveBuffer() throws IOException { + this.tokenFilesNumber++; + ObjectOutputStream tokenBufferWriter = new ObjectOutputStream(new FileOutputStream(dir + ExternalMergeSort.folderName + "1/" + tokenFilesNumber)); + for (int i = 0; i < tokenBufferPointer; i++) { + tokenBufferWriter.writeInt(tokenBuffer[i][0]); + tokenBufferWriter.writeInt(tokenBuffer[i][1]); + } + tokenBufferWriter.close(); + } + + private void clearBuffer() { + tokenBufferPointer = 0; + } + + /** + * Update the productId dictionary by adding to it the given product. If the product already exists, it adds review + * id to the reviews that are matching to this product. + */ + private void addProductId(String productId, int reviewId) { + if (!productIds.containsKey(productId)) { + productIds.put(productId, new ArrayList<>(Arrays.asList(reviewId, 0))); + } + else { + ArrayList product = productIds.get(productId); + product.set(1, product.get(1) + 1); + } + } + + /** + * Adds all the information that is relevant to the given reviewId to the reviewIds dictionary. + */ + private void addReviewId(DataParser.Review review, ObjectOutputStream reviewOutput, int length) { + ArrayList vals = new ArrayList<>(); + + // 0 - productId, 1 - score, 2 - helpfulness, 3 - length + vals.add(review.getProductId()); + vals.add(review.getScore()); + vals.add(review.getHelpfulness()); + vals.add(String.valueOf(length)); + try { + reviewOutput.writeObject(vals); + reviewOutput.reset(); + } catch (IOException e) { e.printStackTrace();} + } + + /** + * Creates and saves to the disk the product index, i.e. all the information that is related to products. + */ + private void createProductIndex() { + ArrayList ids = new ArrayList<>(productIds.keySet()); + ArrayList> vals = new ArrayList<>(productIds.values()); + int k = 8; + KFront kf = new KFront(); + kf.createKFront(k, ids); + for (int i = 0; i < vals.size(); i++) { + kf.getTable().get(i).addAll(vals.get(i)); + } + + ProductIndex pIndex = new ProductIndex(k); + pIndex.insertData(kf.getTable(), kf.getConcatString()); + saveToDir(PRODUCT_INDEX_FILE, pIndex); + } + + /** + * Creates the index file for the tokens in the collection. + * The index is created using the k-1-in-k front coding method. + */ + private void createTokenIndex(){ + ArrayList tokens = new ArrayList<>(tokenDict.keySet()); + Collections.sort(tokens); + + tokenDict = null; + int k = 256; + KFront kf = new KFront(true); + kf.createKFront(k, tokens); + TokensIndex tIdx = new TokensIndex(k, this.dir); + tIdx.insertData(kf.getTable(), kf.getConcatString(), dir + "/1"); + saveToDir(TOKEN_INDEX_FILE, tIdx); + } + + /** + * Creates and saves to the disk the review index which hold all information related to reviews. + */ + private void createReviewIndex() throws IOException, ClassNotFoundException { + ObjectInputStream reviewIds = new ObjectInputStream(new FileInputStream(this.dir + "/reviewIds")); + + // Revise the review dictionary to the correct structure & change productIDs to product index + ArrayList data = new ArrayList<>(); + + HashMap productDict = new HashMap<>(productIds.size()); + int i = 0; + for (String productId: productIds.keySet()){ + productDict.put(productId, i); + i++; + } + ReviewIndex rIndex = new ReviewIndex(); + while (true) { + ArrayList vals = null; + try { vals = (ArrayList) reviewIds.readObject(); + } catch (EOFException ex) { break;} + ReviewIndex.ReviewInfo rI = rIndex.new ReviewInfo(); + int[] info = new int[4]; + byte score = (byte) (int) Float.parseFloat(vals.get(1)); + info[ReviewIndex.PRODUCTID_INDEX] = productDict.get(vals.get(0)); + String[] helpf = vals.get(2).split("/"); + info[ReviewIndex.HELPFNUM_INDEX] = Integer.parseInt(helpf[0]); + info[ReviewIndex.HELPFDNOM_INDEX] = Integer.parseInt(helpf[1]); + info[ReviewIndex.REVIEWLENGTH_INDEX] = Integer.parseInt(vals.get(3)); + rI.encodedInfo = Encoding.groupVarintEncode(info); + rI.score = score; + data.add(rI); + } + reviewIds.close(); + rIndex.insertData(data); + rIndex.save(this.dir + "/" + REVIEW_INDEX_FILE); + } + + /** + * Save the given object to disk under the given name. The file is saved to the dir that was passed to the + * SlowWrite() function. + */ + private void saveToDir(String name, Object obj) { + FileOutputStream fileOut = null; + try { + fileOut = new FileOutputStream(this.dir + "/" + name); + ObjectOutputStream out = new ObjectOutputStream(fileOut); + out.writeObject(obj); + out.close(); + fileOut.close(); + } catch (IOException e) { + System.out.println("Error occurred while saving the index file: " + name); + e.printStackTrace(); + System.exit(1); + } + } +} \ No newline at end of file diff --git a/src/webdata/KFront.java b/src/webdata/KFront.java index 6132471..f37fd2d 100644 --- a/src/webdata/KFront.java +++ b/src/webdata/KFront.java @@ -13,7 +13,7 @@ public class KFront { private boolean saveLength; public KFront() { - table = new LinkedList<>(); + table = new ArrayList<>(); concatString = null; saveLength = false; } @@ -25,7 +25,7 @@ public KFront(List> outputTable) { } public KFront(boolean saveLength) { - table = new LinkedList<>(); + table = new ArrayList<>(); concatString = null; this.saveLength = saveLength; } diff --git a/src/webdata/PairsLoader.java b/src/webdata/PairsLoader.java new file mode 100644 index 0000000..df2dec5 --- /dev/null +++ b/src/webdata/PairsLoader.java @@ -0,0 +1,32 @@ +package webdata; + +import java.io.EOFException; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.ObjectInputStream; + +public class PairsLoader { + ObjectInputStream ois = null; + + public PairsLoader(String file) { + try { + ois = new ObjectInputStream(new FileInputStream(file)); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public int[] readPair() { + int[] pair = new int[2]; + try { + pair[0] = ois.readInt(); + pair[1] = ois.readInt(); + } catch (EOFException e) { + return null; + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + return pair; + } +} diff --git a/src/webdata/ProductIndex.java b/src/webdata/ProductIndex.java index 2e32a7f..b79f5a7 100644 --- a/src/webdata/ProductIndex.java +++ b/src/webdata/ProductIndex.java @@ -1,6 +1,7 @@ package webdata; import java.io.*; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -8,20 +9,20 @@ public class ProductIndex implements Serializable { private class ProductInfo implements Serializable{ - private short stringInfo; // This is either a pointer to the concatenated string, or a prefix size. + private int stringInfo; // This is either a pointer to the concatenated string, or a prefix size. private int reviewId; private short spanLength; private void readObject(ObjectInputStream inputFile) throws ClassNotFoundException, IOException { - stringInfo = inputFile.readShort(); + stringInfo = inputFile.readInt(); reviewId = inputFile.readInt(); spanLength = inputFile.readShort(); } private void writeObject(ObjectOutputStream outputFile) throws IOException { - outputFile.writeShort(stringInfo); + outputFile.writeInt(stringInfo); outputFile.writeInt(reviewId); outputFile.writeShort(spanLength); } @@ -36,11 +37,13 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException private ArrayList data; private String dictString; + private int dictBytes; private int k; public ProductIndex(int k) { - data = new ArrayList<>(); - dictString = null; + this.data = new ArrayList<>(); + this.dictString = null; + this.dictBytes = 0; this.k = k; } @@ -55,14 +58,15 @@ public void insertData(List> inData, String concatString) { pf.reviewId = entry.get(REVIEWID_INDEX); pf.spanLength = entry.get(SPANLENGTH_INDEX).shortValue(); if (offset == 0) { - pf.stringInfo = entry.get(POINTER_INDEX).shortValue(); + pf.stringInfo = entry.get(POINTER_INDEX); } else { - pf.stringInfo = entry.get(PREFIXL_INDEX).shortValue(); + pf.stringInfo = entry.get(PREFIXL_INDEX); } offset++; offset = offset % k; data.add(pf); } + this.dictBytes = this.dictString.getBytes(StandardCharsets.UTF_8).length; } /** @@ -122,14 +126,16 @@ public int search(String str) { private void readObject(ObjectInputStream inputFile) throws ClassNotFoundException, IOException { k = inputFile.readInt(); - dictString = inputFile.readUTF(); + dictBytes = inputFile.readInt(); + dictString = new String(inputFile.readNBytes(dictBytes), StandardCharsets.UTF_8); data = (ArrayList) inputFile.readObject(); } private void writeObject(ObjectOutputStream outputFile) throws IOException { outputFile.writeInt(k); - outputFile.writeUTF(dictString); + outputFile.writeInt(this.dictBytes); + outputFile.writeBytes(this.dictString); outputFile.writeObject(data); } diff --git a/src/webdata/ReviewIndex.java b/src/webdata/ReviewIndex.java index 8d0cbb9..15f6832 100644 --- a/src/webdata/ReviewIndex.java +++ b/src/webdata/ReviewIndex.java @@ -5,9 +5,9 @@ import java.util.List; public class ReviewIndex implements Serializable{ - private class ReviewInfo implements Serializable { - private byte[] encodedInfo; - private byte score; + public class ReviewInfo implements Serializable { + public byte[] encodedInfo; + public byte score; private void readObject(ObjectInputStream inputFile) throws ClassNotFoundException, IOException { @@ -33,20 +33,8 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException /** * insert the given data into the list containing all the information of reviews. */ - public void insertData(List> inData) { - data = new ArrayList<>(); - for (List entry : inData) { - ReviewInfo rI = new ReviewInfo(); - int[] info = new int[4]; - byte score = (byte) entry.get(4).intValue(); - info[PRODUCTID_INDEX] = entry.get(PRODUCTID_INDEX); - info[HELPFNUM_INDEX] = entry.get(HELPFNUM_INDEX); - info[HELPFDNOM_INDEX] = entry.get(HELPFDNOM_INDEX); - info[REVIEWLENGTH_INDEX] = entry.get(REVIEWLENGTH_INDEX); - rI.encodedInfo = Encoding.groupVarintEncode(info); - rI.score = score; - data.add(rI); - } + public void insertData(ArrayList inData) { + this.data = inData; } /** @@ -93,4 +81,28 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException { outputFile.writeObject(this.data); } + + public void save(String outputFile) throws IOException { + ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile))); + for (ReviewInfo rI : data) { + oos.writeObject(rI); + oos.reset(); + } + oos.close(); + } + + public void load(String inputFile) throws IOException, ClassNotFoundException { + data = new ArrayList<>(); + ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(new FileInputStream(inputFile))); + while (true) { + ReviewInfo rI = null; + try { + rI = (ReviewInfo) ois.readObject(); + } catch (EOFException ex) { + break; + } + data.add(rI); + } + ois.close(); + } } diff --git a/src/webdata/SlowIndexWriter.java b/src/webdata/SlowIndexWriter.java deleted file mode 100644 index 0833cb2..0000000 --- a/src/webdata/SlowIndexWriter.java +++ /dev/null @@ -1,219 +0,0 @@ -package webdata; - -import java.io.*; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.*; - -public class SlowIndexWriter { - private TreeMap> tokenDict; // keys are tokens, values are a list where odd cells are review ids including this token and even cells are the times the token appeared in the review. - private TreeMap> productIds; - private TreeMap> reviewIds; - private String dir; - - private static final String PRODUCT_INDEX_FILE = "product_index.txt"; - private static final String REVIEW_INDEX_FILE = "review_index.txt"; - private static final String TOKEN_INDEX_FILE = "token_index.txt"; - private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; - - - /** - * Given product review data, creates an on disk index - * inputFile is the path to the file containing the review data - */ - public void slowWrite(String inputFile, String dir) { - this.dir = dir; - createDicts(inputFile); - createDir(); - createProductIndex(); - createTokenIndex(); - createReviewIndex(); - } - - /** - * Delete all index files by removing the given directory - */ - public void removeIndex(String dir) { - File dirToRemove = new File(dir); - File[] contents = dirToRemove.listFiles(); - if (contents != null) { - for (File file : contents) { - file.delete(); - } - } - dirToRemove.delete(); - } - - /** - * Create a new directory in the path specified in the instance initialization. - */ - private void createDir(){ - Path path = Path.of(this.dir); - try { - Files.createDirectories(path); - } catch (IOException e) { - e.printStackTrace(); - } - } - - /** - * Create temporary dictionaries that will store all information, before saving the indices to the disk. - * @param inputFile - */ - private void createDicts(String inputFile){ - productIds = new TreeMap<>(); - tokenDict = new TreeMap<>(); - reviewIds = new TreeMap<>(); - - DataParser dataParser = null; - try { - dataParser = new DataParser(inputFile); - } catch (IOException e) { - System.out.println("Error occurred while reading the reviews input file."); - System.exit(1); - } - - for (int i = 0; i < dataParser.allReviews.size(); i++) { - addProductId(dataParser.allReviews.get(i).get("productId"), i + 1); - int length = addReviewText(dataParser.allReviews.get(i).get("text"), i + 1); - addReviewId(dataParser.allReviews.get(i), i, length); - } - } - - /** - * Split the given text of the i-th review into tokens and add them to the tokens dictionary. - * @param reviewText the text of the review that should be added. - * @param reviewIndex the number of the given review. - * @return the number of tokens in the given review text. - */ - private int addReviewText(String reviewText, int reviewIndex){ - String[] tokens = reviewText.split("[^a-zA-Z0-9]"); // split to alphanumeric tokens - int reviewLength = 0; - for (String token: tokens){ - if (!token.matches("[a-zA-Z0-9]+")){ - continue; - } - reviewLength += 1; - token = token.toLowerCase(); - if (tokenDict.containsKey(token)){ // token already exists, update its entry - List tokenInfo = tokenDict.get(token); - // check if the current review was already added to the token's review list. If yes, increase the # appearances of the token, else add it with # appearance = 1. - if (tokenInfo.get(tokenInfo.size()-2) == reviewIndex){ - tokenInfo.set(tokenInfo.size()-1 ,tokenInfo.get(tokenInfo.size()-1) + 1); - } else { // token appears first time in the given review - tokenInfo.add(reviewIndex); - tokenInfo.add(1); - } - } - else{ // token seen for the first time, add a new entry for it - tokenDict.put(token, new ArrayList<>(Arrays.asList(reviewIndex, 1))); - } - } - return reviewLength; - } - - /** - * Update the productId dictionary by adding to it the given product. If the product already exists, it adds review - * id to the reviews that are matching to this product. - */ - private void addProductId(String productId, int reviewId) { - if (!productIds.containsKey(productId)) { - productIds.put(productId, new ArrayList<>(Arrays.asList(reviewId, 0))); - } - else { - ArrayList product = productIds.get(productId); - product.set(1, product.get(1) + 1); - } - } - - /** - * Adds all the information that is relevant to the given reviewId to the reviewIds dictionary. - */ - private void addReviewId(HashMap review, int reviewId, int length) { - reviewIds.put(reviewId, new ArrayList<>()); - // 0 - productId, 1 - score, 2 - helpfulness, 3 - length - for (String field : DataParser.INTEREST_FIELDS) { - if (field.equals("text")) { continue; } - reviewIds.get(reviewId).add(review.get(field)); - } - reviewIds.get(reviewId).add(String.valueOf(length)); - } - - /** - * Creates and saves to the disk the product index, i.e. all the information that is related to products. - */ - private void createProductIndex() { - LinkedList ids = new LinkedList<>(productIds.keySet()); - ArrayList> vals = new ArrayList<>(productIds.values()); - int k = 8; - KFront kf = new KFront(); - kf.createKFront(k, ids); - for (int i = 0; i < vals.size(); i++) { - kf.getTable().get(i).addAll(vals.get(i)); - } - - ProductIndex pIndex = new ProductIndex(k); - pIndex.insertData(kf.getTable(), kf.getConcatString()); - saveToDir(PRODUCT_INDEX_FILE, pIndex); - } - - /** - * Creates the index file for the tokens in the collection. - * The index is created using the k-1-in-k front coding method. - */ - private void createTokenIndex(){ - LinkedList tokens = new LinkedList<>(tokenDict.keySet()); - ArrayList> vals = new ArrayList<>(tokenDict.values()); - int k = 8; - - KFront kf = new KFront(true); - kf.createKFront(k, tokens); - - TokensIndex tIdx = new TokensIndex(k, this.dir); - tIdx.insertData(kf.getTable(), vals, kf.getConcatString()); - - saveToDir(TOKEN_INDEX_FILE, tIdx); - } - - /** - * Creates and saves to the disk the review index which hold all information related to reviews. - */ - private void createReviewIndex() { - // Revise the review dictionary to the correct structure & change productIDs to product index - LinkedList> dictValues = new LinkedList<>(); - for (int review : reviewIds.keySet()) { - ArrayList vals = reviewIds.get(review); - ArrayList new_vals = new ArrayList<>(List.of(0, 0, 0, 0, 0)); - new_vals.set(ReviewIndex.PRODUCTID_INDEX, productIds.headMap(vals.get(0)).size()); - String[] helpf = vals.get(2).split("/"); - new_vals.set(ReviewIndex.HELPFNUM_INDEX, Integer.parseInt(helpf[0])); - new_vals.set(ReviewIndex.HELPFDNOM_INDEX, Integer.parseInt(helpf[1])); - new_vals.set(ReviewIndex.REVIEWLENGTH_INDEX, Integer.parseInt(vals.get(3))); - new_vals.set(ReviewIndex.SCORE_INDEX, (int) Float.parseFloat(vals.get(1))); - dictValues.add(new_vals); - } - ReviewIndex rIndex = new ReviewIndex(); - rIndex.insertData(dictValues); - - saveToDir(REVIEW_INDEX_FILE, rIndex); - } - - /** - * Save the given object to disk under the given name. The file is saved to the dir that was passed to the - * SlowWrite() function. - */ - private void saveToDir(String name, Object obj) { - FileOutputStream fileOut = null; - try { - fileOut = new FileOutputStream(this.dir + "/" + name); - ObjectOutputStream out = new ObjectOutputStream(fileOut); - out.writeObject(obj); - out.close(); - fileOut.close(); - } catch (IOException e) { - System.out.println("Error occurred while saving the index file: " + name); - e.printStackTrace(); - System.exit(1); - } - } -} \ No newline at end of file diff --git a/src/webdata/TokensIndex.java b/src/webdata/TokensIndex.java index 7c85d92..e423c76 100644 --- a/src/webdata/TokensIndex.java +++ b/src/webdata/TokensIndex.java @@ -3,36 +3,35 @@ import java.io.IOException; import java.io.*; import java.math.BigInteger; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import java.nio.charset.StandardCharsets; +import java.util.*; public class TokensIndex implements Serializable { public class TokenInfo implements Serializable{ - private short stringInfo; // This is either a pointer to the concatenated string, or a prefix size. - private short frequency; - private short collectionFrequency; + private int stringInfo; // This is either a pointer to the concatenated string, or a prefix size. + private int frequency; + private int collectionFrequency; private short length; - private int invertedIndexPtr; + private long invertedIndexPtr; - public short getFrequency(){ return frequency;} - public short getCollectionFrequency(){ return collectionFrequency;} - public int getInvertedIdxPtr(){ return invertedIndexPtr;} + public int getFrequency(){ return frequency;} + public int getCollectionFrequency(){ return collectionFrequency;} + public long getInvertedIdxPtr(){ return invertedIndexPtr;} private void readObject(ObjectInputStream inputFile) throws IOException, ClassNotFoundException { - stringInfo = inputFile.readShort(); - frequency = inputFile.readShort(); - collectionFrequency = inputFile.readShort(); + stringInfo = inputFile.readInt(); + frequency = inputFile.readInt(); + collectionFrequency = inputFile.readInt(); length = inputFile.readShort(); - invertedIndexPtr = inputFile.readInt(); + invertedIndexPtr = inputFile.readLong(); } private void writeObject(ObjectOutputStream outputFile) throws IOException { - outputFile.writeShort(stringInfo); - outputFile.writeShort(frequency); - outputFile.writeShort(collectionFrequency); + outputFile.writeInt(stringInfo); + outputFile.writeInt(frequency); + outputFile.writeInt(collectionFrequency); outputFile.writeShort(length); - outputFile.writeInt(invertedIndexPtr); + outputFile.writeLong(invertedIndexPtr); } } @@ -43,8 +42,9 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException { private static final String TOKEN_INVERTED_INDEX_FILE = "token_inverted_index.txt"; - private ArrayList data; + public ArrayList data; private String dictString; + private int dictBytes; private int numTokens; // the total number of tokens in the collection, including repetitions private int k; private String dir; @@ -53,17 +53,18 @@ private void writeObject(ObjectOutputStream outputFile) throws IOException { public TokensIndex(int k, String dir) { this.data = new ArrayList<>(); this.dictString = null; + this.dictBytes = 0; this.numTokens = 0; this.k = k; this.dir = dir; - createRandomAccessFile(); + createOutputFile(); } /** * Create a new RandomAccessFile to write the tokens inverted index into. * If such a file already exists, first remove it. */ - private void createRandomAccessFile(){ + private void createOutputFile(){ try { File file = new File(this.dir + "/" + TOKEN_INVERTED_INDEX_FILE); if (file.exists()){ @@ -77,56 +78,61 @@ private void createRandomAccessFile(){ } } - /** - * Insert the given information of token properties into the index format that should be saved. - * @param tokensData the data of the token containing its pointer/prefix length and token length as created in the KFront class. - * @param tokensVals a list of reviewId-num appearances of reviews containing every token and the number the token appeared in every review. - * @param concatString the concatenated string of all tokens in the collection, created by the KFront class. - */ - public void insertData(List> tokensData, ArrayList> tokensVals, String concatString){ + public void insertData(List> tokensData, String concatString, String pairsFilename) { dictString = concatString; + PairsLoader pl = new PairsLoader(pairsFilename); int offset = 0; + int[] curPair = pl.readPair(); // This should correspond to the first token + for (int i=0; i< tokensData.size(); i++){ List tokenData = tokensData.get(i); - List tokenVal = tokensVals.get(i); TokenInfo token = new TokenInfo(); - token.length = tokenData.get(TOKEN_LENGTH).shortValue(); - token.frequency = (short) (tokenVal.size() / 2); - token.collectionFrequency = (short) subListVals(tokenVal, "even").stream().mapToInt(Integer::intValue).sum(); - numTokens += token.getCollectionFrequency(); + ArrayList invertedIdx = new ArrayList<>(); + + invertedIdx.add(curPair[1]); + invertedIdx.add(1); + token.frequency++; + token.collectionFrequency++; + int[] nextPair = pl.readPair(); + while (nextPair != null && nextPair[0] == curPair[0]){ + if (nextPair[1] == curPair[1]) { // Token repetition inside the same doc + int docFreq = invertedIdx.remove(invertedIdx.size()-1); + invertedIdx.add(docFreq + 1); + } else { + invertedIdx.add(nextPair[1]); + invertedIdx.add(1); + token.frequency++; + } + token.collectionFrequency++; + curPair = nextPair; + nextPair = pl.readPair(); + } + curPair = nextPair; // Save the pair for the next token + try { - token.invertedIndexPtr = (int) this.invertedIndexFile.getFilePointer(); + token.invertedIndexPtr = this.invertedIndexFile.getFilePointer(); } catch (IOException e) { - System.out.println("Error occurred while accessing the token_inverted_index file"); e.printStackTrace(); System.exit(1); } - saveInvertedIndex(tokenVal); + saveInvertedIndex(invertedIdx); + + numTokens += token.collectionFrequency; + token.length = tokenData.get(TOKEN_LENGTH).shortValue(); if (offset == 0){ - token.stringInfo = tokenData.get(POINTER_INDEX).shortValue(); + token.stringInfo = tokenData.get(POINTER_INDEX); } else { - token.stringInfo = tokenData.get(PREFIX_INDEX).shortValue(); + token.stringInfo = tokenData.get(PREFIX_INDEX); } offset++; offset = offset % k; this.data.add(token); - } - } - /** - * Create a sub list of the given list containing only the odd/even elements in the array - * @param inputList the list that should be sliced - * @param type can be `odd` or `even` - * @return a List of integers containing only the elements in odd/even indices of the input array - */ - private List subListVals(List inputList, String type){ - int first = 0; - List subList = new ArrayList<>(); - if (type.equals("even")){ first = 1; } - for (int i = first; i < inputList.size(); i = i + 2){ - subList.add(inputList.get(i)); + token = null; + invertedIdx = null; + tokenData = null; } - return subList; + this.dictBytes = this.dictString.getBytes(StandardCharsets.UTF_8).length; } /** @@ -139,15 +145,9 @@ private void saveInvertedIndex(List valsList) { for (int i = valsList.size()-2; i>0; i = i - 2){ valsList.set(i, valsList.get(i) - valsList.get(i-2)); } - - StringBuilder stringCodes = new StringBuilder(); - for (int num : valsList) { - String code = Encoding.deltaEncode(num); - stringCodes.append(code); - } - byte[] codeBytes = Encoding.toByteArray(stringCodes.toString()); + byte[] codeBytes = Encoding.groupVarEncodeMultiple(valsList); this.invertedIndexFile.write(codeBytes); - } catch (Exception e){ + } catch (Exception e) { System.out.println("Error occurred while saving invertedIndex bytes"); e.printStackTrace(); System.exit(1); @@ -232,7 +232,8 @@ public int search(String str) { private void readObject(ObjectInputStream inputFile) throws IOException, ClassNotFoundException { k = inputFile.readInt(); - dictString = inputFile.readUTF(); + dictBytes = inputFile.readInt(); + dictString = new String(inputFile.readNBytes(dictBytes), StandardCharsets.UTF_8); numTokens = inputFile.readInt(); data = (ArrayList) inputFile.readObject(); @@ -240,7 +241,8 @@ private void readObject(ObjectInputStream inputFile) throws IOException, ClassNo private void writeObject(ObjectOutputStream outputFile) throws IOException { outputFile.writeInt(this.k); - outputFile.writeUTF(this.dictString); + outputFile.writeInt(this.dictBytes); + outputFile.writeBytes(this.dictString); outputFile.writeInt(this.numTokens); outputFile.writeObject(this.data); }