Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
b24ee90
Ex2 first commit
darkushin May 3, 2021
aa21bcc
DataLoader added.
nirnits May 3, 2021
aa14759
updated data parser
darkushin May 3, 2021
d679760
dl change
nirnits May 4, 2021
be529b3
addReviewText before saving files
darkushin May 4, 2021
c9b9b66
sortBuffer + saveBuffer
nirnits May 5, 2021
c3c382e
Before sorting tokens alphanumerically
darkushin May 9, 2021
3eb81ec
Added ExternalMergeSort - before implementing
darkushin May 9, 2021
bfdfa1c
ExternalMergeSort
darkushin May 10, 2021
dc57512
ExternalMergeSort - before debugging
darkushin May 11, 2021
c66a861
ExternalMergeSort - train debugging
darkushin May 11, 2021
ed15c2b
Beginning of TokenIndex
darkushin May 14, 2021
3cbe607
prepareTokenDict
darkushin May 15, 2021
12d27f2
other dicts
nirnits May 17, 2021
f681df9
Sorting works?
nirnits May 17, 2021
87b6bd5
EM sort should be working (almost?)
nirnits May 17, 2021
1b19874
sort improvements
nirnits May 17, 2021
baad054
Token Index is (almost?) done.
nirnits May 18, 2021
3a5e5dc
Code after debugging, before cleaning
darkushin May 21, 2021
da01cbe
Cleaned code, before optimization
darkushin May 22, 2021
d8c4a18
Updated dictString writing in TokensIndex
darkushin May 24, 2021
5e6ebbe
test cs computers
May 24, 2021
f427cdb
time
nirnits May 24, 2021
53040fc
Merge branch 'Ex2' of https://github.com/darkushin/Web-Infromation-Re…
nirnits May 24, 2021
4ca2e6d
Added time info
darkushin May 24, 2021
8912132
Merge branch 'Ex2' of https://github.com/darkushin/Web-Infromation-Re…
nirnits May 24, 2021
4281252
saveInvertedIndex time improvement
darkushin May 24, 2021
bde646d
Merge branch 'Ex2' of https://github.com/darkushin/Web-Infromation-Re…
nirnits May 24, 2021
1981b1e
Before changing saveInvertedIndex to save integers
darkushin May 24, 2021
9e035c3
Buffered RandomAccessFile
darkushin May 24, 2021
62ab1a6
Merge branch 'Ex2' of https://github.com/darkushin/Web-Infromation-Re…
nirnits May 24, 2021
20b8d41
reviewIndex optimization start
darkushin May 25, 2021
e2beb22
Merge branch 'Ex2' of https://github.com/darkushin/Web-Infromation-Re…
nirnits May 25, 2021
4377a53
Changed reviewIndex to use hashMap
darkushin May 25, 2021
e405f22
Improved tokenIndex, before changing all LinkedLists
darkushin May 25, 2021
95aafb6
Removed LinkedLists
darkushin May 25, 2021
b10d7a9
TokensIndex without encoding
darkushin May 25, 2021
52b165a
Merge branch 'Ex2' of https://github.com/darkushin/Web-Infromation-Re…
nirnits May 25, 2021
c85587d
Basic Test are working. before investigating data loader & parser
darkushin May 25, 2021
5c80f65
groupvarint encoding for inverted index
nirnits May 26, 2021
1a45ac9
encoding merge
nirnits May 26, 2021
2f70bc3
changed stringInfo to int
nirnits May 26, 2021
8339097
Converting short->int
darkushin May 26, 2021
4137867
External Merge Sort logs
darkushin May 30, 2021
e3535cb
Analysis code
darkushin May 30, 2021
9c3fd41
Text Creator tests
darkushin May 30, 2021
ee93588
DataParser + Loader changes
nirnits May 30, 2021
6a77a77
again
nirnits May 30, 2021
6f44fcc
Updated Data Parsing
darkushin May 31, 2021
d0e2e3c
Cleaned Code, with time measuring
darkushin May 31, 2021
19e6879
Changed switch-case in DataParser
darkushin May 31, 2021
7287e07
Cleaned code for submission, before running tests again
darkushin May 31, 2021
c12c018
Submission code
darkushin May 31, 2021
42ea768
Final Submission
darkushin May 31, 2021
6ffc156
Ex2 fix
Jun 20, 2021
b8cc081
fixed reviewdict bug
nirnits Jun 21, 2021
42089ee
commented memory improvements
nirnits Jun 21, 2021
c00346e
not working changes
darkushin Jun 22, 2021
6e384a6
reviewindex save/load
nirnits Jun 23, 2021
3728348
reviewIndex save/load #2
nirnits Jun 23, 2021
b43ef58
working code - before cleanup
darkushin Jun 24, 2021
5baa5b4
Ex2 - fixed code
darkushin Jun 24, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions src/webdata/Analysis.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package webdata;

import java.util.ArrayList;
import java.util.Date;
import java.util.Random;

public class Analysis {
private IndexReader indexReader;
private TokensIndex tokensIndex;
private ArrayList<String> randomTokens;
private long getReviewsWithTokenTime;
private long getTokenFrequencytTime;

public Analysis(IndexReader indexReader){
this.indexReader = indexReader;
this.tokensIndex = indexReader.tokenIndex;
this.randomTokens = new ArrayList<>();
this.getReviewsWithTokenTime = 0;
this.getTokenFrequencytTime = 0;

getRandomTokens(100);
measureGetReviewsWithToken();
measureTokenFrequencyTime();
}

private void measureGetReviewsWithToken() {
long start = new Date().getTime();
for (String token: this.randomTokens){
indexReader.getReviewsWithToken(token);
}
long end = new Date().getTime();
this.getReviewsWithTokenTime = (end - start);
}

private void measureTokenFrequencyTime() {
long start = new Date().getTime();
for (String token: this.randomTokens){
indexReader.getTokenFrequency(token);
}
long end = new Date().getTime();
this.getTokenFrequencytTime = (end - start);
}

/**
* Get n random tokens from the index.
*/
public void getRandomTokens(int n){
Random random = new Random();
for (int i=0; i < n; i++){
int randIndex = random.nextInt(this.tokensIndex.data.size()); // get random index
this.randomTokens.add(tokensIndex.getWordAt(randIndex));
}
}

public static void main(String[] args) {
IndexReader indexReader = new IndexReader("./Data_index");
Analysis analysis = new Analysis(indexReader);
System.out.println("getReviewsWithToken runtime: " + analysis.getReviewsWithTokenTime + "(ms)");
System.out.println("getTokenFrequency runtime: " + analysis.getTokenFrequencytTime + "(ms)");
}
}
63 changes: 63 additions & 0 deletions src/webdata/DataLoader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package webdata;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;

public class DataLoader implements Iterable<ArrayList<String>> {
private BufferedReader br;
private ArrayList<String> reviewStrings;

public DataLoader(String inputFile) throws FileNotFoundException {
br = new BufferedReader(new FileReader(inputFile));
reviewStrings = new ArrayList<>();
}

public ArrayList<String> readSingleReview() {
String line;
try {
while((line = br.readLine()) != null) {
if (line.contains("product/productId") && reviewStrings.size() != 0) {
ArrayList<String> ret = reviewStrings;
reviewStrings = new ArrayList<String>();
reviewStrings.add(line);
return ret;
}
reviewStrings.add(line);
}
} catch (IOException e) {
e.printStackTrace();
System.exit(1);
}
return reviewStrings;
}

public Iterator<ArrayList<String>> iterator() {
return new Iterator<>() {
@Override
public boolean hasNext(){
try {
br.mark(1);
int i = br.read();
br.reset();
return (i != -1);
} catch (IOException e) {
return false;
}
}

@Override
public ArrayList<String> next() {
return readSingleReview();
}

@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
}
116 changes: 87 additions & 29 deletions src/webdata/DataParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,52 +5,110 @@


public class DataParser {
ArrayList<HashMap<String, String>> allReviews = new ArrayList<>();
public static final List<String> INTEREST_FIELDS = Arrays.asList("productId", "score", "helpfulness", "text");
public class Review{
private String text;
private String productId;
private String score;
private String helpfulness;

public String getText() {
return text;
}

public String getProductId() {
return productId;
}

public String getHelpfulness() {
return helpfulness;
}

public String getScore() {
return score;
}

public void setHelpfulness(String helpfulness) {
this.helpfulness = helpfulness;
}

public void setProductId(String productId) {
this.productId = productId;
}

public void setScore(String score) {
this.score = score;
}

public void setText(String text) {
this.text = text;
}
}

/**
* Given product review data, parses the data and creates a new list where each entry i contains hashmap with the fields
* of the review, i.e: productId->value, score->value, helpfulness->value, text->value.
* inputFile is the path to the file containing the review data
*/
public DataParser(String inputFile) throws IOException {
BufferedReader br = new BufferedReader(new FileReader(inputFile));
String line;
StringBuilder review = new StringBuilder();
while((line = br.readLine()) != null) {
if (line.contains("product/productId")){
if (!review.toString().equals("")){
allReviews.add(parse_review(review.toString()));
}
review = new StringBuilder(line);
}
else{
review.append(line);
}
public List<Review> parseData(List<String> rawReviews){
ArrayList<Review> allReviews = new ArrayList<>();
for (String review: rawReviews){
allReviews.add(parseReview(review));
}
allReviews.add(parse_review(review.toString())); // add the last review
return allReviews;
}

/**
* Given a single review, parse the review and return a hash table containing only the relevant fields of the
* review, i.e: productId, score, helpfulness, text.
* @param review: the review that should be parsed.
* @return a hash table where the keys are the relevant fields mentioned above and their corresponding values.
* Given a single review, parse the review and return a Review object, containing all relevant information from the
* given review, i.e. productId, score, helpfulness and text.
*/
private static HashMap<String, String> parse_review(String review){
List<String> fields = Arrays.asList(review.split("review/"));
HashMap<String, String> review_fields = new HashMap<String, String>();
public Review parseReview(String review){
ArrayList<String> fields = new ArrayList<>(Arrays.asList(review.split("review/")));
Review parsedReview = new Review();

review_fields.put("productId", fields.get(0).split(": ")[1].split("product/")[0]);
parsedReview.setProductId(fields.get(0).split(": ")[1].split("product/")[0]);
for (int i=1; i<fields.size(); i++){
String field = fields.get(i);
List<String> field_value = Arrays.asList(field.split(": "));
if (INTEREST_FIELDS.contains(field_value.get(0))) {
review_fields.put(field_value.get(0), String.join(":", field_value.subList(1, field_value.size())));
List<String> fieldValue = Arrays.asList(field.split(": "));
if (fieldValue.get(0).equals("text")) {
parsedReview.setText(String.join(": ", fieldValue.subList(1, fieldValue.size())));
} else if (fieldValue.get(0).equals("helpfulness")) {
parsedReview.setHelpfulness(fieldValue.get(1));
} else if (fieldValue.get(0).equals("score")) {
parsedReview.setScore(fieldValue.get(1));
}
}
return parsedReview;
}

public Review parseReview(ArrayList<String> review){
Review parsedReview = new Review();
StringBuilder text = new StringBuilder();
boolean readingText = false;
for (String line : review){
if (readingText && !line.equals("")) {
text.append(" ");
text.append(line);
continue;
}
int prefix = line.indexOf("/");
int delim = line.indexOf(":");
if (prefix == -1 || delim == -1 || delim < prefix) {
continue;
}
String field = line.substring(prefix + 1, delim);
if (field.equals("text")){
text.append(line.substring(delim + 2));
readingText = true;
} else if (field.equals("productId")) {
parsedReview.setProductId(line.substring(delim + 2));
} else if (field.equals("helpfulness")) {
parsedReview.setHelpfulness(line.substring(delim + 2));
} else if (field.equals("score")) {
parsedReview.setScore(line.substring(delim + 2));
}
}
return review_fields;
parsedReview.setText(text.toString());
return parsedReview;
}
}

Loading