Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions src/webdata/DataLoader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package webdata;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;

public class DataLoader implements Iterable<ArrayList<String>> {
private BufferedReader br;
private ArrayList<String> reviewStrings;

public DataLoader(String inputFile) throws FileNotFoundException {
br = new BufferedReader(new FileReader(inputFile));
reviewStrings = new ArrayList<>();
}

public ArrayList<String> readSingleReview() {
String line;
try {
while((line = br.readLine()) != null) {
if (line.contains("product/productId") && reviewStrings.size() != 0) {
ArrayList<String> ret = reviewStrings;
reviewStrings = new ArrayList<String>();
reviewStrings.add(line);
return ret;
}
reviewStrings.add(line);
}
} catch (IOException e) {
e.printStackTrace();
System.exit(1);
}
return reviewStrings;
}

public Iterator<ArrayList<String>> iterator() {
return new Iterator<>() {
@Override
public boolean hasNext(){
try {
br.mark(1);
int i = br.read();
br.reset();
return (i != -1);
} catch (IOException e) {
return false;
}
}

@Override
public ArrayList<String> next() {
return readSingleReview();
}

@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
}
116 changes: 87 additions & 29 deletions src/webdata/DataParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,52 +5,110 @@


public class DataParser {
ArrayList<HashMap<String, String>> allReviews = new ArrayList<>();
public static final List<String> INTEREST_FIELDS = Arrays.asList("productId", "score", "helpfulness", "text");
public class Review{
private String text;
private String productId;
private String score;
private String helpfulness;

public String getText() {
return text;
}

public String getProductId() {
return productId;
}

public String getHelpfulness() {
return helpfulness;
}

public String getScore() {
return score;
}

public void setHelpfulness(String helpfulness) {
this.helpfulness = helpfulness;
}

public void setProductId(String productId) {
this.productId = productId;
}

public void setScore(String score) {
this.score = score;
}

public void setText(String text) {
this.text = text;
}
}

/**
* Given product review data, parses the data and creates a new list where each entry i contains hashmap with the fields
* of the review, i.e: productId->value, score->value, helpfulness->value, text->value.
* inputFile is the path to the file containing the review data
*/
public DataParser(String inputFile) throws IOException {
BufferedReader br = new BufferedReader(new FileReader(inputFile));
String line;
StringBuilder review = new StringBuilder();
while((line = br.readLine()) != null) {
if (line.contains("product/productId")){
if (!review.toString().equals("")){
allReviews.add(parse_review(review.toString()));
}
review = new StringBuilder(line);
}
else{
review.append(line);
}
public List<Review> parseData(List<String> rawReviews){
ArrayList<Review> allReviews = new ArrayList<>();
for (String review: rawReviews){
allReviews.add(parseReview(review));
}
allReviews.add(parse_review(review.toString())); // add the last review
return allReviews;
}

/**
* Given a single review, parse the review and return a hash table containing only the relevant fields of the
* review, i.e: productId, score, helpfulness, text.
* @param review: the review that should be parsed.
* @return a hash table where the keys are the relevant fields mentioned above and their corresponding values.
* Given a single review, parse the review and return a Review object, containing all relevant information from the
* given review, i.e. productId, score, helpfulness and text.
*/
private static HashMap<String, String> parse_review(String review){
List<String> fields = Arrays.asList(review.split("review/"));
HashMap<String, String> review_fields = new HashMap<String, String>();
public Review parseReview(String review){
ArrayList<String> fields = new ArrayList<>(Arrays.asList(review.split("review/")));
Review parsedReview = new Review();

review_fields.put("productId", fields.get(0).split(": ")[1].split("product/")[0]);
parsedReview.setProductId(fields.get(0).split(": ")[1].split("product/")[0]);
for (int i=1; i<fields.size(); i++){
String field = fields.get(i);
List<String> field_value = Arrays.asList(field.split(": "));
if (INTEREST_FIELDS.contains(field_value.get(0))) {
review_fields.put(field_value.get(0), String.join(":", field_value.subList(1, field_value.size())));
List<String> fieldValue = Arrays.asList(field.split(": "));
if (fieldValue.get(0).equals("text")) {
parsedReview.setText(String.join(": ", fieldValue.subList(1, fieldValue.size())));
} else if (fieldValue.get(0).equals("helpfulness")) {
parsedReview.setHelpfulness(fieldValue.get(1));
} else if (fieldValue.get(0).equals("score")) {
parsedReview.setScore(fieldValue.get(1));
}
}
return parsedReview;
}

public Review parseReview(ArrayList<String> review){
Review parsedReview = new Review();
StringBuilder text = new StringBuilder();
boolean readingText = false;
for (String line : review){
if (readingText && !line.equals("")) {
text.append(" ");
text.append(line);
continue;
}
int prefix = line.indexOf("/");
int delim = line.indexOf(":");
if (prefix == -1 || delim == -1 || delim < prefix) {
continue;
}
String field = line.substring(prefix + 1, delim);
if (field.equals("text")){
text.append(line.substring(delim + 2));
readingText = true;
} else if (field.equals("productId")) {
parsedReview.setProductId(line.substring(delim + 2));
} else if (field.equals("helpfulness")) {
parsedReview.setHelpfulness(line.substring(delim + 2));
} else if (field.equals("score")) {
parsedReview.setScore(line.substring(delim + 2));
}
}
return review_fields;
parsedReview.setText(text.toString());
return parsedReview;
}
}

61 changes: 37 additions & 24 deletions src/webdata/SlowIndexWriter.java → src/webdata/IndexWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
import java.nio.file.Path;
import java.util.*;

public class SlowIndexWriter {
public class IndexWriter {
private TreeMap<String, ArrayList<Integer>> tokenDict; // keys are tokens, values are a list where odd cells are review ids including this token and even cells are the times the token appeared in the review.
private TreeMap<String, ArrayList<Integer>> productIds;
private TreeMap<Integer, ArrayList<String>> reviewIds;
private LinkedList<ArrayList<String>> reviewIds;
private String dir;

private static final String PRODUCT_INDEX_FILE = "product_index.txt";
Expand All @@ -21,10 +21,10 @@ public class SlowIndexWriter {
* Given product review data, creates an on disk index
* inputFile is the path to the file containing the review data
*/
public void slowWrite(String inputFile, String dir) {
public void write(String inputFile, String dir) {
this.dir = dir;
createDicts(inputFile);
createDir();
createDicts(inputFile);
createProductIndex();
createTokenIndex();
createReviewIndex();
Expand Down Expand Up @@ -63,20 +63,26 @@ private void createDir(){
private void createDicts(String inputFile){
productIds = new TreeMap<>();
tokenDict = new TreeMap<>();
reviewIds = new TreeMap<>();
reviewIds = new LinkedList<>();

DataParser dataParser = null;
DataLoader dataLoader = null;
DataParser dataParser = new DataParser();
try {
dataParser = new DataParser(inputFile);
dataLoader = new DataLoader(inputFile);
} catch (IOException e) {
e.printStackTrace();
System.out.println("Error occurred while reading the reviews input file.");
System.exit(1);
}

for (int i = 0; i < dataParser.allReviews.size(); i++) {
addProductId(dataParser.allReviews.get(i).get("productId"), i + 1);
int length = addReviewText(dataParser.allReviews.get(i).get("text"), i + 1);
addReviewId(dataParser.allReviews.get(i), i, length);
int i=1;
int readTokens = 0;
for (ArrayList<String> s: dataLoader){
DataParser.Review review = dataParser.parseReview(s);
addProductId(review.getProductId(), i);
int length = addReviewText(review.getText(), i);
addReviewId(review, i, length);
readTokens += length;
i++;
}
}

Expand Down Expand Up @@ -129,14 +135,16 @@ private void addProductId(String productId, int reviewId) {
/**
* Adds all the information that is relevant to the given reviewId to the reviewIds dictionary.
*/
private void addReviewId(HashMap<String, String> review, int reviewId, int length) {
reviewIds.put(reviewId, new ArrayList<>());
private void addReviewId(DataParser.Review review, int reviewId, int length) {
ArrayList<String> vals = new ArrayList<>();

// 0 - productId, 1 - score, 2 - helpfulness, 3 - length
for (String field : DataParser.INTEREST_FIELDS) {
if (field.equals("text")) { continue; }
reviewIds.get(reviewId).add(review.get(field));
}
reviewIds.get(reviewId).add(String.valueOf(length));
vals.add(review.getProductId());
vals.add(review.getScore());
vals.add(review.getHelpfulness());
vals.add(String.valueOf(length));

reviewIds.add(vals);
}

/**
Expand Down Expand Up @@ -180,11 +188,16 @@ private void createTokenIndex(){
*/
private void createReviewIndex() {
// Revise the review dictionary to the correct structure & change productIDs to product index
LinkedList<List<Integer>> dictValues = new LinkedList<>();
for (int review : reviewIds.keySet()) {
ArrayList<String> vals = reviewIds.get(review);
ArrayList<List<Integer>> dictValues = new ArrayList<>();
HashMap<String, Integer> productDict = new HashMap<>(productIds.size());
int i = 0;
for (String productId: productIds.keySet()){
productDict.put(productId, i);
i++;
}
for (ArrayList<String> vals : reviewIds) {
ArrayList<Integer> new_vals = new ArrayList<>(List.of(0, 0, 0, 0, 0));
new_vals.set(ReviewIndex.PRODUCTID_INDEX, productIds.headMap(vals.get(0)).size());
new_vals.set(ReviewIndex.PRODUCTID_INDEX, productDict.get(vals.get(0)));
String[] helpf = vals.get(2).split("/");
new_vals.set(ReviewIndex.HELPFNUM_INDEX, Integer.parseInt(helpf[0]));
new_vals.set(ReviewIndex.HELPFDNOM_INDEX, Integer.parseInt(helpf[1]));
Expand All @@ -200,7 +213,7 @@ private void createReviewIndex() {

/**
* Save the given object to disk under the given name. The file is saved to the dir that was passed to the
* SlowWrite() function.
* write() function.
*/
private void saveToDir(String name, Object obj) {
FileOutputStream fileOut = null;
Expand Down
Loading