Main Mapreduce program - release - v1.0

piyush26c · piyush26c · commit b05a0b6df62e · 2023-12-13T06:06:32.000Z
diff --git a/main/.DS_Store b/main/.DS_Store
diff --git a/main/Dockerfile b/main/Dockerfile
@@ -0,0 +1,6 @@
+FROM python:3.8
+EXPOSE 8080
+WORKDIR /app
+COPY . ./
+RUN pip install -r requirements.txt
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8080", "--server.address=0.0.0.0"]
diff --git a/main/app.py b/main/app.py
@@ -0,0 +1,161 @@
+import streamlit as st
+import re
+import json
+import json
+import pandas as pd
+from google.cloud import storage
+import pandas as pd
+import time
+from datetime import datetime, timezone
+
+
+# Import CSS
+st.markdown('<link rel="stylesheet" href="style.css">', unsafe_allow_html=True)
+
+def is_file_updated_recently(threshold_seconds):
+    client = storage.Client.from_service_account_json('piyush-chaudhari-fall2023-9ae1ed20a7f3.json')
+    eccmr_final_result_bucket_name = 'eccmr_final_result_bucket'
+    eccmr_final_result_bucket = client.get_bucket(eccmr_final_result_bucket_name)
+    file_name = "final_results.json"
+    file_path = f"{file_name}"
+    blob = eccmr_final_result_bucket.blob(file_path) 
+
+    # Check if the file exists
+    if not blob.exists():
+        print(f"File '{file_path}' does not exist.")
+        return False
+       
+    # Retrieve the metadata
+    blob.reload()
+     # Check if 'updated' is not None
+    if blob.updated is None:
+        print(f"File '{file_path}' has no 'updated' timestamp.")
+        return False
+    
+    # Make both datetimes timezone-aware
+    updated_time = blob.updated.replace(tzinfo=timezone.utc)
+    current_time = datetime.utcnow().replace(tzinfo=timezone.utc)
+    time_difference = current_time - updated_time
+
+    return time_difference.total_seconds() <= threshold_seconds
+
+
+def is_valid_input(text):
+    return bool(re.match(r'^[^\s\n]+$', text))
+
+def find_results(input_text):
+    if not is_valid_input(input_text):
+        st.warning("Invalid input! Please avoid spaces and newline characters.")
+        return None
+
+    client = storage.Client.from_service_account_json('piyush-chaudhari-fall2023-9ae1ed20a7f3.json')
+    eccmr_final_result_bucket_name = 'eccmr_final_result_bucket'
+    eccmr_final_result_bucket = client.get_bucket(eccmr_final_result_bucket_name)
+    file_name = "final_results.json"
+    file_path = f"{file_name}"
+    # print('file_path:', file_path)
+    blob = eccmr_final_result_bucket.blob(file_path)
+    if not blob.exists():
+        print("result file does not exists.")
+        return None
+
+    # Download the content of the file as text
+    content_text = blob.download_as_text()
+    json_object = json.loads(content_text)
+
+    # preprocess the input word
+    input_text = input_text.strip().lower()
+    input_text = re.sub(r'[^a-zA-Z0-9\s]', '', input_text)
+    print("processed input text:", input_text)
+    df = None
+
+    if input_text in json_object.keys():
+        df = pd.DataFrame(list(json_object[input_text].items()), columns=['Document', 'Count'])
+        # Sorting the DataFrame by the second column in descending order
+        df = df.sort_values(by='Count', ascending=False)
+        # Reindexing the sorted DataFrame
+        df = df.reset_index(drop=True)
+
+    return df
+
+def save_uploaded_file(uploaded_file):
+    if uploaded_file is not None:
+        file_path = uploaded_file.name
+        with open(file_path, "wb") as f:
+            f.write(uploaded_file.getvalue())
+        
+        # uploading file in eccrm_dataset_bucket
+        client = storage.Client.from_service_account_json('piyush-chaudhari-fall2023-9ae1ed20a7f3.json')
+        eccrm_dataset_bucket_name = 'eccrm_dataset_bucket'
+        eccrm_dataset_bucket = client.get_bucket(eccrm_dataset_bucket_name)
+        
+        # Define the file name and destination folder in the bucket
+        file_name = uploaded_file.name.split("/")[-1]
+        folder_name = "dataset"
+        destination_blob_name = f"{folder_name}/{file_name}"
+        
+        # Upload the file to GCS
+        blob = eccrm_dataset_bucket.blob(destination_blob_name)
+        blob.upload_from_filename(file_path)
+
+        return f"File uploaded successfully."
+    return None
+
+def main():
+    st.title("Map Reduce Assignment (Engineering Cloud Computing)")
+    tabs = ["Find Occurances", "Upload File"]
+    choice = st.sidebar.selectbox("Select Tab:", tabs)
+    
+    if choice == tabs[0]:
+        st.header(tabs[0])
+
+        # Tab 1 Content
+        input_text = st.text_area("Enter Text:", placeholder="Enter the word ...", help="Enter word that you want to search in corpus and its occurances in each document")
+
+        if st.button("Find"):
+            if is_valid_input(input_text):
+                with st.spinner("Finding results..."):
+                    result_df = find_results(input_text)
+                if result_df is not None:
+                    st.dataframe(result_df, use_container_width=True)
+                else:
+                    st.info("Word not present in corpus.")
+            else:
+                st.warning("Invalid input! Please avoid spaces and newline characters.")
+
+    elif choice == tabs[1]:
+        st.header(tabs[1])
+
+        # Tab 2 Content
+        uploaded_file_key = "uploaded_file_key"
+        uploaded_file = st.file_uploader("Choose a .txt file:", type=["txt"])
+        if uploaded_file is not None:
+            if st.button("Upload"):
+                with st.spinner("Uploading file..."):
+                    upload_result = save_uploaded_file(uploaded_file)
+                st.success(upload_result)
+
+                with st.spinner("Updating the inverted index..."):
+                    loop_interval = 1
+                    threshold_seconds = 20
+                    while True:
+                        if is_file_updated_recently(threshold_seconds):
+                            break
+                        time.sleep(loop_interval)
+                st.success("Indexing complete. Thank you for waiting.")
+                # Display countdown message and refresh the page
+                refresh_in = 6
+                countdown_placeholder = st.empty()
+                for i in range(refresh_in, 0, -1):
+                    countdown_placeholder.text(f"Refreshing the page in {i} seconds.")
+                    time.sleep(1)
+                st.experimental_rerun()
+
+    # Adding the "Made with love (emoji) Piyush Chaudhari" message at the bottom
+    st.markdown(
+        '<div style="position: fixed; bottom: 10px; left: 50%; transform: translateX(-50%);">'
+        'Made with ❤️ Piyush Rajendra Chaudhari</div>',
+        unsafe_allow_html=True
+    )
+if __name__ == "__main__":
+    main()
diff --git a/main/piyush-chaudhari-fall2023-9ae1ed20a7f3.json b/main/piyush-chaudhari-fall2023-9ae1ed20a7f3.json
@@ -0,0 +1,13 @@
+{
+  "type": "service_account",
+  "project_id": "piyush-chaudhari-fall2023",
+  "private_key_id": "",
+  "private_key": "",
+  "client_email": "googlecloudstorage@piyush-chaudhari-fall2023.iam.gserviceaccount.com",
+  "client_id": "",
+  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+  "token_uri": "https://oauth2.googleapis.com/token",
+  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/googlecloudstorage%40piyush-chaudhari-fall2023.iam.gserviceaccount.com",
+  "universe_domain": "googleapis.com"
+}
diff --git a/main/requirements.txt b/main/requirements.txt
@@ -0,0 +1,18 @@
+cachetools==5.3.2
+certifi==2023.11.17
+charset-normalizer==3.3.2
+google-api-core==2.15.0
+google-auth==2.25.2
+google-cloud-core==2.4.1
+google-cloud-storage==2.13.0
+google-crc32c==1.5.0
+google-resumable-media==2.6.0
+googleapis-common-protos==1.62.0
+idna==3.6
+protobuf==4.25.1
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+requests==2.31.0
+rsa==4.9
+urllib3==2.1.0
+streamlit==1.29.0
diff --git a/main/style.css b/main/style.css
@@ -0,0 +1,50 @@
+body {
+    background-color: #f4f7fa;
+    color: #333;
+}
+
+.sidebar .css-vfskoc {
+    background-color: #007bff;
+}
+
+.st-d0 .st-cg .st-bp {
+    background-color: #ffffff;
+    border: 1px solid #c8d1e0;
+    border-radius: 5px;
+    padding: 15px;
+    box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+}
+
+.st-cc {
+    color: #007bff;
+}
+
+.st-cg button {
+    background-color: #007bff;
+    color: #ffffff;
+}
+
+.st-ct {
+    color: #007bff;
+}
+
+.st-ct a {
+    color: #007bff;
+}
+
+.st-d0 .st-cg .st-bp button {
+    margin-top: 10px;
+}
+
+.st-bb {
+    position: fixed;
+    top: 50%;
+    left: 50%;
+    transform: translate(-50%, -50%);
+    z-index: 1000;
+    background: rgba(255, 255, 255, 0.8);
+    border-radius: 10px;
+    padding: 20px;
+    text-align: center;
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+}