|
| 1 | +import streamlit as st |
| 2 | +import re |
| 3 | +import json |
| 4 | +import json |
| 5 | +import pandas as pd |
| 6 | +from google.cloud import storage |
| 7 | +import pandas as pd |
| 8 | +import time |
| 9 | +from datetime import datetime, timezone |
| 10 | + |
| 11 | + |
| 12 | +# Import CSS |
| 13 | +st.markdown('<link rel="stylesheet" href="style.css">', unsafe_allow_html=True) |
| 14 | + |
| 15 | +def is_file_updated_recently(threshold_seconds): |
| 16 | + client = storage.Client.from_service_account_json('piyush-chaudhari-fall2023-9ae1ed20a7f3.json') |
| 17 | + eccmr_final_result_bucket_name = 'eccmr_final_result_bucket' |
| 18 | + eccmr_final_result_bucket = client.get_bucket(eccmr_final_result_bucket_name) |
| 19 | + file_name = "final_results.json" |
| 20 | + file_path = f"{file_name}" |
| 21 | + blob = eccmr_final_result_bucket.blob(file_path) |
| 22 | + |
| 23 | + # Check if the file exists |
| 24 | + if not blob.exists(): |
| 25 | + print(f"File '{file_path}' does not exist.") |
| 26 | + return False |
| 27 | + |
| 28 | + # Retrieve the metadata |
| 29 | + blob.reload() |
| 30 | + # Check if 'updated' is not None |
| 31 | + if blob.updated is None: |
| 32 | + print(f"File '{file_path}' has no 'updated' timestamp.") |
| 33 | + return False |
| 34 | + |
| 35 | + # Make both datetimes timezone-aware |
| 36 | + updated_time = blob.updated.replace(tzinfo=timezone.utc) |
| 37 | + current_time = datetime.utcnow().replace(tzinfo=timezone.utc) |
| 38 | + time_difference = current_time - updated_time |
| 39 | + |
| 40 | + return time_difference.total_seconds() <= threshold_seconds |
| 41 | + |
| 42 | + |
| 43 | +def is_valid_input(text): |
| 44 | + return bool(re.match(r'^[^\s\n]+$', text)) |
| 45 | + |
| 46 | +def find_results(input_text): |
| 47 | + if not is_valid_input(input_text): |
| 48 | + st.warning("Invalid input! Please avoid spaces and newline characters.") |
| 49 | + return None |
| 50 | + |
| 51 | + client = storage.Client.from_service_account_json('piyush-chaudhari-fall2023-9ae1ed20a7f3.json') |
| 52 | + eccmr_final_result_bucket_name = 'eccmr_final_result_bucket' |
| 53 | + eccmr_final_result_bucket = client.get_bucket(eccmr_final_result_bucket_name) |
| 54 | + file_name = "final_results.json" |
| 55 | + file_path = f"{file_name}" |
| 56 | + # print('file_path:', file_path) |
| 57 | + blob = eccmr_final_result_bucket.blob(file_path) |
| 58 | + if not blob.exists(): |
| 59 | + print("result file does not exists.") |
| 60 | + return None |
| 61 | + |
| 62 | + # Download the content of the file as text |
| 63 | + content_text = blob.download_as_text() |
| 64 | + json_object = json.loads(content_text) |
| 65 | + |
| 66 | + # preprocess the input word |
| 67 | + input_text = input_text.strip().lower() |
| 68 | + input_text = re.sub(r'[^a-zA-Z0-9\s]', '', input_text) |
| 69 | + print("processed input text:", input_text) |
| 70 | + df = None |
| 71 | + |
| 72 | + if input_text in json_object.keys(): |
| 73 | + df = pd.DataFrame(list(json_object[input_text].items()), columns=['Document', 'Count']) |
| 74 | + # Sorting the DataFrame by the second column in descending order |
| 75 | + df = df.sort_values(by='Count', ascending=False) |
| 76 | + # Reindexing the sorted DataFrame |
| 77 | + df = df.reset_index(drop=True) |
| 78 | + |
| 79 | + return df |
| 80 | + |
| 81 | +def save_uploaded_file(uploaded_file): |
| 82 | + if uploaded_file is not None: |
| 83 | + file_path = uploaded_file.name |
| 84 | + with open(file_path, "wb") as f: |
| 85 | + f.write(uploaded_file.getvalue()) |
| 86 | + |
| 87 | + # uploading file in eccrm_dataset_bucket |
| 88 | + client = storage.Client.from_service_account_json('piyush-chaudhari-fall2023-9ae1ed20a7f3.json') |
| 89 | + eccrm_dataset_bucket_name = 'eccrm_dataset_bucket' |
| 90 | + eccrm_dataset_bucket = client.get_bucket(eccrm_dataset_bucket_name) |
| 91 | + |
| 92 | + # Define the file name and destination folder in the bucket |
| 93 | + file_name = uploaded_file.name.split("/")[-1] |
| 94 | + folder_name = "dataset" |
| 95 | + destination_blob_name = f"{folder_name}/{file_name}" |
| 96 | + |
| 97 | + # Upload the file to GCS |
| 98 | + blob = eccrm_dataset_bucket.blob(destination_blob_name) |
| 99 | + blob.upload_from_filename(file_path) |
| 100 | + |
| 101 | + return f"File uploaded successfully." |
| 102 | + return None |
| 103 | + |
| 104 | +def main(): |
| 105 | + st.title("Map Reduce Assignment (Engineering Cloud Computing)") |
| 106 | + tabs = ["Find Occurances", "Upload File"] |
| 107 | + choice = st.sidebar.selectbox("Select Tab:", tabs) |
| 108 | + |
| 109 | + if choice == tabs[0]: |
| 110 | + st.header(tabs[0]) |
| 111 | + |
| 112 | + # Tab 1 Content |
| 113 | + input_text = st.text_area("Enter Text:", placeholder="Enter the word ...", help="Enter word that you want to search in corpus and its occurances in each document") |
| 114 | + |
| 115 | + if st.button("Find"): |
| 116 | + if is_valid_input(input_text): |
| 117 | + with st.spinner("Finding results..."): |
| 118 | + result_df = find_results(input_text) |
| 119 | + if result_df is not None: |
| 120 | + st.dataframe(result_df, use_container_width=True) |
| 121 | + else: |
| 122 | + st.info("Word not present in corpus.") |
| 123 | + else: |
| 124 | + st.warning("Invalid input! Please avoid spaces and newline characters.") |
| 125 | + |
| 126 | + elif choice == tabs[1]: |
| 127 | + st.header(tabs[1]) |
| 128 | + |
| 129 | + # Tab 2 Content |
| 130 | + uploaded_file_key = "uploaded_file_key" |
| 131 | + uploaded_file = st.file_uploader("Choose a .txt file:", type=["txt"]) |
| 132 | + if uploaded_file is not None: |
| 133 | + if st.button("Upload"): |
| 134 | + with st.spinner("Uploading file..."): |
| 135 | + upload_result = save_uploaded_file(uploaded_file) |
| 136 | + st.success(upload_result) |
| 137 | + |
| 138 | + with st.spinner("Updating the inverted index..."): |
| 139 | + loop_interval = 1 |
| 140 | + threshold_seconds = 20 |
| 141 | + while True: |
| 142 | + if is_file_updated_recently(threshold_seconds): |
| 143 | + break |
| 144 | + time.sleep(loop_interval) |
| 145 | + st.success("Indexing complete. Thank you for waiting.") |
| 146 | + # Display countdown message and refresh the page |
| 147 | + refresh_in = 6 |
| 148 | + countdown_placeholder = st.empty() |
| 149 | + for i in range(refresh_in, 0, -1): |
| 150 | + countdown_placeholder.text(f"Refreshing the page in {i} seconds.") |
| 151 | + time.sleep(1) |
| 152 | + st.experimental_rerun() |
| 153 | + |
| 154 | + # Adding the "Made with love (emoji) Piyush Chaudhari" message at the bottom |
| 155 | + st.markdown( |
| 156 | + '<div style="position: fixed; bottom: 10px; left: 50%; transform: translateX(-50%);">' |
| 157 | + 'Made with ❤️ Piyush Rajendra Chaudhari</div>', |
| 158 | + unsafe_allow_html=True |
| 159 | + ) |
| 160 | +if __name__ == "__main__": |
| 161 | + main() |
0 commit comments