diff --git a/.github/workflows/docker-tutorial-image.yml b/.github/workflows/docker-tutorial-image.yml
new file mode 100644
index 00000000..c7d3a2ca
--- /dev/null
+++ b/.github/workflows/docker-tutorial-image.yml
@@ -0,0 +1,35 @@
+name: Docker image for tutorial
+
+on:
+  push:
+    branches: [ "tutorial" ]
+
+jobs:
+  build:
+    runs-on: self-hosted
+    
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      # Step 1: Checkout the repository
+      - name: Checkout Code
+        uses: actions/checkout@v4
+
+      # Step 2: Log in to GitHub Container Registry
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      # Step 3: Build and Push Docker Image
+      - name: Build and Push Docker Image
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./Dockerfile.ksc2025
+          push: true
+          tags: ghcr.io/psal-postech/torchsim_ksc2025:latest
diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index 32d6543c..fe8a4a7d 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -33,8 +33,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_add.py
 
   test_transcendental:
@@ -54,8 +54,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transcendental.py
 
   test_activation:
@@ -75,8 +75,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_activation.py
 
   test_batchnorm:
@@ -96,8 +96,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_batchnorm.py
 
   test_bmm:
@@ -117,8 +117,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_bmm.py
 
   test_cnn:
@@ -138,8 +138,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_cnn.py
 
   test_conv2d:
@@ -159,8 +159,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_conv2d.py
 
   test_matmul:
@@ -180,8 +180,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_matmul.py
 
   test_reduce:
@@ -201,8 +201,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_reduce.py
 
   test_softmax:
@@ -222,8 +222,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_softmax.py
 
   test_transpose2D:
@@ -243,8 +243,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transpose2D.py
 
   test_view3D_2D:
@@ -264,8 +264,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_view3D_2D.py
 
   test_layernorm:
@@ -285,8 +285,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_layernorm.py
 
   test_mlp:
@@ -306,8 +306,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_mlp.py
 
   test_resnet:
@@ -327,8 +327,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_resnet.py
 
       - name: Run test_resnet50.py
@@ -337,8 +337,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_resnet.py --model_type resnet50
 
   test_transformer:
@@ -358,8 +358,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transformer.py
 
   test_transpose3D:
@@ -379,8 +379,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transpose3D.py
 
   test_sparsity:
@@ -400,8 +400,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_sparsity.py
 
   test_pool:
@@ -421,8 +421,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_pool.py
 
   test_perceptron:
@@ -442,8 +442,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_single_perceptron.py
 
   test_fusion:
@@ -463,8 +463,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py
 
       - name: Run test_matmul_activation.py
@@ -473,8 +473,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py
 
       - name: Run test_matmul_scalar.py
@@ -483,8 +483,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py
 
       - name: Run test_matmul_reduction.py
@@ -493,8 +493,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_reduction.py
 
       - name: Run test_bmm_reduction.py
@@ -503,8 +503,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_bmm_reduction.py
 
       - name: Run test_prologue_fusion.py
@@ -513,8 +513,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_prologue_fusion.py
 
       - name: Run test_transformer_fusion.py
@@ -523,8 +523,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_transformer_fusion.py
 
       - name: Run test_conv_fusion.py
@@ -533,8 +533,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_conv_fusion.py
 
   test_moe:
@@ -554,8 +554,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/MoE/test_moe.py
 
   test_mistral:
@@ -575,8 +575,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Mixtral_8x7B/test_attention.py
 
   test_vit:
@@ -596,8 +596,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_vit.py
 
   test_diffusion:
@@ -617,8 +617,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Diffusion/test_diffusion.py
 
   test_indirect:
@@ -638,8 +638,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_indirect_access.py
 
   test_scheduler:
@@ -659,8 +659,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_scheduler.py
 
   test_accuracy:
@@ -683,8 +683,8 @@ jobs:
           docker run --rm \
             -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
             -e TORCHSIM_DUMP_PATH=/dump \
-            -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \
-            -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} bash -c \
             "cd /workspace && PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh && \
             cp PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out /dump/summary_cycle.out"
diff --git a/.gitignore b/.gitignore
index 9decced5..b42d5f6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,9 @@
 __pycache__/
 TOGSim/build/
 .vscode
+*.txt
+*.ipynb_checkpoints
+output
+togsim_results/*
+outputs/*
+experiments/artifact/logs/*
\ No newline at end of file
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 577c45e9..4d57b987 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -15,7 +15,7 @@ def hash_prefix(hash_value):
     return hash_value[1:12]
 
 def get_write_path(src_code):
-    return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(get_hash(src_code.strip())))
+    return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(get_hash(src_code.strip())))
 
 def dump_metadata(args, arg_attributes, path):
     meta_path = os.path.join(path, "meta.txt")
@@ -27,19 +27,6 @@ def dump_metadata(args, arg_attributes, path):
             file.write(f'{arg_name}=({arg_attribute[0]}, {arg.dtype}, {arg.shape})\n')
     return
 
-def llvm_compile_command(input, output):
-    opt_output = f"{input[:-3]}_opt.ll"
-    return [re.sub(r"[ \n]+", " ",
-        f"""
-            {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/opt --load-pass-plugin={extension_config.CONFIG_TORCHSIM_CUSTOM_PASS_PATH}/libLowerGemminiPass.so -S -march=riscv64 --passes=LowerGemminiPass {input} -o {opt_output}
-        """,
-    ).strip(),
-            re.sub(r"[ \n]+", " ",
-        f"""
-            {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc -march=riscv64 -mattr=+m,+f,+d,+a,+c,+v -O2 {opt_output} -o {output}
-        """,
-    ).strip()]
-
 def mlir_compile_command(filename, vectorlane_size, vlen=256):
     return [re.sub(r"[ \n]+", " ",
         f"""
@@ -165,7 +152,7 @@ def load(cls, source_code,
         else:
             link_option = ""
         # Generate LLVM kernel calller and binary for validation
-        if extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE:
+        if extension_config.pytorchsim_functional_mode:
             # Use custom malloc to avoid size error
             new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
             cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
@@ -182,7 +169,7 @@ def load(cls, source_code,
                     print("Error output:", e.output)
                     assert(0)
 
-                val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE, arg_attributes)
+                val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.pytorchsim_functional_mode, arg_attributes)
                 val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name)
                 val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name,
                                                    validation_binary_name, new_link_option)
@@ -213,7 +200,7 @@ def load(cls, source_code,
                 print("Error output:", e.output)
                 assert(0)
 
-            if not extension_config.CONFIG_TORCHSIM_TIMING_MODE:
+            if not extension_config.pytorchsim_timing_mode:
                 return key
 
             # Generate MLIR kernel calller and binary for cycle calculation
@@ -280,26 +267,26 @@ def dummy_simulator(*args, **kwargs):
             lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
             with lock:
                 # Run simulator pass
-                result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key))
+                result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key))
                 # Dump arguments and meta data
                 dump_metadata(args, arg_attributes, result_path)
                 runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
-                if not autotune and (extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE or validate):
+                if not autotune and (extension_config.pytorchsim_functional_mode or validate):
                     funcsim = FunctionalSimulator(result_path, key)
                     funcsim.run_spike(args, arg_attributes,
                                     runtime_path, self.validation_binary_name,
                                     vectorlane_size=vectorlane_size, spad_info=spad_info,
-                                    cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS, silent_mode=silent_mode)
-                if not extension_config.CONFIG_TORCHSIM_TIMING_MODE:
+                                    silent_mode=silent_mode)
+                if not extension_config.pytorchsim_timing_mode:
                     return
 
                 onnx_path = os.path.join(result_path, "tile_graph.onnx")
                 attribute_path = os.path.join(runtime_path, "attribute")
                 togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
-                backsim = TOGSimulator(togsim_path, extension_config.CONFIG_TOGSIM_CONFIG)
-                backsim.vectorlane_size = vectorlane_size
-                attribute_path = backsim.create_attribute_file(attribute_path, args, loop_size=loop_size)
-                result_path = backsim.simulation(onnx_path, attribute_path, silent_mode=silent_mode)
+                TOGSim = TOGSimulator(togsim_path, extension_config.CONFIG_TOGSIM_CONFIG)
+                TOGSim.vectorlane_size = vectorlane_size
+                attribute_path = TOGSim.create_attribute_file(attribute_path, args, loop_size=loop_size)
+                result_path = TOGSim.simulation(onnx_path, attribute_path, silent_mode=silent_mode)
                 result = TOGSimulator.get_result_from_file(result_path)
                 return result
 
@@ -310,23 +297,20 @@ def dryrun_simulator(*args, **kwargs):
             lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
             with lock:
                 # Run simulator pass
-                result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key))
+                result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key))
                 # Dump arguments and meta data
                 dump_metadata(args, arg_attributes, result_path)
                 runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
-                if not extension_config.CONFIG_TORCHSIM_TIMING_MODE:
-                    return
 
                 # Todo. Support valude dependent mode for graph mode
-                if False: # extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE:
+                if False: # extension_config.pytorchsim_functional_mode:
                     funcsim = FunctionalSimulator(result_path, key)
                     funcsim.run_spike(args, arg_attributes,
                                     runtime_path, self.validation_binary_name,
-                                    vectorlane_size=vectorlane_size, spad_info=spad_info,
-                                    cleanup=extension_config.CONFIG_CLEANUP_DUMP_ARGS)
+                                    vectorlane_size=vectorlane_size, spad_info=spad_info)
             return result_path, runtime_path, None
 
-        is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) and not autotune
+        is_dryrun = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) and not autotune
         target_simulator = dryrun_simulator if is_dryrun else dummy_simulator
         target_simulator.arg_attributes = arg_attributes
         target_simulator.future = future
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 3d6fbb76..5a4b8937 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -1,126 +1,89 @@
 import os
 import sys
-import tempfile
 import importlib
+import json
+
+CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+CONFIG_GEM5_PATH = os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt")
+CONFIG_TORCHSIM_LLVM_PATH = os.environ.get('TORCHSIM_LLVM_PATH', default="/usr/bin")
+
+CONFIG_TORCHSIM_DUMP_MLIR_IR = int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False))
+CONFIG_TORCHSIM_DUMP_LLVM_IR = int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False))
 
 def __getattr__(name):
+    # TOGSim config
+    config_path = os.environ.get('TOGSIM_CONFIG',
+                default=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json")
+    if name == "CONFIG_TOGSIM_CONFIG":
+        return config_path
+    config_json = json.load(open(config_path, 'r'))
 
     # Hardware info config
-    if name == "CONFIG_VECTOR_LANE":
-        return int(os.environ.get("TORCHSIM_VECTOR_LANE", default=128))
-    if name == "CONFIG_VECTOR_LANE_STRIDE":
-        return int(os.environ.get("TORCHSIM_VECTOR_LANE_STRIDE", default=2))
+    if name == "vpu_num_lanes":
+        return config_json["vpu_num_lanes"]
     if name == "CONFIG_SPAD_INFO":
         return {
           "spad_vaddr" : 0xD0000000,
           "spad_paddr" : 0x2000000000,
-          "spad_size" : int(os.environ.get("TORCHSIM_SPAD_SIZE", default=128)) << 10 # Note: spad size per lane
+          "spad_size" : config_json["vpu_spad_size_kb_per_lane"] << 10 # Note: spad size per lane
         }
+
     if name == "CONFIG_PRECISION":
-          return 4 # 32bit
+        return 4 # 32bit
     if name == "CONFIG_NUM_CORES":
-          return 1
-    if name == "CONFIG_VLEN":
-          return 256 # 256bits / 32bits = 8 [elements]
-
-    # Tile size config
-    if name == "CONFIG_TORCHSIM_DIR":
-          return os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-
-    if name == "CONFIG_TORCHSIM_DUMP_PATH":
-          return os.environ.get('TORCHSIM_DUMP_PATH', default = f"{tempfile.gettempdir()}/torchinductor")
-    if name == "CONFIG_TORCHSIM_DUMP_FILE":
-          return int(os.environ.get('TORCHSIM_DUMP_FILE', default=True))
-    if name == "CONFIG_TORCHSIM_FUNCTIONAL_MODE":
-          return int(os.environ.get('TORCHSIM_FUNCTIONAL_MODE', default=True))
-    if name == "CONFIG_TORCHSIM_TIMING_MODE":
-          return int(os.environ.get("TORCHSIM_TIMING_MODE", True))
-    if name == "CONFIG_CLEANUP_DUMP_ARGS":
-          return int(os.environ.get('CLEANUP_DUMP_ARGS', default=False))
-
-    # LLVM PATH
-    if name == "CONFIG_TORCHSIM_LLVM_PATH":
-        return os.environ.get('TORCHSIM_LLVM_PATH', default="/usr/bin")
-    if name == "CONFIG_TORCHSIM_CUSTOM_PASS_PATH":
-        return os.environ.get('TORCHSIM_CUSTOM_PASS_PATH',
-                                              default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/GemminiLowerPass/build")
-    if name == "CONFIG_TORCHSIM_DUMP_MLIR_IR":
-        return int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False))
-    if name == "CONFIG_TORCHSIM_DUMP_LLVM_IR":
-        return int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False))
-
-    # TOGSim config
-    if name == "CONFIG_TOGSIM_CONFIG":
-        return os.environ.get('TORCHSIM_CONFIG',
-                default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json")
-    if name == "CONFIG_TOGSIM_EAGER_MODE":
-        return int(os.environ.get("TOGSIM_EAGER_MODE", default=False))
-    if name == "CONFIG_TOGSIM_DRYRUN":
-        return int(os.environ.get('TOGSIM_DRYRUN', default=False))
-    if name == "CONFIG_TOGSIM_DEBUG_LEVEL":
-        return os.environ.get("TOGSIM_DEBUG_LEVEL", "")
+        return config_json["num_cores"]
+    if name == "vpu_vector_length_bits":
+        return config_json["vpu_vector_length_bits"]
+
+    if name == "pytorchsim_functional_mode":
+        return config_json['pytorchsim_functional_mode']
+    if name == "pytorchsim_timing_mode":
+        return config_json['pytorchsim_timing_mode']
+
+    # Mapping strategy
+    if name == "codegen_mapping_strategy":
+        codegen_mapping_strategy = config_json["codegen_mapping_strategy"]
+        assert(codegen_mapping_strategy in ["heuristic", "autotune", "external-then-heuristic", "external-then-autotune"]), "Invalid mapping strategy!"
+        return codegen_mapping_strategy
+
+    if name == "codegen_external_mapping_file":
+        return config_json["codegen_external_mapping_file"]
+
+    # Autotune config
+    if name == "codegen_autotune_max_retry":
+        return config_json["codegen_autotune_max_retry"]
+    if name == "codegen_autotune_template_topk":
+        return config_json["codegen_autotune_template_topk"]
 
-    # GEM5 config
-    if name == "CONFIG_GEM5_PATH":
-        return os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt")
-    if name == "CONFIG_GEM5_SCRIPT_PATH":
-        return os.environ.get('GEM5_SCRIPT_PATH',
-                                      default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/gem5_script/script_systolic.py")
-
-    # AUTOTUNE config
-    if name == "CONFIG_AUTOTUNE":
-        return int(os.environ.get('AUTOTUNE', default=False))
-    if name == "CONFIG_AUTOTUNE_TEMPLATE":
-        return int(os.environ.get('AUTOTUNE_TEMPLATE', default=False))
-    if name == "CONFIG_MAX_AUTOTUNE_TRY":
-        return int(os.environ.get('MAX_AUTOTUNE_TRY', default=10))
-    if name == "CONFIG_AUTOTUNE_TEMPLATE_TOPK":
-        return int(os.environ.get('AUTOTUNE_TEMPLATE_TOPK', default=4))
-
-    # For block sparse
-    if name == "CONFIG_BLOCK_SPARSE":
-        return int(os.environ.get('BLOCK_SPARSE', default=0))
-
-    # For GEMM tile size
-    if name == "CONFIG_MANUAL_TILE_SIZE":
-        return int(os.environ.get('TORCHSIM_MANUAL_TILE_SIZE', default=False))
-    if name == "CONFIG_TILE_M":
-        return int(os.getenv("TORCHSIM_TILE_M", __getattr__("CONFIG_VECTOR_LANE")))
-    if name == "CONFIG_TILE_N":
-        return int(os.getenv("TORCHSIM_TILE_N", __getattr__("CONFIG_VECTOR_LANE")))
-    if name == "CONFIG_TILE_K":
-        return int(os.getenv("TORCHSIM_TILE_K", __getattr__("CONFIG_VECTOR_LANE")))
-
-    if name == "CONFIG_SUBTILE":
-        return int(os.environ.get('TORCHSIM_SUBTILE', default=True))
-    if name == "CONFIG_MANUAL_SUBTILE_SIZE":
-        return int(os.environ.get('TORCHSIM_MANUAL_SUBTILE_SIZE', default=False))
-    if name == "CONFIG_SUBTILE_M":
-        return int(os.environ.get('TORCHSIM_SUBTILE_M', default=__getattr__("CONFIG_VECTOR_LANE")))
-    if name == "CONFIG_SUBTILE_N":
-        return int(os.environ.get('TORCHSIM_SUBTILE_N', default=__getattr__("CONFIG_VECTOR_LANE")))
-    if name == "CONFIG_SUBTILE_K":
-        return int(os.environ.get('TORCHSIM_SUBTILE_K', default=__getattr__("CONFIG_VECTOR_LANE")))
-
-    if name == "CONFIG_GEMM_CHEATSHEET_PATH":
-          return os.environ.get('TORCHSIM_GEMM_CHEATSHEET_PATH',
-                          default=f"{__getattr__('CONFIG_TORCHSIM_DIR')}/validation/gemm_tpuv3_cheatsheet.json")
     # Compiler Optimization
-    if name == "CONFIG_COMPILER_OPTIMIZATION":
-          return os.environ.get('TORCHSIM_COMPILER_OPTIMIZATION', default="all")  # options: all, none, custom
+    if name == "codegen_compiler_optimization":
+        return config_json["codegen_compiler_optimization"]
+
     # Advanced fusion options
     if name == "CONFIG_FUSION":
-          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "fusion" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
+        return True if (__getattr__("codegen_compiler_optimization") == "all" or "fusion" in __getattr__("codegen_compiler_optimization")) else False
     if name == "CONFIG_FUSION_REDUCTION_EPILOGUE":
-          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "reduction_epliogue" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
+        return True if (__getattr__("codegen_compiler_optimization") == "all" or "reduction_epliogue" in __getattr__("codegen_compiler_optimization")) else False
     if name == "CONFIG_FUSION_REDUCTION_REDUCTION":
-          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "reduction_reduction" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
+        return True if (__getattr__("codegen_compiler_optimization") == "all" or "reduction_reduction" in __getattr__("codegen_compiler_optimization")) else False
     if name == "CONFIG_FUSION_PROLOGUE":
-          return True if ((__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all") or ("prologue" in __getattr__("CONFIG_COMPILER_OPTIMIZATION"))) else False
+        return True if ((__getattr__("codegen_compiler_optimization") == "all") or ("prologue" in __getattr__("codegen_compiler_optimization"))) else False
     if name == "CONFIG_SINGLE_BATCH_CONV":
-          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "single_batch_conv" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
+        return True if (__getattr__("codegen_compiler_optimization") == "all" or "single_batch_conv" in __getattr__("codegen_compiler_optimization")) else False
     if name == "CONFIG_MULTI_TILE_CONV":
-          return True if (__getattr__("CONFIG_COMPILER_OPTIMIZATION") == "all" or "multi_tile_conv" in __getattr__("CONFIG_COMPILER_OPTIMIZATION")) else False
+        return True if (__getattr__("codegen_compiler_optimization") == "all" or "multi_tile_conv" in __getattr__("codegen_compiler_optimization")) else False
+    if name == "CONFIG_SUBTILE":
+        return True if (__getattr__("codegen_compiler_optimization") == "all" or "subtile" in __getattr__("codegen_compiler_optimization")) else False
+
+    if name == "CONFIG_TOGSIM_DEBUG_LEVEL":
+        return os.environ.get("TOGSIM_DEBUG_LEVEL", "")
+    if name == "CONFIG_TORCHSIM_DUMP_PATH":
+        return os.environ.get('TORCHSIM_DUMP_PATH', default = CONFIG_TORCHSIM_DIR)
+    if name == "CONFIG_TORCHSIM_LOG_PATH":
+        return os.environ.get('TORCHSIM_DUMP_LOG_PATH', default = os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results"))
+
+    if name == "CONFIG_TOGSIM_EAGER_MODE":
+        return int(os.environ.get("TOGSIM_EAGER_MODE", default=False))
 
 # SRAM Buffer allocation plan
 def load_plan_from_module(module_path):
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 167544f2..786e7398 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -46,7 +46,7 @@
 
 class MLIRExternKernelChoice(ExternKernelChoice):
     def call_name(self):
-        is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False))
+        is_dryrun = int(os.environ.get('TOGSIM_EAGER_MODE', default=False))
         if is_dryrun:
             return f"yield from sparse_mm_dummy_stonne_outer"
         return f"torch.ops.extension_op.{self.name}"
@@ -276,9 +276,9 @@ def sparse_mm_stonne_outer(a, b, out):
     onnx_path, attribute_path, c_result_path = prepare_outer_product_matrix(a, b, out)
 
     togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
-    stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/TOGSim/configs/stonne_single_c1_simple_noc.json'
-    backsim = TOGSimulator(togsim_path, stonne_config_path)
-    result_path = backsim.simulation(onnx_path)
+    stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_single_c1_simple_noc.json'
+    TOGSim = TOGSimulator(togsim_path, stonne_config_path)
+    result_path = TOGSim.simulation(onnx_path)
     TOGSimulator.get_result_from_file(result_path)
 
     # Load result data
diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
index e52d6cff..988408ea 100644
--- a/PyTorchSimFrontend/mlir/mlir_autotune.py
+++ b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -21,7 +21,7 @@ def hash_prefix(hash_value):
     return hash_value[1:12]
 
 def get_write_path(src_code):
-    return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(get_hash(src_code.strip())))
+    return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(get_hash(src_code.strip())))
 
 @dataclasses.dataclass
 class MLIRBenchmarkRequest():
@@ -58,7 +58,7 @@ def make_run_fn(
         # Check already cached result.
         write_path = get_write_path(self.source_code)
         key,  _ = write(self.source_code, "mlir", specified_dir=write_path)
-        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "togsim_result/0")
+        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key), "togsim_result/0")
         if os.path.exists(result_path):
             result = TOGSimulator.get_result_from_file(result_path)
             def cached_run_fn(*args, **kwargs):
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index c24260ce..6650f429 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -426,7 +426,14 @@ def exp(operand, *args, var_info=None, **kwargs):
 
     @staticmethod
     def exp2(operand, *args, var_info=None, **kwargs):
-        raise NotImplementedError()
+        # Hands-on part: implement exp2 using math.exp2
+        # var_info = {operand: [tile_size, dtype]}
+        # Ex) var_info[operand] = [8, "f32"]
+
+        ln2 = math.log(2)
+        coeff = ops.constant(ln2, "f32")
+        operand = ops.mul(operand, coeff)
+        return ops.exp(operand), var_info[operand]
 
     @staticmethod
     def erf(operand, *args, var_info=None, **kwargs):
@@ -1572,7 +1579,8 @@ def make_choices(self, nodes, kernel_name):
             current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size())
             search_space.add(current_tile_sz)
 
-            print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}")
+            if extension_config.CONFIG_DEBUG_MODE:
+                print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}")
             self._prepare_simulator_headers(src_code)
             bench_runner = self.run_bench(nodes, kernel_name, src_code)
             choices.append((bench_runner, src_code, current_tile_sz, self.kernel_group.tile_desc.vmap.vlane_stride))
@@ -1614,7 +1622,8 @@ def make_choices(self, nodes, kernel_name):
 
                     # Add this choice
                     search_space.add(current_tile_sz)
-                    print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}")
+                    if extension_config.CONFIG_DEBUG_MODE:
+                        print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}")
                     self._prepare_simulator_headers(src_code)
                     bench_runner = self.run_bench(nodes, kernel_name, src_code)
                     choices.append((bench_runner, src_code, self.kernel_group.tile_desc.get_tile_size(), self.kernel_group.tile_desc.vmap.vlane_stride))
@@ -1625,7 +1634,7 @@ def make_choices(self, nodes, kernel_name):
     def autotune(self, *args):
         def get_cycle(choice):
             bench_runner = choice[0]
-            for n_try in range(extension_config.CONFIG_MAX_AUTOTUNE_TRY): # TODO: make simple
+            for n_try in range(extension_config.codegen_autotune_max_retry): # TODO: make simple
                 try:
                     out = bench_runner()
                     return out[-1]
@@ -1641,7 +1650,8 @@ def get_cycle(choice):
         max_idx = results.index(min(results))
         if min(results) == float("inf"):
             raise RuntimeError("Failed to find optimal tile size...")
-        self._log_autotune_result(choices[max_idx], results[max_idx])
+        if extension_config.CONFIG_DEBUG_MODE:
+            self._log_autotune_result(choices[max_idx], results[max_idx])
         optimal_src_code, loop_size = choices[max_idx][1], choices[max_idx][-1]
         return optimal_src_code, loop_size
 
@@ -1661,7 +1671,7 @@ def run_bench(self, nodes, kernel_name, src_code):
                 "spad_info": self.spad_info,
                 "vlen" : self.vlen,
                 "arg_attributes" : arg_attributes,
-                "validate" : extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE,
+                "validate" : extension_config.pytorchsim_functional_mode,
                 "autotune" : True,
             },
             source_code=src_code,
@@ -1680,7 +1690,7 @@ def _log_autotune_result(self, best_choice, best_cycle):
     def codegen_nodes(self, nodes, kernel_name):
         src_code = super().codegen_nodes(nodes, kernel_name)
         self._prepare_simulator_headers(src_code)
-        if extension_config.CONFIG_AUTOTUNE and extension_config.CONFIG_TORCHSIM_TIMING_MODE:
+        if "autotune" in extension_config.codegen_mapping_strategy and extension_config.pytorchsim_timing_mode:
             optimal_src_code = self.autotune(nodes, kernel_name)[0]
             if optimal_src_code is not None:
                 return optimal_src_code
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index c655dde3..4d33eea4 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -567,11 +567,11 @@ def set_tile_info(self, tile_desc : MLIRMultiDimTile):
 class BaseMLIRHardwareInfo():
     def __init__(self):
         # Default HW setting
-        self.vector_lane = extension_config.CONFIG_VECTOR_LANE
+        self.vector_lane = extension_config.vpu_num_lanes
         self.spad_info = extension_config.CONFIG_SPAD_INFO
         self.precision = extension_config.CONFIG_PRECISION
         self.num_cores = extension_config.CONFIG_NUM_CORES
-        self.vlen = extension_config.CONFIG_VLEN
+        self.vlen = extension_config.vpu_vector_length_bits
 
 class BaseMLIRKernel(common.Kernel, BaseMLIRHardwareInfo):
     newvar_prefix = "%"
@@ -700,7 +700,7 @@ def extract_dividers(self, implicit_ops):
 
     def compute_tile_size(self, nodes, vars, reduction_vars):
         vlane_split_axis = len(vars) - 1
-        vlane_stride = extension_config.CONFIG_VECTOR_LANE_STRIDE
+        vlane_stride = 2 # Set minimum vlane stride
 
         # Set initial tile size & vector lane mapping
         if self.kernel_group.tile_desc is None:
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py
index 77826730..a1a9d935 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py
@@ -93,7 +93,7 @@ def outer_func_render(self, kernel_name, input_args):
             OUTPUT=Y,
             PADDING_H=self.padding[0],
             PADDING_W=self.padding[1],
-            VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_FUNCTIONAL_MODE,
+            VALIDATION_MODE=extension_config.pytorchsim_functional_mode,
             TOGSIM_EAGER_MODE=eager_mode,
             input_reorder=self.input_reorder
         )
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 6271b548..bbc63b45 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -297,31 +297,24 @@ def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes):
         return X,W,Y,M,N,K,n_epilogue_node,n_prologue_node,len(n_extra_read)
 
     def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node):
-        # Check cheat sheet
-        cheatsheet_path = extension_config.CONFIG_GEMM_CHEATSHEET_PATH
         data = {}
-        if extension_config.CONFIG_GEMM_CHEATSHEET_PATH is not None:
-            path = Path(cheatsheet_path)
-            if path.is_file():
-                with path.open("r") as f:
-                    data = json.load(f)
-
-        gemm_shape = f"{M}_{K}_{N}"
-        if extension_config.CONFIG_MANUAL_TILE_SIZE:
+        gemm_shape = f"{M}_{N}_{K}"
+        if "external" in extension_config.codegen_mapping_strategy:
             # case 1: use manual tile size
-            TILE_M = extension_config.CONFIG_TILE_M
-            TILE_N = extension_config.CONFIG_TILE_N
-            TILE_K = extension_config.CONFIG_TILE_K
-            tile_candidates = [[TILE_M, TILE_N, TILE_K]]
-        elif gemm_shape in data:
-            # case 2: cached tile size
+            path = Path(extension_config.codegen_external_mapping_file)
+            with path.open("r") as f:
+                data = json.load(f)
+        if gemm_shape in data:
             tile_info = data[gemm_shape]
-            TILE_M = tile_info["TILE_M"]
-            TILE_N = tile_info["TILE_N"]
-            TILE_K = tile_info["TILE_K"]
-            tile_candidates = [[TILE_M, TILE_N, TILE_K]]
+            if len(tile_info) == 3:
+                TILE_M, TILE_N, TILE_K = tile_info.values()
+                tile_candidates = [[TILE_M, TILE_N, TILE_K]]
+            elif len(tile_info) == 6:
+                TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info.values()
+                full_tile_candidates = [[TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K]]
+                return full_tile_candidates
         else:
-            # case 3: use gemm_combination_mapping
+            # case 2: use heuristic mapping
             min_tile = (n_extra_node + n_prologue_node) == 0
             tile_candidates = kernel.gemm_combination_mapping(M, N, K, max(n_extra_read-2, 0), n_prologue_node, min_tile=True)
 
@@ -332,24 +325,18 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no
 
         full_tile_candidates = []
         for idx, (TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates):
-            # Calculate Sub Tile Size for fine-grained DMA
+            # Case 1: calculate sub tile size for fine-grained DMA
             if extension_config.CONFIG_SUBTILE:
-                # Case 1: adjust selective fine-grained DMA (SFG-DMA)
                 SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane
                 if (TILE_M == M and TILE_N == N and TILE_N <= 512):
                     SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
                 else: # Avoid Row Conflict of weights
                     SUB_TILE_N = TILE_N
                 SUB_TILE_K = TILE_K
-                # Case 2: use manual sub tile size (FG-DMA)
-                if extension_config.CONFIG_MANUAL_SUBTILE_SIZE:
-                    SUB_TILE_M = extension_config.CONFIG_SUBTILE_M
-                    SUB_TILE_N = extension_config.CONFIG_SUBTILE_N
-                    SUB_TILE_K = extension_config.CONFIG_SUBTILE_K
-            # Case 3: None Subtile
+            # Case 2: None Subtile
             else:
                 SUB_TILE_M = TILE_M
                 SUB_TILE_N = TILE_N
                 SUB_TILE_K = TILE_K
-            full_tile_candidates.append([TILE_M,TILE_N,TILE_K, SUB_TILE_M,SUB_TILE_N,SUB_TILE_K])
+            full_tile_candidates.append([TILE_M,TILE_N,TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K])
         return full_tile_candidates
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index af59d88f..ebf0c80e 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -110,7 +110,7 @@ def convolution(
         mlir_template = MLIRConvSingleBatchTemplate([x, weight, bias], layout, **kwargs)
     elif BATCH == 1 and stride[0] != 1 and extension_config.CONFIG_SINGLE_BATCH_CONV:
         mlir_template = MLIRConvSingleBatchStridedTemplate([x, weight, bias], layout, **kwargs)
-    elif I_C < extension_config.CONFIG_VECTOR_LANE // 8 and extension_config.CONFIG_MULTI_TILE_CONV: # 8 is hard-coded for now. This should be changed to a better heuristic.
+    elif I_C < extension_config.vpu_num_lanes // 8 and extension_config.CONFIG_MULTI_TILE_CONV: # 8 is hard-coded for now. This should be changed to a better heuristic.
         mlir_template = MLIRConvMultiTileTemplate([x, weight, bias], layout, **kwargs)
     else:
         mlir_template = MLIRConvTemplate([x, weight, bias], layout, **kwargs)
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 38603319..23be941c 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -257,7 +257,7 @@ def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size
             codecache_def.writeline(f"spad_info={spad_info},")
             codecache_def.writeline(f"origins={origins},")
             codecache_def.writeline("arg_attributes=arg_attributes,")
-            codecache_def.writeline(f"vlen={extension_config.CONFIG_VLEN})")
+            codecache_def.writeline(f"vlen={extension_config.vpu_vector_length_bits})")
             wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False)
         return kernel_name
 
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index df3621eb..e493464a 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -491,7 +491,9 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_
     def make_choices(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes):
         choices = []
         for tile_info in tile_candidates:
-            print(f"[Auto-tune] Trying tile size: {list(tile_info)}")
+            if extension_config.CONFIG_DEBUG_MODE:
+                # Compute Tile M, N, K DMA Tile M, N, K
+                print(f"[Auto-tune] Trying tile size: {list(tile_info)}")
             src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info)
             bench_runner = self.run_bench([template_node], self.kernel_name, src_code)
             choices.append((bench_runner, src_code, tile_info, self.loop_size))
@@ -506,7 +508,7 @@ def _log_autotune_result(self, best_choice, best_cycle):
         )
 
     def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes):
-        if extension_config.CONFIG_AUTOTUNE_TEMPLATE and len(tile_candidates):
+        if "autotune" in extension_config.codegen_mapping_strategy and len(tile_candidates):
             src_code, loop_size = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes)
             self.loop_size = loop_size
         else:
@@ -1230,7 +1232,7 @@ def make_kernel_render(
                 template=self,
                 kwargs=kwargs
             )
-            tile_candidates = self.get_tile_candidates(**kwargs)[:extension_config.CONFIG_AUTOTUNE_TEMPLATE_TOPK]
+            tile_candidates = self.get_tile_candidates(**kwargs)[:extension_config.codegen_autotune_template_topk]
             return kernel, tile_candidates, render
 
         return MLIRTemplateCaller(
diff --git a/README.md b/README.md
index dbfdf2e8..1b6b744c 100644
--- a/README.md
+++ b/README.md
@@ -131,7 +131,7 @@ Wrapper Codegen Path = /tmp/torchinductor_root/yd/cyda7nhzv5mtakfhfcxtmmhtsv6kg7
 [Gem5Simulator] cmd>  /workspace/gem5/build/RISCV/gem5.opt -r --stdout-file=sto.log -d /tmp/torchinductor/tmp/fy6nnyudtno/m5out /root/workspace/PyTorchSim/gem5_script/script_systolic.py -c /tmp/torchinductor/tmp/fy6nnyudtno/cycle_bin --vlane 128
 [Gem5Simulator] Simulation is still running... 
 [SpikeSimulator] cmd>  spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 -m0x80000000:0x1900000000,0x2000000000:0x1000000 --scratchpad-base-paddr=137438953472 --scratchpad-base-vaddr=3489660928 --scratchpad-size=131072  --kernel-addr=0000000000010400:10846 --base-path=/tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001 /workspace/riscv-pk/build/pk /tmp/torchinductor/tmp/fy6nnyudtno/validation_binary /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg0_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg1_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/buf0/0.raw
-[TOGSimulator] cmd>  /root/workspace/PyTorchSim/TOGSim/build/bin/Simulator --config /root/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/attribute/0
+[TOGSimulator] cmd>  /root/workspace/PyTorchSim/TOGSim/build/bin/Simulator --config /root/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/attribute/0
 [TOGSimulator] Simulation is still running..  
 [TOGSimulator] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/0"
 ----------------------------
@@ -147,7 +147,7 @@ Simulation consists of three steps
 
 If you want to turn off the `SpikeSimulator` for fast simulation, you can set as below.
 ```bash
-export TORCHSIM_FUNCTIONAL_MODE=False
+export pytorchsim_functional_mode=False
 ```
 Log contains memory & core stats.
 ```bash
@@ -195,7 +195,7 @@ import torch
 from torchvision.models import resnet18
 from test_transformer import EncoderBlock
 base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-config = f'{base_path}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
+config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
 
 sys.path.append(base_path)
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
@@ -329,12 +329,10 @@ Last but not least, you must set `l2d_type` and `l2d_config` in the [TOGSim conf
 
 You can configure these options using environment variables.
 ```bash
-export TORCHSIM_VECTOR_LANE=128 # vector lane size
-export TORCHSIM_VECTOR_LANE_STRIDE=2  # vector lane stride for DMA
+export vpu_num_lanes=128 # vector lane size
+export vpu_num_lanes_STRIDE=2  # vector lane stride for DMA
 export TORCHSIM_DIR=/workspace/PyTorchSim # home directory
 
-export BLOCK_SPARSE=0 # If you want to use block sparse workload, turn it on
-
 # Plan which tensor allocated in TPUv4's CMEM
 export SRAM_BUFFER_PLAN_PATH=/workspace/PyTorchSim/tpuv4/gemm_plan.py
 
@@ -344,7 +342,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
 ## TOGSim Configuration
 ![NPU_Core](./docs/npu_core.jpg)
 
-`TOGSim/configs` directory contains example NPU configuration files in the JSON format.
+`configs` directory contains example NPU configuration files in the JSON format.
 ```
   "num_cores" : 2,                   // Number of NPU cores
   "core_freq_mhz" : 940,             // Core's frequency (MHz)
@@ -377,7 +375,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
 ```
 You can set TOGSim config path as below.
 ```bash
-export TORCHSIM_CONFIG=/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+export TORCHSIM_CONFIG=/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
 ```
 ## Future Works
 Currently, PyTorchSim supports PyTorch 2.2. Support for newer versions will be added soon.
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 0b633fa9..ffe8e4fc 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -159,7 +159,6 @@ def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None:
         self.tog_simulator = tog_simulator
 
         # Dry run for compile and create generator
-        os.environ["TOGSIM_DRYRUN"] = "1"
         os.environ["TOGSIM_EAGER_MODE"] = "1"
 
     @staticmethod
@@ -222,7 +221,7 @@ def is_all_idle(self):
         return all([self.is_partition_idle(i) for i in range(self.num_partion)])
 
     def prepare_model(self, req_model: SchedulerDNNModel):
-        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "togsim_result", req_model.model_name)
+        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_LOG_PATH, "togsim_result", req_model.model_name)
         os.makedirs(result_path, exist_ok=True)
         index = str(len(os.listdir(result_path)))
 
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index c586c2fd..322d9b12 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -6,6 +6,7 @@
 import sys
 import json
 import time
+import datetime
 import threading
 from pathlib import Path
 
@@ -161,10 +162,11 @@ def show_progress():
             print("")
 
         dir_path = os.path.join(os.path.dirname(target_binary), "m5out")
-        gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-r", "--stdout-file=sto.log", "-d", dir_path, extension_config.CONFIG_GEM5_SCRIPT_PATH, "-c", target_binary, "--vlane", str(vectorlane_size)]
+        gem5_script_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "gem5_script/script_systolic.py")
+        gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-r", "--stdout-file=sto.log", "-d", dir_path, gem5_script_path, "-c", target_binary, "--vlane", str(vectorlane_size)]
         try:
             # Create progress thread
-            is_dryrun = int(os.environ.get('TOGSIM_DRYRUN', default=False)) or silent_mode
+            is_dryrun = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) or silent_mode
             if not is_dryrun:
                 if extension_config.CONFIG_DEBUG_MODE:
                     print("[Gem5] cmd> ", " ".join(gem5_cmd))
@@ -242,13 +244,15 @@ def show_progress():
                 print("[TOGSim] Error output:", e.output)
             assert 0
         # Save result to result_path
-        result_path = os.path.join(os.path.dirname(model_path), "togsim_result")
+        result_path = extension_config.CONFIG_TORCHSIM_LOG_PATH
         os.makedirs(result_path, exist_ok=True)
-        file_name = str(len(os.listdir(result_path)))
+        file_name = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')+".log"
         result_path = os.path.join(result_path, file_name)
         with open(result_path, "w") as f:
             f.write(result.decode())
-        print(f'[TOGSim] Simulation of "{model_path}" is stored to "{result_path}"')
+        if not silent_mode or extension_config.CONFIG_DEBUG_MODE:
+            model_path_log = f' of "{model_path}" ' if extension_config.CONFIG_DEBUG_MODE else " "
+            print(f'[TOGSim] Simulation log{model_path_log}is stored to "{result_path}"')
         return result_path
 
     def interactive_simulation(self):
@@ -286,7 +290,7 @@ def wait(self):
     def send_command(self, command):
         if self.process:
             try:
-                if not extension_config.CONFIG_TOGSIM_DRYRUN:
+                if extension_config.CONFIG_TORCHSIM_DEBUG_MODE:
                     print(command, flush=True)
                 self.process.stdin.write(command + '\n')
                 self.process.stdin.flush()
@@ -398,6 +402,7 @@ def get_result_from_file(result_path):
         dram_channel_bw = {}
         avg_dram_bw = None
         simulation_time = None
+        total_cycle = None
 
         # Read and find total stat position
         with open(result_path, "r") as f:
@@ -452,6 +457,6 @@ def get_result_from_file(result_path):
         return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle
 
 if __name__ == "__main__":
-    sim = TOGSimulator("/workspace/PyTorchSim/TOGSim", "/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json")
+    sim = TOGSimulator("/workspace/PyTorchSim/TOGSim", "/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json")
     sim.interactive_simulation()
     sim.until(4000)
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
deleted file mode 100644
index 1257891c..00000000
--- a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 700,
-  "core_stats_print_period_cycles" : 10000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 700,
-  "dram_channels": 32,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycless": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq_mhz" : 700,
-  "icnt_injection_ports_per_core" : 16
-}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
deleted file mode 100644
index 2207f2b9..00000000
--- a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq_mhz" : 700,
-  "core_stats_print_period_cycles" : 10000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :700,
-  "dram_channels": 32,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 7,
-  "icnt_freq_mhz" : 700,
-  "icnt_injection_ports_per_core" : 16
-}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json
deleted file mode 100644
index 045407b7..00000000
--- a/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 1000,
-  "core_stats_print_period_cycles" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :800,
-  "dram_channels": 1,
-  "dram_req_size_byte": 64,
-  "dram_num_burst_length" : 4,
-  "dram_stats_print_period_cycles": 100000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq_mhz" : 1000
-}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json
deleted file mode 100644
index d8f95d70..00000000
--- a/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 1000,
-  "core_stats_print_period_cycles" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :800,
-  "dram_channels": 2,
-  "dram_req_size_byte": 64,
-  "dram_num_burst_length" : 4,
-  "dram_stats_print_period_cycles": 100000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq_mhz" : 1000
-}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json
deleted file mode 100644
index a5fa9585..00000000
--- a/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 1000,
-  "core_stats_print_period_cycles" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :800,
-  "dram_channels": 4,
-  "dram_req_size_byte": 64,
-  "dram_num_burst_length" : 4,
-  "dram_stats_print_period_cycles": 100000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq_mhz" : 1000
-}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_booksim.json b/TOGSim/configs/systolic_ws_8x8_c1_booksim.json
deleted file mode 100644
index cf560171..00000000
--- a/TOGSim/configs/systolic_ws_8x8_c1_booksim.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 1000,
-  "core_stats_print_period_cycles" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :800,
-  "dram_channels": 1,
-  "dram_req_size_byte": 64,
-  "dram_num_burst_length" : 4,
-  "dram_stats_print_period_cycless": 100000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
-
-  "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
-  "icnt_freq_mhz" : 1000
-}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json
deleted file mode 100644
index 8da61d72..00000000
--- a/TOGSim/configs/systolic_ws_8x8_c1_simple_noc.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 1000,
-  "core_stats_print_period_cycles" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :800,
-  "dram_channels": 1,
-  "dram_req_size_byte": 64,
-  "dram_num_burst_length" : 4,
-  "dram_stats_print_period_cycles": 100000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq_mhz" : 1000
-}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json
deleted file mode 100644
index c5f429f9..00000000
--- a/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
-  "core_type" : ["ws_mesh","ws_mesh"],
-  "num_cores" : 2,
-  "core_freq_mhz" : 1000,
-  "core_stats_print_period_cycles" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :800,
-  "dram_channels": 1,
-  "dram_req_size_byte": 64,
-  "dram_num_burst_length" : 4,
-  "dram_stats_print_period_cycless": 100000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq_mhz" : 1000
-}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json
deleted file mode 100644
index 254520be..00000000
--- a/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq_mhz" : 1000,
-  "core_stats_print_period_cycles" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :800,
-  "dram_channels": 2,
-  "dram_req_size_byte": 64,
-  "dram_num_burst_length" : 4,
-  "dram_stats_print_period_cycles": 100000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq_mhz" : 1000
-}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json b/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json
deleted file mode 100644
index e39867a7..00000000
--- a/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq_mhz" : 1000,
-  "core_stats_print_period_cycles" : 100000,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :800,
-  "dram_channels": 4,
-  "dram_req_size_byte": 64,
-  "dram_num_burst_length" : 4,
-  "dram_stats_print_period_cycless": 100000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency" : 1,
-  "icnt_freq_mhz" : 1000
-}
\ No newline at end of file
diff --git a/TOGSim/include/Interconnect.h b/TOGSim/include/Interconnect.h
index e6b325d0..3f0aaff7 100644
--- a/TOGSim/include/Interconnect.h
+++ b/TOGSim/include/Interconnect.h
@@ -3,6 +3,8 @@
 #include "DMA.h"
 #include "booksim2/Interconnect.hpp"
 #include <cmath>
+#include <iostream>
+#include <fstream>
 #include <filesystem>
 
 namespace fs = std::filesystem;
@@ -69,6 +71,7 @@ class Booksim2Interconnect : public Interconnect {
   virtual mem_fetch* top(uint32_t nid) override;
   virtual void pop(uint32_t nid) override;
   virtual void print_stats() override;
+  void print_config(std::string config_path);
 
  private:
   uint32_t _ctrl_size;
diff --git a/TOGSim/include/Simulator.h b/TOGSim/include/Simulator.h
index 4d9defd1..39fa310e 100644
--- a/TOGSim/include/Simulator.h
+++ b/TOGSim/include/Simulator.h
@@ -3,6 +3,7 @@
 #include <queue>
 #include <filesystem>
 #include <string>
+#include <yaml-cpp/yaml.h>
 #include "Common.h"
 #include "Core.h"
 #include "SparseCore.h"
diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc
index b5c092b3..9a6b7798 100644
--- a/TOGSim/src/Common.cc
+++ b/TOGSim/src/Common.cc
@@ -16,6 +16,8 @@ T get_config_value(json config, std::string key) {
 
 SimulationConfig initialize_config(json config) {
   SimulationConfig parsed_config;
+  // print json
+  spdlog::info("TOGSim Config: {}", config.dump(2));
 
   /* Core configs */
   parsed_config.num_cores = config["num_cores"];
@@ -111,8 +113,8 @@ SimulationConfig initialize_config(json config) {
     throw std::runtime_error(fmt::format("Not implemented icnt type {} ",
                                          (std::string)config["icnt_type"]));
   parsed_config.icnt_freq_mhz = config["icnt_freq_mhz"];
-  if (config.contains("icnt_latency"))
-    parsed_config.icnt_latency = config["icnt_latency"];
+  if (config.contains("icnt_latency_cycles"))
+    parsed_config.icnt_latency = config["icnt_latency_cycles"];
   if (config.contains("booksim_config_path"))
     parsed_config.icnt_config_path = config["booksim_config_path"];
   if (config.contains("icnt_stats_print_period_cycles"))
diff --git a/TOGSim/src/Interconnect.cc b/TOGSim/src/Interconnect.cc
index ab2d5d89..096efe3d 100644
--- a/TOGSim/src/Interconnect.cc
+++ b/TOGSim/src/Interconnect.cc
@@ -83,11 +83,22 @@ Booksim2Interconnect::Booksim2Interconnect(SimulationConfig config) {
     std::string(onnxim_path_env) + "/TOGSim" : std::string("./");
 
   _config_path = fs::path(onnxim_path).append("configs").append((std::string)config.icnt_config_path).string();
-  spdlog::info("Config path : {}", _config_path);
+  spdlog::info("Booksim 2 config path : {}", _config_path);
+  print_config(_config_path);
   _booksim = std::make_unique<booksim2::Interconnect>(_config_path, _n_nodes);
   _ctrl_size = 8;
 }
 
+void Booksim2Interconnect::print_config(std::string config_path) {
+  std::ifstream config_file(config_path);
+  std::string line;
+  spdlog::info("Booksim2 Configuration: ");
+  while (std::getline(config_file, line)) {
+    std::cout << line << std::endl;
+  }
+  config_file.close();
+}
+
 bool Booksim2Interconnect::running() {
   return false;
 }
diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc
index 41a2c7a5..857923c5 100644
--- a/TOGSim/src/Simulator.cc
+++ b/TOGSim/src/Simulator.cc
@@ -17,7 +17,7 @@ Simulator::Simulator(SimulationConfig config)
   _noc_node_per_core = config.icnt_injection_ports_per_core;
   char* onnxim_path_env = std::getenv("TORCHSIM_DIR");
   std::string onnxim_path = onnxim_path_env != NULL?
-    std::string(onnxim_path_env) + "/TOGSim" : std::string("./");
+    std::string(onnxim_path_env): std::string("./");
 
   // Create core objects
   _cores.resize(_n_cores);
@@ -42,7 +42,10 @@ Simulator::Simulator(SimulationConfig config)
                                        .append("configs")
                                        .append(config.dram_config_path)
                                        .string();
-    spdlog::info("[Config/DRAM] Ramulator2 config: {}", ramulator_config);
+    spdlog::info("[Config/DRAM] Ramulator2 config path: {}", ramulator_config);
+    YAML::Node dram_config = YAML::LoadFile(ramulator_config);
+    spdlog::info("Ramulator2 config: ");
+    std::cout << dram_config << std::endl;
     config.dram_config_path = ramulator_config;
     _dram = std::make_unique<DramRamulator2>(config, &_core_cycles);
   } else {
diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc
index 1af11257..77c1bae7 100644
--- a/TOGSim/src/main.cc
+++ b/TOGSim/src/main.cc
@@ -9,7 +9,7 @@
 namespace fs = std::filesystem;
 namespace po = boost::program_options;
 
-const char* env_value = std::getenv("TOGSIM_DRYRUN");
+const char* env_value = std::getenv("TOGSIM_EAGER_MODE");
 bool isDryRun = (env_value != nullptr && std::string(env_value) == "1");
 
 void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, std::string config_path, cycle_type request_time=0, int partiton_id=0) {
diff --git a/TOGSim/configs/booksim2_configs/anynet.icnt b/configs/booksim2_configs/anynet.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/anynet.icnt
rename to configs/booksim2_configs/anynet.icnt
diff --git a/TOGSim/configs/booksim2_configs/anynet_file b/configs/booksim2_configs/anynet_file
similarity index 100%
rename from TOGSim/configs/booksim2_configs/anynet_file
rename to configs/booksim2_configs/anynet_file
diff --git a/TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt b/configs/booksim2_configs/chiplet_32_32_2.icnt
similarity index 77%
rename from TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt
rename to configs/booksim2_configs/chiplet_32_32_2.icnt
index 3102fecc..d677c3ed 100644
--- a/TOGSim/configs/booksim2_configs/chiplet_32_32_2.icnt
+++ b/configs/booksim2_configs/chiplet_32_32_2.icnt
@@ -2,7 +2,7 @@
 use_map = 0
 flit_size = 32
 topology = anynet
-network_file = /workspace/PyTorchSim/TOGSim/configs/booksim2_configs/chiplet_32_32_2.net
+network_file = /workspace/PyTorchSim/configs/booksim2_configs/chiplet_32_32_2.net
 routing_function = min
 subnets = 1
 routing_delay = 4
diff --git a/TOGSim/configs/booksim2_configs/chiplet_32_32_2.net b/configs/booksim2_configs/chiplet_32_32_2.net
similarity index 100%
rename from TOGSim/configs/booksim2_configs/chiplet_32_32_2.net
rename to configs/booksim2_configs/chiplet_32_32_2.net
diff --git a/TOGSim/configs/booksim2_configs/fly_c16_m16.icnt b/configs/booksim2_configs/fly_c16_m16.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c16_m16.icnt
rename to configs/booksim2_configs/fly_c16_m16.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c16_m32.icnt b/configs/booksim2_configs/fly_c16_m32.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c16_m32.icnt
rename to configs/booksim2_configs/fly_c16_m32.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c16_m8.icnt b/configs/booksim2_configs/fly_c16_m8.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c16_m8.icnt
rename to configs/booksim2_configs/fly_c16_m8.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c1_m1.icnt b/configs/booksim2_configs/fly_c1_m1.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c1_m1.icnt
rename to configs/booksim2_configs/fly_c1_m1.icnt
diff --git a/configs/booksim2_configs/fly_c1_m16.icnt b/configs/booksim2_configs/fly_c1_m16.icnt
new file mode 100644
index 00000000..48df18d1
--- /dev/null
+++ b/configs/booksim2_configs/fly_c1_m16.icnt
@@ -0,0 +1,18 @@
+[config]
+use_map = 0
+flit_size = 32
+topology = fly
+k = 17
+n = 1
+routing_function = dest_tag
+subnets = 1
+
+vc_buf_size = 256
+input_buffer_size = 256
+ejection_buffer_size = 256
+boundary_buffer_size = 256
+wait_for_tail_credit = 0
+vc_allocator = islip
+sw_allocator = islip
+alloc_iters = 1
+deadlock_warn_timeout = 10000
\ No newline at end of file
diff --git a/TOGSim/configs/booksim2_configs/fly_c1_m2.icnt b/configs/booksim2_configs/fly_c1_m2.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c1_m2.icnt
rename to configs/booksim2_configs/fly_c1_m2.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c1_m8.icnt b/configs/booksim2_configs/fly_c1_m8.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c1_m8.icnt
rename to configs/booksim2_configs/fly_c1_m8.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c2_m32.icnt b/configs/booksim2_configs/fly_c2_m32.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c2_m32.icnt
rename to configs/booksim2_configs/fly_c2_m32.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c2_m8.icnt b/configs/booksim2_configs/fly_c2_m8.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c2_m8.icnt
rename to configs/booksim2_configs/fly_c2_m8.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c32_m32.icnt b/configs/booksim2_configs/fly_c32_m32.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c32_m32.icnt
rename to configs/booksim2_configs/fly_c32_m32.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c32_m4.icnt b/configs/booksim2_configs/fly_c32_m4.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c32_m4.icnt
rename to configs/booksim2_configs/fly_c32_m4.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c32_m8.icnt b/configs/booksim2_configs/fly_c32_m8.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c32_m8.icnt
rename to configs/booksim2_configs/fly_c32_m8.icnt
diff --git a/configs/booksim2_configs/fly_c4_m16.icnt b/configs/booksim2_configs/fly_c4_m16.icnt
new file mode 100644
index 00000000..559855a3
--- /dev/null
+++ b/configs/booksim2_configs/fly_c4_m16.icnt
@@ -0,0 +1,17 @@
+[config]
+use_map = 0
+flit_size = 64
+topology = fly
+k = 20 
+n = 1
+routing_function = dest_tag
+subnets = 1
+
+vc_buf_size = 64
+input_buffer_size = 256
+ejection_buffer_size = 64
+boundary_buffer_size = 64
+wait_for_tail_credit = 0
+vc_allocator = islip
+sw_allocator = islip
+alloc_iters = 1
diff --git a/TOGSim/configs/booksim2_configs/fly_c4_m2.icnt b/configs/booksim2_configs/fly_c4_m2.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c4_m2.icnt
rename to configs/booksim2_configs/fly_c4_m2.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c4_m32.icnt b/configs/booksim2_configs/fly_c4_m32.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c4_m32.icnt
rename to configs/booksim2_configs/fly_c4_m32.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c4_m8.icnt b/configs/booksim2_configs/fly_c4_m8.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c4_m8.icnt
rename to configs/booksim2_configs/fly_c4_m8.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c64_m8.icnt b/configs/booksim2_configs/fly_c64_m8.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c64_m8.icnt
rename to configs/booksim2_configs/fly_c64_m8.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c64_m8_sif-age.icnt b/configs/booksim2_configs/fly_c64_m8_sif-age.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c64_m8_sif-age.icnt
rename to configs/booksim2_configs/fly_c64_m8_sif-age.icnt
diff --git a/TOGSim/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt b/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt
rename to configs/booksim2_configs/fly_c64_m8_sif-rr.icnt
diff --git a/configs/booksim2_configs/fly_c8_m16.icnt b/configs/booksim2_configs/fly_c8_m16.icnt
new file mode 100644
index 00000000..3061d718
--- /dev/null
+++ b/configs/booksim2_configs/fly_c8_m16.icnt
@@ -0,0 +1,17 @@
+[config]
+use_map = 0
+flit_size = 64
+topology = fly
+k = 24
+n = 1
+routing_function = dest_tag
+subnets = 1
+
+vc_buf_size = 64
+input_buffer_size = 256
+ejection_buffer_size = 64
+boundary_buffer_size = 64
+wait_for_tail_credit = 0
+vc_allocator = islip
+sw_allocator = islip
+alloc_iters = 1
diff --git a/TOGSim/configs/booksim2_configs/make_anynet_topology.py b/configs/booksim2_configs/make_anynet_topology.py
similarity index 100%
rename from TOGSim/configs/booksim2_configs/make_anynet_topology.py
rename to configs/booksim2_configs/make_anynet_topology.py
diff --git a/TOGSim/configs/booksim2_configs/mesh_sif-age.icnt b/configs/booksim2_configs/mesh_sif-age.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/mesh_sif-age.icnt
rename to configs/booksim2_configs/mesh_sif-age.icnt
diff --git a/TOGSim/configs/booksim2_configs/mesh_sif-rr.icnt b/configs/booksim2_configs/mesh_sif-rr.icnt
similarity index 100%
rename from TOGSim/configs/booksim2_configs/mesh_sif-rr.icnt
rename to configs/booksim2_configs/mesh_sif-rr.icnt
diff --git a/TOGSim/configs/heterogeneous_c2_simple_noc.json b/configs/heterogeneous_c2_simple_noc.json
similarity index 69%
rename from TOGSim/configs/heterogeneous_c2_simple_noc.json
rename to configs/heterogeneous_c2_simple_noc.json
index 60f160a8..a68f38c2 100644
--- a/TOGSim/configs/heterogeneous_c2_simple_noc.json
+++ b/configs/heterogeneous_c2_simple_noc.json
@@ -4,10 +4,15 @@
   "num_cores" : 2,
   "core_freq_mhz" : 940,
   "core_stats_print_period_cycles" : 10000,
+
   "num_stonne_per_core" : 8,
   "num_stonne_port" : 64,
   "num_systolic_array_per_core" : 2,
 
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
   "dram_type" : "ramulator2",
   "dram_freq_mhz" : 940,
   "dram_channels": 16,
@@ -17,7 +22,7 @@
   "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
 
   "icnt_type" : "simple",
-  "icnt_latency" : 7,
+  "icnt_latency_cycles" : 10,
   "icnt_freq_mhz" : 940,
   "icnt_injection_ports_per_core" : 16,
 
@@ -25,5 +30,11 @@
   "partition": {
     "core_0":0,
     "core_1":1
-  }
+  },
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
 }
\ No newline at end of file
diff --git a/TOGSim/configs/ramulator2_configs/DDR4.yaml b/configs/ramulator2_configs/DDR4.yaml
similarity index 100%
rename from TOGSim/configs/ramulator2_configs/DDR4.yaml
rename to configs/ramulator2_configs/DDR4.yaml
diff --git a/TOGSim/configs/ramulator2_configs/HBM2.yaml b/configs/ramulator2_configs/HBM2.yaml
similarity index 100%
rename from TOGSim/configs/ramulator2_configs/HBM2.yaml
rename to configs/ramulator2_configs/HBM2.yaml
diff --git a/TOGSim/configs/ramulator2_configs/HBM2_TPUv3.yaml b/configs/ramulator2_configs/HBM2_TPUv3.yaml
similarity index 100%
rename from TOGSim/configs/ramulator2_configs/HBM2_TPUv3.yaml
rename to configs/ramulator2_configs/HBM2_TPUv3.yaml
diff --git a/TOGSim/configs/ramulator_configs/ALDRAM-config.cfg b/configs/ramulator_configs/ALDRAM-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/ALDRAM-config.cfg
rename to configs/ramulator_configs/ALDRAM-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/DDR3-config.cfg b/configs/ramulator_configs/DDR3-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/DDR3-config.cfg
rename to configs/ramulator_configs/DDR3-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/DDR4-config.cfg b/configs/ramulator_configs/DDR4-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/DDR4-config.cfg
rename to configs/ramulator_configs/DDR4-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/DSARP-config.cfg b/configs/ramulator_configs/DSARP-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/DSARP-config.cfg
rename to configs/ramulator_configs/DSARP-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/GDDR5-config.cfg b/configs/ramulator_configs/GDDR5-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/GDDR5-config.cfg
rename to configs/ramulator_configs/GDDR5-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/HBM-config.cfg b/configs/ramulator_configs/HBM-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/HBM-config.cfg
rename to configs/ramulator_configs/HBM-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg b/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg
rename to configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg
diff --git a/TOGSim/configs/ramulator_configs/HBM-config_FCFS.cfg b/configs/ramulator_configs/HBM-config_FCFS.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/HBM-config_FCFS.cfg
rename to configs/ramulator_configs/HBM-config_FCFS.cfg
diff --git a/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS.cfg b/configs/ramulator_configs/HBM-config_FRFCFS.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/HBM-config_FRFCFS.cfg
rename to configs/ramulator_configs/HBM-config_FRFCFS.cfg
diff --git a/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg b/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg
rename to configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg
diff --git a/TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg b/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg
rename to configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg
diff --git a/TOGSim/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg b/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg
rename to configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg
diff --git a/TOGSim/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg b/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg
rename to configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg
diff --git a/TOGSim/configs/ramulator_configs/HBMx0.5ch-config.cfg b/configs/ramulator_configs/HBMx0.5ch-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/HBMx0.5ch-config.cfg
rename to configs/ramulator_configs/HBMx0.5ch-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/HBMx2ch-config.cfg b/configs/ramulator_configs/HBMx2ch-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/HBMx2ch-config.cfg
rename to configs/ramulator_configs/HBMx2ch-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/LPDDR3-config.cfg b/configs/ramulator_configs/LPDDR3-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/LPDDR3-config.cfg
rename to configs/ramulator_configs/LPDDR3-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/LPDDR4-config.cfg b/configs/ramulator_configs/LPDDR4-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/LPDDR4-config.cfg
rename to configs/ramulator_configs/LPDDR4-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/PCM-config.cfg b/configs/ramulator_configs/PCM-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/PCM-config.cfg
rename to configs/ramulator_configs/PCM-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/SALP-config.cfg b/configs/ramulator_configs/SALP-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/SALP-config.cfg
rename to configs/ramulator_configs/SALP-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/STTMRAM-config.cfg b/configs/ramulator_configs/STTMRAM-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/STTMRAM-config.cfg
rename to configs/ramulator_configs/STTMRAM-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/TLDRAM-config.cfg b/configs/ramulator_configs/TLDRAM-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/TLDRAM-config.cfg
rename to configs/ramulator_configs/TLDRAM-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/WideIO-config.cfg b/configs/ramulator_configs/WideIO-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/WideIO-config.cfg
rename to configs/ramulator_configs/WideIO-config.cfg
diff --git a/TOGSim/configs/ramulator_configs/WideIO2-config.cfg b/configs/ramulator_configs/WideIO2-config.cfg
similarity index 100%
rename from TOGSim/configs/ramulator_configs/WideIO2-config.cfg
rename to configs/ramulator_configs/WideIO2-config.cfg
diff --git a/TOGSim/configs/stonne_big_c1_simple_noc.json b/configs/stonne_big_c1_simple_noc.json
similarity index 95%
rename from TOGSim/configs/stonne_big_c1_simple_noc.json
rename to configs/stonne_big_c1_simple_noc.json
index 5d563fbe..0a8ca3c2 100644
--- a/TOGSim/configs/stonne_big_c1_simple_noc.json
+++ b/configs/stonne_big_c1_simple_noc.json
@@ -16,7 +16,7 @@
   "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
 
   "icnt_type" : "simple",
-  "icnt_latency" : 7,
+  "icnt_latency_cycles" : 10,
   "icnt_freq_mhz" : 940,
   "icnt_injection_ports_per_core" : 16
 }
\ No newline at end of file
diff --git a/TOGSim/configs/stonne_single_c1_simple_noc.json b/configs/stonne_single_c1_simple_noc.json
similarity index 95%
rename from TOGSim/configs/stonne_single_c1_simple_noc.json
rename to configs/stonne_single_c1_simple_noc.json
index 304e84b3..3421d4f1 100644
--- a/TOGSim/configs/stonne_single_c1_simple_noc.json
+++ b/configs/stonne_single_c1_simple_noc.json
@@ -16,7 +16,7 @@
   "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
 
   "icnt_type" : "simple",
-  "icnt_latency" : 7,
+  "icnt_latency_cycles" : 10,
   "icnt_freq_mhz" : 700,
   "icnt_injection_ports_per_core" : 8
 }
\ No newline at end of file
diff --git a/TOGSim/configs/stonne_validation_c1_simple_noc.json b/configs/stonne_validation_c1_simple_noc.json
similarity index 95%
rename from TOGSim/configs/stonne_validation_c1_simple_noc.json
rename to configs/stonne_validation_c1_simple_noc.json
index 38d4244c..fb196dfb 100644
--- a/TOGSim/configs/stonne_validation_c1_simple_noc.json
+++ b/configs/stonne_validation_c1_simple_noc.json
@@ -17,7 +17,7 @@
   "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32",
 
   "icnt_type" : "simple",
-  "icnt_latency" : 7,
+  "icnt_latency_cycles" : 10,
   "icnt_freq_mhz" : 1000,
   "icnt_injection_ports_per_core" : 8
 }
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
similarity index 61%
rename from TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
rename to configs/systolic_ws_128x128_c1_booksim_tpuv2.json
index 58519aad..686827dc 100644
--- a/TOGSim/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
+++ b/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
@@ -3,6 +3,10 @@
   "core_freq_mhz" : 700,
   "core_stats_print_period_cycles" : 10000,
 
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
   "dram_type" : "ramulator2",
   "dram_freq_mhz" :700,
   "dram_channels": 16,
@@ -15,5 +19,11 @@
   "icnt_type" : "booksim2",
   "icnt_freq_mhz" : 700,
   "icnt_injection_ports_per_core" : 16,
-  "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt"
+  "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt",
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
 }
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv3.json b/configs/systolic_ws_128x128_c1_booksim_tpuv3.json
new file mode 100644
index 00000000..1109dc0f
--- /dev/null
+++ b/configs/systolic_ws_128x128_c1_booksim_tpuv3.json
@@ -0,0 +1,32 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "booksim2",
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+  "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt",
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
+}
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
new file mode 100644
index 00000000..22aedcf8
--- /dev/null
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
@@ -0,0 +1,31 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 700,
+  "core_stats_print_period_cycles" : 10000,
+
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 700,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycless": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency_cycles" : 10,
+  "icnt_freq_mhz" : 700,
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
+}
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
new file mode 100644
index 00000000..e8e489d9
--- /dev/null
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
@@ -0,0 +1,32 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency_cycles" : 10,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "heuristic",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
similarity index 50%
rename from TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
rename to configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
index 34896fc7..980bfc73 100644
--- a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
@@ -4,6 +4,10 @@
   "core_stats_print_period_cycles" : 10000,
   "num_systolic_array_per_core" : 2,
 
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
   "dram_type" : "ramulator2",
   "dram_freq_mhz" : 940,
   "dram_channels": 8,
@@ -13,7 +17,16 @@
   "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
 
   "icnt_type" : "simple",
-  "icnt_latency" : 7,
+  "icnt_latency_cycles" : 10,
   "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
 }
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
similarity index 54%
rename from TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
rename to configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
index 59be9fd4..02bfd75c 100644
--- a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
@@ -4,6 +4,10 @@
   "core_stats_print_period_cycles" : 10000,
   "num_systolic_array_per_core" : 4,
 
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
   "dram_type" : "ramulator2",
   "dram_freq_mhz" :1200,
   "dram_channels": 16,
@@ -15,7 +19,16 @@
   "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32",
 
   "icnt_type" : "simple",
-  "icnt_latency" : 7,
+  "icnt_latency_cycles" : 10,
   "icnt_freq_mhz" : 1050,
-  "icnt_injection_ports_per_core" : 16
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
 }
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
similarity index 59%
rename from TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
rename to configs/systolic_ws_128x128_c2_booksim_tpuv3.json
index 271e7e1c..66566324 100644
--- a/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
+++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
@@ -4,6 +4,10 @@
   "core_stats_print_period_cycles" : 10000,
   "num_systolic_array_per_core" : 2,
 
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
   "dram_type" : "ramulator2",
   "dram_freq_mhz" : 940,
   "dram_channels": 32,
@@ -15,5 +19,14 @@
   "icnt_type" : "booksim2",
   "icnt_freq_mhz" : 940,
   "icnt_injection_ports_per_core" : 16,
-  "booksim_config_path" : "../configs/booksim2_configs/fly_c32_m32.icnt"
+  "booksim_config_path" : "../configs/booksim2_configs/fly_c32_m32.icnt",
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
 }
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
similarity index 62%
rename from TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
rename to configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
index 7382c4c8..8ef47e87 100644
--- a/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
+++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
@@ -5,6 +5,10 @@
   "core_print_interval" : 10000,
   "num_systolic_array_per_core" : 2,
 
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
   "dram_type" : "ramulator2",
   "dram_freq" : 940,
   "dram_channels": 8,
@@ -15,7 +19,7 @@
   "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
  
   "icnt_type" : "booksim2",
-  "icnt_latency" : 1,
+  "icnt_latency_cycles" : 10,
   "icnt_freq" : 940,
   "icnt_injection_ports_per_core" : 16,
   "icnt_config_path" : "../configs/booksim2_configs/fly_c32_m8.icnt",
@@ -26,5 +30,14 @@
   "partition": {
     "core_0":0,
     "core_1":0
-  }
+  },
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
 }
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
similarity index 58%
rename from TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
rename to configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
index 6561ffc0..ecd671bf 100644
--- a/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
+++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
@@ -4,6 +4,10 @@
   "core_stats_print_period_cycles" : 10000,
   "num_systolic_array_per_core" : 2,
 
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
   "dram_type" : "ramulator2",
   "dram_freq_mhz" : 940,
   "dram_channels": 32,
@@ -17,5 +21,14 @@
   "icnt_freq_mhz" : 1000,
   "icnt_injection_ports_per_core" : 16,
   "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
-  "icnt_stats_print_period_cycles" : 10000
+  "icnt_stats_print_period_cycles" : 10000,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
 }
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
similarity index 60%
rename from TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
rename to configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
index fad63cc3..168fbe3a 100644
--- a/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
+++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
@@ -4,6 +4,10 @@
   "core_stats_print_period_cycles" : 10000,
   "num_systolic_array_per_core" : 2,
 
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
   "dram_type" : "ramulator2",
   "dram_freq_mhz" : 940,
   "dram_channels": 32,
@@ -16,5 +20,14 @@
   "icnt_type" : "booksim2",
   "icnt_freq_mhz" : 1000,
   "icnt_injection_ports_per_core" : 16,
-  "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt"
+  "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
 }
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
new file mode 100644
index 00000000..0a5f15b2
--- /dev/null
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
@@ -0,0 +1,31 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 700,
+  "core_stats_print_period_cycles" : 10000,
+
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :700,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency_cycles" : 10,
+  "icnt_freq_mhz" : 700,
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "heuristic",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
similarity index 50%
rename from TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
rename to configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
index 76f51b40..f099b93d 100644
--- a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
@@ -4,6 +4,10 @@
   "core_stats_print_period_cycles" : 10000,
   "num_systolic_array_per_core" : 2,
 
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
   "dram_type" : "ramulator2",
   "dram_freq_mhz" : 940,
   "dram_channels": 32,
@@ -13,7 +17,16 @@
   "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
 
   "icnt_type" : "simple",
-  "icnt_latency" : 7,
+  "icnt_latency_cycles" : 10,
   "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "heuristic",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
 }
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
similarity index 57%
rename from TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
rename to configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
index 42e003c7..681ef884 100644
--- a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
@@ -4,6 +4,10 @@
   "core_stats_print_period_cycles" : 10000,
   "num_systolic_array_per_core" : 2,
 
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
   "dram_type" : "ramulator2",
   "dram_freq_mhz" : 940,
   "dram_channels": 32,
@@ -13,7 +17,7 @@
   "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
 
   "icnt_type" : "simple",
-  "icnt_latency" : 7,
+  "icnt_latency_cycles" : 10,
   "icnt_freq_mhz" : 940,
   "icnt_injection_ports_per_core" : 16,
 
@@ -21,5 +25,14 @@
   "partition": {
     "core_0":0,
     "core_1":1
-  }
+  },
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
 }
diff --git a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
similarity index 54%
rename from TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
rename to configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
index 44ec72fe..d09228a1 100644
--- a/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
@@ -4,6 +4,10 @@
   "core_stats_print_period_cycles" : 10000,
   "num_systolic_array_per_core" : 4,
 
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
   "dram_type" : "ramulator2",
   "dram_freq_mhz" :1200,
   "dram_channels": 32,
@@ -15,7 +19,16 @@
   "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32",
 
   "icnt_type" : "simple",
-  "icnt_latency" : 7,
+  "icnt_latency_cycles" : 10,
   "icnt_freq_mhz" : 1050,
-  "icnt_injection_ports_per_core" : 16
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
 }
\ No newline at end of file
diff --git a/configs/systolic_ws_8x8_c1_booksim.json b/configs/systolic_ws_8x8_c1_booksim.json
new file mode 100644
index 00000000..851664e6
--- /dev/null
+++ b/configs/systolic_ws_8x8_c1_booksim.json
@@ -0,0 +1,29 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 800,
+  "core_stats_print_period_cycles" : 100000,
+
+  "vpu_num_lanes" : 8,
+  "vpu_spad_size_kb_per_lane" : 32,
+  "vpu_vector_length_bits" : 256,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 1,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycles": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+
+  "icnt_type" : "booksim2",
+  "icnt_freq_mhz" : 800,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
+}
\ No newline at end of file
diff --git a/configs/systolic_ws_8x8_c1_simple_noc.json b/configs/systolic_ws_8x8_c1_simple_noc.json
new file mode 100644
index 00000000..2eb7e183
--- /dev/null
+++ b/configs/systolic_ws_8x8_c1_simple_noc.json
@@ -0,0 +1,30 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 800,
+  "core_stats_print_period_cycles" : 100000,
+
+  "vpu_num_lanes" : 8,
+  "vpu_spad_size_kb_per_lane" : 32,
+  "vpu_vector_length_bits" : 256,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" :800,
+  "dram_channels": 1,
+  "dram_req_size_byte": 64,
+  "dram_num_burst_length" : 4,
+  "dram_stats_print_period_cycles": 100000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
+ 
+  "icnt_type" : "simple",
+  "icnt_latency_cycles" : 10,
+  "icnt_freq_mhz" : 800,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
+}
\ No newline at end of file
diff --git a/experiments/BERT.py b/experiments/BERT.py
index c5bb454e..3311682c 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -36,7 +36,7 @@ def run_BERT(size, input_seq, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -51,7 +51,7 @@ def run_BERT(size, input_seq, config):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
-        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
+    if 'pytorchsim_functional_mode' in os.environ:
+        del os.environ['pytorchsim_functional_mode']
 
     run_BERT(size, input_seq, config)
diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh
index 28e6ad5e..99eed4ed 100755
--- a/experiments/artifact/cycle_validation/run_cycle.sh
+++ b/experiments/artifact/cycle_validation/run_cycle.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e
 
-export TORCHSIM_CONFIG=$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+export TORCHSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
 LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
 mkdir -p $LOG_DIR
 
diff --git a/experiments/artifact/speedup/run_speedup.sh b/experiments/artifact/speedup/run_speedup.sh
index 2b9625e9..9a19e9af 100755
--- a/experiments/artifact/speedup/run_speedup.sh
+++ b/experiments/artifact/speedup/run_speedup.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
-CONFIG_DIR="$TORCHSIM_DIR/TOGSim/configs"
+CONFIG_DIR="$TORCHSIM_DIR/configs"
 SIMULATOR_BIN="$TORCHSIM_DIR/TOGSim/build/bin/Simulator"
 
 configs=(
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
index 4055b355..fe872e02 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
@@ -26,7 +26,7 @@ for i in "${config[@]}"; do
     echo "===== config=$i | model=$ops =====" >> "$output_file"
     sum=0.0
     count=0
-    config_path="$TORCHSIM_DIR/TOGSim/configs/$i"
+    config_path="$TORCHSIM_DIR/configs/$i"
 
     for iter in {1..5}; do
       echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
index 83b3798a..19613a34 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
@@ -27,7 +27,7 @@ for i in "${config[@]}"; do
     echo "===== config=$i | model=$ops =====" >> "$output_file"
     sum=0.0
     count=0
-    config_path="$TORCHSIM_DIR/TOGSim/configs/$i"
+    config_path="$TORCHSIM_DIR/configs/$i"
 
     for iter in {1..5}; do
       echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
index f1467614..6f3385f1 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
@@ -25,7 +25,7 @@ for i in "${config[@]}"; do
     echo "===== config=$i | model=$ops =====" >> "$output_file"
     sum=0.0
     count=0
-    config_path="$TORCHSIM_DIR/TOGSim/configs/$i"
+    config_path="$TORCHSIM_DIR/configs/$i"
 
     for iter in {1..5}; do
       echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
index 2ed3ca2a..ca4cfa39 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
@@ -33,7 +33,7 @@ for i in "${config[@]}"; do
       echo "===== config=$i | model=$ops =====" >> "$output_file"
       sum=0.0
       count=0
-      config_path="$TORCHSIM_DIR/TOGSim/configs/$i"
+      config_path="$TORCHSIM_DIR/configs/$i"
 
       for iter in {1..5}; do
         echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
diff --git a/experiments/attention.py b/experiments/attention.py
index 5a8c5f45..bbd2734e 100644
--- a/experiments/attention.py
+++ b/experiments/attention.py
@@ -36,7 +36,7 @@ def attention(query, key, value):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -50,7 +50,7 @@ def attention(query, key, value):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
-        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
+    if 'pytorchsim_functional_mode' in os.environ:
+        del os.environ['pytorchsim_functional_mode']
 
     run_attention(size, config)
diff --git a/experiments/conv.py b/experiments/conv.py
index c8ca9a37..f439c5e3 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -37,7 +37,7 @@ def custom_conv2d(a, b, bias):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -51,7 +51,7 @@ def custom_conv2d(a, b, bias):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
-        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
+    if 'pytorchsim_functional_mode' in os.environ:
+        del os.environ['pytorchsim_functional_mode']
 
     run_conv2d(size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7], config)
\ No newline at end of file
diff --git a/experiments/gemm.py b/experiments/gemm.py
index 67dc4f79..e92200d1 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -31,7 +31,7 @@ def custom_matmul(a, b):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -45,8 +45,8 @@ def custom_matmul(a, b):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
-        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
+    if 'pytorchsim_functional_mode' in os.environ:
+        del os.environ['pytorchsim_functional_mode']
 
     from Scheduler.scheduler import PyTorchSimRunner
     module = PyTorchSimRunner.setup_device()
diff --git a/experiments/layernorm.py b/experiments/layernorm.py
index 0beaac6c..74b6d286 100644
--- a/experiments/layernorm.py
+++ b/experiments/layernorm.py
@@ -27,7 +27,7 @@ def run_layernorm(size, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -42,7 +42,7 @@ def run_layernorm(size, config):
     os.environ['TORCHSIM_FUSION_REDUCTION_REDUCTION'] = "0"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
-        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
+    if 'pytorchsim_functional_mode' in os.environ:
+        del os.environ['pytorchsim_functional_mode']
 
     run_layernorm(size, config)
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index 23d62e40..45311d59 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -29,7 +29,7 @@ def run_resnet(batch, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -43,7 +43,7 @@ def run_resnet(batch, config):
     os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
-        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
+    if 'pytorchsim_functional_mode' in os.environ:
+        del os.environ['pytorchsim_functional_mode']
 
     run_resnet(batch, config)
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index 60a46071..4f03ea15 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -29,7 +29,7 @@ def run_resnet(batch, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -43,7 +43,7 @@ def run_resnet(batch, config):
     os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
-        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
+    if 'pytorchsim_functional_mode' in os.environ:
+        del os.environ['pytorchsim_functional_mode']
 
     run_resnet(batch, config)
diff --git a/experiments/softmax.py b/experiments/softmax.py
index 532ef091..b47bd685 100644
--- a/experiments/softmax.py
+++ b/experiments/softmax.py
@@ -27,7 +27,7 @@ def run_softmax(size, config, dim=1):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
@@ -41,7 +41,7 @@ def run_softmax(size, config, dim=1):
     os.environ['TORCHSIM_DUMP_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'TORCHSIM_FUNCTIONAL_MODE' in os.environ:
-        del os.environ['TORCHSIM_FUNCTIONAL_MODE']
+    if 'pytorchsim_functional_mode' in os.environ:
+        del os.environ['pytorchsim_functional_mode']
 
     run_softmax(size, config)
diff --git a/scripts/CompilerOpt_experiment/DMAopt.sh b/scripts/CompilerOpt_experiment/DMAopt.sh
index 22118b1e..5c2dc65c 100644
--- a/scripts/CompilerOpt_experiment/DMAopt.sh
+++ b/scripts/CompilerOpt_experiment/DMAopt.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json"
+export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json"
 
 # None FG DMA
 export TORCHSIM_SUBTILE=0
diff --git a/scripts/chiplet.sh b/scripts/chiplet.sh
index 2989e4fd..0d56ecae 100755
--- a/scripts/chiplet.sh
+++ b/scripts/chiplet.sh
@@ -19,11 +19,11 @@ GEMM_DIR_NAME=$(basename "$GEMM_PATH")
 echo "GEMM Directory Name: $GEMM_DIR_NAME"
 
 CONFIG_LIST=(
-    "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json"
+    "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json"
 )
 CONFIG_LIST2=(
-    "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_booksim_tpuv3.json"
-    "$TORCHSIM_DIR/TOGSim/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json"
+    "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_booksim_tpuv3.json"
+    "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json"
 )
 shift
 shift
diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh
index 94e00527..4f5dd3a6 100755
--- a/scripts/sparsity_experiment/run.sh
+++ b/scripts/sparsity_experiment/run.sh
@@ -5,7 +5,7 @@ export TORCHSIM_FORCE_TIME_M=8
 export TORCHSIM_FORCE_TIME_N=8
 
 OUTPUT_DIR="12GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -13,7 +13,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="24GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -21,7 +21,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="48GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -29,7 +29,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="12GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -37,7 +37,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="24GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -45,7 +45,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="48GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/TOGSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
diff --git a/scripts/stonne_experiment2/tog_gen.py b/scripts/stonne_experiment2/tog_gen.py
index be30795b..d4f93d4d 100644
--- a/scripts/stonne_experiment2/tog_gen.py
+++ b/scripts/stonne_experiment2/tog_gen.py
@@ -72,7 +72,7 @@ def extract_simulation_stats(result_path):
             continue
         tog_path = os.path.join(path, "tile_graph.onnx")
         togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
-        stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/TOGSim/configs/stonne_validation_c1_simple_noc.json'
+        stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_validation_c1_simple_noc.json'
         backsim = TOGSimulator(togsim_path, stonne_config_path)
         result_path = backsim.simulation(tog_path)
         nr_multiplications, total_cycle, sim_time = extract_simulation_stats(result_path)
diff --git a/test_extension_backend.py b/test_extension_backend.py
deleted file mode 100644
index 5e6427ef..00000000
--- a/test_extension_backend.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import torch._dynamo
-import torch.utils.cpp_extension
-from tests.test_add import test_vectoradd, test_vector_scalar_add
-from tests.test_reduce import test_reduce_sum
-from tests.test_transpose2D import test_Transpose2D, test_Transpose2D_2
-from tests.test_transpose3D import test_Transpose3D_1, test_Transpose3D_2, test_Transpose3D_3
-from tests.test_view3D_2D import test_view3D_2D
-from tests.test_softmax import test_softmax
-from tests.test_batchnorm import test_BatchNorm
-from tests.test_layernorm import test_LayerNorm
-from tests.test_conv2d import test_conv2d
-from tests.test_matmul import test_matmul
-from tests.test_bmm import test_BMM
-from tests.test_cnn import test_CNN
-from tests.test_transformer import test_EncoderBlock
-from tests.test_resnet import test_resnet
-from tests.test_mlp import test_mlp, test_mlp_inf
-from tests.MoE.test_moe import test_moe
-from tests.test_pool import test_avgpool, test_maxpool
-from tests.Fusion.test_addmm_residual import test_addmm_residual
-from tests.Fusion.test_matmul_scalar import test_matmul_scalar
-from tests.Fusion.test_matmul_activation import test_matmul_activation
-
-if __name__ == "__main__":
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
-    #test_vectoradd(device, (47, 10))
-    #test_vector_scalar_add(device, (10, 10))
-    #test_reduce_sum(device, (32, 32), 1, keepdim=True)
-    #test_reduce_sum(device, (32, 32), 0, keepdim=True)
-    #test_reduce_sum(device, (512, 512), 1, keepdim=True)
-    #test_reduce_sum(device, (512, 512), 0, keepdim=True)
-    #test_Transpose2D(device, [64, 156])
-    #test_Transpose2D_2(device, [16, 64])
-    #test_Transpose3D_1(device, [62, 34, 256])
-    #test_Transpose3D_2(device, [62, 34, 256])
-    #test_Transpose3D_3(device, [62, 34, 256])
-    #test_view3D_2D(device)
-    test_maxpool(device)
-    #test_avgpool(device)
-    #test_softmax(device, (256, 256), dim=1)
-    #test_BatchNorm(device)
-    #test_LayerNorm(device, (64, 128))
-    #test_conv2d(device)
-    #test_matmul(device, 33, 45, 68)
-    #test_BMM(device)
-    #test_CNN(device)
-    #test_EncoderBlock(device)
-    #test_resnet(device)
-    #test_mlp(device)
-    #test_mlp_inf(device, batch_size=64, input_size=256, hidden_size=512, output_size=256, sparsity=0.97)
-
-    # # Fusion Test
-    #test_matmul_scalar(device)
-    #test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="relu")
-    #test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="sigmoid")
-    #test_addmm_residual(device)
diff --git a/tests/test_compile_overhead.py b/tests/test_compile_overhead.py
index c32b4364..030f548e 100644
--- a/tests/test_compile_overhead.py
+++ b/tests/test_compile_overhead.py
@@ -21,7 +21,7 @@
         #    shutil.rmtree("/tmp/torchinductor")
         #except FileNotFoundError:
         #    print("no cache")
-        scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
+        scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
         # Register compiled model
         opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
         SchedulerDNNModel.register_model("resnet18", opt_model1)
diff --git a/tests/test_hetro.py b/tests/test_hetro.py
index 557ea5d6..a0716e2d 100644
--- a/tests/test_hetro.py
+++ b/tests/test_hetro.py
@@ -26,7 +26,7 @@ def custom_matmul(a, b):
     K = args.K
     sparsity = args.sparsity
     mode = args.mode
-    config_path = f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/{args.config}"
+    config_path = f"{CONFIG_TORCHSIM_DIR}/configs/{args.config}"
 
     print("M: ", M)
     print("N: ", N)
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 91bf0ad8..4860de56 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -7,7 +7,7 @@
 base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
 sys.path.append(base_path)
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-config = f'{base_path}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
+config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
 
 target_model1 = model1().eval()
 target_model2 = model2(768, 12).eval()
diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py
index 5a34d161..53f9256d 100644
--- a/tests/test_scheduler_batching.py
+++ b/tests/test_scheduler_batching.py
@@ -17,7 +17,7 @@
     target_model1 = model1().eval()
 
     # Init scheduler
-    scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
+    scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
     # Register compiled model
     opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
     SchedulerDNNModel.register_model("resnet18", opt_model1)
diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py
index c7abf0ae..71594eb2 100644
--- a/tests/test_spmm_scheduler.py
+++ b/tests/test_spmm_scheduler.py
@@ -25,7 +25,7 @@
     output_size = args.output_size
     w1_sparsity = args.w1_sparsity
     w2_sparsity = args.w2_sparsity
-    config_path = f"{CONFIG_TORCHSIM_DIR}/TOGSim/configs/{args.config}"
+    config_path = f"{CONFIG_TORCHSIM_DIR}/configs/{args.config}"
 
     print("batch_size: ", batch_size)
     print("input_size: ", input_size)
diff --git a/tutorial/session1/CompilerOptimization.ipynb b/tutorial/session1/CompilerOptimization.ipynb
new file mode 100644
index 00000000..178974c1
--- /dev/null
+++ b/tutorial/session1/CompilerOptimization.ipynb
@@ -0,0 +1,118 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compiler Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import os\n",
+    "import sys\n",
+    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "sys.path.append(base_dir)\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.json\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### GeMM + ReLU fusion (Default)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"fused\")\n",
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "def gemm_relu(a, b):\n",
+    "    return torch.relu(torch.matmul(a, b))\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n",
+    "out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cat /root/workspace/PyTorchSim/outputs/20251202_060538/togsim_result.log | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Disable fusion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"non_fused\")\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "def gemm_relu(a, b):\n",
+    "    return torch.relu(torch.matmul(a, b))\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n",
+    "out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cat /root/workspace/PyTorchSim/outputs/20251202_055530/togsim_result.log | grep \"Total execution cycle\"\n",
+    "!cat /root/workspace/PyTorchSim/outputs/20251202_055532/togsim_result.log | grep \"Total execution cycle\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorial/session1/DNNServing.ipynb b/tutorial/session1/DNNServing.ipynb
new file mode 100644
index 00000000..b38bfe6a
--- /dev/null
+++ b/tutorial/session1/DNNServing.ipynb
@@ -0,0 +1,130 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## DNN Serving System"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import os\n",
+    "import sys\n",
+    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "sys.path.append(base_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Scheduler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torchvision.models import resnet18\n",
+    "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request\n",
+    "from PyTorchSimFrontend import extension_config\n",
+    "\n",
+    "scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.TOGSIM_CONFIG)\n",
+    "device = scheduler.execution_engine.module.custom_device()\n",
+    "\n",
+    "model = resnet18().eval()\n",
+    "input = torch.randn(1, 3, 224, 224).to(device=device)\n",
+    "opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))\n",
+    "\n",
+    "SchedulerDNNModel.register_model(\"resnet18\", opt_fn)\n",
+    "request = Request(\"resnet18\", [input], [], request_queue_idx=0)\n",
+    "scheduler.add_request(request, request_time=0)\n",
+    "\n",
+    "# Run scheduler\n",
+    "while not scheduler.is_finished():\n",
+    "    with torch.no_grad():\n",
+    "        scheduler.schedule()\n",
+    "\n",
+    "print(\"ResNet18 Simulation Done\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load Generator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "from torchvision.models import resnet18\n",
+    "\n",
+    "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator\n",
+    "TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "\n",
+    "lambda_requests = 10\n",
+    "max_time = 30\n",
+    "\n",
+    "target_model1 = resnet18().eval()\n",
+    "\n",
+    "# Init scheduler\n",
+    "scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.TOGSIM_CONFIG)\n",
+    "# Register compiled model\n",
+    "opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)\n",
+    "SchedulerDNNModel.register_model(\"resnet18\", opt_model1)\n",
+    "\n",
+    "# Generate time stamp\n",
+    "for request_time in poisson_request_generator(lambda_requests, max_time):\n",
+    "    # Init input data\n",
+    "    model_input1 = torch.randn(1, 3, 224, 224)\n",
+    "\n",
+    "    # Init request\n",
+    "    new_request1 = Request(\"resnet18\", [model_input1], [], request_queue_idx=0)\n",
+    "\n",
+    "    # Add request to scheduler\n",
+    "    print(\"[Reqest] Resnet18 request time: \", request_time, flush=True)\n",
+    "    scheduler.add_request(new_request1, request_time=request_time)\n",
+    "\n",
+    "# Run scheduler\n",
+    "while not scheduler.is_finished():\n",
+    "    scheduler.schedule()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorial/session1/ExecutionMode.ipynb b/tutorial/session1/ExecutionMode.ipynb
new file mode 100644
index 00000000..22e00bed
--- /dev/null
+++ b/tutorial/session1/ExecutionMode.ipynb
@@ -0,0 +1,182 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Execution Mode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import os\n",
+    "import sys\n",
+    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "sys.path.append(base_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Functional & Timing mode (Default)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Functional only mode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_functional_only.json\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Timing only mode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.json\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TOGSim Configuration\n",
+    "### Single Core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.json\"\n",
+    "\n",
+    "input = torch.randn(2048, 2048).to(device=device)\n",
+    "weight = torch.randn(2048, 2048).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cat /root/workspace/PyTorchSim/outputs/20251202_160520/togsim_result.log | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Multi-Core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_2_cores.json\"\n",
+    "\n",
+    "input = torch.randn(2048, 2048).to(device=device)\n",
+    "weight = torch.randn(2048, 2048).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cat /root/workspace/PyTorchSim/outputs/20251202_160547/togsim_result.log | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorial/session1/HelloPyTorchSim.ipynb b/tutorial/session1/HelloPyTorchSim.ipynb
deleted file mode 100644
index dfb086a4..00000000
--- a/tutorial/session1/HelloPyTorchSim.ipynb
+++ /dev/null
@@ -1,1216 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Hello, PyTorchSim!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "import os\n",
-    "import sys\n",
-    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
-    "sys.path.append(base_dir)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## One Touch Simulation\n",
-    "### Normal Matmul Code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-    "\n",
-    "torch.manual_seed(0)\n",
-    "input = torch.randn(128, 128).to(device)\n",
-    "weight = torch.randn(128, 128).to(device)\n",
-    "\n",
-    "opt_fn = torch.compile(torch.matmul)\n",
-    "cpu_out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### PyTorchSim Matmul Code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
-      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n",
-      "Building extension module npu...\n",
-      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
-      "Loading extension module npu...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "ninja: no work to do.\n",
-      "Wrapper Codegen Path = /tmp/torchinductor_root/ro/croutbd6yxrzgdstfcplx7yrpn2do5frwhyx2md5r7rvrubdhdgd.py\n",
-      "[Gem5] Gem5 is running... \n",
-      "[Spike] Running Spike simulator\n",
-      "[TOGSim] TOGSim is running..  \n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/0\"\n"
-     ]
-    }
-   ],
-   "source": [
-    "from Scheduler.scheduler import PyTorchSimRunner\n",
-    "device = PyTorchSimRunner.setup_device().custom_device()\n",
-    "\n",
-    "torch.manual_seed(0)\n",
-    "input = torch.randn(128, 128).to(device)\n",
-    "weight = torch.randn(128, 128).to(device)\n",
-    "\n",
-    "opt_fn = torch.compile(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n",
-    "    if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
-    "        message = f\"|{name} Test Passed|\"\n",
-    "        print(\"-\" * len(message))\n",
-    "        print(message)\n",
-    "        print(\"-\" * len(message))\n",
-    "    else:\n",
-    "        message = f\"|{name} Test Failed|\"\n",
-    "        print(\"-\" * len(message))\n",
-    "        print(message)\n",
-    "        print(\"-\" * len(message))\n",
-    "        print(\"npu out: \", npu_out.cpu())\n",
-    "        print(\"cpu out: \", cpu_out)\n",
-    "        exit(1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "--------------------\n",
-      "|MatMul Test Passed|\n",
-      "--------------------\n"
-     ]
-    }
-   ],
-   "source": [
-    "test_result(\"MatMul\", npu_out, cpu_out)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Training"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# from Scheduler.scheduler import PyTorchSimRunner\n",
-    "# npu_device = PyTorchSimRunner.setup_device().custom_device()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Normal Backward Code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-    "torch.manual_seed(0)\n",
-    "cpu_input = torch.randn(128, 128).to(device)\n",
-    "cpu_weight = torch.randn(128, 128).to(device)\n",
-    "cpu_target = torch.randn(128, 128).to(device)\n",
-    "cpu_input.requires_grad = True\n",
-    "cpu_weight.requires_grad = True\n",
-    "\n",
-    "opt_fn = torch.compile(torch.matmul)\n",
-    "cpu_out = opt_fn(cpu_input, cpu_weight)\n",
-    "\n",
-    "loss_fn = torch.nn.CrossEntropyLoss()\n",
-    "cpu_loss = loss_fn(cpu_out, cpu_target)\n",
-    "cpu_loss.backward()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### PyTorchSim Backward Code"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
-      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
-      "Loading extension module npu...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Wrapper Codegen Path = /tmp/torchinductor_root/5i/c5isqyualxbaqsmuhsux7oubvkypfmh4kvamqvgref6z3ypnrpw5.py\n",
-      "[Gem5] Gem5 is running... \n",
-      "[Spike] Running Spike simulator\n",
-      "[TOGSim] TOGSim is running..  \n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/19\"\n"
-     ]
-    },
-    {
-     "ename": "RuntimeError",
-     "evalue": "0 <= device.index() && device.index() < static_cast<c10::DeviceIndex>(device_ready_queues_.size()) INTERNAL ASSERT FAILED at \"/opt/conda/conda-bld/pytorch_1704987394225/work/torch/csrc/autograd/engine.cpp\":1423, please report a bug to PyTorch. ",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[7], line 15\u001b[0m\n\u001b[1;32m     13\u001b[0m loss_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mnn\u001b[38;5;241m.\u001b[39mCrossEntropyLoss()\n\u001b[1;32m     14\u001b[0m npu_loss \u001b[38;5;241m=\u001b[39m loss_fn(npu_out, npu_target)\n\u001b[0;32m---> 15\u001b[0m \u001b[43mnpu_loss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_tensor.py:522\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m    512\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    513\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m    514\u001b[0m         Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m    515\u001b[0m         (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    520\u001b[0m         inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m    521\u001b[0m     )\n\u001b[0;32m--> 522\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    523\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m    524\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:266\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m    261\u001b[0m     retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[1;32m    263\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[1;32m    264\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m    265\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m    267\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    268\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    269\u001b[0m \u001b[43m    \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    270\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    271\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    272\u001b[0m \u001b[43m    \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    273\u001b[0m \u001b[43m    \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    274\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[0;31mRuntimeError\u001b[0m: 0 <= device.index() && device.index() < static_cast<c10::DeviceIndex>(device_ready_queues_.size()) INTERNAL ASSERT FAILED at \"/opt/conda/conda-bld/pytorch_1704987394225/work/torch/csrc/autograd/engine.cpp\":1423, please report a bug to PyTorch. "
-     ]
-    }
-   ],
-   "source": [
-    "from Scheduler.scheduler import PyTorchSimRunner\n",
-    "npu_device = PyTorchSimRunner.setup_device().custom_device()\n",
-    "torch.manual_seed(0)\n",
-    "npu_input = torch.randn(128, 128).to(npu_device)\n",
-    "npu_weight = torch.randn(128, 128).to(npu_device)\n",
-    "npu_target = torch.randn(128, 128).to(npu_device)\n",
-    "npu_input.requires_grad = True\n",
-    "npu_weight.requires_grad = True\n",
-    "\n",
-    "opt_fn = torch.compile(torch.matmul)\n",
-    "npu_out = opt_fn(npu_input, npu_weight)\n",
-    "\n",
-    "loss_fn = torch.nn.CrossEntropyLoss()\n",
-    "npu_loss = loss_fn(npu_out, npu_target)\n",
-    "npu_loss.backward()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'test_result' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtest_result\u001b[49m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMatMul Input Grad\u001b[39m\u001b[38;5;124m\"\u001b[39m, npu_input\u001b[38;5;241m.\u001b[39mgrad, cpu_input\u001b[38;5;241m.\u001b[39mgrad)\n\u001b[1;32m      2\u001b[0m test_result(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMatMul Weight Grad\u001b[39m\u001b[38;5;124m\"\u001b[39m, npu_weight\u001b[38;5;241m.\u001b[39mgrad, cpu_weight\u001b[38;5;241m.\u001b[39mgrad)\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'test_result' is not defined"
-     ]
-    }
-   ],
-   "source": [
-    "test_result(\"MatMul Input Grad\", npu_input.grad, cpu_input.grad)\n",
-    "test_result(\"MatMul Weight Grad\", npu_weight.grad, cpu_weight.grad)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Mapping\n",
-    "\n",
-    "Default mapping is based on heuristic."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
-      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
-      "Loading extension module npu...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Wrapper Codegen Path = /tmp/torchinductor_root/5z/c5z4ur2k2svn2gaawn776ev3t6gsa7esgu36la63523cqpbbt56d.py\n",
-      "[Gem5] Gem5 is running..  \n",
-      "[Spike] Running Spike simulator\n",
-      "[TOGSim] TOGSim is running.   \n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0\"\n"
-     ]
-    }
-   ],
-   "source": [
-    "import torch\n",
-    "from Scheduler.scheduler import PyTorchSimRunner\n",
-    "device = PyTorchSimRunner.setup_device().custom_device()\n",
-    "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2025-11-30 18:53:14.002] [info] Total execution cycle: 47158\n"
-     ]
-    }
-   ],
-   "source": [
-    "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0 | grep \"Total execution cycle\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Manual Mapping\n",
-    "User can set tile size manually."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Wrapper Codegen Path = /tmp/torchinductor_root/mv/cmv6cp7oo3wwndv76iv3sib7r74tnbvodfwxi3rw33k7grlh3h4h.py\n",
-      "[Gem5] Gem5 is running.   \n",
-      "[Spike] Running Spike simulator\n",
-      "[TOGSim] TOGSim is running... \n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/75hiq5mugpq/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/75hiq5mugpq/togsim_result/0\"\n"
-     ]
-    }
-   ],
-   "source": [
-    "torch._dynamo.reset()\n",
-    "\n",
-    "os.environ['TORCHSIM_MANUAL_TILE_SIZE']=\"1\"\n",
-    "os.environ['TORCHSIM_TILE_M']=\"512\"\n",
-    "os.environ['TORCHSIM_TILE_N']=\"512\"\n",
-    "os.environ['TORCHSIM_TILE_K']=\"512\"\n",
-    "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2025-11-30 18:54:00.878] [info] Total execution cycle: 53704\n"
-     ]
-    }
-   ],
-   "source": [
-    "!cat /tmp/torchinductor/tmp/75hiq5mugpq/togsim_result/0 | grep \"Total execution cycle\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Autotune"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Auto-tune] Trying tile size: [1024, 1024, 256, 128, 1024, 256]\n",
-      "[Auto-tune] Trying tile size: [256, 1024, 1024, 128, 1024, 1024]\n",
-      "[Auto-tune] Trying tile size: [1024, 256, 1024, 128, 256, 1024]\n",
-      "[Auto-tune] Trying tile size: [1024, 1024, 128, 128, 1024, 128]\n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/x27ipc5avjg/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/0\"\n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/7j33rcic2qn/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/7j33rcic2qn/togsim_result/0\"\n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/vsaamplubl5/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/vsaamplubl5/togsim_result/0\"\n",
-      "[Auto-tune] Optimal tile size: [1024, 1024, 128, 128, 1024, 128], cycles: 46423\n",
-      "Wrapper Codegen Path = /tmp/torchinductor_root/3b/c3bebp4b4rp73grbvhbaq4xdxny7f5m7fgqkgpflp2cjn3x5uugr.py\n",
-      "[Gem5] Gem5 is running..  \n",
-      "[Spike] Running Spike simulator\n",
-      "[TOGSim] TOGSim is running.   \n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/x27ipc5avjg/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/1\"\n"
-     ]
-    }
-   ],
-   "source": [
-    "torch._dynamo.reset()\n",
-    "os.environ['TORCHSIM_MANUAL_TILE_SIZE']=\"0\"\n",
-    "os.environ['AUTOTUNE_TEMPLATE']=\"1\"\n",
-    "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2025-11-30 18:54:53.051] [info] Total execution cycle: 46422\n"
-     ]
-    }
-   ],
-   "source": [
-    "!cat /tmp/torchinductor/tmp/x27ipc5avjg/togsim_result/1 | grep \"Total execution cycle\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Execution Mode\n",
-    "### Functional & Timing mode (Default)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Wrapper Codegen Path = /tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py\n",
-      "[Gem5] Gem5 is running..  \n",
-      "[Spike] Running Spike simulator\n",
-      "[TOGSim] TOGSim is running.   \n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/4\"\n"
-     ]
-    }
-   ],
-   "source": [
-    "torch._dynamo.reset()\n",
-    "os.environ['AUTOTUNE_TEMPLATE']=\"0\"\n",
-    "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"1\"\n",
-    "os.environ['TORCHSIM_TIMING_MODE']=\"1\"\n",
-    "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Functional only mode"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
-      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
-      "Loading extension module npu...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Spike] Running Spike simulator\n"
-     ]
-    }
-   ],
-   "source": [
-    "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"1\"\n",
-    "os.environ['TORCHSIM_TIMING_MODE']=\"0\"\n",
-    "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Timing only mode"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[23], line 8\u001b[0m\n\u001b[1;32m      5\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m      7\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 8\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m     11\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;124;03m    Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m     \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     17\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m     19\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m inner\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified.<locals>.forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m    899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m    900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func.<locals>.g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper.<locals>.runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m     88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     89\u001b[0m     \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m     90\u001b[0m     \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m     91\u001b[0m     \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m     92\u001b[0m     \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m     93\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m         all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m     95\u001b[0m             compiled_fn,\n\u001b[1;32m     96\u001b[0m             args,\n\u001b[1;32m     97\u001b[0m             disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m     98\u001b[0m         )\n\u001b[1;32m    100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m    101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m    103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m    104\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m         out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    106\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    107\u001b[0m         \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m    108\u001b[0m         \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m    109\u001b[0m         warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m    110\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    111\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    112\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    113\u001b[0m         )\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base.<locals>.rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    116\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m    117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m    863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m    884\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m    885\u001b[0m     compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m    886\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m    887\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m    888\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m    889\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m    890\u001b[0m     )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m    124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m    127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir.<locals>.dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m    284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m    287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m     26\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m     27\u001b[0m         file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'"
-     ]
-    }
-   ],
-   "source": [
-    "os.environ['TORCHSIM_FUNCTIONAL_MODE']=\"0\"\n",
-    "os.environ['TORCHSIM_TIMING_MODE']=\"1\"\n",
-    "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## TOGSim Configuration\n",
-    "### Single Core"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[22], line 7\u001b[0m\n\u001b[1;32m      4\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m      6\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 7\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m     11\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;124;03m    Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m     \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     17\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m     19\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m inner\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified.<locals>.forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m    899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m    900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func.<locals>.g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper.<locals>.runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m     88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     89\u001b[0m     \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m     90\u001b[0m     \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m     91\u001b[0m     \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m     92\u001b[0m     \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m     93\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m         all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m     95\u001b[0m             compiled_fn,\n\u001b[1;32m     96\u001b[0m             args,\n\u001b[1;32m     97\u001b[0m             disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m     98\u001b[0m         )\n\u001b[1;32m    100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m    101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m    103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m    104\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m         out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    106\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    107\u001b[0m         \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m    108\u001b[0m         \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m    109\u001b[0m         warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m    110\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    111\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    112\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    113\u001b[0m         )\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base.<locals>.rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    116\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m    117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m    863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m    884\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m    885\u001b[0m     compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m    886\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m    887\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m    888\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m    889\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m    890\u001b[0m     )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m    124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m    127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir.<locals>.dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m    284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m    287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m     26\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m     27\u001b[0m         file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'"
-     ]
-    }
-   ],
-   "source": [
-    "os.environ['TORCHSIM_CONFIG']=\"/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json\"\n",
-    "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2025-11-30 18:32:01.843] [info] Total execution cycle: 47126\n"
-     ]
-    }
-   ],
-   "source": [
-    "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/11 | grep \"Total execution cycle\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Multi-Core"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
-      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
-      "Loading extension module npu...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Spike] Running Spike simulator\n",
-      "[TOGSim] TOGSim is running.   \n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/12\"\n"
-     ]
-    }
-   ],
-   "source": [
-    "os.environ['TORCHSIM_CONFIG']=\"/workspace/PyTorchSim/TOGSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json\"\n",
-    "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2025-11-30 18:34:48.969] [info] Total execution cycle: 40736\n"
-     ]
-    }
-   ],
-   "source": [
-    "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/12 | grep \"Total execution cycle\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## TOGSim log level\n",
-    "### log level info"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[21], line 7\u001b[0m\n\u001b[1;32m      4\u001b[0m weight \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m)\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m      6\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(torch\u001b[38;5;241m.\u001b[39mmatmul)\n\u001b[0;32m----> 7\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:15\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_inline\u001b[39m(fn):\n\u001b[1;32m     11\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;124;03m    Create an extra frame around fn that is not in skipfiles\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m     \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     17\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m     19\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m inner\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:17\u001b[0m, in \u001b[0;36mwrap_inline.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 17\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:901\u001b[0m, in \u001b[0;36maot_module_simplified.<locals>.forward\u001b[0;34m(*runtime_args)\u001b[0m\n\u001b[1;32m    899\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(params_flat)\n\u001b[1;32m    900\u001b[0m full_args\u001b[38;5;241m.\u001b[39mextend(runtime_args)\n\u001b[0;32m--> 901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:81\u001b[0m, in \u001b[0;36mmake_boxed_func.<locals>.g\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mg\u001b[39m(args):\n\u001b[0;32m---> 81\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:94\u001b[0m, in \u001b[0;36mcreate_runtime_wrapper.<locals>.runtime_wrapper\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m     88\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     89\u001b[0m     \u001b[38;5;66;03m# When we have an inference graph, we run with torch.no_grad.\u001b[39;00m\n\u001b[1;32m     90\u001b[0m     \u001b[38;5;66;03m# It's possible to get an inference graph with inputs that require grad,\u001b[39;00m\n\u001b[1;32m     91\u001b[0m     \u001b[38;5;66;03m# in which case we want to make sure autograd is disabled\u001b[39;00m\n\u001b[1;32m     92\u001b[0m     \u001b[38;5;66;03m# (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)\u001b[39;00m\n\u001b[1;32m     93\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m---> 94\u001b[0m         all_outs \u001b[38;5;241m=\u001b[39m call_func_at_runtime_with_args(\n\u001b[1;32m     95\u001b[0m             compiled_fn,\n\u001b[1;32m     96\u001b[0m             args,\n\u001b[1;32m     97\u001b[0m             disable_amp\u001b[38;5;241m=\u001b[39mdisable_amp,\n\u001b[1;32m     98\u001b[0m         )\n\u001b[1;32m    100\u001b[0m num_mutated_runtime_inps \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_mutated_inp_runtime_indices\n\u001b[1;32m    101\u001b[0m num_intermediate_bases \u001b[38;5;241m=\u001b[39m runtime_metadata\u001b[38;5;241m.\u001b[39mnum_intermediate_bases\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:105\u001b[0m, in \u001b[0;36mcall_func_at_runtime_with_args\u001b[0;34m(f, args, steal_args, disable_amp)\u001b[0m\n\u001b[1;32m    103\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m    104\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 105\u001b[0m         out \u001b[38;5;241m=\u001b[39m normalize_as_list(\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    106\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    107\u001b[0m         \u001b[38;5;66;03m# TODO: Please remove soon\u001b[39;00m\n\u001b[1;32m    108\u001b[0m         \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670\u001b[39;00m\n\u001b[1;32m    109\u001b[0m         warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m    110\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour compiler for AOTAutograd is returning a function that doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt take boxed arguments. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    111\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    112\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSee https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    113\u001b[0m         )\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:118\u001b[0m, in \u001b[0;36maot_dispatch_base.<locals>.rng_functionalization_wrapper\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    116\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m    117\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_fw\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:864\u001b[0m, in \u001b[0;36mCompiledFxGraph.__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m    863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs: List[Any]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 864\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_current_callable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/codecache.py:892\u001b[0m, in \u001b[0;36m_run_from_cache\u001b[0;34m(compiled_graph, inputs)\u001b[0m\n\u001b[1;32m    884\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m compiled_graph\u001b[38;5;241m.\u001b[39martifact_path\n\u001b[1;32m    885\u001b[0m     compiled_graph\u001b[38;5;241m.\u001b[39mcompiled_artifact \u001b[38;5;241m=\u001b[39m PyCodeCache\u001b[38;5;241m.\u001b[39mload_by_key_path(\n\u001b[1;32m    886\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_key,\n\u001b[1;32m    887\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39martifact_path,\n\u001b[1;32m    888\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mcache_linemap,\n\u001b[1;32m    889\u001b[0m         compiled_graph\u001b[38;5;241m.\u001b[39mconstants,\n\u001b[1;32m    890\u001b[0m     )\u001b[38;5;241m.\u001b[39mcall\n\u001b[0;32m--> 892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiled_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompiled_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/tmp/torchinductor_root/kc/ckcse5nw4rmyxquz2urirolc7mt445hqc2dapbmurbqgix2gdunr.py:125\u001b[0m, in \u001b[0;36mcall\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    123\u001b[0m buf0 \u001b[38;5;241m=\u001b[39m empty((\u001b[38;5;241m1024\u001b[39m, \u001b[38;5;241m1024\u001b[39m), device\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m'\u001b[39m, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m    124\u001b[0m sram_plan_prefix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuf0\u001b[39m\u001b[38;5;124m'\u001b[39m, buf0)\n\u001b[0;32m--> 125\u001b[0m \u001b[43mmlir_kernel_6\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg0_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg1_1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuf0\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    126\u001b[0m sram_plan_postfix(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124marg0_1\u001b[39m\u001b[38;5;124m'\u001b[39m, arg0_1)\n\u001b[1;32m    127\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m arg0_1\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:285\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir.<locals>.dummy_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    283\u001b[0m result_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_DUMP_PATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtmp\u001b[39m\u001b[38;5;124m\"\u001b[39m, hash_prefix(key))\n\u001b[1;32m    284\u001b[0m \u001b[38;5;66;03m# Dump arguments and meta data\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m \u001b[43mdump_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    286\u001b[0m runtime_path \u001b[38;5;241m=\u001b[39m FunctionalSimulator\u001b[38;5;241m.\u001b[39mget_runtime_dump_path(result_path)\n\u001b[1;32m    287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m autotune \u001b[38;5;129;01mand\u001b[39;00m (extension_config\u001b[38;5;241m.\u001b[39mCONFIG_TORCHSIM_FUNCTIONAL_MODE \u001b[38;5;129;01mor\u001b[39;00m validate):\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:25\u001b[0m, in \u001b[0;36mdump_metadata\u001b[0;34m(args, arg_attributes, path)\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(meta_path):\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmeta_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ma\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m     26\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m (arg_name, arg_attribute), arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(arg_attributes, args):\n\u001b[1;32m     27\u001b[0m         file\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg_attribute[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00marg\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/root/workspace/PyTorchSim/tmp/4q4qv6gbpia/meta.txt'"
-     ]
-    }
-   ],
-   "source": [
-    "os.environ['TORCHSIM_DUMP_PATH']=\"/workspace/PyTorchSim\"\n",
-    "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### log level trace"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
-      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
-      "Loading extension module npu...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Spike] Running Spike simulator\n",
-      "[TOGSim] TOGSim is running.   \n",
-      "[TOGSim] Simulation of \"/workspace/PyTorchSim/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/workspace/PyTorchSim/tmp/4q4qv6gbpia/togsim_result/1\"\n"
-     ]
-    }
-   ],
-   "source": [
-    "os.environ['BACKENDSIM_DEBUG_LEVEL']=\"trace\"\n",
-    "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Scheduler"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "from torchvision.models import resnet18\n",
-    "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request\n",
-    "from PyTorchSimFrontend.extension_config import CONFIG_TORCHSIM_BACKEND_CONFIG\n",
-    "\n",
-    "scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, backend_config=CONFIG_TORCHSIM_BACKEND_CONFIG)\n",
-    "device = scheduler.execution_engine.module.custom_device()\n",
-    "\n",
-    "model = resnet18().eval()\n",
-    "input = torch.randn(1, 3, 224, 224).to(device=device)\n",
-    "opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))\n",
-    "\n",
-    "SchedulerDNNModel.register_model(\"resnet18\", opt_fn)\n",
-    "request = Request(\"resnet18\", [input], [], request_queue_idx=0)\n",
-    "scheduler.add_request(request, request_time=0)\n",
-    "\n",
-    "# Run scheduler\n",
-    "while not scheduler.is_finished():\n",
-    "    with torch.no_grad():\n",
-    "        scheduler.schedule()\n",
-    "\n",
-    "print(\"ResNet18 Simulation Done\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load Generator"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
-      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
-      "Loading extension module npu...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2025-11-30 13:05:13.597] [info] [LoadConfig] Success to open \"/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\"\n",
-      "[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 0: Partition 0\n",
-      "[2025-11-30 13:05:13.597] [info] [Config/Core] CPU 1: Partition 0\n",
-      "[2025-11-30 13:05:13.597] [info] [Config/Core] Core 0: 700 MHz, Systolic array per core: 1\n",
-      "[2025-11-30 13:05:13.597] [info] [Config/Core] Core 1: 700 MHz, Systolic array per core: 1\n",
-      "[2025-11-30 13:05:13.597] [info] [Config/DRAM] Ramulator2 config: /root/workspace/PyTorchSim/PyTorchSimBackend/configs/../configs/ramulator2_configs/HBM2.yaml\n",
-      "[2025-11-30 13:05:13.597] [info] [Config/DRAM] DRAM Bandwidth 716 GB/s, Freq: 700 MHz, Channels: 32, Request_size: 32B\n",
-      "[2025-11-30 13:05:13.597] [info] [Config/L2Cache] No L2 cache\n",
-      "[2025-11-30 13:05:13.673] [info] [Config/Interconnect] Interconnect freq: 20000 MHz\n",
-      "[2025-11-30 13:05:13.673] [info] [Config/Interconnect] SimpleInerconnect selected\n",
-      "[0] BackendSim> [Reqest] Resnet18 request time:  0\n",
-      "[Request issue] partition: 0 batch size: 1\n",
-      "[Request-0 issue] partition: 0 arrival_time: 0 start_time: 0.0\n",
-      "Wrapper Codegen Path = /tmp/torchinductor_root/qx/cqxp5xnkdgcdwmer5w6ftyf46iegefhyjclg6mkz2smhktj4tpcy.py\n",
-      "launch /root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx /tmp/torchinductor/tmp/w5hefiqdl3p/runtime_0001/attribute/0 0 0\n",
-      "[2025-11-30 13:05:22.114] [info] [LoadConfig] Success to open \"/tmp/torchinductor/tmp/w5hefiqdl3p/runtime_0001/attribute/0\"\n",
-      "[2025-11-30 13:05:22.114] [info] [LoadConfig] Success to open \"/root/workspace/PyTorchSim/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\"\n",
-      "[2025-11-30 13:05:22.115] [info] [TOGParser/Attribute] Address Attribute key: arg0 address: 0xa3056c0\n",
-      "[2025-11-30 13:05:22.115] [info] [TOGParser/Attribute] Address Attribute key: arg1 address: 0xc4a3d40\n",
-      "[2025-11-30 13:05:22.115] [info] [TOGParser] Register Metadata \"systolic_size\": \"128\"\n",
-      "[2025-11-30 13:05:22.115] [info] [TOGParser] Register Metadata \"stonneGraph\": \"0\"\n",
-      "[2025-11-30 13:05:22.116] [info] [Scheduler 0] Register graph path: /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx operation: primals_123 at 0\n",
-      "[2025-11-30 13:05:22.116] [info] [Scheduler 0] Tile Graph FIFO Scheduled\n",
-      "until -1\n",
-      "[2025-11-30 13:05:22.117] [info] HBM2-CH_0: BW utilization 0% (0 reads, 0 writes)\n",
-      "[2025-11-30 13:05:22.319] [info] [Scheduler 0] Graph path: /tmp/torchinductor/tmp/w5hefiqdl3p/tile_graph.onnx operation: primals_123 finish at 2424\n",
-      "[2025-11-30 13:05:22.319] [info] Total compute time 2424\n",
-      "cycle\n"
-     ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[3], line 33\u001b[0m\n\u001b[1;32m     31\u001b[0m \u001b[38;5;66;03m# Run scheduler\u001b[39;00m\n\u001b[1;32m     32\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m scheduler\u001b[38;5;241m.\u001b[39mis_finished():\n\u001b[0;32m---> 33\u001b[0m     \u001b[43mscheduler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:475\u001b[0m, in \u001b[0;36mScheduler.schedule\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    473\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcurrent_cycle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbackend_simulator\u001b[38;5;241m.\u001b[39mcycle()\n\u001b[1;32m    474\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 475\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnext_time\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    476\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:507\u001b[0m, in \u001b[0;36mScheduler.run\u001b[0;34m(self, until_time)\u001b[0m\n\u001b[1;32m    505\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m until_time \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m    506\u001b[0m     \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mis_any_idle(req_empty_info):\n\u001b[0;32m--> 507\u001b[0m         result \u001b[38;5;241m=\u001b[39m \u001b[43mexecute_cycle\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    508\u001b[0m         req_empty_info \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest_empty(i) \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mnum_partion)]\n\u001b[1;32m    509\u001b[0m         \u001b[38;5;66;03m# if result is not -1, schedule new request\u001b[39;00m\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:484\u001b[0m, in \u001b[0;36mScheduler.run.<locals>.execute_cycle\u001b[0;34m()\u001b[0m\n\u001b[1;32m    482\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mnum_partion):\n\u001b[1;32m    483\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_engine\u001b[38;5;241m.\u001b[39mpartition_state[i] \u001b[38;5;241m==\u001b[39m PyTorchSimRunner\u001b[38;5;241m.\u001b[39mPARTITION_IDLE:\n\u001b[0;32m--> 484\u001b[0m         ret \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlaunch_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_cycle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    485\u001b[0m         launch_ret_info\u001b[38;5;241m.\u001b[39mappend(ret)\n\u001b[1;32m    487\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcheck_finish_request()\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:254\u001b[0m, in \u001b[0;36mPyTorchSimRunner.launch_kernel\u001b[0;34m(self, current_cycle, partion_idx)\u001b[0m\n\u001b[1;32m    252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpartition_state[partion_idx] \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mPARTITION_IDLE:\n\u001b[1;32m    253\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpartition_state[partion_idx]\n\u001b[0;32m--> 254\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpartion_idx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    255\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSELECT_NOTHING:\n\u001b[1;32m    256\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSELECT_NOTHING\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/Scheduler/scheduler.py:290\u001b[0m, in \u001b[0;36mFIFORunner.select_kernel\u001b[0;34m(self, partition_idx)\u001b[0m\n\u001b[1;32m    287\u001b[0m         nested_gen \u001b[38;5;241m=\u001b[39m kernel(\u001b[38;5;241m*\u001b[39minputs)\n\u001b[1;32m    288\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnested_launch_model_dicts[partition_idx] \u001b[38;5;241m=\u001b[39m {req : nested_gen}\n\u001b[1;32m    289\u001b[0m         kernel, inputs \u001b[38;5;241m=\u001b[39m \\\n\u001b[0;32m--> 290\u001b[0m             \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnested_launch_model_dicts\u001b[49m\u001b[43m[\u001b[49m\u001b[43mpartition_idx\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43mreq\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    291\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m kernel, inputs\n\u001b[1;32m    292\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    293\u001b[0m     \u001b[38;5;66;03m# Retry\u001b[39;00m\n",
-      "File \u001b[0;32m/tmp/torchinductor_root/qx/cqxp5xnkdgcdwmer5w6ftyf46iegefhyjclg6mkz2smhktj4tpcy.py:227\u001b[0m, in \u001b[0;36mConv2D_1_3_224_22464_3_7_7_2_2_3_3_1_1_3\u001b[0;34m(X, W, Y)\u001b[0m\n\u001b[1;32m    224\u001b[0m W \u001b[38;5;241m=\u001b[39m W\u001b[38;5;241m.\u001b[39mpermute(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m.\u001b[39mcontiguous() \u001b[38;5;66;03m# (O_C, I_C, K_H, K_W) -> (K_H, K_W, I_C, O_C)\u001b[39;00m\n\u001b[1;32m    226\u001b[0m \u001b[38;5;66;03m# Launch kernel\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m \u001b[43mmlir_kernel_1\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mW\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    228\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m (mlir_kernel_1, (X, W, Y))\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/extension_codecache.py:307\u001b[0m, in \u001b[0;36mCustomAsyncCompile.mlir.<locals>.dryrun_simulator\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    306\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdryrun_simulator\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 307\u001b[0m     key \u001b[38;5;241m=\u001b[39m \u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    308\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfilelock\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FileLock\n\u001b[1;32m    309\u001b[0m     lock_dir \u001b[38;5;241m=\u001b[39m get_lock_dir()\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/concurrent/futures/_base.py:453\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    450\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[1;32m    451\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__get_result()\n\u001b[0;32m--> 453\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_condition\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    455\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n\u001b[1;32m    456\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/threading.py:320\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    318\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:    \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m    319\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m         \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    321\u001b[0m         gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m    322\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "import torch\n",
-    "from torchvision.models import resnet18\n",
-    "\n",
-    "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator\n",
-    "CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
-    "\n",
-    "lambda_requests = 10\n",
-    "max_time = 30\n",
-    "\n",
-    "target_model1 = resnet18().eval()\n",
-    "\n",
-    "# Init scheduler\n",
-    "scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, backend_config=f\"{CONFIG_TORCHSIM_DIR}/PyTorchSimBackend/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json\")\n",
-    "# Register compiled model\n",
-    "opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)\n",
-    "SchedulerDNNModel.register_model(\"resnet18\", opt_model1)\n",
-    "\n",
-    "# Generate time stamp\n",
-    "for request_time in poisson_request_generator(lambda_requests, max_time):\n",
-    "    # Init input data\n",
-    "    model_input1 = torch.randn(1, 3, 224, 224)\n",
-    "\n",
-    "    # Init request\n",
-    "    new_request1 = Request(\"resnet18\", [model_input1], [], request_queue_idx=0)\n",
-    "\n",
-    "    # Add request to scheduler\n",
-    "    print(\"[Reqest] Resnet18 request time: \", request_time, flush=True)\n",
-    "    scheduler.add_request(new_request1, request_time=request_time)\n",
-    "\n",
-    "# Run scheduler\n",
-    "while not scheduler.is_finished():\n",
-    "    scheduler.schedule()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Compiler Optimization\n",
-    "### GeMM + ReLU fusion"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
-      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
-      "Loading extension module npu...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Wrapper Codegen Path = /tmp/torchinductor_root/vr/cvrlybtkuzkk6pmnlfxu7o55375z24tajmiow6mszaen5t4ra6zo.py\n",
-      "[Gem5] Gem5 is running.   \n",
-      "[Spike] Running Spike simulator\n",
-      "[TOGSim] TOGSim is running.   \n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/5o2xythi5z3/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/5o2xythi5z3/togsim_result/0\"\n"
-     ]
-    }
-   ],
-   "source": [
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "def gemm_relu(a, b):\n",
-    "    return torch.relu(torch.matmul(a, b))\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n",
-    "out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "cat: /tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0: No such file or directory\n"
-     ]
-    }
-   ],
-   "source": [
-    "!cat /tmp/torchinductor/tmp/5o2xythi5z3/backendsim_result/0 | grep \"Total execution cycle\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Disable fusion"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
-      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/npu/build.ninja...\n",
-      "Building extension module npu...\n",
-      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
-      "Loading extension module npu...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "ninja: no work to do.\n",
-      "Wrapper Codegen Path = /tmp/torchinductor_root/tl/ctlqjsvukam6d4kteerml7exwbt4paw7cjtjbxcwdlsd7e4koriq.py\n",
-      "[Gem5] Gem5 is running... \n",
-      "[Gem5] Gem5 is running..  \n",
-      "[Spike] Running Spike simulator\n",
-      "[TOGSim] TOGSim is running.   \n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/4q4qv6gbpia/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/4q4qv6gbpia/togsim_result/0\"\n",
-      "[Spike] Running Spike simulator\n",
-      "[TOGSim] TOGSim is running..  \n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/37dfo4nczcq/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/37dfo4nczcq/togsim_result/0\"\n"
-     ]
-    }
-   ],
-   "source": [
-    "os.environ['TORCHSIM_COMPILER_OPTIMIZATION']=\"none\"\n",
-    "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "def gemm_relu(a, b):\n",
-    "    return torch.relu(torch.matmul(a, b))\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n",
-    "out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2025-11-30 12:52:49.376] [info] Total execution cycle: 47164\n",
-      "[2025-11-30 12:52:52.444] [info] Total execution cycle: 58510\n"
-     ]
-    }
-   ],
-   "source": [
-    "!cat /tmp/torchinductor/tmp/4q4qv6gbpia/backendsim_result/2 | grep \"Total execution cycle\"\n",
-    "!cat /tmp/torchinductor/tmp/37dfo4nczcq/backendsim_result/0 | grep \"Total execution cycle\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Single kernel mode (TODO: remove it?)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
-      "No modifications detected for re-loaded extension module npu, skipping build step...\n",
-      "Loading extension module npu...\n"
-     ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/assumptions.py:499\u001b[0m, in \u001b[0;36mmake_property.<locals>.getit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    498\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 499\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_assumptions\u001b[49m\u001b[43m[\u001b[49m\u001b[43mfact\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m    500\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n",
-      "\u001b[0;31mKeyError\u001b[0m: 'extended_negative'",
-      "\nDuring handling of the above exception, another exception occurred:\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[2], line 10\u001b[0m\n\u001b[1;32m      7\u001b[0m model \u001b[38;5;241m=\u001b[39m resnet18()\u001b[38;5;241m.\u001b[39mto(device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[1;32m      9\u001b[0m opt_fn \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(dynamic\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)(model)\n\u001b[0;32m---> 10\u001b[0m npu_out \u001b[38;5;241m=\u001b[39m \u001b[43mopt_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1519\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1523\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:489\u001b[0m, in \u001b[0;36m_TorchDynamoContext.__call__.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    487\u001b[0m     dynamo_config_ctx\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__enter__\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 489\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    491\u001b[0m     set_eval_frame(prior)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1519\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1523\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:655\u001b[0m, in \u001b[0;36mcatch_errors_wrapper.<locals>.catch_errors\u001b[0;34m(frame, cache_entry, frame_state)\u001b[0m\n\u001b[1;32m    652\u001b[0m             \u001b[38;5;28;01mreturn\u001b[39;00m hijacked_callback(frame, cache_entry, hooks, frame_state)\n\u001b[1;32m    654\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compile_lock, _disable_current_modes():\n\u001b[0;32m--> 655\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcallback\u001b[49m\u001b[43m(\u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_entry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:727\u001b[0m, in \u001b[0;36mconvert_frame.<locals>._convert_frame\u001b[0;34m(frame, cache_entry, hooks, frame_state)\u001b[0m\n\u001b[1;32m    725\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mframes\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtotal\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    726\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 727\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[43minner_convert\u001b[49m\u001b[43m(\u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_entry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    728\u001b[0m     counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mframes\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mok\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    729\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:383\u001b[0m, in \u001b[0;36mconvert_frame_assert.<locals>._convert_frame_assert\u001b[0;34m(frame, cache_entry, hooks, frame_state)\u001b[0m\n\u001b[1;32m    370\u001b[0m signpost_event(\n\u001b[1;32m    371\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdynamo\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    372\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_convert_frame_assert._compile\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    379\u001b[0m     },\n\u001b[1;32m    380\u001b[0m )\n\u001b[1;32m    382\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config\u001b[38;5;241m.\u001b[39mpatch(_patch_config_if_changed()):\n\u001b[0;32m--> 383\u001b[0m     compiled_product \u001b[38;5;241m=\u001b[39m \u001b[43m_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    384\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    385\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_globals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    386\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_locals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    387\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf_builtins\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    388\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcompiler_fn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    389\u001b[0m \u001b[43m        \u001b[49m\u001b[43mone_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    390\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexport\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    391\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexport_constraints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    392\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    393\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    394\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    395\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframe_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mframe_state\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    396\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcompile_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcompile_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    397\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    398\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compiled_product\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:646\u001b[0m, in \u001b[0;36m_compile\u001b[0;34m(code, globals, locals, builtins, compiler_fn, one_graph, export, export_constraints, hooks, cache_size, frame, frame_state, compile_id)\u001b[0m\n\u001b[1;32m    644\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compile_context(CompileContext(compile_id)):\n\u001b[1;32m    645\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 646\u001b[0m         guarded_code \u001b[38;5;241m=\u001b[39m \u001b[43mcompile_inner\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mone_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    647\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m guarded_code\n\u001b[1;32m    648\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m (\n\u001b[1;32m    649\u001b[0m         Unsupported,\n\u001b[1;32m    650\u001b[0m         TorchRuntimeError,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    657\u001b[0m         BisectValidationException,\n\u001b[1;32m    658\u001b[0m     ) \u001b[38;5;28;01mas\u001b[39;00m e:\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:562\u001b[0m, in \u001b[0;36m_compile.<locals>.compile_inner\u001b[0;34m(code, one_graph, hooks, transform)\u001b[0m\n\u001b[1;32m    560\u001b[0m CompileContext\u001b[38;5;241m.\u001b[39mget()\u001b[38;5;241m.\u001b[39mattempt \u001b[38;5;241m=\u001b[39m attempt\n\u001b[1;32m    561\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 562\u001b[0m     out_code \u001b[38;5;241m=\u001b[39m \u001b[43mtransform_code_object\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    563\u001b[0m     \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    564\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mRestartAnalysis \u001b[38;5;28;01mas\u001b[39;00m e:\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/bytecode_transformation.py:1033\u001b[0m, in \u001b[0;36mtransform_code_object\u001b[0;34m(code, transformations, safe)\u001b[0m\n\u001b[1;32m   1030\u001b[0m instructions \u001b[38;5;241m=\u001b[39m cleaned_instructions(code, safe)\n\u001b[1;32m   1031\u001b[0m propagate_line_nums(instructions)\n\u001b[0;32m-> 1033\u001b[0m \u001b[43mtransformations\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstructions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcode_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1034\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m clean_and_assemble_instructions(instructions, keys, code_options)[\u001b[38;5;241m1\u001b[39m]\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:151\u001b[0m, in \u001b[0;36mpreserve_global_state.<locals>._fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    149\u001b[0m cleanup \u001b[38;5;241m=\u001b[39m setup_compile_debug()\n\u001b[1;32m    150\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 151\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    152\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    153\u001b[0m     cleanup\u001b[38;5;241m.\u001b[39mclose()\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:527\u001b[0m, in \u001b[0;36m_compile.<locals>.transform\u001b[0;34m(instructions, code_options)\u001b[0m\n\u001b[1;32m    525\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    526\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m tracing(tracer\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mtracing_context), tracer\u001b[38;5;241m.\u001b[39mset_current_tx():\n\u001b[0;32m--> 527\u001b[0m         \u001b[43mtracer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    528\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mUnspecializeRestartAnalysis:\n\u001b[1;32m    529\u001b[0m     speculation_log\u001b[38;5;241m.\u001b[39mclear()\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:2128\u001b[0m, in \u001b[0;36mInstructionTranslator.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   2127\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m-> 2128\u001b[0m     \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:818\u001b[0m, in \u001b[0;36mInstructionTranslatorBase.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    813\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    814\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mpush_tx(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m    815\u001b[0m     \u001b[38;5;28;01mwhile\u001b[39;00m (\n\u001b[1;32m    816\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minstruction_pointer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    817\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39mshould_exit\n\u001b[0;32m--> 818\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    819\u001b[0m     ):\n\u001b[1;32m    820\u001b[0m         \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m    821\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m BackendCompilerFailed:\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:781\u001b[0m, in \u001b[0;36mInstructionTranslatorBase.step\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    777\u001b[0m         unimplemented(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmissing: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minst\u001b[38;5;241m.\u001b[39mopname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    778\u001b[0m     TracingContext\u001b[38;5;241m.\u001b[39mset_current_loc(\n\u001b[1;32m    779\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_filename, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlineno, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_name\n\u001b[1;32m    780\u001b[0m     )\n\u001b[0;32m--> 781\u001b[0m     \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minst\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopname\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43minst\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    783\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m inst\u001b[38;5;241m.\u001b[39mopname \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    784\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m Unsupported:\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:2243\u001b[0m, in \u001b[0;36mInstructionTranslator.RETURN_VALUE\u001b[0;34m(self, inst)\u001b[0m\n\u001b[1;32m   2238\u001b[0m _step_logger()(\n\u001b[1;32m   2239\u001b[0m     logging\u001b[38;5;241m.\u001b[39mINFO,\n\u001b[1;32m   2240\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorchdynamo done tracing \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf_code\u001b[38;5;241m.\u001b[39mco_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (RETURN_VALUE)\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m   2241\u001b[0m )\n\u001b[1;32m   2242\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE triggered compile\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 2243\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompile_subgraph\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2244\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2245\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreason\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mGraphCompileReason\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2246\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreturn_value\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mframe_summary\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgraph_break\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m   2247\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2248\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompile_return_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   2249\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2250\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput\u001b[38;5;241m.\u001b[39madd_output_instructions([create_instruction(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRETURN_VALUE\u001b[39m\u001b[38;5;124m\"\u001b[39m)])\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:919\u001b[0m, in \u001b[0;36mOutputGraph.compile_subgraph\u001b[0;34m(self, tx, partial_convert, reason, compile_return_value)\u001b[0m\n\u001b[1;32m    916\u001b[0m     append_prefix_insts()\n\u001b[1;32m    917\u001b[0m     \u001b[38;5;66;03m# optimization to generate better code in a common case\u001b[39;00m\n\u001b[1;32m    918\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_output_instructions(\n\u001b[0;32m--> 919\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompile_and_call_fx_graph\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mreversed\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mstack_values\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mroot\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    920\u001b[0m         \u001b[38;5;241m+\u001b[39m [create_instruction(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUNPACK_SEQUENCE\u001b[39m\u001b[38;5;124m\"\u001b[39m, arg\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(stack_values))]\n\u001b[1;32m    921\u001b[0m     )\n\u001b[1;32m    922\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    923\u001b[0m     graph_output_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnew_var(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgraph_out\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36mContextDecorator.__call__.<locals>.inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m     76\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m     78\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1087\u001b[0m, in \u001b[0;36mOutputGraph.compile_and_call_fx_graph\u001b[0;34m(self, tx, rv, root)\u001b[0m\n\u001b[1;32m   1084\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtracing_context\u001b[38;5;241m.\u001b[39mfake_mode \u001b[38;5;241m=\u001b[39m backend_fake_mode\n\u001b[1;32m   1086\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrestore_global_state():\n\u001b[0;32m-> 1087\u001b[0m     compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcall_user_compiler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1088\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m disable(compiled_fn)\n\u001b[1;32m   1090\u001b[0m counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstats\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munique_graphs\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1140\u001b[0m, in \u001b[0;36mOutputGraph.call_user_compiler\u001b[0;34m(self, gm)\u001b[0m\n\u001b[1;32m   1138\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39mverify_correctness:\n\u001b[1;32m   1139\u001b[0m     compiler_fn \u001b[38;5;241m=\u001b[39m WrapperBackend(compiler_fn)\n\u001b[0;32m-> 1140\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexample_inputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1141\u001b[0m _step_logger()(logging\u001b[38;5;241m.\u001b[39mINFO, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdone compiler function \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1142\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(compiled_fn), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompiler_fn did not return callable\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/repro/after_dynamo.py:117\u001b[0m, in \u001b[0;36mwrap_backend_debug.<locals>.debug_wrapper\u001b[0;34m(gm, example_inputs, **kwargs)\u001b[0m\n\u001b[1;32m    115\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[1;32m    116\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 117\u001b[0m     compiled_gm \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compiled_gm\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/__init__.py:1662\u001b[0m, in \u001b[0;36m_TorchCompileInductorWrapper.__call__\u001b[0;34m(self, model_, inputs_)\u001b[0m\n\u001b[1;32m   1659\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, model_, inputs_):\n\u001b[1;32m   1660\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_inductor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompile_fx\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m compile_fx\n\u001b[0;32m-> 1662\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompile_fx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig_patches\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:1168\u001b[0m, in \u001b[0;36mcompile_fx\u001b[0;34m(model_, example_inputs_, inner_compile, config_patches, decompositions)\u001b[0m\n\u001b[1;32m   1163\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m inference_compiler(unlifted_gm, example_inputs_)\n\u001b[1;32m   1165\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_fake_mode(fake_mode), torch\u001b[38;5;241m.\u001b[39m_guards\u001b[38;5;241m.\u001b[39mtracing(\n\u001b[1;32m   1166\u001b[0m     tracing_context\n\u001b[1;32m   1167\u001b[0m ), compiled_autograd\u001b[38;5;241m.\u001b[39mdisable():\n\u001b[0;32m-> 1168\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43maot_autograd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1169\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfw_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1170\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbw_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbw_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1171\u001b[0m \u001b[43m        \u001b[49m\u001b[43minference_compiler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minference_compiler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1172\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdecompositions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecompositions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1173\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpartition_fn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_fn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1174\u001b[0m \u001b[43m        \u001b[49m\u001b[43mkeep_inference_input_mutations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   1175\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs_\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/backends/common.py:55\u001b[0m, in \u001b[0;36maot_autograd.<locals>.compiler_fn\u001b[0;34m(gm, example_inputs)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     \u001b[38;5;66;03m# NB: NOT cloned!\u001b[39;00m\n\u001b[1;32m     54\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m enable_aot_logging(), patch_config:\n\u001b[0;32m---> 55\u001b[0m         cg \u001b[38;5;241m=\u001b[39m \u001b[43maot_module_simplified\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     56\u001b[0m         counters[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot_autograd\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mok\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m     57\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m disable(cg)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:887\u001b[0m, in \u001b[0;36maot_module_simplified\u001b[0;34m(mod, args, fw_compiler, bw_compiler, partition_fn, decompositions, keep_inference_input_mutations, inference_compiler)\u001b[0m\n\u001b[1;32m    871\u001b[0m aot_config \u001b[38;5;241m=\u001b[39m AOTConfig(\n\u001b[1;32m    872\u001b[0m     fw_compiler\u001b[38;5;241m=\u001b[39mfw_compiler,\n\u001b[1;32m    873\u001b[0m     bw_compiler\u001b[38;5;241m=\u001b[39mbw_compiler,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    883\u001b[0m     no_tangents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m    884\u001b[0m )\n\u001b[1;32m    886\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m compiled_autograd\u001b[38;5;241m.\u001b[39mdisable():\n\u001b[0;32m--> 887\u001b[0m     compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_aot_dispatcher_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    888\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfunctional_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    889\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfull_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    890\u001b[0m \u001b[43m        \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    891\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    893\u001b[0m \u001b[38;5;66;03m# TODO: There is something deeply wrong here; compiled_fn running with\u001b[39;00m\n\u001b[1;32m    894\u001b[0m \u001b[38;5;66;03m# the boxed calling convention, but aot_module_simplified somehow\u001b[39;00m\n\u001b[1;32m    895\u001b[0m \u001b[38;5;66;03m# historically returned a function that was not the boxed calling\u001b[39;00m\n\u001b[1;32m    896\u001b[0m \u001b[38;5;66;03m# convention.  This should get fixed...\u001b[39;00m\n\u001b[1;32m    897\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39mruntime_args):\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:600\u001b[0m, in \u001b[0;36mcreate_aot_dispatcher_function\u001b[0;34m(flat_fn, flat_args, aot_config)\u001b[0m\n\u001b[1;32m    597\u001b[0m compiler_fn \u001b[38;5;241m=\u001b[39m partial(aot_wrapper_dedupe, compiler_fn\u001b[38;5;241m=\u001b[39mcompiler_fn)\n\u001b[1;32m    598\u001b[0m \u001b[38;5;66;03m# You can put more passes here\u001b[39;00m\n\u001b[0;32m--> 600\u001b[0m compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfake_flat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    601\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m aot_config\u001b[38;5;241m.\u001b[39mis_export:\n\u001b[1;32m    602\u001b[0m     mutated_user_inp_locs \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    603\u001b[0m         idx \u001b[38;5;241m-\u001b[39m aot_config\u001b[38;5;241m.\u001b[39mnum_params_buffers\n\u001b[1;32m    604\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m fw_metadata\u001b[38;5;241m.\u001b[39mmutated_inp_runtime_indices\n\u001b[1;32m    605\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m idx \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m aot_config\u001b[38;5;241m.\u001b[39mnum_params_buffers\n\u001b[1;32m    606\u001b[0m     ]\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:425\u001b[0m, in \u001b[0;36maot_wrapper_dedupe\u001b[0;34m(flat_fn, flat_args, aot_config, compiler_fn, fw_metadata)\u001b[0m\n\u001b[1;32m    422\u001b[0m             \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    424\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ok:\n\u001b[0;32m--> 425\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mleaf_flat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    427\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m requires_subclass_dispatch(leaf_flat_args, fw_metadata):\n\u001b[1;32m    428\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m    429\u001b[0m \u001b[38;5;250m            \u001b[39m\u001b[38;5;124;03m\"\"\"\\\u001b[39;00m\n\u001b[1;32m    430\u001b[0m \u001b[38;5;124;03mEncountered duplicate inputs that are mutated in the graph, but at least one input/output\u001b[39;00m\n\u001b[1;32m    431\u001b[0m \u001b[38;5;124;03mto the graph is a tensor subclass. This is not supported today. You can try to\u001b[39;00m\n\u001b[1;32m    432\u001b[0m \u001b[38;5;124;03mremove the aliasing yourself as a workaround, or otherwise file an issue on github.\"\"\"\u001b[39;00m\n\u001b[1;32m    433\u001b[0m         )\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:630\u001b[0m, in \u001b[0;36maot_wrapper_synthetic_base\u001b[0;34m(flat_fn, flat_args, aot_config, fw_metadata, needs_autograd, compiler_fn)\u001b[0m\n\u001b[1;32m    628\u001b[0m \u001b[38;5;66;03m# Happy path: we don't need synthetic bases\u001b[39;00m\n\u001b[1;32m    629\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m synthetic_base_info \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 630\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflat_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maot_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfw_metadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfw_metadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    632\u001b[0m \u001b[38;5;66;03m# export path: ban synthetic bases for now, add later if requested.\u001b[39;00m\n\u001b[1;32m    633\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m requires_subclass_dispatch(flat_args, fw_metadata):\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:295\u001b[0m, in \u001b[0;36maot_dispatch_autograd\u001b[0;34m(flat_fn, flat_args, aot_config, fw_metadata)\u001b[0m\n\u001b[1;32m    292\u001b[0m     tracing_context\u001b[38;5;241m.\u001b[39mfw_metadata \u001b[38;5;241m=\u001b[39m inner_meta\n\u001b[1;32m    294\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m TracingContext\u001b[38;5;241m.\u001b[39mreport_output_strides() \u001b[38;5;28;01mas\u001b[39;00m fwd_output_strides:\n\u001b[0;32m--> 295\u001b[0m     compiled_fw_func \u001b[38;5;241m=\u001b[39m \u001b[43maot_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfw_compiler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfw_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43madjusted_flat_args\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    296\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(compiled_fw_func, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_boxed_call\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    297\u001b[0m     compiled_fw_func \u001b[38;5;241m=\u001b[39m make_boxed_func(compiled_fw_func)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:1100\u001b[0m, in \u001b[0;36mcompile_fx.<locals>.fw_compiler_base\u001b[0;34m(model, example_inputs, is_inference)\u001b[0m\n\u001b[1;32m   1092\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m orig_output_end_idx \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m num_model_outputs\n\u001b[1;32m   1094\u001b[0m     user_visible_outputs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m   1095\u001b[0m         n\u001b[38;5;241m.\u001b[39mname\n\u001b[1;32m   1096\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m n \u001b[38;5;129;01min\u001b[39;00m model_outputs[original_output_start_index:orig_output_end_idx]\n\u001b[1;32m   1097\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(n, torch\u001b[38;5;241m.\u001b[39mfx\u001b[38;5;241m.\u001b[39mNode)\n\u001b[1;32m   1098\u001b[0m     }\n\u001b[0;32m-> 1100\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1101\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1102\u001b[0m \u001b[43m    \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1103\u001b[0m \u001b[43m    \u001b[49m\u001b[43mnum_fixed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfixed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1104\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcudagraphs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcudagraphs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1105\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgraph_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgraph_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1106\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_inference\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_inference\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1107\u001b[0m \u001b[43m    \u001b[49m\u001b[43mboxed_forward_device_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforward_device\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1108\u001b[0m \u001b[43m    \u001b[49m\u001b[43muser_visible_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_visible_outputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1109\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/repro/after_aot.py:83\u001b[0m, in \u001b[0;36mwrap_compiler_debug.<locals>.debug_wrapper\u001b[0;34m(gm, example_inputs, **kwargs)\u001b[0m\n\u001b[1;32m     78\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m config\u001b[38;5;241m.\u001b[39mrepro_after \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdynamo\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m     81\u001b[0m     \u001b[38;5;66;03m# Call the compiler_fn - which is either aot_autograd or inductor\u001b[39;00m\n\u001b[1;32m     82\u001b[0m     \u001b[38;5;66;03m# with fake inputs\u001b[39;00m\n\u001b[0;32m---> 83\u001b[0m     inner_compiled_fn \u001b[38;5;241m=\u001b[39m \u001b[43mcompiler_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     84\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m     85\u001b[0m     \u001b[38;5;66;03m# TODO: Failures here are troublesome because no real inputs,\u001b[39;00m\n\u001b[1;32m     86\u001b[0m     \u001b[38;5;66;03m# need a different serialization strategy\u001b[39;00m\n\u001b[1;32m     87\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39mrepro_after \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maot\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/debug.py:305\u001b[0m, in \u001b[0;36mDebugContext.wrap.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    302\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(fn)\n\u001b[1;32m    303\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    304\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m DebugContext():\n\u001b[0;32m--> 305\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36mContextDecorator.__call__.<locals>.inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m     76\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m     78\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:320\u001b[0m, in \u001b[0;36mcompile_fx_inner\u001b[0;34m(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, boxed_forward_device_index, user_visible_outputs, layout_opt, extern_node_serializer)\u001b[0m\n\u001b[1;32m    316\u001b[0m     compiled_graph \u001b[38;5;241m=\u001b[39m FxGraphCache\u001b[38;5;241m.\u001b[39mload(\n\u001b[1;32m    317\u001b[0m         fx_codegen_and_compile, gm, example_inputs, graph_kwargs\n\u001b[1;32m    318\u001b[0m     )\n\u001b[1;32m    319\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m     compiled_graph \u001b[38;5;241m=\u001b[39m \u001b[43mfx_codegen_and_compile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    321\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mgraph_kwargs\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# type: ignore[arg-type]\u001b[39;49;00m\n\u001b[1;32m    322\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    324\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFX codegen and compilation took \u001b[39m\u001b[38;5;132;01m%.3f\u001b[39;00m\u001b[38;5;124ms\u001b[39m\u001b[38;5;124m\"\u001b[39m, time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start)\n\u001b[1;32m    326\u001b[0m \u001b[38;5;66;03m# Return the output strides to the caller via TracingContext\u001b[39;00m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:535\u001b[0m, in \u001b[0;36mfx_codegen_and_compile\u001b[0;34m(gm, example_inputs, cudagraphs, num_fixed, is_backward, graph_id, cpp_wrapper, aot_mode, is_inference, user_visible_outputs, layout_opt, extern_node_serializer)\u001b[0m\n\u001b[1;32m    519\u001b[0m graph \u001b[38;5;241m=\u001b[39m GraphLowering(\n\u001b[1;32m    520\u001b[0m     gm,\n\u001b[1;32m    521\u001b[0m     \u001b[38;5;66;03m# example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    532\u001b[0m     is_inference\u001b[38;5;241m=\u001b[39mis_inference,\n\u001b[1;32m    533\u001b[0m )\n\u001b[1;32m    534\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_graph_handler(graph):\n\u001b[0;32m--> 535\u001b[0m     \u001b[43mgraph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mexample_inputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    536\u001b[0m     output_strides: List[Optional[Tuple[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m]]] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m    537\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m graph\u001b[38;5;241m.\u001b[39mgraph_outputs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    538\u001b[0m         \u001b[38;5;66;03m# We'll put the output strides in the compiled graph so we\u001b[39;00m\n\u001b[1;32m    539\u001b[0m         \u001b[38;5;66;03m# can later return them to the caller via TracingContext\u001b[39;00m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py:244\u001b[0m, in \u001b[0;36mdynamo_timed.<locals>.dynamo_timed_inner.<locals>.time_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mrecord_function(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (dynamo_timed)\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    243\u001b[0m     t0 \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m--> 244\u001b[0m     r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    245\u001b[0m     time_spent \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m t0\n\u001b[1;32m    246\u001b[0m compilation_time_metrics[key]\u001b[38;5;241m.\u001b[39mappend(time_spent)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:519\u001b[0m, in \u001b[0;36mGraphLowering.run\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m    517\u001b[0m \u001b[38;5;129m@dynamo_timed\u001b[39m\n\u001b[1;32m    518\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs):\n\u001b[0;32m--> 519\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/fx/interpreter.py:138\u001b[0m, in \u001b[0;36mInterpreter.run\u001b[0;34m(self, initial_env, enable_io_processing, *args)\u001b[0m\n\u001b[1;32m    135\u001b[0m     \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m    137\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 138\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menv[node] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_node\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    140\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mextra_traceback:\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:814\u001b[0m, in \u001b[0;36mGraphLowering.run_node\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m    812\u001b[0m     debug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayout_constraints\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    813\u001b[0m     args, kwargs \u001b[38;5;241m=\u001b[39m layout_constraints[n\u001b[38;5;241m.\u001b[39mtarget](n, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 814\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcall_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    815\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_magic_method(n\u001b[38;5;241m.\u001b[39mtarget):\n\u001b[1;32m    816\u001b[0m     \u001b[38;5;66;03m# TODO: this is sus, it probably should be handled in the\u001b[39;00m\n\u001b[1;32m    817\u001b[0m     \u001b[38;5;66;03m# lowerings themselves similarly to sym_size/sym-stride\u001b[39;00m\n\u001b[1;32m    818\u001b[0m     debug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_magic_method\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py:691\u001b[0m, in \u001b[0;36mGraphLowering.call_function\u001b[0;34m(self, target, args, kwargs)\u001b[0m\n\u001b[1;32m    689\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    690\u001b[0m     log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m  via \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, lowerings[target])\n\u001b[0;32m--> 691\u001b[0m     out \u001b[38;5;241m=\u001b[39m \u001b[43mlowerings\u001b[49m\u001b[43m[\u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    692\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[1;32m    693\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_lowering.py:117\u001b[0m, in \u001b[0;36mconvolution\u001b[0;34m(x, weight, bias, stride, padding, dilation, transposed, output_padding, groups)\u001b[0m\n\u001b[1;32m    115\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    116\u001b[0m     mlir_template \u001b[38;5;241m=\u001b[39m MLIRConvTemplate([x, weight, bias], layout, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 117\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmlir_template\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39moutput_node()\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_template.py:1189\u001b[0m, in \u001b[0;36mMLIRTemplate.generate\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m   1184\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m patch\u001b[38;5;241m.\u001b[39mobject(V\u001b[38;5;241m.\u001b[39mgraph, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mget_dtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fake_get_dtype(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_node)):\n\u001b[1;32m   1185\u001b[0m     kernel  \u001b[38;5;241m=\u001b[39m MLIRTemplateKernel(kernel_name\u001b[38;5;241m=\u001b[39mkernel_name, input_nodes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minput_nodes, call_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlayout\u001b[38;5;241m.\u001b[39msize, kernel_group\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   1186\u001b[0m                                  outer_func_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunction_name \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfunction_name\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   1187\u001b[0m                                  outer_func_render\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mouter_func_render \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mouter_func_render\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   1188\u001b[0m                                  kernel_arg_attributes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_arg_attributes() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mget_arg_attributes\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m-> 1189\u001b[0m     code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkernel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkernel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1191\u001b[0m kernel_hash_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmlir_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex_counter)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1192\u001b[0m extra_args \u001b[38;5;241m=\u001b[39m []\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py:238\u001b[0m, in \u001b[0;36mMLIRConvSingleBatchTemplate.render\u001b[0;34m(self, kernel, template_buffer_node, epilogue_nodes, tile_info, **kwargs)\u001b[0m\n\u001b[1;32m    229\u001b[0m kernel\u001b[38;5;241m.\u001b[39mepilogue_info \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(\n\u001b[1;32m    230\u001b[0m     output_node \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_node\u001b[38;5;241m.\u001b[39mname,\n\u001b[1;32m    231\u001b[0m     sram_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_buffer\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    235\u001b[0m     dim_aliasing \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex0\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mc0\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex1\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtile_n\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex2\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mo_h\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex3\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtile_m\u001b[39m\u001b[38;5;124m\"\u001b[39m}\n\u001b[1;32m    236\u001b[0m )\n\u001b[1;32m    237\u001b[0m kernel\u001b[38;5;241m.\u001b[39mexception_nodes[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumel\u001b[39m\u001b[38;5;124m\"\u001b[39m : (I_W\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m\u001b[38;5;241m*\u001b[39mPADDING_W)\u001b[38;5;241m*\u001b[39m(I_H\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m\u001b[38;5;241m*\u001b[39mPADDING_H)\u001b[38;5;241m*\u001b[39mI_C\u001b[38;5;241m*\u001b[39mBATCH}\n\u001b[0;32m--> 238\u001b[0m code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_template_from_string\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconv_template\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkernel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    239\u001b[0m kernel\u001b[38;5;241m.\u001b[39madd_loop_info([kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mK_H\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mK_W\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_H\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_W\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBATCH\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO_C\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mI_C\u001b[39m\u001b[38;5;124m\"\u001b[39m]], [kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_M\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_N\u001b[39m\u001b[38;5;124m\"\u001b[39m], kernel\u001b[38;5;241m.\u001b[39mrender_options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTILE_K\u001b[39m\u001b[38;5;124m\"\u001b[39m]])\n\u001b[1;32m    240\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m code\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/jinja2/environment.py:1299\u001b[0m, in \u001b[0;36mTemplate.render\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1296\u001b[0m ctx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnew_context(\u001b[38;5;28mdict\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs))\n\u001b[1;32m   1298\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1299\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menvironment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconcat\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mroot_render_func\u001b[49m\u001b[43m(\u001b[49m\u001b[43mctx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m   1300\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m   1301\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menvironment\u001b[38;5;241m.\u001b[39mhandle_exception()\n",
-      "File \u001b[0;32m<template>:178\u001b[0m, in \u001b[0;36mroot\u001b[0;34m(context, missing, environment)\u001b[0m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/jinja2/runtime.py:298\u001b[0m, in \u001b[0;36mContext.call\u001b[0;34m(_Context__self, _Context__obj, *args, **kwargs)\u001b[0m\n\u001b[1;32m    295\u001b[0m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_loop_vars\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    297\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 298\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m__obj\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    299\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m:\n\u001b[1;32m    300\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m __self\u001b[38;5;241m.\u001b[39menvironment\u001b[38;5;241m.\u001b[39mundefined(\n\u001b[1;32m    301\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalue was undefined because a callable raised a\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    302\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m StopIteration exception\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    303\u001b[0m     )\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_template.py:790\u001b[0m, in \u001b[0;36mMLIRTemplateKernel.def_dma_op\u001b[0;34m(self, dma_type, dram_var, index_list, tile_desc, subtile_size, async_type, indent_size)\u001b[0m\n\u001b[1;32m    788\u001b[0m local_code \u001b[38;5;241m=\u001b[39m IndentedBuffer()\n\u001b[1;32m    789\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m V\u001b[38;5;241m.\u001b[39mset_kernel_handler(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 790\u001b[0m     index_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse_index_list\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlocal_code\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtile_desc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    791\u001b[0m     node_layout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnamed_nodes[dram_var]\u001b[38;5;241m.\u001b[39mget_layout()\n\u001b[1;32m    792\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m dram_var \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexception_nodes:\n",
-      "File \u001b[0;32m~/workspace/PyTorchSim/PyTorchSimFrontend/mlir/mlir_codegen_backend.py:1051\u001b[0m, in \u001b[0;36mMLIRKernel.parse_index_list\u001b[0;34m(self, expr_list, buffer, offset)\u001b[0m\n\u001b[1;32m   1048\u001b[0m         indices\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mstr\u001b[39m(new_arg))\n\u001b[1;32m   1050\u001b[0m \u001b[38;5;66;03m# Extract index var\u001b[39;00m\n\u001b[0;32m-> 1051\u001b[0m expr_str \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43msum\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mnew_expr_list\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1052\u001b[0m args \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mmap\u001b[39m(\u001b[38;5;28mstr\u001b[39m, dim_list))\n\u001b[1;32m   1053\u001b[0m map_var \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmap_cse\u001b[38;5;241m.\u001b[39mgenerate(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mglobal_vars, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maffine_map<(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00margs\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)[] -> (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexpr_str\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)>\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/_print_helpers.py:29\u001b[0m, in \u001b[0;36mPrintable.__str__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     27\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__str__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m     28\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msympy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mprinting\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mstr\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m sstr\n\u001b[0;32m---> 29\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msstr\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py:372\u001b[0m, in \u001b[0;36m_PrintFunction.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    371\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 372\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__wrapped__\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/str.py:998\u001b[0m, in \u001b[0;36msstr\u001b[0;34m(expr, **settings)\u001b[0m\n\u001b[1;32m    982\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Returns the expression as a string.\u001b[39;00m\n\u001b[1;32m    983\u001b[0m \n\u001b[1;32m    984\u001b[0m \u001b[38;5;124;03mFor large expressions where speed is a concern, use the setting\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    994\u001b[0m \u001b[38;5;124;03m'Eq(a + b, 0)'\u001b[39;00m\n\u001b[1;32m    995\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    997\u001b[0m p \u001b[38;5;241m=\u001b[39m StrPrinter(settings)\n\u001b[0;32m--> 998\u001b[0m s \u001b[38;5;241m=\u001b[39m \u001b[43mp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdoprint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1000\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m s\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py:292\u001b[0m, in \u001b[0;36mPrinter.doprint\u001b[0;34m(self, expr)\u001b[0m\n\u001b[1;32m    290\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdoprint\u001b[39m(\u001b[38;5;28mself\u001b[39m, expr):\n\u001b[1;32m    291\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Returns printer's representation for expr (as a string)\"\"\"\u001b[39;00m\n\u001b[0;32m--> 292\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_str(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_print\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m)\u001b[49m)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py:331\u001b[0m, in \u001b[0;36mPrinter._print\u001b[0;34m(self, expr, **kwargs)\u001b[0m\n\u001b[1;32m    329\u001b[0m     printmethod \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, printmethodname, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    330\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m printmethod \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 331\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mprintmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    332\u001b[0m \u001b[38;5;66;03m# Unknown object, fall back to the emptyPrinter.\u001b[39;00m\n\u001b[1;32m    333\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39memptyPrinter(expr)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/str.py:57\u001b[0m, in \u001b[0;36mStrPrinter._print_Add\u001b[0;34m(self, expr, order)\u001b[0m\n\u001b[1;32m     55\u001b[0m l \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m     56\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m term \u001b[38;5;129;01min\u001b[39;00m terms:\n\u001b[0;32m---> 57\u001b[0m     t \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_print\u001b[49m\u001b[43m(\u001b[49m\u001b[43mterm\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     58\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m term\u001b[38;5;241m.\u001b[39mis_Add:\n\u001b[1;32m     59\u001b[0m         sign \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py:331\u001b[0m, in \u001b[0;36mPrinter._print\u001b[0;34m(self, expr, **kwargs)\u001b[0m\n\u001b[1;32m    329\u001b[0m     printmethod \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, printmethodname, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    330\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m printmethod \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 331\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mprintmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    332\u001b[0m \u001b[38;5;66;03m# Unknown object, fall back to the emptyPrinter.\u001b[39;00m\n\u001b[1;32m    333\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39memptyPrinter(expr)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/str.py:263\u001b[0m, in \u001b[0;36mStrPrinter._print_Mul\u001b[0;34m(self, expr)\u001b[0m\n\u001b[1;32m    261\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_print_Mul\u001b[39m(\u001b[38;5;28mself\u001b[39m, expr):\n\u001b[0;32m--> 263\u001b[0m     prec \u001b[38;5;241m=\u001b[39m \u001b[43mprecedence\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    265\u001b[0m     \u001b[38;5;66;03m# Check for unevaluated Mul. In this case we need to make sure the\u001b[39;00m\n\u001b[1;32m    266\u001b[0m     \u001b[38;5;66;03m# identities are visible, multiple Rational factors are not combined\u001b[39;00m\n\u001b[1;32m    267\u001b[0m     \u001b[38;5;66;03m# etc so we display in a straight-forward form that fully preserves all\u001b[39;00m\n\u001b[1;32m    268\u001b[0m     \u001b[38;5;66;03m# args and their order.\u001b[39;00m\n\u001b[1;32m    269\u001b[0m     args \u001b[38;5;241m=\u001b[39m expr\u001b[38;5;241m.\u001b[39margs\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/precedence.py:132\u001b[0m, in \u001b[0;36mprecedence\u001b[0;34m(item)\u001b[0m\n\u001b[1;32m    130\u001b[0m n \u001b[38;5;241m=\u001b[39m i\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\n\u001b[1;32m    131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n \u001b[38;5;129;01min\u001b[39;00m PRECEDENCE_FUNCTIONS:\n\u001b[0;32m--> 132\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPRECEDENCE_FUNCTIONS\u001b[49m\u001b[43m[\u001b[49m\u001b[43mn\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    133\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m n \u001b[38;5;129;01min\u001b[39;00m PRECEDENCE_VALUES:\n\u001b[1;32m    134\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m PRECEDENCE_VALUES[n]\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/printing/precedence.py:62\u001b[0m, in \u001b[0;36mprecedence_Mul\u001b[0;34m(item)\u001b[0m\n\u001b[1;32m     61\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprecedence_Mul\u001b[39m(item):\n\u001b[0;32m---> 62\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mitem\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcould_extract_minus_sign\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m     63\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m PRECEDENCE[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAdd\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m     64\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m PRECEDENCE[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMul\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/mul.py:180\u001b[0m, in \u001b[0;36mMul.could_extract_minus_sign\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    178\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m  \u001b[38;5;66;03m# e.g. zoo*x == -zoo*x\u001b[39;00m\n\u001b[1;32m    179\u001b[0m c \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 180\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m c\u001b[38;5;241m.\u001b[39mis_Number \u001b[38;5;129;01mand\u001b[39;00m \u001b[43mc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mis_extended_negative\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/assumptions.py:503\u001b[0m, in \u001b[0;36mmake_property.<locals>.getit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    501\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_assumptions \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_assumptions:\n\u001b[1;32m    502\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_assumptions \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_assumptions\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[0;32m--> 503\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_ask\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfact\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/assumptions.py:559\u001b[0m, in \u001b[0;36m_ask\u001b[0;34m(fact, obj)\u001b[0m\n\u001b[1;32m    557\u001b[0m handler_i \u001b[38;5;241m=\u001b[39m handler_map\u001b[38;5;241m.\u001b[39mget(fact_i)\n\u001b[1;32m    558\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m handler_i \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 559\u001b[0m     fact_i_value \u001b[38;5;241m=\u001b[39m \u001b[43mhandler_i\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    561\u001b[0m \u001b[38;5;66;03m# If we get a new value for fact_i then we should update our knowledge\u001b[39;00m\n\u001b[1;32m    562\u001b[0m \u001b[38;5;66;03m# of fact_i as well as any related facts that can be inferred using the\u001b[39;00m\n\u001b[1;32m    563\u001b[0m \u001b[38;5;66;03m# inference rules connecting the fact_i and any other fact values that\u001b[39;00m\n\u001b[1;32m    564\u001b[0m \u001b[38;5;66;03m# are already known.\u001b[39;00m\n\u001b[1;32m    565\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m fact_i_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/expr.py:894\u001b[0m, in \u001b[0;36mExpr._eval_is_extended_negative\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    893\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_eval_is_extended_negative\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 894\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_eval_is_extended_positive_negative\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpositive\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/expr.py:857\u001b[0m, in \u001b[0;36mExpr._eval_is_extended_positive_negative\u001b[0;34m(self, positive)\u001b[0m\n\u001b[1;32m    854\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_number:\n\u001b[1;32m    855\u001b[0m     \u001b[38;5;66;03m# check to see that we can get a value\u001b[39;00m\n\u001b[1;32m    856\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 857\u001b[0m         n2 \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_eval_evalf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    858\u001b[0m     \u001b[38;5;66;03m# XXX: This shouldn't be caught here\u001b[39;00m\n\u001b[1;32m    859\u001b[0m     \u001b[38;5;66;03m# Catches ValueError: hypsum() failed to converge to the requested\u001b[39;00m\n\u001b[1;32m    860\u001b[0m     \u001b[38;5;66;03m# 34 bits of accuracy\u001b[39;00m\n\u001b[1;32m    861\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m:\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/numbers.py:672\u001b[0m, in \u001b[0;36mNumber._eval_evalf\u001b[0;34m(self, prec)\u001b[0m\n\u001b[1;32m    671\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_eval_evalf\u001b[39m(\u001b[38;5;28mself\u001b[39m, prec):\n\u001b[0;32m--> 672\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m Float\u001b[38;5;241m.\u001b[39m_new(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_as_mpf_val\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprec\u001b[49m\u001b[43m)\u001b[49m, prec)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/sympy/core/numbers.py:2083\u001b[0m, in \u001b[0;36mInteger._as_mpf_val\u001b[0;34m(self, prec)\u001b[0m\n\u001b[1;32m   2079\u001b[0m is_Integer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m   2081\u001b[0m \u001b[38;5;18m__slots__\u001b[39m \u001b[38;5;241m=\u001b[39m ()\n\u001b[0;32m-> 2083\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_as_mpf_val\u001b[39m(\u001b[38;5;28mself\u001b[39m, prec):\n\u001b[1;32m   2084\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m mlib\u001b[38;5;241m.\u001b[39mfrom_int(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mp, prec, rnd)\n\u001b[1;32m   2086\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_mpmath_\u001b[39m(\u001b[38;5;28mself\u001b[39m, prec, rnd):\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "import torch\n",
-    "from torchvision.models import resnet18\n",
-    "from Scheduler.scheduler import PyTorchSimRunner\n",
-    "device = PyTorchSimRunner.setup_device().custom_device()\n",
-    "\n",
-    "input = torch.randn(1, 3, 224, 224).to(device=device)\n",
-    "model = resnet18().to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(model)\n",
-    "npu_out = opt_fn(input)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/tutorial/session1/Inference.ipynb b/tutorial/session1/Inference.ipynb
new file mode 100644
index 00000000..a49e2440
--- /dev/null
+++ b/tutorial/session1/Inference.ipynb
@@ -0,0 +1,124 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Basic usage\n",
+    "## Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import os\n",
+    "import sys\n",
+    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "sys.path.append(base_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Normal Matmul Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "\n",
+    "torch.manual_seed(0)\n",
+    "input = torch.randn(128, 128).to(device)\n",
+    "weight = torch.randn(128, 128).to(device)\n",
+    "\n",
+    "opt_fn = torch.compile(torch.matmul)\n",
+    "cpu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### PyTorchSim Matmul Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "\n",
+    "torch.manual_seed(0)\n",
+    "input = torch.randn(128, 128).to(device)\n",
+    "weight = torch.randn(128, 128).to(device)\n",
+    "\n",
+    "opt_fn = torch.compile(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n",
+    "    if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
+    "        message = f\"|{name} Test Passed|\"\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(message)\n",
+    "        print(\"-\" * len(message))\n",
+    "    else:\n",
+    "        message = f\"|{name} Test Failed|\"\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(message)\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(\"npu out: \", npu_out.cpu())\n",
+    "        print(\"cpu out: \", cpu_out)\n",
+    "        exit(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_result(\"MatMul\", npu_out, cpu_out)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorial/session1/LogAnalysis.ipynb b/tutorial/session1/LogAnalysis.ipynb
new file mode 100644
index 00000000..4f1e17cb
--- /dev/null
+++ b/tutorial/session1/LogAnalysis.ipynb
@@ -0,0 +1,99 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TOGSim Log Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import os\n",
+    "import sys\n",
+    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "sys.path.append(base_dir)\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.json\"\n",
+    "os.environ['TORCHSIM_DUMP_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### log level info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### log level trace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ['TOGSIM_DEBUG_LEVEL']=\"trace\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorial/session1/Mapping.ipynb b/tutorial/session1/Mapping.ipynb
new file mode 100644
index 00000000..b02c98fe
--- /dev/null
+++ b/tutorial/session1/Mapping.ipynb
@@ -0,0 +1,151 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Mapping"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import os\n",
+    "import sys\n",
+    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "sys.path.append(base_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Default mapping is based on heuristic."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cat /root/workspace/PyTorchSim/outputs/20251202_154524/togsim_result.log | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Manual Mapping\n",
+    "User can set tile size manually."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch._dynamo.reset()\n",
+    "\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_external_mapping.json\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cat /root/workspace/PyTorchSim/outputs/20251202_141933/togsim_result.log | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Autotune"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch._dynamo.reset()\n",
+    "\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_autotune.json\"\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cat /root/workspace/PyTorchSim/outputs/20251202_141951/togsim_result.log | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorial/session1/Training.ipynb b/tutorial/session1/Training.ipynb
new file mode 100644
index 00000000..0c6b138a
--- /dev/null
+++ b/tutorial/session1/Training.ipynb
@@ -0,0 +1,144 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import torch\n",
+    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "sys.path.append(base_dir)\n",
+    "\n",
+    "cpu_device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "from Scheduler.scheduler import PyTorchSimRunner\n",
+    "npu_device = PyTorchSimRunner.setup_device().custom_device()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Normal Backward Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.manual_seed(0)\n",
+    "cpu_input = torch.randn(128, 128).to(cpu_device)\n",
+    "cpu_weight = torch.randn(128, 128).to(cpu_device)\n",
+    "cpu_target = torch.randn(128, 128).to(cpu_device)\n",
+    "cpu_input.requires_grad = True\n",
+    "cpu_weight.requires_grad = True\n",
+    "\n",
+    "opt_fn = torch.matmul\n",
+    "cpu_out = opt_fn(cpu_input, cpu_weight)\n",
+    "\n",
+    "loss_fn = torch.nn.CrossEntropyLoss()\n",
+    "cpu_loss = loss_fn(cpu_out, cpu_target)\n",
+    "cpu_loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### PyTorchSim Backward Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.manual_seed(0)\n",
+    "npu_input = torch.randn(128, 128).to(npu_device)\n",
+    "npu_weight = torch.randn(128, 128).to(npu_device)\n",
+    "npu_target = torch.randn(128, 128).to(npu_device)\n",
+    "npu_input.requires_grad = True\n",
+    "npu_weight.requires_grad = True\n",
+    "\n",
+    "opt_fn = torch.compile(torch.matmul)\n",
+    "npu_out = opt_fn(npu_input, npu_weight)\n",
+    "\n",
+    "loss_fn = torch.nn.CrossEntropyLoss()\n",
+    "npu_loss = loss_fn(npu_out, npu_target)\n",
+    "npu_loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n",
+    "    if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
+    "        message = f\"|{name} Test Passed|\"\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(message)\n",
+    "        print(\"-\" * len(message))\n",
+    "    else:\n",
+    "        message = f\"|{name} Test Failed|\"\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(message)\n",
+    "        print(\"-\" * len(message))\n",
+    "        print(\"npu out: \", npu_out.cpu())\n",
+    "        print(\"cpu out: \", cpu_out)\n",
+    "        exit(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_result(\"MatMul Input Grad\", npu_input.grad, cpu_input.grad)\n",
+    "test_result(\"MatMul Weight Grad\", npu_weight.grad, cpu_weight.grad)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorial/session1/togsim_configs/togsim_config.json b/tutorial/session1/togsim_configs/togsim_config.json
new file mode 100644
index 00000000..e8e489d9
--- /dev/null
+++ b/tutorial/session1/togsim_configs/togsim_config.json
@@ -0,0 +1,32 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency_cycles" : 10,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "heuristic",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
+}
\ No newline at end of file
diff --git a/tutorial/session1/togsim_configs/togsim_config_2_cores.json b/tutorial/session1/togsim_configs/togsim_config_2_cores.json
new file mode 100644
index 00000000..c50edaa9
--- /dev/null
+++ b/tutorial/session1/togsim_configs/togsim_config_2_cores.json
@@ -0,0 +1,32 @@
+{
+  "num_cores" : 2,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 32,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency_cycles" : 10,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 0,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "heuristic",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
+}
\ No newline at end of file
diff --git a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/tutorial/session1/togsim_configs/togsim_config_autotune.json
similarity index 50%
rename from TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
rename to tutorial/session1/togsim_configs/togsim_config_autotune.json
index b92d8029..c9763e92 100644
--- a/TOGSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+++ b/tutorial/session1/togsim_configs/togsim_config_autotune.json
@@ -4,6 +4,10 @@
   "core_stats_print_period_cycles" : 10000,
   "num_systolic_array_per_core" : 2,
 
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
   "dram_type" : "ramulator2",
   "dram_freq_mhz" : 940,
   "dram_channels": 16,
@@ -13,7 +17,16 @@
   "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
 
   "icnt_type" : "simple",
-  "icnt_latency" : 7,
+  "icnt_latency_cycles" : 10,
   "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "autotune",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
 }
\ No newline at end of file
diff --git a/tutorial/session1/togsim_configs/togsim_config_external_mapping.json b/tutorial/session1/togsim_configs/togsim_config_external_mapping.json
new file mode 100644
index 00000000..c8ddb0f3
--- /dev/null
+++ b/tutorial/session1/togsim_configs/togsim_config_external_mapping.json
@@ -0,0 +1,32 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency_cycles" : 10,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "external-then-heuristic",
+  "codegen_external_mapping_file" : "/workspace/PyTorchSim/tutorial/session1/tutorial_external_mapping.json",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
+}
\ No newline at end of file
diff --git a/tutorial/session1/togsim_configs/togsim_config_functional_only.json b/tutorial/session1/togsim_configs/togsim_config_functional_only.json
new file mode 100644
index 00000000..53072307
--- /dev/null
+++ b/tutorial/session1/togsim_configs/togsim_config_functional_only.json
@@ -0,0 +1,32 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency_cycles" : 10,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 1,
+  "pytorchsim_timing_mode" : 0,
+
+  "codegen_mapping_strategy" : "heuristic",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
+}
\ No newline at end of file
diff --git a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json
new file mode 100644
index 00000000..e2b9c8c8
--- /dev/null
+++ b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json
@@ -0,0 +1,32 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency_cycles" : 10,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 0,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "heuristic",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "none"
+}
\ No newline at end of file
diff --git a/tutorial/session1/togsim_configs/togsim_config_timing_only.json b/tutorial/session1/togsim_configs/togsim_config_timing_only.json
new file mode 100644
index 00000000..0b846bbd
--- /dev/null
+++ b/tutorial/session1/togsim_configs/togsim_config_timing_only.json
@@ -0,0 +1,32 @@
+{
+  "num_cores" : 1,
+  "core_freq_mhz" : 940,
+  "core_stats_print_period_cycles" : 10000,
+  "num_systolic_array_per_core" : 2,
+
+  "vpu_num_lanes" : 128,
+  "vpu_spad_size_kb_per_lane" : 128,
+  "vpu_vector_length_bits" : 256,
+
+  "dram_type" : "ramulator2",
+  "dram_freq_mhz" : 940,
+  "dram_channels": 16,
+  "dram_req_size_byte": 32,
+  "dram_num_burst_length" : 2,
+  "dram_stats_print_period_cycles": 10000,
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+
+  "icnt_type" : "simple",
+  "icnt_latency_cycles" : 10,
+  "icnt_freq_mhz" : 940,
+  "icnt_injection_ports_per_core" : 16,
+
+  "pytorchsim_functional_mode" : 0,
+  "pytorchsim_timing_mode" : 1,
+
+  "codegen_mapping_strategy" : "heuristic",
+  "codegen_external_mapping_file" : "",
+  "codegen_autotune_max_retry": 10,
+  "codegen_autotune_template_topk": 4,
+  "codegen_compiler_optimization" : "all"
+}
\ No newline at end of file
diff --git a/tutorial/session1/tutorial_external_mapping.json b/tutorial/session1/tutorial_external_mapping.json
new file mode 100644
index 00000000..3982d950
--- /dev/null
+++ b/tutorial/session1/tutorial_external_mapping.json
@@ -0,0 +1,7 @@
+{
+    "1024_1024_1024" : {
+        "TILE_M" : 512,
+        "TILE_N" : 512,
+        "TILE_K" : 512
+    }
+}
\ No newline at end of file
diff --git a/tutorial/session2/Hands_on.ipynb b/tutorial/session2/Hands_on.ipynb
index ef66a5f6..33ec1a28 100644
--- a/tutorial/session2/Hands_on.ipynb
+++ b/tutorial/session2/Hands_on.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "89aac974-97ea-46f2-b856-7b37c0a23add",
    "metadata": {},
    "outputs": [
@@ -31,7 +31,8 @@
     "import torch\n",
     "import torch._dynamo\n",
     "import torch.utils.cpp_extension\n",
-    "sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))\n",
+    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "sys.path.append(base_dir)\n",
     "\n",
     "from Scheduler.scheduler import PyTorchSimRunner\n",
     "module = PyTorchSimRunner.setup_device()\n",
@@ -58,7 +59,7 @@
     "    x = torch.randn(size).to(device=device)\n",
     "    opt_fn = torch.compile(dynamic=False)(exponent2)\n",
     "    res = opt_fn(x)\n",
-    "    out = exponent(x.cpu())\n",
+    "    out = exponent2(x.cpu())\n",
     "    test_result(\"exponent2\", res, out)"
    ]
   },
@@ -68,6 +69,13 @@
    "id": "42d509f3-d955-4149-9f0f-bd0f3d0620f9",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2025-12-03 02:02:14,679] [0/0] torch._inductor.debug: [WARNING] model___9 debug trace: /tmp/torchinductor_root/uu/cuumxtbdv4ukzpymchmrda2exohouwcdybawmj2v7jog4vbvoycf.debug\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -76,7 +84,7 @@
       "[Gem5] Gem5 is running... \n",
       "[Spike] Running Spike simulator\n",
       "[TOGSim] TOGSim is running..  \n",
-      "[TOGSim] Simulation of \"/tmp/torchinductor/tmp/wefbdnuiezd/tile_graph.onnx\" is stored to \"/tmp/torchinductor/tmp/wefbdnuiezd/togsim_result/0\"\n",
+      "[TOGSim] Simulation log is stored to \"/workspace/PyTorchSim/togsim_results/20251203_020218.log\"\n",
       "------------------\n",
       "|exp2 Test Passed|\n",
       "------------------\n"
@@ -96,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "5bfdf22f-e749-41a5-a2cf-dcbb630bfb83",
    "metadata": {},
    "outputs": [
@@ -230,7 +238,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
diff --git a/tutorial/session2/Warmup.py b/tutorial/session2/Warmup.py
new file mode 100644
index 00000000..ce215cf5
--- /dev/null
+++ b/tutorial/session2/Warmup.py
@@ -0,0 +1,27 @@
+from typing import List
+import os
+from torch.fx.passes.graph_drawer import FxGraphDrawer
+os.environ['TORCH_LOGS'] = 'bytecode'
+import torch
+
+def dummy_compiler(gm: torch.fx.GraphModule, _):
+    gm.graph.print_tabular()
+    drawer = FxGraphDrawer(gm, "my_model")
+    drawer.get_dot_graph().write_svg("fx_graph.svg")
+    return gm.forward # Return a callable object
+
+class MyModel(torch.nn.Module):
+    def forward(self, x, y):
+        z = torch.matmul(x, y)
+        return torch.relu(z)
+
+@torch.compile(backend=dummy_compiler)
+def f(x, y):
+    my_model = MyModel()
+    return my_model(x, y)
+
+if __name__ == "__main__":
+    x = torch.randn(7, 5,requires_grad=False)
+    y = torch.randn(5, 3,requires_grad=False)
+    k = f(x, y)
+    print(k)
diff --git a/validation/gemm_tpuv3_cheatsheet.json b/validation/gemm_tpuv3_cheatsheet.json
index 76a26e1a..e7fd0a6f 100644
--- a/validation/gemm_tpuv3_cheatsheet.json
+++ b/validation/gemm_tpuv3_cheatsheet.json
@@ -1,17 +1,17 @@
 {
-    "512_2048_8192" : {
+    "512_8192_2048" : {
         "TILE_M" : 512,
-        "TILE_K" : 512,
-        "TILE_N" : 1024
+        "TILE_N" : 1024,
+        "TILE_K" : 512
     },
     "512_2048_2048" : {
         "TILE_M" : 512,
-        "TILE_K" : 512,
-        "TILE_N" : 1024
+        "TILE_N" : 1024,
+        "TILE_K" : 512
     },
-    "2048_2048_512" : {
+    "2048_512_2048" : {
         "TILE_M" : 1024,
-        "TILE_K" : 512,
-        "TILE_N" : 512
+        "TILE_N" : 512,
+        "TILE_K" : 512
     }
 }
\ No newline at end of file