-
Notifications
You must be signed in to change notification settings - Fork 181
Open
Description
The config being used is this:
# functional simulator specification
-gpgpu_ptx_instruction_classification 0
-gpgpu_ptx_sim_mode 0
-gpgpu_ptx_force_max_capability 89
# Device Limits
-gpgpu_stack_size_limit 1024
-gpgpu_heap_size_limit 8388608
-gpgpu_runtime_sync_depth_limit 2
-gpgpu_runtime_pending_launch_count_limit 2048
-gpgpu_kernel_launch_latency 248698
-gpgpu_TB_launch_latency 0
# Compute Capability
-gpgpu_compute_capability_major 8
-gpgpu_compute_capability_minor 9
# PTX execution-driven
-gpgpu_ptx_convert_to_ptxplus 0
-gpgpu_ptx_save_converted_ptxplus 0
# high level architecture configuration
-gpgpu_n_clusters 128
-gpgpu_n_cores_per_cluster 1
-gpgpu_n_mem 16
-gpgpu_n_sub_partition_per_mchannel 2
# clock domains
#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
-gpgpu_clock_domains 2235:2235:2235:5250.5
# shader core pipeline config
-gpgpu_shader_registers 65536
-gpgpu_registers_per_block 65536
-gpgpu_occupancy_sm_number 89
-gpgpu_shader_core_pipeline 1536:32
-gpgpu_shader_cta 32
-gpgpu_simd_model 1
# Pipeline widths and number of FUs
# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
-gpgpu_num_sp_units 4
-gpgpu_num_sfu_units 4
-gpgpu_num_dp_units 4
-gpgpu_num_int_units 4
-gpgpu_tensor_core_avail 1
-gpgpu_num_tensor_core_units 4
# Instruction latencies and initiation intervals
# "ADD,MAX,MUL,MAD,DIV"
# All Div operations are executed on SFU unit
-ptx_opcode_latency_int 4,4,4,4,21
-ptx_opcode_initiation_int 2,2,2,2,2
-ptx_opcode_latency_fp 4,4,4,4,39
-ptx_opcode_initiation_fp 2,2,2,2,4
-ptx_opcode_latency_dp 54,54,54,54,330
-ptx_opcode_initiation_dp 64,64,64,64,130
-ptx_opcode_latency_sfu 23
-ptx_opcode_initiation_sfu 8
-ptx_opcode_latency_tesnor 64
-ptx_opcode_initiation_tensor 64
# sub core model: in which each scheduler has its own register file and EUs
# i.e. schedulers are isolated
-gpgpu_sub_core_model 1
# disable specialized operand collectors and use generic operand collectors instead
-gpgpu_enable_specialized_operand_collector 0
-gpgpu_operand_collector_num_units_gen 8
-gpgpu_operand_collector_num_in_ports_gen 8
-gpgpu_operand_collector_num_out_ports_gen 8
# register banks
-gpgpu_num_reg_banks 8
-gpgpu_reg_file_port_throughput 2
# warp scheduling
-gpgpu_num_sched_per_core 4
-gpgpu_scheduler gto
# a warp scheduler issue mode
-gpgpu_max_insn_issue_per_warp 1
-gpgpu_dual_issue_diff_exec_units 1
## L1/shared memory configuration
# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
# ** Optional parameter - Required when mshr_type==Texture Fifo
# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache
# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-gpgpu_adaptive_cache_config 1
-gpgpu_shmem_option 0,8,16,32,64,100
-gpgpu_unified_l1d_size 128
# L1 cache configuration
-gpgpu_l1_banks 4
-gpgpu_cache:dl1 S:4:128:64,L:T:m:L:L,A:384:48,16:0,32
-gpgpu_l1_latency 44
-gpgpu_gmem_skip_L1D 0
-gpgpu_flush_l1_cache 1
-gpgpu_n_cluster_ejection_buffer_size 32
-gpgpu_l1_cache_write_ratio 25
# shared memory configuration
-gpgpu_shmem_size 102400
-gpgpu_shmem_sizeDefault 102400
-gpgpu_shmem_per_block 49152
-gpgpu_smem_latency 30
# shared memory bankconflict detection
-gpgpu_shmem_num_banks 32
-gpgpu_shmem_limited_broadcast 0
-gpgpu_shmem_warp_parts 1
-gpgpu_coalesce_arch 89
# L2 cache
-gpgpu_cache:dl2 S:512:128:24,L:B:m:L:P,A:192:4,32:0,32
-gpgpu_cache:dl2_texture_only 0
-gpgpu_dram_partition_queues 64:64:64:64
-gpgpu_perf_sim_memcpy 1
-gpgpu_memory_partition_indexing 2
# 128 KB Inst.
-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
-gpgpu_inst_fetch_throughput 4
# 128 KB Tex
# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
# 64 KB Const
-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
-gpgpu_perfect_inst_const_cache 1
# interconnection
# use built-in local xbar
-network_mode 2
-icnt_in_buffer_limit 512
-icnt_out_buffer_limit 512
-icnt_subnets 2
-icnt_flit_size 40
-icnt_arbiter_algo 1
# memory partition latency config
-gpgpu_l2_rop_latency 247
-dram_latency 330
# dram sched config
-gpgpu_dram_scheduler 1
-gpgpu_frfcfs_dram_sched_queue_size 64
-gpgpu_dram_return_queue_size 192
# dram model config
-gpgpu_n_mem_per_ctrlr 1
-gpgpu_dram_buswidth 2
-gpgpu_dram_burst_length 16
-dram_data_command_freq_ratio 4
-gpgpu_mem_address_mask 1
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
# Mem timing
-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=18:RCD=36:RAS=82:RP=36:RC=117:CL=36:WL=12:CDLR=15:WR=36:nbkgrp=4:CCDL=9:RTPL=6
-dram_dual_bus_interface 0
# select lower bits for bnkgrp to increase bnkgrp parallelism
-dram_bnk_indexing_policy 0
-dram_bnkgrp_indexing_policy 1
#-dram_seperate_write_queue_enable 1
#-dram_write_queue_size 64:56:32
# stat collection
-gpgpu_memlatency_stat 14
-gpgpu_runtime_stat 500
-enable_ptx_file_line_stats 1
-visualizer_enabled 0
# power model configs, disable it untill we create a real energy model
-power_simulation_enabled 0
# tracing functionality
#-trace_enabled 1
#-trace_components WARP_SCHEDULER,SCOREBOARD
#-trace_sampling_core 0
And the error I keep running into is this:
**********************************************************
**********************************************************
l1_bw_32f_unroll_large-NO_ARGS--RTX4090-SASS-LINEAR-RR-32B-FCFS. Status=ASSERT, FUNC_TEST_FAILED, ABORTED
Last 10 line of /home/cab-prj/Prabin/december/accel-sim-framework/util/job_launching/../../sim_run_12.4/l1_bw_32f_unroll_large/NO_ARGS/RTX4090-SASS-LINEAR-RR-32B-FCFS/l1_bw_32f_unroll_large-NO_ARGS.accelsim-commit-d7f397a_modified_1.0_25-12-12-10-50-26gpgpu-sim_git-commit-b18ee397_modified_2.0..o34
------------------
launching memcpy command : MemcpyHtoD,0x00007f4e8d602000,528384
GPGPU-Sim uArch: performance model initialization complete.
*** Initializing Memory Statistics ***
GPGPU-Sim uArch: clock periods: 0.00000000044742729306:0.00000000044742729306:0.00000000044742729306:0.00000000019045805161
GPGPU-Sim uArch: clock freqs: 2235000000.000000:2235000000.000000:2235000000.000000:5250500000.000000
sub_partition_id_mask = 0000000000000800
addr_dec_mask[BURST] = 000000000000001f high:5 low:0
addr_dec_mask[COL] = 000000000000f61f high:16 low:0
addr_dec_mask[ROW] = 00000000fff80000 high:32 low:19
addr_dec_mask[BK] = 0000000000070800 high:19 low:11
------------------
Contents of /home/cab-prj/Prabin/december/accel-sim-framework/util/job_launching/../../sim_run_12.4/l1_bw_32f_unroll_large/NO_ARGS/RTX4090-SASS-LINEAR-RR-32B-FCFS/l1_bw_32f_unroll_large-NO_ARGS.accelsim-commit-d7f397a_modified_1.0_25-12-12-10-50-26gpgpu-sim_git-commit-b18ee397_modified_2.0..e34
------------------
accel-sim.out: /home/cab-prj/Prabin/december/accel-sim-framework/gpu-simulator/gpgpu-sim/src/gpgpu-sim/hashing.cc:88: unsigned int ipoly_hash_function(new_addr_type, unsigned int, unsigned int): Assertion `"\nmemory_partition_indexing error: The number of " "channels should be " "16, 32 or 64 for the hashing IPOLY index function. other banks " "numbers are not supported. Generate it by yourself! \n" && 0' failed.
/home/cab-prj/Prabin/december/accel-sim-framework/util/job_launching/../../sim_run_12.4/l1_bw_32f_unroll_large/NO_ARGS/RTX4090-SASS-LINEAR-RR-32B-FCFS/slurm.sim: line 54: 16238 Aborted (core dumped) /home/cab-prj/Prabin/december/accel-sim-framework/util/job_launching/../../sim_run_12.4/gpgpu-sim-builds/accelsim-commit-d7f397a_modified_1.0_25-12-12-10-50-26gpgpu-sim_git-commit-b18ee397_modified_2.0./accel-sim.out -config ./gpgpusim.config -trace ./traces/kernelslist.g
------------------
**********************************************************```
Is there any solution to this?
Metadata
Metadata
Assignees
Labels
No labels