Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/NeuraDialect/Mapping/MappingState.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ struct MappingLoc {
}

bool operator<(const MappingLoc &other) const {
if (!resource || !other.resource) {
return resource < other.resource;
}
if (resource->getKind() != other.resource->getKind()) {
return resource->getKind() < other.resource->getKind();
}
Expand Down
9 changes: 8 additions & 1 deletion include/NeuraDialect/Mapping/mapping_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,10 @@ getOpsInAlapLevels(const std::vector<Operation *> &sorted_ops,
const std::set<Operation *> &critical_ops);

// Flattens the level buckets into a vector of pairs (operation, level).
// Within each ALAP level, critical ops are prioritized before non-critical ops.
std::vector<std::pair<Operation *, int>> flatten_level_buckets(
const std::vector<std::vector<Operation *>> &level_buckets);
const std::vector<std::vector<Operation *>> &level_buckets,
const std::set<Operation *> &critical_ops);

// Gets the physical hops from the producers to the tile, which is used for
// estimating the award of a location for placement.
Expand Down Expand Up @@ -80,6 +82,11 @@ bool tryRouteBackwardMove(Operation *mov_op, MappingLoc src_loc,
// ctrl_mov users found.
llvm::SmallVector<Operation *> getCtrlMovUsers(Operation *op);

// Identifies operations on the critical path (i.e., operations with zero slack).
// Returns pair of: (critical_ops_set, asap_level_map)
std::pair<std::set<Operation *>, llvm::DenseMap<Operation *, int>>
identifyCriticalPathOps(const std::vector<Operation *> &sorted_ops);

// Maps a materialized operation to the accelerator, and routes the dataflow
// from the producers to the given op.
bool placeAndRoute(Operation *op, const MappingLoc &target_loc,
Expand Down
101 changes: 94 additions & 7 deletions lib/NeuraDialect/Mapping/mapping_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ static const int AWARD_BACKWARD_PROXIMITY_SCALE = 1;
static const int AWARD_BASE_MULTIPLIER = 1;
static const int AWARD_CRITICAL_BONUS_DIV = 1;

// Congestion penalty coefficients (tunable).
static const int STRONG_CONGESTION_PENALTY = 60; // used for high fan-in ops (>=3)
static const int WEAK_CONGESTION_PENALTY = 15; // used for low fan-in ops

namespace mlir {
namespace neura {
OperationKind getOperationKindFromMlirOp(Operation *op) {
Expand Down Expand Up @@ -394,12 +398,53 @@ mlir::neura::getOpsInAlapLevels(const std::vector<Operation *> &sorted_ops,
}

std::vector<std::pair<Operation *, int>> mlir::neura::flatten_level_buckets(
const std::vector<std::vector<Operation *>> &level_buckets) {
const std::vector<std::vector<Operation *>> &level_buckets,
const std::set<Operation *> &critical_ops) {
std::vector<std::pair<Operation *, int>> result;

for (int level = 0; level < static_cast<int>(level_buckets.size()); ++level) {
for (Operation *op : level_buckets[level]) {
result.emplace_back(op, level);
// Collects ops with their current index to ensure stable sorting.
std::vector<std::pair<Operation *, int>> ops_with_index;
for (int i = 0; i < (int)level_buckets[level].size(); ++i) {
ops_with_index.push_back({level_buckets[level][i], i});
}

// Sorts with criticality as PRIMARY criterion within the same ALAP level.
// This addresses tancheng's feedback: critical ops should map before
// high-degree non-critical ops in the same level.
std::sort(ops_with_index.begin(), ops_with_index.end(),
[&critical_ops](const std::pair<Operation *, int> &a_pair,
const std::pair<Operation *, int> &b_pair) {
Operation *a = a_pair.first;
Operation *b = b_pair.first;

bool a_is_critical = critical_ops.count(a) > 0;
bool b_is_critical = critical_ops.count(b) > 0;

// Priority 1: Critical ops come first (within same ALAP level).
if (a_is_critical != b_is_critical)
return a_is_critical > b_is_critical;

// Priority 2: Degree (connectivity) - higher degree first.
int degree_a = a->getNumOperands();
int degree_b = b->getNumOperands();
for (Value res : a->getResults()) {
degree_a += std::distance(res.getUsers().begin(),
res.getUsers().end());
}
for (Value res : b->getResults()) {
degree_b += std::distance(res.getUsers().begin(),
res.getUsers().end());
}
if (degree_a != degree_b)
return degree_a > degree_b;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we also prioritize the ops on critical path? (even though they have lower degree?)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Operations on the critical path already receive the highest priority. In getOpsInAlapLevels, we first compute the ALAP schedule but thenoverride the levels for all critical_ops using an ASAP schedule. Within the same level bucket, we chose "degree" as the primary sorting criterion to address the routing congestion issues.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean if one bucket level has both critical operation (C) and non-critical operation (N), if N has more degrees than C, then we prioritize N, correct? But shouldn't we map C first?

This comment was marked as resolved.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean you implemented what I suggested, which leads to worse II?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see the problem. The previous mapping was unstable, so the reduced ii was a coincidence. I am trying to solve the problem.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done~


// Priority 3: Original index (stability tie-breaker).
return a_pair.second < b_pair.second;
});

for (const auto &p : ops_with_index) {
result.emplace_back(p.first, level);
}
}

Expand Down Expand Up @@ -628,7 +673,7 @@ bool mlir::neura::tryRouteDataMove(Operation *mov_op, MappingLoc src_loc,
continue;
}

// Explores two routing options from current tile:
int next_time = current_state.current_time + 1;

// Option 1: Moves to adjacent tile through link.
for (Link *out_link : current_state.current_tile->getOutLinks()) {
Expand Down Expand Up @@ -904,6 +949,10 @@ mlir::neura::calculateAward(Operation *op, std::set<Operation *> &critical_ops,
std::vector<MappingLoc> backward_users_locs;
for (Operation *user : backward_users) {
std::vector<MappingLoc> user_locs = mapping_state.getAllLocsOfOp(user);
if (user_locs.empty()) {
// llvm::errs() << "[Warning] No locations found for backward user " << *user << "\n";
continue;
}
assert(!user_locs.empty() && "No locations found for backward user");

MappingLoc backward_user_loc = user_locs.back();
Expand Down Expand Up @@ -972,7 +1021,39 @@ mlir::neura::calculateAward(Operation *op, std::set<Operation *> &critical_ops,
if (meet_producer_constraint && meet_backward_user_constraint) {
// Earlier time steps get higher scores.
int time_bonus = latest_end_time_step - t;
int total_award = tile_award + time_bonus;

// === Balanced Link congestion penalty ===
// A conservative penalty to guide the mapper away from hotspots
// without being too restrictive for small IIs.
int total_in = tile->getInLinks().size();
int total_out = tile->getOutLinks().size();
int occupied_in = 0;
int occupied_out = 0;

for (auto *link : tile->getInLinks()) {
if (!mapping_state.isAvailableAcrossTime({link, t}))
occupied_in++;
}
for (auto *link : tile->getOutLinks()) {
if (!mapping_state.isAvailableAcrossTime({link, t}))
occupied_out++;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

}

float in_ratio = (total_in > 0) ? (float)occupied_in / total_in : 0;
float out_ratio = (total_out > 0) ? (float)occupied_out / total_out : 0;

// Adaptive penalty strategy:
// - Use very strong penalty (60) only for high fan-in ops (>= 3 producers)
// - Use weak penalty (15) for low fan-in ops
// This optimizes fuse-pattern (II=11 target) without breaking iter-merge
int base_penalty_coeff = (producers.size() >= 3)
? STRONG_CONGESTION_PENALTY
: WEAK_CONGESTION_PENALTY;

int congestion_penalty = static_cast<int>(in_ratio * in_ratio * base_penalty_coeff) +
static_cast<int>(out_ratio * out_ratio * base_penalty_coeff);

int total_award = tile_award + time_bonus - congestion_penalty;
updateAward(locs_with_award, tile_loc_candidate, total_award);
}
}
Expand All @@ -983,11 +1064,17 @@ mlir::neura::calculateAward(Operation *op, std::set<Operation *> &critical_ops,
std::vector<std::pair<MappingLoc, int>> locs_award_vec(
locs_with_award.begin(), locs_with_award.end());

// Sorts by award (descending).
// Sorts by award (descending). Use stable sort/tie-breaker logic
// to minimize noise in mapping results.
std::sort(
locs_award_vec.begin(), locs_award_vec.end(),
[](const std::pair<MappingLoc, int> &a,
const std::pair<MappingLoc, int> &b) { return a.second > b.second; });
const std::pair<MappingLoc, int> &b) {
if (a.second != b.second)
return a.second > b.second;
// Tie-breaker: earlier time step first.
return a.first.time_step < b.first.time_step;
});
// TODO: Needs to handle tie case and prioritize lower resource utilization,
// however, compiled II becomes worse after adding this tie-breaker:
// https://github.com/coredac/dataflow/issues/59.
Expand Down
2 changes: 1 addition & 1 deletion lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,7 @@ struct MapToAcceleratorPass
}
}
std::vector<std::pair<Operation *, int>> sorted_ops_with_alap_levels =
flatten_level_buckets(level_buckets);
flatten_level_buckets(level_buckets, critical_ops);
for (const auto &[op, level] : sorted_ops_with_alap_levels) {
llvm::outs() << "[MapToAcceleratorPass] ALAP sorted op: " << *op
<< " (ALAP level: " << level << ")\n";
Expand Down
2 changes: 1 addition & 1 deletion test/c2llvm2mlir/nested_loop/test.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@
// CHECK-LLVM2NEURA: %188 = neura.sext %187 : !neura.data<i32, i1> -> !neura.data<i64, i1>
// CHECK-LLVM2NEURA: %207 = "neura.mul"(%205, %206) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>

// CHECK-LLVM2NEURA-MAP: func.func @_Z6kernelPiS_S_(%arg0: !llvm.ptr {llvm.noundef}, %arg1: !llvm.ptr {llvm.noundef}, %arg2: !llvm.ptr {llvm.noundef}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", frame_pointer = #llvm.framePointerKind<all>, linkage = #llvm.linkage<external>, mapping_info = {compiled_ii = 13 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 9 : i32, res_mii = 6 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}, no_inline, no_unwind, optimize_none, passthrough = ["mustprogress", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 0 : i64, visibility_ = 0 : i64} {
// CHECK-LLVM2NEURA-MAP: func.func @_Z6kernelPiS_S_(%arg0: !llvm.ptr {llvm.noundef}, %arg1: !llvm.ptr {llvm.noundef}, %arg2: !llvm.ptr {llvm.noundef}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", frame_pointer = #llvm.framePointerKind<all>, linkage = #llvm.linkage<external>, mapping_info = {compiled_ii = 11 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 9 : i32, res_mii = 6 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}, no_inline, no_unwind, optimize_none, passthrough = ["mustprogress", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 0 : i64, visibility_ = 0 : i64} {
Loading