Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions include/core/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
#include <map>
#include <unordered_set>

namespace infini {
namespace infini
{
class Allocator
{
private:
Expand All @@ -23,10 +24,9 @@ namespace infini {
// pointer to the memory actually allocated
void *ptr;

// =================================== 作业 ===================================
// TODO:可能需要设计一个数据结构来存储free block,以便于管理和合并
// HINT: 可以使用一个 map 来存储 free block,key 为 block 的起始/结尾地址,value 为 block 的大小
// =================================== 作业 ===================================
std::map<size_t, size_t> freeBlocks;
void addFreeBlock(size_t addr, size_t size);
std::map<size_t, size_t>::iterator findFreeBlock(size_t size);

public:
Allocator(Runtime runtime);
Expand Down
92 changes: 87 additions & 5 deletions src/core/allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,104 @@ namespace infini
IT_ASSERT(this->ptr == nullptr);
// pad the size to the multiple of alignment
size = this->getAlignedSize(size);

// =================================== 作业 ===================================
// TODO: 设计一个算法来分配内存,返回起始地址偏移量
// =================================== 作业 ===================================

return 0;
auto it = findFreeBlock(size);
if (it != freeBlocks.end())
{
const size_t addr = it->first;
const size_t blockSize = it->second;
IT_ASSERT(blockSize >= size);
freeBlocks.erase(it);

const size_t remain = blockSize - size;
if (remain > 0)
{
freeBlocks.emplace(addr + size, remain);
}

used += size;
return addr;
}

const size_t addr = peak;
peak += size;
used += size;
return addr;
}

void Allocator::free(size_t addr, size_t size)
{
IT_ASSERT(this->ptr == nullptr);
size = getAlignedSize(size);

// =================================== 作业 ===================================
// TODO: 设计一个算法来回收内存
// =================================== 作业 ===================================
IT_ASSERT(used >= size);
used -= size;
addFreeBlock(addr, size);
}

std::map<size_t, size_t>::iterator Allocator::findFreeBlock(size_t size)
{
// first-fit: 找到第一个 size 足够的空闲块
for (auto it = freeBlocks.begin(); it != freeBlocks.end(); ++it)
{
if (it->second >= size)
return it;
}
return freeBlocks.end();
Comment on lines +69 to +77
Copy link

Copilot AI Feb 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The findFreeBlock function uses a linear search (first-fit) through all free blocks. For workloads with many allocations and deallocations, this could become a performance bottleneck as it has O(n) complexity where n is the number of free blocks. Consider using a more efficient data structure, such as maintaining free blocks sorted by size (in addition to address) or using a best-fit strategy with size-indexed structures for better performance. However, if the number of tensors is small (typical in most ML graphs), this may not be a practical concern.

Copilot uses AI. Check for mistakes.
}

void Allocator::addFreeBlock(size_t addr, size_t size)
{
// 插入一个 free block,并与左右相邻块合并(coalescing)
auto it = freeBlocks.lower_bound(addr);

// 尝试与左侧块合并
if (it != freeBlocks.begin())
{
auto left = std::prev(it);
const size_t leftAddr = left->first;
const size_t leftSize = left->second;
if (leftAddr + leftSize == addr)
{
addr = leftAddr;
size += leftSize;
freeBlocks.erase(left);
}
}

// 尝试与右侧块合并(重新定位迭代器)
it = freeBlocks.lower_bound(addr);
if (it != freeBlocks.end())
{
const size_t rightAddr = it->first;
const size_t rightSize = it->second;
if (addr + size == rightAddr)
{
size += rightSize;
freeBlocks.erase(it);
}
}

freeBlocks.emplace(addr, size);

// 若空闲块位于堆顶(addr+size==peak),则可以把 peak 往回收缩。
// 进一步:如果收缩后的新 peak 仍然与另一个空闲块相邻,也可以继续收缩。
while (!freeBlocks.empty())
{
auto it = freeBlocks.upper_bound(peak);
if (it == freeBlocks.begin())
break;
--it;
const size_t blockAddr = it->first;
const size_t blockSize = it->second;
if (blockAddr + blockSize != peak)
break;
peak = blockAddr;
freeBlocks.erase(it);
}
}

void *Allocator::getPtr()
Expand Down
228 changes: 223 additions & 5 deletions src/core/graph.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#include "core/graph.h"
#include "core/blob.h"
#include "operators/matmul.h"
#include "operators/transpose.h"
#include <algorithm>
#include <numeric>
#include <queue>
Expand Down Expand Up @@ -106,6 +109,166 @@ namespace infini
// 1. 去除冗余的算子(例如,两个相邻的算子都是 transpose 算子,且做的是相反的操作,可以将其全部删除)
// 2. 合并算子(例如,矩阵乘算子中含有属性transA、transB,如果其输入存在transpose,且对最后两个维度做交换,就可以将transpose融入到矩阵乘算子的属性中去)
// =================================== 作业 ===================================

IT_ASSERT(topo_sort() == true);

auto isSwapLast2Permute = [](const std::vector<int> &perm) -> bool
{
const int r = static_cast<int>(perm.size());
if (r < 2)
return false;
for (int i = 0; i < r - 2; ++i)
if (perm[i] != i)
return false;
return perm[r - 2] == r - 1 && perm[r - 1] == r - 2;
};

auto isInversePermute = [](const std::vector<int> &p1,
const std::vector<int> &p2) -> bool
{
if (p1.size() != p2.size())
return false;
const int r = static_cast<int>(p1.size());
std::vector<int> inv(r, -1);
for (int i = 0; i < r; ++i)
{
const int v = p1[i];
if (v < 0 || v >= r || inv[v] != -1)
return false;
inv[v] = i;
}
return inv == p2;
};

bool changed = true;
while (changed)
{
changed = false;

// 规则 1:消除连续 transpose(perm 互逆)
for (size_t i = 0; i < ops.size(); ++i)
{
auto op1 = ops[i];
if (op1->getOpType() != OpType::Transpose)
continue;
auto t1 = as<TransposeObj>(op1);
auto out1 = t1->getOutput();
auto targets = out1->getTargets();
if (targets.size() != 1)
continue;
auto op2 = targets[0];
if (!op2 || op2->getOpType() != OpType::Transpose)
continue;
auto t2 = as<TransposeObj>(op2);
if (t2->getInputs(0) != out1)
continue;
if (!isInversePermute(t1->getPermute(), t2->getPermute()))
continue;

auto in = t1->getInputs(0);
auto out2 = t2->getOutput();
for (auto &consumer : out2->getTargets())
consumer->replaceInput(out2, in);

ops.erase(std::remove(ops.begin(), ops.end(), op1), ops.end());
ops.erase(std::remove(ops.begin(), ops.end(), op2), ops.end());
changed = true;
break;
}
if (changed)
continue;

// 规则 2:将 transpose(交换最后两维) 融合到 matmul 的 transA/transB
std::unordered_set<OperatorObj *> toRemove;
for (auto &op : ops)
{
if (op->getOpType() != OpType::MatMul)
continue;
auto mm = as<MatmulObj>(op);
for (int inputIdx = 0; inputIdx < 2; ++inputIdx)
{
auto in = mm->getInputs(inputIdx);
auto src = in->getSource();
if (!src || src->getOpType() != OpType::Transpose)
continue;
auto tr = as<TransposeObj>(src);
if (tr->getOutput() != in)
continue;
if (!isSwapLast2Permute(tr->getPermute()))
continue;

auto trIn = tr->getInputs(0);
mm->replaceInput(in, trIn);
if (inputIdx == 0)
mm->setTransA(!mm->getTransA());
else
mm->setTransB(!mm->getTransB());
toRemove.insert(src.get());
changed = true;
Comment on lines +194 to +207
Copy link

Copilot AI Feb 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential issue: When a transpose operator is marked for removal and added to the toRemove set, the code doesn't verify that the transpose output is only consumed by this matmul operator. If the transpose output is used by multiple consumers, removing it would break other parts of the graph. Add a check to ensure tr->getOutput()->getTargets().size() == 1 before adding to toRemove, similar to the check in rule 1 at line 157.

Copilot uses AI. Check for mistakes.
}
}
if (!toRemove.empty())
{
ops.erase(std::remove_if(ops.begin(), ops.end(),
[&](const Operator &op)
{ return toRemove.count(op.get()) != 0; }),
ops.end());
}
}

// 清理不再被任何算子引用的张量
{
std::unordered_set<TensorObj *> referenced;
referenced.reserve(tensors.size());
for (auto &op : ops)
{
for (auto &t : op->getInputs())
referenced.insert(t.get());
for (auto &t : op->getOutputs())
referenced.insert(t.get());
}
TensorVec kept;
kept.reserve(tensors.size());
for (auto &t : tensors)
if (referenced.count(t.get()) != 0)
kept.emplace_back(t);
tensors = std::move(kept);
}

// 重新构建 pred/succ 与 tensor source/target
for (auto &t : tensors)
{
t->targets.clear();
t->source.reset();
}
for (auto &op : ops)
{
op->predecessors.clear();
op->successors.clear();
}
for (auto &op : ops)
{
for (auto &input : op->getInputs())
{
if (input)
{
input->addTarget(op);
if (auto pred = input->getSource())
{
pred->addSuccessors(op);
op->addPredecessors(pred);
}
}
}
for (auto &output : op->getOutputs())
{
if (output)
output->setSource(op);
}
}

sorted = false;
IT_ASSERT(topo_sort() == true);
}

Tensor GraphObj::getTensor(int fuid) const
Expand Down Expand Up @@ -148,10 +311,65 @@ namespace infini
// topological sorting first
IT_ASSERT(topo_sort() == true);

// =================================== 作业 ===================================
// TODO:利用 allocator 给计算图分配内存
// HINT: 获取分配好的内存指针后,可以调用 tensor 的 setDataBlob 函数给 tensor 绑定内存
// =================================== 作业 ===================================
std::unordered_map<TensorObj *, int> remainingUses;
std::unordered_map<TensorObj *, size_t> bytes;
std::unordered_map<TensorObj *, size_t> offsets;
remainingUses.reserve(tensors.size());
bytes.reserve(tensors.size());
offsets.reserve(tensors.size());

std::unordered_set<TensorObj *> keepAlive;
keepAlive.reserve(tensors.size());
for (auto &t : tensors)
{
bytes[t.get()] = t->getBytes();
remainingUses[t.get()] = static_cast<int>(t->getTargets().size());
if (t->getTargets().empty())
keepAlive.insert(t.get());
}

auto ensureAlloc = [&](const Tensor &t)
{
auto *p = t.get();
if (offsets.find(p) == offsets.end())
offsets[p] = allocator.alloc(bytes[p]);
};

// 输入张量:dataMalloc 后会 setData
for (auto &t : getInputs())
ensureAlloc(t);

// 遍历 op:分配输出、回收“已完成最后一次使用”的输入
for (auto &op : ops)
{
for (auto &out : op->getOutputs())
ensureAlloc(out);

for (auto &in : op->getInputs())
{
auto *p = in.get();
auto it = remainingUses.find(p);
if (it == remainingUses.end())
continue;
if (it->second > 0)
--(it->second);
if (it->second == 0 && keepAlive.count(p) == 0)
{
auto offIt = offsets.find(p);
if (offIt != offsets.end())
allocator.free(offIt->second, bytes[p]);
}
}
}

void *base = allocator.getPtr();
for (auto &t : tensors)
{
auto it = offsets.find(t.get());
IT_ASSERT(it != offsets.end(), "Tensor not allocated in dataMalloc");
void *ptr = static_cast<void *>(static_cast<char *>(base) + it->second);
t->setDataBlob(make_ref<BlobObj>(runtime, ptr));
}

allocator.info();
}
Expand Down Expand Up @@ -227,4 +445,4 @@ namespace infini
return true;
}

} // namespace infini
} // namespace infini
Loading
Loading