Couldn't find allocation mapping for T8_l__bfloat[iblockIdx.x604{2048} ...] #3740

tfogal · 2025-01-21T19:21:37Z

# CUDA devices:
#  0: NVIDIA RTX 6000 Ada Generation
# torch version: 2.6.0a0+[redacted]
# cuda version: [redacted]
# nvfuser version: 0.2.24+git5e08e1d
import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id4(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[4096, 512], contiguity=[True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[1, 0])
    S1 = fd.define_scalar(None, dtype=DataType.Int)
    S2 = fd.define_scalar(None, dtype=DataType.Int)
    T3 = fd.define_tensor(shape=[1, 4, 4096, 128], contiguity=[None, True, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 1, 2, 0])
    T4 = fd.define_tensor(shape=[1, 4096, 128], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 0, 1])
    T5 = fd.define_tensor(shape=[1, 4096, 128], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 0, 1])
    T6 = fd.define_tensor(shape=[1, 28, 4096, 128], contiguity=[None, True, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 1, 2, 0])
    T7 = fd.define_tensor(shape=[1, 4096, 512], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
    T12 = fd.ops.reshape(T0, new_shape=[1, 4096, 512])
    T13 = fd.ops.cast(T12, dtype=DataType.Float)
    S14 = fd.define_scalar(0.00000, dtype=DataType.Double)
    S15 = fd.define_scalar(1.00000, dtype=DataType.Double)
    S16 = fd.define_scalar(1, dtype=DataType.Int)
    S17 = fd.define_scalar(4096, dtype=DataType.Int)
    S18 = fd.define_scalar(512, dtype=DataType.Int)
    T20 = fd.ops.uniform(S14, S15, shape=[S16, S17, S18], rng_seed=S2, rng_offset=S1, dtype=DataType.BFloat16)
    S21 = fd.define_scalar(4.00000, dtype=DataType.Double)
    T22 = fd.ops.mul(T13, S21)
    S23 = fd.define_scalar(0.900000, dtype=DataType.Double)
    T24 = fd.ops.lt(T20, S23)
    T25 = fd.ops.cast(T24, dtype=DataType.Float)
    T41 = fd.ops.slice(T3, start_indices=[0, 0, 0, 64], end_indices=[1, 4, 4096, 128], strides=[1, 1, 1, 1], manual_normalization=0)
    T42 = fd.ops.mul(T22, T25)
    T43 = fd.ops.cast(T41, dtype=DataType.Float)
    T44 = fd.ops.neg(T43)
    T50 = fd.ops.broadcast_in_dim(T4, shape=[1, 1, 4096, 128], broadcast_dims=[0, 2, 3])
    T66 = fd.ops.slice(T3, start_indices=[0, 0, 0, 0], end_indices=[1, 4, 4096, 64], strides=[1, 1, 1, 1], manual_normalization=0)
    T67 = fd.ops.cast(T44, dtype=DataType.BFloat16)
    T73 = fd.ops.broadcast_in_dim(T5, shape=[1, 1, 4096, 128], broadcast_dims=[0, 2, 3])
    T89 = fd.ops.slice(T6, start_indices=[0, 0, 0, 64], end_indices=[1, 28, 4096, 128], strides=[1, 1, 1, 1], manual_normalization=0)
    S90 = fd.define_scalar(1.11111, dtype=DataType.Double)
    T91 = fd.ops.mul(T42, S90)
    T97 = fd.ops.broadcast_in_dim(T50, shape=[1, 4, 4096, 128], broadcast_dims=[0, 1, 2, 3])
    T98 = fd.ops.cat([T67, T66], dim=-1, manual_padding=0)
    T104 = fd.ops.broadcast_in_dim(T73, shape=[1, 4, 4096, 128], broadcast_dims=[0, 1, 2, 3])
    T105 = fd.ops.cast(T89, dtype=DataType.Float)
    T106 = fd.ops.cast(T97, dtype=DataType.Float)
    T107 = fd.ops.cast(T98, dtype=DataType.Float)
    T108 = fd.ops.cast(T104, dtype=DataType.Float)
    T109 = fd.ops.cast(T3, dtype=DataType.Float)
    T110 = fd.ops.neg(T105)
    T111 = fd.ops.cast(T7, dtype=DataType.Float)
    T112 = fd.ops.mul(T107, T106)
    T113 = fd.ops.mul(T109, T108)
    T129 = fd.ops.slice(T6, start_indices=[0, 0, 0, 0], end_indices=[1, 28, 4096, 64], strides=[1, 1, 1, 1], manual_normalization=0)
    T130 = fd.ops.cast(T110, dtype=DataType.BFloat16)
    T131 = fd.ops.add(T111, T91)
    T137 = fd.ops.broadcast_in_dim(T50, shape=[1, 28, 4096, 128], broadcast_dims=[0, 1, 2, 3])
    T138 = fd.ops.cat([T130, T129], dim=-1, manual_padding=0)
    T144 = fd.ops.broadcast_in_dim(T73, shape=[1, 28, 4096, 128], broadcast_dims=[0, 1, 2, 3])
    T145 = fd.ops.cast(T131, dtype=DataType.BFloat16)
    T146 = fd.ops.cast(T137, dtype=DataType.Float)
    T147 = fd.ops.cast(T138, dtype=DataType.Float)
    T148 = fd.ops.cast(T144, dtype=DataType.Float)
    T149 = fd.ops.cast(T6, dtype=DataType.Float)
    T155 = fd.ops.reshape(T145, new_shape=[1, 4096, 4, 128])
    T156 = fd.ops.add(T113, T112)
    T157 = fd.ops.mul(T147, T146)
    T158 = fd.ops.mul(T149, T148)
    T159 = fd.ops.permute(T155, dims=[0, 2, 1, 3])
    T160 = fd.ops.cast(T156, dtype=DataType.BFloat16)
    T167 = fd.ops.broadcast_in_dim(T159, shape=[1, 4, 1, 4096, 128], broadcast_dims=[0, 1, 3, 4])
    T174 = fd.ops.broadcast_in_dim(T160, shape=[1, 4, 1, 4096, 128], broadcast_dims=[0, 1, 3, 4])
    T181 = fd.ops.broadcast_in_dim(T167, shape=[1, 4, 7, 4096, 128], broadcast_dims=[0, 1, 2, 3, 4])
    T188 = fd.ops.broadcast_in_dim(T174, shape=[1, 4, 7, 4096, 128], broadcast_dims=[0, 1, 2, 3, 4])
    T189 = fd.ops.add(T158, T157)
    T195 = fd.ops.reshape(T181, new_shape=[1, 28, 4096, 128])
    T201 = fd.ops.reshape(T188, new_shape=[1, 28, 4096, 128])
    T202 = fd.ops.cast(T189, dtype=DataType.BFloat16)
    T203 = fd.ops.stride_order(T195, stride_order=[3, 2, 1, 0])
    T204 = fd.ops.stride_order(T201, stride_order=[3, 2, 1, 0])
    T205 = fd.ops.stride_order(T202, stride_order=[3, 2, 1, 0])
    fd.add_output(T159)
    fd.add_output(T160)
    fd.add_output(T203)
    fd.add_output(T204)
    fd.add_output(T205)

with FusionDefinition() as fd:
    nvfuser_fusion_id4(fd)

inputs = [
    torch.testing.make_tensor((4096, 512), dtype=torch.bfloat16, device='cuda:0'),
    25546,
    4178507003375096,
    torch.randn(2097152, dtype=torch.bfloat16, device='cuda:0').as_strided((1, 4, 4096, 128), (2097152, 128, 512, 1)),
    torch.randn(524288, dtype=torch.bfloat16, device='cuda:0').as_strided((1, 4096, 128), (524288, 1, 4096)),
    torch.randn(524288, dtype=torch.bfloat16, device='cuda:0').as_strided((1, 4096, 128), (524288, 1, 4096)),
    torch.randn(14680064, dtype=torch.bfloat16, device='cuda:0').as_strided((1, 28, 4096, 128), (14680064, 128, 3584, 1)),
    torch.testing.make_tensor((1, 4096, 512), dtype=torch.bfloat16, device='cuda:0'),
]
fd.execute(inputs)

Error:

RuntimeError:  INTERNAL ASSERT FAILED at "/tmp/pip-req-build-3_7aabj7/csrc/index_compute.cpp":1727, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. Couldn't find allocation mapping for T8_l___bfloat[iblockIdx.x604{2048}, iUS603{1}, iS601{8}, ithreadIdx.x605{128}] ca_pos( 4 ) dim: 2 id: iS27{512}
Exception raised from getConsumerAllocationIndices at /tmp/pip-req-build-3_7aabj7/csrc/index_compute.cpp:1727 (most recent call first):
frame #0: nvfuser::nvfCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x103 (0x73151835eb97 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #1: nvfuser::nvfErrorFail(char const*, char const*, unsigned int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x62 (0x731518782bd2 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #2: <unknown function> + 0x66ecb4 (0x731518898cb4 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #3: <unknown function> + 0x6708da (0x73151889a8da in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #4: <unknown function> + 0x670e45 (0x73151889ae45 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #5: <unknown function> + 0x4601d4 (0x73151868a1d4 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #6: <unknown function> + 0x46ddef (0x731518697def in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #7: <unknown function> + 0x459b57 (0x731518683b57 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #8: <unknown function> + 0x459b57 (0x731518683b57 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #9: <unknown function> + 0x459b57 (0x731518683b57 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #10: <unknown function> + 0x46ddef (0x731518697def in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #11: <unknown function> + 0x459b57 (0x731518683b57 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #12: <unknown function> + 0x45831b (0x73151868231b in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #13: <unknown function> + 0x41b622 (0x731518645622 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #14: nvfuser::GpuLower::run() + 0x239 (0x73151863e979 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #15: nvfuser::KernelExecutor::compile(nvfuser::Fusion*, nvfuser::KernelArgumentHolder const&, nvfuser::LaunchParams const&, nvfuser::CompileParams, nvfuser::SchedulerType) + 0x64a (0x731518ac625a in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #16: <unknown function> + 0x8a541f (0x731518acf41f in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #17: <unknown function> + 0x8dd0a4 (0x731518b070a4 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #18: nvfuser::FusionKernelRuntime::compileFusionParallel(nvfuser::KernelArgumentHolder) + 0x423 (0x731518b0a753 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #19: nvfuser::FusionExecutorCache::runFusionWithInputs(c10::ArrayRef<c10::IValue> const&, std::optional<nvfuser::PrimDataType>, std::optional<signed char>) + 0x1cb (0x731518afff5b in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #20: nvfuser::python_frontend::FusionDefinition::execute(c10::ArrayRef<c10::IValue> const&, std::optional<signed char>, bool, bool, bool, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >) const + 0xbc4 (0x731518ccad54 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #21: <unknown function> + 0x1cde24 (0x7315183f7e24 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #22: <unknown function> + 0x29aa03 (0x7315184c4a03 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #23: <unknown function> + 0x1bb283 (0x7315183e5283 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #24: python3() [0x5820ff]
<omitting python frames>
frame #27: python3() [0x54cae4]
frame #31: python3() [0x5a3698]
frame #34: python3() [0x54ca1d]
frame #36: python3() [0x54ca1d]
frame #39: python3() [0x5a3698]
frame #42: python3() [0x54ca1d]
frame #44: python3() [0x54ca1d]
frame #46: python3() [0x54ca1d]
frame #49: python3() [0x5a3698]
frame #52: python3() [0x54ca1d]
frame #54: <unknown function> + 0x7c6078 (0x73161d968078 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so)
frame #56: python3() [0x54ca1d]
frame #58: <unknown function> + 0x7c6078 (0x73161d968078 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so)
frame #60: python3() [0x5a3698]
frame #63: <unknown function> + 0x7c59fe (0x73161d9679fe in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so)

This comes up in the Qwen2 network ("Qwen/Qwen2.5-7B-Instruct") when disabling the inductor backend of Thunder. e.g.:

cudnn = thunder.extend.get_executor("cudnn")
sdpa = thunder.extend.get_executor("sdpa")
apex = thunder.extend.get_executor("apex")
nvf = thunder.extend.get_executor("nvfuser")
pyt = thunder.extend.get_executor("torch")
pyth = thunder.extend.get_executor("python")
execs = [cudnn, sdpa, aspex, nvf, pyt, pyth]
be = thunder.dynamo.ThunderCompiler(executors=execs)
model = torch.compile(model, backend=be)

The text was updated successfully, but these errors were encountered:

tfogal added Thunder Indexing Ops labels Jan 21, 2025

tfogal mentioned this issue Jan 31, 2025

Add a few benchmarks for concatenation. #3751

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Couldn't find allocation mapping for T8_l__bfloat[iblockIdx.x604{2048} ...] #3740

Couldn't find allocation mapping for T8_l__bfloat[iblockIdx.x604{2048} ...] #3740

tfogal commented Jan 21, 2025 •

edited

Loading

Couldn't find allocation mapping for T8_l__bfloat[iblockIdx.x604{2048} ...] #3740

Couldn't find allocation mapping for T8_l__bfloat[iblockIdx.x604{2048} ...] #3740

Comments

tfogal commented Jan 21, 2025 • edited Loading

tfogal commented Jan 21, 2025 •

edited

Loading