Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Couldn't find allocation mapping for T8_l__bfloat[iblockIdx.x604{2048} ...] #3740

Open
tfogal opened this issue Jan 21, 2025 · 0 comments
Open

Comments

@tfogal
Copy link
Collaborator

tfogal commented Jan 21, 2025

# CUDA devices:
#  0: NVIDIA RTX 6000 Ada Generation
# torch version: 2.6.0a0+[redacted]
# cuda version: [redacted]
# nvfuser version: 0.2.24+git5e08e1d
import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id4(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[4096, 512], contiguity=[True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[1, 0])
    S1 = fd.define_scalar(None, dtype=DataType.Int)
    S2 = fd.define_scalar(None, dtype=DataType.Int)
    T3 = fd.define_tensor(shape=[1, 4, 4096, 128], contiguity=[None, True, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 1, 2, 0])
    T4 = fd.define_tensor(shape=[1, 4096, 128], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 0, 1])
    T5 = fd.define_tensor(shape=[1, 4096, 128], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 0, 1])
    T6 = fd.define_tensor(shape=[1, 28, 4096, 128], contiguity=[None, True, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 1, 2, 0])
    T7 = fd.define_tensor(shape=[1, 4096, 512], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
    T12 = fd.ops.reshape(T0, new_shape=[1, 4096, 512])
    T13 = fd.ops.cast(T12, dtype=DataType.Float)
    S14 = fd.define_scalar(0.00000, dtype=DataType.Double)
    S15 = fd.define_scalar(1.00000, dtype=DataType.Double)
    S16 = fd.define_scalar(1, dtype=DataType.Int)
    S17 = fd.define_scalar(4096, dtype=DataType.Int)
    S18 = fd.define_scalar(512, dtype=DataType.Int)
    T20 = fd.ops.uniform(S14, S15, shape=[S16, S17, S18], rng_seed=S2, rng_offset=S1, dtype=DataType.BFloat16)
    S21 = fd.define_scalar(4.00000, dtype=DataType.Double)
    T22 = fd.ops.mul(T13, S21)
    S23 = fd.define_scalar(0.900000, dtype=DataType.Double)
    T24 = fd.ops.lt(T20, S23)
    T25 = fd.ops.cast(T24, dtype=DataType.Float)
    T41 = fd.ops.slice(T3, start_indices=[0, 0, 0, 64], end_indices=[1, 4, 4096, 128], strides=[1, 1, 1, 1], manual_normalization=0)
    T42 = fd.ops.mul(T22, T25)
    T43 = fd.ops.cast(T41, dtype=DataType.Float)
    T44 = fd.ops.neg(T43)
    T50 = fd.ops.broadcast_in_dim(T4, shape=[1, 1, 4096, 128], broadcast_dims=[0, 2, 3])
    T66 = fd.ops.slice(T3, start_indices=[0, 0, 0, 0], end_indices=[1, 4, 4096, 64], strides=[1, 1, 1, 1], manual_normalization=0)
    T67 = fd.ops.cast(T44, dtype=DataType.BFloat16)
    T73 = fd.ops.broadcast_in_dim(T5, shape=[1, 1, 4096, 128], broadcast_dims=[0, 2, 3])
    T89 = fd.ops.slice(T6, start_indices=[0, 0, 0, 64], end_indices=[1, 28, 4096, 128], strides=[1, 1, 1, 1], manual_normalization=0)
    S90 = fd.define_scalar(1.11111, dtype=DataType.Double)
    T91 = fd.ops.mul(T42, S90)
    T97 = fd.ops.broadcast_in_dim(T50, shape=[1, 4, 4096, 128], broadcast_dims=[0, 1, 2, 3])
    T98 = fd.ops.cat([T67, T66], dim=-1, manual_padding=0)
    T104 = fd.ops.broadcast_in_dim(T73, shape=[1, 4, 4096, 128], broadcast_dims=[0, 1, 2, 3])
    T105 = fd.ops.cast(T89, dtype=DataType.Float)
    T106 = fd.ops.cast(T97, dtype=DataType.Float)
    T107 = fd.ops.cast(T98, dtype=DataType.Float)
    T108 = fd.ops.cast(T104, dtype=DataType.Float)
    T109 = fd.ops.cast(T3, dtype=DataType.Float)
    T110 = fd.ops.neg(T105)
    T111 = fd.ops.cast(T7, dtype=DataType.Float)
    T112 = fd.ops.mul(T107, T106)
    T113 = fd.ops.mul(T109, T108)
    T129 = fd.ops.slice(T6, start_indices=[0, 0, 0, 0], end_indices=[1, 28, 4096, 64], strides=[1, 1, 1, 1], manual_normalization=0)
    T130 = fd.ops.cast(T110, dtype=DataType.BFloat16)
    T131 = fd.ops.add(T111, T91)
    T137 = fd.ops.broadcast_in_dim(T50, shape=[1, 28, 4096, 128], broadcast_dims=[0, 1, 2, 3])
    T138 = fd.ops.cat([T130, T129], dim=-1, manual_padding=0)
    T144 = fd.ops.broadcast_in_dim(T73, shape=[1, 28, 4096, 128], broadcast_dims=[0, 1, 2, 3])
    T145 = fd.ops.cast(T131, dtype=DataType.BFloat16)
    T146 = fd.ops.cast(T137, dtype=DataType.Float)
    T147 = fd.ops.cast(T138, dtype=DataType.Float)
    T148 = fd.ops.cast(T144, dtype=DataType.Float)
    T149 = fd.ops.cast(T6, dtype=DataType.Float)
    T155 = fd.ops.reshape(T145, new_shape=[1, 4096, 4, 128])
    T156 = fd.ops.add(T113, T112)
    T157 = fd.ops.mul(T147, T146)
    T158 = fd.ops.mul(T149, T148)
    T159 = fd.ops.permute(T155, dims=[0, 2, 1, 3])
    T160 = fd.ops.cast(T156, dtype=DataType.BFloat16)
    T167 = fd.ops.broadcast_in_dim(T159, shape=[1, 4, 1, 4096, 128], broadcast_dims=[0, 1, 3, 4])
    T174 = fd.ops.broadcast_in_dim(T160, shape=[1, 4, 1, 4096, 128], broadcast_dims=[0, 1, 3, 4])
    T181 = fd.ops.broadcast_in_dim(T167, shape=[1, 4, 7, 4096, 128], broadcast_dims=[0, 1, 2, 3, 4])
    T188 = fd.ops.broadcast_in_dim(T174, shape=[1, 4, 7, 4096, 128], broadcast_dims=[0, 1, 2, 3, 4])
    T189 = fd.ops.add(T158, T157)
    T195 = fd.ops.reshape(T181, new_shape=[1, 28, 4096, 128])
    T201 = fd.ops.reshape(T188, new_shape=[1, 28, 4096, 128])
    T202 = fd.ops.cast(T189, dtype=DataType.BFloat16)
    T203 = fd.ops.stride_order(T195, stride_order=[3, 2, 1, 0])
    T204 = fd.ops.stride_order(T201, stride_order=[3, 2, 1, 0])
    T205 = fd.ops.stride_order(T202, stride_order=[3, 2, 1, 0])
    fd.add_output(T159)
    fd.add_output(T160)
    fd.add_output(T203)
    fd.add_output(T204)
    fd.add_output(T205)

with FusionDefinition() as fd:
    nvfuser_fusion_id4(fd)

inputs = [
    torch.testing.make_tensor((4096, 512), dtype=torch.bfloat16, device='cuda:0'),
    25546,
    4178507003375096,
    torch.randn(2097152, dtype=torch.bfloat16, device='cuda:0').as_strided((1, 4, 4096, 128), (2097152, 128, 512, 1)),
    torch.randn(524288, dtype=torch.bfloat16, device='cuda:0').as_strided((1, 4096, 128), (524288, 1, 4096)),
    torch.randn(524288, dtype=torch.bfloat16, device='cuda:0').as_strided((1, 4096, 128), (524288, 1, 4096)),
    torch.randn(14680064, dtype=torch.bfloat16, device='cuda:0').as_strided((1, 28, 4096, 128), (14680064, 128, 3584, 1)),
    torch.testing.make_tensor((1, 4096, 512), dtype=torch.bfloat16, device='cuda:0'),
]
fd.execute(inputs)

Error:

RuntimeError:  INTERNAL ASSERT FAILED at "/tmp/pip-req-build-3_7aabj7/csrc/index_compute.cpp":1727, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. Couldn't find allocation mapping for T8_l___bfloat[iblockIdx.x604{2048}, iUS603{1}, iS601{8}, ithreadIdx.x605{128}] ca_pos( 4 ) dim: 2 id: iS27{512}
Exception raised from getConsumerAllocationIndices at /tmp/pip-req-build-3_7aabj7/csrc/index_compute.cpp:1727 (most recent call first):
frame #0: nvfuser::nvfCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x103 (0x73151835eb97 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #1: nvfuser::nvfErrorFail(char const*, char const*, unsigned int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x62 (0x731518782bd2 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #2: <unknown function> + 0x66ecb4 (0x731518898cb4 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #3: <unknown function> + 0x6708da (0x73151889a8da in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #4: <unknown function> + 0x670e45 (0x73151889ae45 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #5: <unknown function> + 0x4601d4 (0x73151868a1d4 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #6: <unknown function> + 0x46ddef (0x731518697def in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #7: <unknown function> + 0x459b57 (0x731518683b57 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #8: <unknown function> + 0x459b57 (0x731518683b57 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #9: <unknown function> + 0x459b57 (0x731518683b57 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #10: <unknown function> + 0x46ddef (0x731518697def in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #11: <unknown function> + 0x459b57 (0x731518683b57 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #12: <unknown function> + 0x45831b (0x73151868231b in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #13: <unknown function> + 0x41b622 (0x731518645622 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #14: nvfuser::GpuLower::run() + 0x239 (0x73151863e979 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #15: nvfuser::KernelExecutor::compile(nvfuser::Fusion*, nvfuser::KernelArgumentHolder const&, nvfuser::LaunchParams const&, nvfuser::CompileParams, nvfuser::SchedulerType) + 0x64a (0x731518ac625a in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #16: <unknown function> + 0x8a541f (0x731518acf41f in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #17: <unknown function> + 0x8dd0a4 (0x731518b070a4 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #18: nvfuser::FusionKernelRuntime::compileFusionParallel(nvfuser::KernelArgumentHolder) + 0x423 (0x731518b0a753 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #19: nvfuser::FusionExecutorCache::runFusionWithInputs(c10::ArrayRef<c10::IValue> const&, std::optional<nvfuser::PrimDataType>, std::optional<signed char>) + 0x1cb (0x731518afff5b in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #20: nvfuser::python_frontend::FusionDefinition::execute(c10::ArrayRef<c10::IValue> const&, std::optional<signed char>, bool, bool, bool, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >) const + 0xbc4 (0x731518ccad54 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #21: <unknown function> + 0x1cde24 (0x7315183f7e24 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #22: <unknown function> + 0x29aa03 (0x7315184c4a03 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #23: <unknown function> + 0x1bb283 (0x7315183e5283 in /usr/local/lib/python3.12/dist-packages/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)
frame #24: python3() [0x5820ff]
<omitting python frames>
frame #27: python3() [0x54cae4]
frame #31: python3() [0x5a3698]
frame #34: python3() [0x54ca1d]
frame #36: python3() [0x54ca1d]
frame #39: python3() [0x5a3698]
frame #42: python3() [0x54ca1d]
frame #44: python3() [0x54ca1d]
frame #46: python3() [0x54ca1d]
frame #49: python3() [0x5a3698]
frame #52: python3() [0x54ca1d]
frame #54: <unknown function> + 0x7c6078 (0x73161d968078 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so)
frame #56: python3() [0x54ca1d]
frame #58: <unknown function> + 0x7c6078 (0x73161d968078 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so)
frame #60: python3() [0x5a3698]
frame #63: <unknown function> + 0x7c59fe (0x73161d9679fe in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so)

This comes up in the Qwen2 network ("Qwen/Qwen2.5-7B-Instruct") when disabling the inductor backend of Thunder. e.g.:

cudnn = thunder.extend.get_executor("cudnn")
sdpa = thunder.extend.get_executor("sdpa")
apex = thunder.extend.get_executor("apex")
nvf = thunder.extend.get_executor("nvfuser")
pyt = thunder.extend.get_executor("torch")
pyth = thunder.extend.get_executor("python")
execs = [cudnn, sdpa, aspex, nvf, pyt, pyth]
be = thunder.dynamo.ThunderCompiler(executors=execs)
model = torch.compile(model, backend=be)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

1 participant