-
Notifications
You must be signed in to change notification settings - Fork 77
Open
Description
Transpose scheduler is used for the following fusion:
TEST_F(NVFuserTest, TransposeMaverick17B) {
auto fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());
Fusion& fusion = *fusion_ptr;
auto dtype = DataType::BFloat16;
auto tv0 = makeContigConcreteTensor({262144, 5120}, dtype);
auto tv1 = makeContigConcreteTensor({262144, 1}, dtype);
fusion.addInput(tv0);
fusion.addInput(tv1);
auto tv2 = castOp(DataType::Float, tv0);
auto tv3 = castOp(DataType::Float, tv1);
auto tv4 = mul(tv2, tv3);
auto tv5 = castOp(dtype, tv4);
fusion.addOutput(tv5);
auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor input0 = at::randn({262144, 5120}, options);
at::Tensor input1 = at::randn({262144, 1}, options);
FusionExecutorCache executor_cache(std::move(fusion_ptr));
auto outputs = executor_cache.runFusionWithInputs({input0, input1});
testValidate(executor_cache.fusion(), outputs, {input0, input1}, __LINE__, __FILE__);
}
Originally found in benchmark_inference.py with: NVFUSER_DISABLE=parallel_compile NVFUSER_DUMP=scheduler_params,python_definition,fusion_ir_presched python /opt/pytorch/nvfuser/benchmarks/python/benchmark_inference.py --use-hardcoded-model --batch-size 128 --input-length 2048 --output-length 2 --num-layers 2 --mode thunder --warmup-iterations 1 --num-iterations 1 2>&1 |tee params.log
Transpose scheduler was selected to vectorize both inputs of t0 and t1:
===== Transpose Stats ========
inputs: T0_g___bfloat[iS0{262144}, iS1{5120}], T1_g___bfloat[iS2{262144}, bS3{1}]
outputs: T5_g___bfloat[iS10{262144}, iS11{5120}]
shape: 262144 5120
num_elems: 1342177280
n_io_tensors: 3
max_io_dtype_size: 2
group 1: T5_g___bfloat[iS10{262144}, iS11{5120}], T0_g___bfloat[iS0{262144}, iS1{5120}]
reference1: T5_g___bfloat[iS10{262144}, iS11{5120}]
inner_most_id1 position: 1 (in reference 1)
group 2: T1_g___bfloat[iS2{262144}, bS3{1}]
reference2: T1_g___bfloat[iS2{262144}, bS3{1}]
inner_most_id2 position: 0 (in reference 1)
Its performance is only 58% SOL.
Fus# NSegs CuEvtTm(ms) HstTm(ms) CmpTm(ms) KerTm(ms) EffBw(GB/s) %PkBw S-Seg# S-KerTm(ms) S-EffBw(GB/s) S-%PkBw S-In(MB) S-Out(MB) S-Smem[Dyn,Stat] S-Regs S-Grid S-Block S-KerName
0 1 202.812 201.072 192.821 1.863 2881.960 36.35 0 1.171 4583.737 57.82 2684.879 2684.355 [2048, 0] 27 [1310720, 1, 1] [128, 1, 1] nvfuser_transpose_f0_c1_r0_g0
If use pointwise, the perf is 86% SOL.
Fus# NSegs CuEvtTm(ms) HstTm(ms) CmpTm(ms) KerTm(ms) EffBw(GB/s) %PkBw S-Seg# S-KerTm(ms) S-EffBw(GB/s) S-%PkBw S-In(MB) S-Out(MB) S-Smem[Dyn,Stat] S-Regs S-Grid S-Block S-KerName
0 1 218.388 217.037 207.627 1.475 3640.200 45.92 0 0.783 6855.192 86.47 2684.879 2684.355 [0, 4] 37 [3, 65535, 1] [128, 1, 1] nvfuser_pointwise_f0_c1_r0_g0
Metadata
Metadata
Assignees
Labels
No labels