Skip to content

The all_to_all benchmark test results on TPUv7x don't meet expectations. #73

@jimoosciuc

Description

@jimoosciuc

Results summary

  • achieved_bw_1d/achieved_bw_2d=68GB/s
  • achieved_bw_3d=43GB/s

all_to_all_1d

benchmarks:
- benchmark_name: all_to_all
  benchmark_sweep_params:
  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x8", ici_size_range: 32, sharding_strategy: "1x1x8", op_dimension: 1, num_runs: 5} # Non Parallel Replica
  trace_dir: "../microbenchmarks/all_to_all_1d"
  csv_path: "../microbenchmarks/all_to_all_1d"
  xlml_metrics_dir: "../microbenchmarks/all_to_all_1d"
  xla_dump_dir: "../microbenchmarks/all_to_all_1d/hlo_graphs"
iteration op_type replica_group_type rank mesh_shape op_dimension sharding_strategy input_num_elements matrix_shape transferred_data (GB) dtype_bytes hlo_input_shape hlo_output_shape hlo_replica_groups step_time_ms_p50 step_time_ms_p90 step_time_ms_p95 step_time_ms_p99 step_time_ms_avg step_time_ms_max step_time_ms_num_runs step_time_ms_min achieved_bw (GB/s)_p50 achieved_bw (GB/s)_p90 achieved_bw (GB/s)_p95 achieved_bw (GB/s)_p99 achieved_bw (GB/s)_avg achieved_bw (GB/s)_max achieved_bw (GB/s)_num_runs achieved_bw (GB/s)_min
8 A2A non-parallel 8 2x2x8 1 1x1x8 8192 ((8, 8, 128)) 2.46E-05 4 f32[8,8,128] f32[8,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.008611044 0.0086439376 0.0086521008000000010.00865863136 0.0085675868 0.008660264 5 0.008460984 2.8540093396340795 2.9011751774001655 2.9029009366511014 2.9042815440518504 2.8687298763576714 2.9046266959020377 5 2.8377887787254523  
16 A2A non-parallel 8 2x2x8 1 1x1x8 16384 ((16, 8, 128)) 4.92E-05 4 f32[16,8,128] f32[16,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.008716687 0.0088573828 0.0088662664 0.00887337328 0.008748859600000002 0.00887515 5 0.008632653 5.638839618768003 5.6789415140169 5.686336025425934 5.692251634553162 5.618700910331254 5.693730536834969 55.538159918423915  
32 A2A non-parallel 8 2x2x8 1 1x1x8 32768 ((32, 8, 128)) 9.83E-05 4 f32[32,8,128] f32[32,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.009296519 0.009470828 0.009508523 0.009538679 0.009323889600000001 0.009546218 5 0.009152461 10.574280545223434 10.687841873483267 10.714279794321033 10.735430130991245 10.545262852131069 10.740717715158798 5 10.297690666607448
64 A2A non-parallel 8 2x2x8 1 1x1x8 65536 ((64, 8, 128)) 0.00019660800000000003 4 f32[64,8,128] f32[64,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.010689076 0.0107200478 0.0107291714 0.01073647028 0.0106840336 0.010738295 5 0.010648259 18.393357854317813 18.461365962403796 18.462614698865792 18.463613688035387 18.40221127502451 18.463863435327788 5 18.309051855997627
128 A2A non-parallel 8 2x2x8 1 1x1x8 131072 ((128, 8, 128)) 0.00039321600000000005 4 f32[128,8,128] f32[128,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.013545018 0.0137250902 0.0137406966 0.013753181719999999 0.013599039599999998 0.013756303 5 0.013496999 29.03030472163271 29.114960671163793 29.12427403542584 29.13172472683548 28.916565977041564 29.13358739968789 5 28.584424172686518
256 A2A non-parallel 8 2x2x8 1 1x1x8 262144 ((256, 8, 128)) 0.0007864320000000001 4 f32[256,8,128] f32[256,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.018891957 0.018996638399999998 0.01901392520.01902775464 0.018865306 0.019031212 5 0.018728691 41.62787370307905 41.98968464649197 41.99022315365213 41.99065395938026 41.68835756148295 41.99076166081229 5 41.32327462906725  
512 A2A non-parallel 8 2x2x8 1 1x1x8 524288 ((512, 8, 128)) 0.0015728640000000002 4 f32[512,8,128] f32[512,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.030623049 0.0306439374 0.0306460982 0.03064782684 0.0306050418 0.030648259 5 0.030540216 51.36209656980924 51.47713969107542 51.48927180520952 51.498977496516794 51.3924075433177 51.50140391934361 5 51.31984821715322
1024 A2A non-parallel 8 2x2x8 1 1x1x8 1048576 ((1024, 8, 128)) 0.0031457280000000004 4 f32[1024,8,128] f32[1024,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.052677071 0.0527111644 0.05271812720.05272369744 0.05267010800000001 0.05272509 5 0.052584634 59.71721548451319 59.78183404976464 59.80201234599838 59.81815498298537 59.72515663368107 59.822190642232115 559.6628284560538    
2048 A2A non-parallel 8 2x2x8 1 1x1x8 2097152 ((2048, 8, 128)) 0.006291456000000001 4 f32[2048,8,128] f32[2048,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.097835534 0.0979282112 0.09793469360.09793987952000001 0.0978175268 0.097941176 5 0.097668667 64.30645127362418 64.3992281271826 64.40777351354059 64.41460982262699 64.31836091336609 64.41631889989858 564.237088596935    
4096 A2A non-parallel 8 2x2x8 1 1x1x8 4194304 ((4096, 8, 128)) 0.012582912000000002 4 f32[4096,8,128] f32[4096,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.187986795 0.1880893156 0.18809987979999998 0.18810833115999998 0.18800864339999998 0.188110444 5 0.187920768 66.9350844563311 66.95193124379138 66.95526685591364 66.95793534561145 66.92731453563388 66.9586024680359 5 66.89108660016774
8192 A2A non-parallel 8 2x2x8 1 1x1x8 8388608 ((8192, 8, 128)) 0.025165824000000003 4 f32[8192,8,128] f32[8192,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.368803121 0.3688235292 0.36882953160000004 0.36883433352 0.3687771908 0.368835534 5 0.368639856 68.23647243484147 68.25469378450728 68.26069355892552 68.26549337846012 68.24127285875251 68.26669333334377 568.2304758629899  

all_to_all_2d

benchmarks:
- benchmark_name: all_to_all
  benchmark_sweep_params:
  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x8", ici_size_range: 32, sharding_strategy: "1x8", op_dimension: 2, num_runs: 5} # Non Parallel Replica
  trace_dir: "../microbenchmarks/all_to_all_2d"
  csv_path: "../microbenchmarks/all_to_all_2d"
  xlml_metrics_dir: "../microbenchmarks/all_to_all_2d"
  xla_dump_dir: "../microbenchmarks/all_to_all_2d/hlo_graphs"
iteration op_type replica_group_type rank mesh_shape op_dimension sharding_strategy input_num_elements matrix_shape transferred_data (GB) dtype_bytes hlo_input_shape hlo_output_shape hlo_replica_groups step_time_ms_p50 step_time_ms_p90 step_time_ms_p95 step_time_ms_p99 step_time_ms_avg step_time_ms_max step_time_ms_num_runs step_time_ms_min achieved_bw (GB/s)_p50 achieved_bw (GB/s)_p90 achieved_bw (GB/s)_p95 achieved_bw (GB/s)_p99 achieved_bw (GB/s)_avg achieved_bw (GB/s)_max achieved_bw (GB/s)_num_runs achieved_bw (GB/s)_min
8 A2A non-parallel 8 4x8 2 1x8 8192 ((8, 8, 128)) 2.46E-05 4 f32[8,8,128] f32[8,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.008563025 0.008734933799999999 0.00879183640.00883735848 0.008537815 0.008848739 5 0.008272509 2.8700138093722725 2.947142047540046 2.9589728528886137 2.9684374971674674 2.8798846636937085 2.970803658237181 5 2.777344884960445  
16 A2A non-parallel 8 4x8 2 1x8 16384 ((16, 8, 128)) 4.92E-05 4 f32[16,8,128] f32[16,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.0087491 0.0088256904 0.0088348142 0.008842113240000001 0.00874958 0.008843938 5 0.008639856 5.617949274782551 5.668926070677588 5.678954887980783 5.686977941823339 5.6179998946065774 5.688983705283978 55.55770517613308  
32 A2A non-parallel 8 4x8 2 1x8 32768 ((32, 8, 128)) 9.83E-05 4 f32[32,8,128] f32[32,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.009254502 0.009404802 0.009412485 0.0094186314 0.009276590599999999 0.009420168 5 0.009130852 10.62228956242054 10.7357777581096 10.750957184181201 10.763100725038482 10.59847213318989 10.766136610252802 510.4354826792898  
64 A2A non-parallel 8 4x8 2 1x8 65536 ((64, 8, 128)) 0.00019660800000000003 4 f32[64,8,128] f32[64,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.010660264 0.0107322934 0.0107442982 0.01075390204 0.010629532 0.010756303 5 0.010483794 18.443070453039443 18.705720804781432 18.72961847298997 18.7487366075568 18.497997026276316 18.753516141198507 5 18.27839918604004
128 A2A non-parallel 8 4x8 2 1x8 131072 ((128, 8, 128)) 0.00039321600000000005 4 f32[128,8,128] f32[128,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.013602641 0.0136981992 0.0137164466 0.013731044520000001 0.013605522200000001 0.013734694 5 0.013506603 28.90732762850979 29.0839631046883 29.098417393391674 29.109980824354377 28.9022087992601 29.11287168209505 528.6293964758152  
256 A2A non-parallel 8 4x8 2 1x8 262144 ((256, 8, 128)) 0.0007864320000000001 4 f32[256,8,128] f32[256,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.018792317 0.018972629 0.019007923 0.0190361582 0.018821848800000002 0.019043217 5 0.018619448 41.848591634549386 42.085991701888126 42.16155962361874 42.22201396100324 41.785145728040824 42.23712754534937 541.2972240982183  
512 A2A non-parallel 8 4x8 2 1x8 524288 ((512, 8, 128)) 0.0015728640000000002 4 f32[512,8,128] f32[512,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.03054982 0.030619688 0.0306273709999999970.030633517399999997 0.0305560626 0.030635054 5 0.030462185 51.485213333499196 51.58298787134468 51.60815810470684 51.628294291396564 51.47488373996223 51.633328338069 5 51.34196923563446  
1024 A2A non-parallel 8 4x8 2 1x8 1048576 ((1024, 8, 128)) 0.0031457280000000004 4 f32[1024,8,128] f32[1024,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.052659064 0.05272509 0.0527322930.0527380554 0.052642017 0.052739496 5 0.052541417 59.737636050652185 59.85991101808027 59.86565366503011 59.870247782589985 59.757107524934476 59.87139631197995 5 59.64653132066337  
2048 A2A non-parallel 8 4x8 2 1x8 2097152 ((2048, 8, 128)) 0.006291456000000001 4 f32[2048,8,128] f32[2048,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.097747899 0.0979452578 0.09799183640.09802909928 0.0977875148 0.098038415 5 0.097648259 64.36410464433615 64.41679769761421 64.4232896437367 64.42848320063467 64.33815347050663 64.42978158985918 5 64.17337530395612  
4096 A2A non-parallel 8 4x8 2 1x8 4194304 ((4096, 8, 128)) 0.012582912000000002 4 f32[4096,8,128] f32[4096,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.188061224 0.188235054 0.1882621850.18828388980000002 0.1880792316 0.188289316 5 0.187936375 66.90859355461816 66.95030520692961 66.95167358031136 66.95276827901677 66.90221981343015 66.95304195369312 566.8275410804509    
8192 A2A non-parallel 8 4x8 2 1x8 8388608 ((8192, 8, 128)) 0.025165824000000003 4 f32[8192,8,128] f32[8192,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15},{16,17,18,19,20,21,22,23},{24,25,26,27,28,29,30,31}} 0.368863145 0.3691212486 0.36916206479999997 0.36919471775999996 0.3688938776 0.369202881 5 0.368669868 68.22536851709596 68.25633569983937 68.25873585175053 68.26065597327946 68.21970299159307 68.26113600366169 5 68.16258836290068

all_to_all_3d

benchmarks:
- benchmark_name: all_to_all
  benchmark_sweep_params:
  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x8", ici_size_range: 32, sharding_strategy: "2x2x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups
  trace_dir: "../microbenchmarks/all_to_all_3d"
  csv_path: "../microbenchmarks/all_to_all_3d"
  xlml_metrics_dir: "../microbenchmarks/all_to_all_3d"
  xla_dump_dir: "../microbenchmarks/all_to_all_3d/hlo_graphs"
iteration op_type replica_group_type rank mesh_shape op_dimension sharding_strategy input_num_elements matrix_shape transferred_data (GB) dtype_bytes hlo_input_shape hlo_output_shape hlo_replica_groups step_time_ms_p50 step_time_ms_p90 step_time_ms_p95 step_time_ms_p99 step_time_ms_avg step_time_ms_max step_time_ms_num_runs step_time_ms_min achieved_bw (GB/s)_p50 achieved_bw (GB/s)_p90 achieved_bw (GB/s)_p95 achieved_bw (GB/s)_p99 achieved_bw (GB/s)_avg achieved_bw (GB/s)_max achieved_bw (GB/s)_num_runs achieved_bw (GB/s)_min
32 A2A non-parallel 32 2x2x8 3 2x2x8 32768 ((32, 8, 128)) 0.00012288000000000002 4 f32[32,8,128] f32[32,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}} 0.012151261 0.0121591836 0.012159423800000001 0.01215961596 0.0121342138 0.012159664 5 0.012072029 10.112530707718319 10.159559674225536 10.169230831639041 10.176967757569845 10.126812378684606 10.178901989052546 5 10.105542389987093
64 A2A non-parallel 32 2x2x8 3 2x2x8 65536 ((64, 8, 128)) 0.00024576000000000003 4 f32[64,8,128] f32[64,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}} 0.014726291 0.014879231599999999 0.0148993998 0.01491553436 0.0147654262 0.014919568 5 0.014648259 16.688519872383345 16.747306504967447 16.762363473950998 16.77440904913784 16.64496464732585 16.777420442934552 5 16.472326812679835
128 A2A non-parallel 32 2x2x8 3 2x2x8 131072 ((128, 8, 128)) 0.0004915200000000001 4 f32[128,8,128] f32[128,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}} 0.020040816 0.0200741894 0.0200785112 0.02008196864 0.019980072 0.020082833 5 0.019726291 24.525947446451287 24.785913469866404 24.851456916239464 24.90389167333791 24.60157262312181 24.91700036261252 5 24.474634629486786
256 A2A non-parallel 32 2x2x8 3 2x2x8 262144 ((256, 8, 128)) 0.0009830400000000001 4 f32[256,8,128] f32[256,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}} 0.031109244 0.0311798316 0.0311829528 0.03118544976 0.031123169200000002 0.031186074 5 0.031056423 31.599610713780127 31.638202014918498 31.645778793242886 31.6518402159024 31.58554873921394 31.65335557156728 5 31.52176192488994
512 A2A non-parallel 32 2x2x8 3 2x2x8 524288 ((512, 8, 128)) 0.0019660800000000003 4 f32[512,8,128] f32[512,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}} 0.05367587 0.053797838800000004 0.0537992794000000040.053800431880000006 0.0536489794 0.05380072 5 0.053444178 36.628749566611596 36.76378324795936 36.775663158832366 36.78516708753077 36.647365421214516 36.78754306970537 536.5437488568926    
1024 A2A non-parallel 32 2x2x8 3 2x2x8 1048576 ((1024, 8, 128)) 0.0039321600000000005 4 f32[1024,8,128] f32[1024,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}} 0.098608643 0.0986607444 0.098667707200000010.09867327744000001 0.0985812724 0.09867467 5 0.09845018 39.87642340844302 39.92717689659808 39.933892210059554 39.93926446082874 39.88752152176063 39.94060752352104 539.849740566652    
2048 A2A non-parallel 32 2x2x8 3 2x2x8 2097152 ((2048, 8, 128)) 0.007864320000000001 4 f32[2048,8,128] f32[2048,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}} 0.188404562 0.1885515006 0.1885692678 0.18858348156 0.18844681859999998 0.188587035 5 0.188345738 41.741664408317256 41.750018454446625 41.7523597940835 41.754232865793 41.732313023456484 41.754701133720374 5 41.70127601825863
4096 A2A non-parallel 32 2x2x8 3 2x2x8 4194304 ((4096, 8, 128)) 0.015728640000000002 4 f32[4096,8,128] f32[4096,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}} 0.368291717 0.3685068432 0.3685553426 0.36859414212 0.368291957 0.368603842 5 0.368027611 42.707015319597865 42.73080687995496 42.73423492148282 42.73697735470511 42.70699917383609 42.737662963010685 5 42.670852030891204
8192 A2A non-parallel 32 2x2x8 3 2x2x8 8388608 ((8192, 8, 128)) 0.031457280000000004 4 f32[8192,8,128] f32[8192,8,128]{2,1,0:T(8,128) {{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}} 0.727989196 0.7280333732 0.7280472986 0.72805843892 0.7279903962000001 0.728061224 5 0.727941176 43.21119073311083 43.213385593207654 43.213713416632615 43.21397567537258 43.21111962202942 43.21404124005757 543.2069157964111  

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions