From b7214419cb6ec8273d71874fb5d6ce0b52f2a5f7 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Tue, 7 May 2024 13:15:10 -0700 Subject: [PATCH 001/187] gpu: jit: gemm: use scanline ordering in more thin-m/n cases --- src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp | 24 +++++++++++++++---- .../intel/jit/gemm/xe_hp_systolic_gemm.cpp | 5 ++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp index 2780c3cb6e9..d003dbc2063 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp @@ -110,15 +110,20 @@ status_t gen_gemm_kernel_desc_t::finalize(const char *tags) { int wg_tile_m = strategy_.wg[LoopM] * strategy_.unroll[LoopM]; int wg_tile_n = strategy_.wg[LoopN] * strategy_.unroll[LoopN]; if (wg_tile_m > 0 && wg_tile_n > 0) { - dim_t thread_count = dim_t(utils::div_up(m_, wg_tile_m)) - * utils::div_up(n_, wg_tile_n) * strategy_.wg[LoopM] - * strategy_.wg[LoopN]; + dim_t m_tiles = dim_t(utils::div_up(m_, wg_tile_m)); + dim_t n_tiles = dim_t(utils::div_up(n_, wg_tile_n)); + dim_t thread_per_tg = strategy_.wg[LoopM] * strategy_.wg[LoopN]; if (!strategy_.kParallelVariable) - thread_count *= std::max(strategy_.wg[LoopK], 1); + thread_per_tg *= std::max(strategy_.wg[LoopK], 1); dim_t thread_gpu = eu_count_ * compute::device_info_t::threads_per_eu( arch_, strategy_.GRFs > 128); - if (thread_count <= thread_gpu) { + dim_t tiles_gpu = thread_gpu / thread_per_tg; + + bool use_linear = (m_tiles * n_tiles <= tiles_gpu); + bool use_linear_m = (m_tiles * m_tiles <= 2 * tiles_gpu); + bool use_linear_n = (n_tiles * n_tiles <= 2 * tiles_gpu); + if (use_linear) { if (strategy_.kParallelVariable) strategy_.cWalkOrder = WalkOrder::SimpleLinear; else if (strategy_.kParallel @@ -131,6 +136,15 @@ status_t gen_gemm_kernel_desc_t::finalize(const char *tags) { strategy_.blocking[LoopM] = 16777216; strategy_.blocking[LoopN] = 16777216; } + } else if (use_linear_m || use_linear_n) { + if (use_linear_n && !use_linear_m) { + strategy_.loopOrder[0] = LoopN; + strategy_.loopOrder[1] = LoopM; + } else if (use_linear_m && !use_linear_n) { + strategy_.loopOrder[0] = LoopM; + strategy_.loopOrder[1] = LoopN; + } + strategy_.cWalkOrder = WalkOrder::SimpleLinear; } } } diff --git a/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.cpp b/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.cpp index 93042d9159c..0f46d1572f0 100644 --- a/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.cpp +++ b/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.cpp @@ -857,6 +857,11 @@ status_t xe_hp_systolic_gemm_t::launch_compute(const gemm_exec_ctx_t &ctx, compute::range_t lws(size_t(tg_m), size_t(tg_n), 1); if (pd()->with_batch()) gws[2] = batch; + if (compute_info_.isNMK()) { + std::swap(lws[0], lws[1]); + std::swap(gws[0], gws[1]); + } + lws[1] *= compute_info_.wgExpand; gws[1] *= compute_info_.wgExpand; From 3fd77ce3226ed2b098689f0a19d240ddffe5d5f7 Mon Sep 17 00:00:00 2001 From: "Kassen, Andrew" Date: Mon, 22 Apr 2024 16:38:05 -0700 Subject: [PATCH 002/187] gpu: intel: ocl: reorder: explicitly promote offset --- src/gpu/intel/ocl/ref_reorder.cl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gpu/intel/ocl/ref_reorder.cl b/src/gpu/intel/ocl/ref_reorder.cl index 1aad91c1212..364a7410e74 100644 --- a/src/gpu/intel/ocl/ref_reorder.cl +++ b/src/gpu/intel/ocl/ref_reorder.cl @@ -22,7 +22,8 @@ #define TO_I4 ((DST_DT_U4 || DST_DT_S4) && (!SRC_DT_U4 && !SRC_DT_S4)) #define FROM_I4 ((SRC_DT_U4 || SRC_DT_S4) && (!DST_DT_U4 && !DST_DT_S4)) -#define GWS_GET_THREAD_ID(index) (get_global_id(index) + offset.array[index]) +#define GWS_GET_THREAD_ID(index) \ + (off_t)(get_global_id(index) + offset.array[index]) KERNEL_ATTR __kernel void ref_reorder(__global SRC_DATA_T *restrict src, From 7af61e3533638e04e09fdcbe5a8d904daf6ce80b Mon Sep 17 00:00:00 2001 From: Robert Cohn Date: Tue, 7 May 2024 14:09:45 -0500 Subject: [PATCH 003/187] github: workflows: add slack notification for RFCs --- .github/workflows/slack-pr.yaml | 40 +++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/slack-pr.yaml diff --git a/.github/workflows/slack-pr.yaml b/.github/workflows/slack-pr.yaml new file mode 100644 index 00000000000..963b14d9f2f --- /dev/null +++ b/.github/workflows/slack-pr.yaml @@ -0,0 +1,40 @@ +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +name: Slack PR Notification +on: + # use pull_request_target to run on PRs from forks and have access to secrets + pull_request_target: + types: [labeled] + +env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + channel: "onednn" + +jobs: + rfc: + name: RFC Notification + runs-on: ubuntu-latest + # Trigger when labeling a PR with "RFC" + if: | + github.event.action == 'labeled' && + contains(toJson(github.event.pull_request.labels.*.name), '"RFC"') + steps: + - name: Notify Slack + uses: slackapi/slack-github-action@70cd7be8e40a46e8b0eced40b0de447bdb42f68e # v1.26.0 + with: + channel-id: ${{ env.channel }} + slack-message: "${{ github.actor }} posted a RFC: ${{ github.event.pull_request.title }}. URL: ${{ github.event.pull_request.html_url }}" From 568666b25d25aa0cb6e9e6a751038264b74e41a6 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Tue, 7 May 2024 14:23:40 -0700 Subject: [PATCH 004/187] xehpc: jit: gemm: use more nocopy --- src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.cpp b/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.cpp index 0f46d1572f0..e8d6600c35f 100644 --- a/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.cpp +++ b/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.cpp @@ -157,11 +157,11 @@ const nocopy_table_t xe_hp_x8x8s32_nocopy_bad_ld_table[] = { {{{656, 528}, {352, 384}}, {{656, 528}, {352, 384}}}}; const nocopy_table_t xe_hpc_f16_nocopy_table[] = { - // NN NT TN TT - {{{14848, 12800}, {8193, 8193}}, {{0, 0}, {0, 0}}}}; + // NN NT TN TT + {{{0, 12800}, {0, 0}}, {{0, 0}, {0, 0}}}}; const nocopy_table_t xe_hpc_x8x8s32_nocopy_table[] = { - // NN NT TN TT + // NN NT TN TT {{{0, 10000}, {0, 0}}, {{0, 0}, {0, 0}}}}; const nocopy_table_t xe_hpc_f16_nocopy_bad_ld_table[] = { From f6d54c6b5cadcd913a9fd35f5139b13e12fcf70d Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Wed, 8 May 2024 10:58:41 -0700 Subject: [PATCH 005/187] fixup: cpu: gemm: distinguish interfaces by name --- src/common/gemm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/gemm.cpp b/src/common/gemm.cpp index 42beb369166..6a2578cd5ca 100644 --- a/src/common/gemm.cpp +++ b/src/common/gemm.cpp @@ -182,7 +182,7 @@ dnnl_status_t dnnl_threadpool_interop_gemm_u8s8s32(char transa, char transb, status_t status = dnnl_success; MAYBE_VERBOSE(status, "u8", "s8", "s32", MAYBE_RUN_STACK_CHECKER(dnnl_threadpool_interop_gemm_u8s8s32, - cpu::gemm_s8x8s32, &transb, &transa, c2f_offsetC(&offsetc), + cpu::gemm_s8u8s32, &transb, &transa, c2f_offsetC(&offsetc), &N, &M, &K, &alpha, B, &ldb, &bo, A, &lda, &ao, &beta, C, &ldc, co)); threadpool_utils::deactivate_threadpool(); @@ -198,7 +198,7 @@ dnnl_status_t dnnl_threadpool_interop_gemm_s8s8s32(char transa, char transb, status_t status = dnnl_success; MAYBE_VERBOSE(status, "s8", "s8", "s32", MAYBE_RUN_STACK_CHECKER(dnnl_threadpool_interop_gemm_s8s8s32, - cpu::gemm_s8x8s32, &transb, &transa, c2f_offsetC(&offsetc), + cpu::gemm_s8s8s32, &transb, &transa, c2f_offsetC(&offsetc), &N, &M, &K, &alpha, B, &ldb, &bo, A, &lda, &ao, &beta, C, &ldc, co)); threadpool_utils::deactivate_threadpool(); From 30376efa11e1a0fd0072043de56b711c371bda05 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Tue, 7 May 2024 15:17:00 -0700 Subject: [PATCH 006/187] gpu: jit: gemm: separate strategy parameter for A/B block 2D path --- src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp | 12 ++++++------ src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp | 6 +++--- src/gpu/intel/jit/gemm/strategy_parser.cpp | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp index 4daa1364df7..976623a618c 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp @@ -21661,12 +21661,12 @@ void gemm_kernel_generator_t::gemmSubkernel( int optAlignB = strategy.optAlignAB; // Handle block 2D alignment checks. - if (optAlignA == GEMMStrategy::AlignBlock2D) { - optAlignA = std::max({block2DMinAlignment(hw, problem.A, strategy.A), + if (strategy.optAlignAB2D) { + optAlignA = std::max({optAlignA, + block2DMinAlignment(hw, problem.A, strategy.A), block2DMinAlignment(hw, problem.A, strategy.A_prefetch)}); - } - if (optAlignB == GEMMStrategy::AlignBlock2D) { - optAlignB = std::max({block2DMinAlignment(hw, problem.B, strategy.B), + optAlignB = std::max({optAlignB, + block2DMinAlignment(hw, problem.B, strategy.B), block2DMinAlignment(hw, problem.B, strategy.B_prefetch)}); } @@ -21714,7 +21714,7 @@ void gemm_kernel_generator_t::gemmSubkernel( InstructionModifier bmod = checkLDB ? 1 | f0[1] | anyv : 1 | f0[1]; ejmpi(bmod, labelUnaligned); } - if (strategy.optAlignAB == GEMMStrategy::AlignBlock2D) { + if (strategy.optAlignAB2D) { if (doA) and_(1 | nz | f0[0], null.ud(), state.inputs.lda, 0xFF000000); if (doB) diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp index 05113e03548..7141e156321 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp @@ -1086,9 +1086,9 @@ struct GEMMStrategyPOD : public CommonStrategy { uint8_t pad4[3] = {}; int optAlignAB = 0; // Optional alignment for A/B. If > 0, create two versions of k loop, one for A/B aligned to this value, one not. - enum { - AlignBlock2D = 65536 // Special optAlignAB value for block 2D loads. - }; + bool optAlignAB2D + = false; // If true, create two version of k loop, one for A/B aligned to block 2D requirements, one not. + uint8_t pad4b[3] = {}; AccessType unalignedAccA, unalignedAccB; // Access types to use for A/B on unaligned path. uint8_t pad5[2] = {}; diff --git a/src/gpu/intel/jit/gemm/strategy_parser.cpp b/src/gpu/intel/jit/gemm/strategy_parser.cpp index 0f5b421d8fd..194689f3508 100644 --- a/src/gpu/intel/jit/gemm/strategy_parser.cpp +++ b/src/gpu/intel/jit/gemm/strategy_parser.cpp @@ -407,7 +407,7 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, strategy.arbitrationMode = ngen::ThreadArbitrationMode::RoundRobinOnStall; else if (mod == "l2d") - strategy.optAlignAB = GEMMStrategy::AlignBlock2D; + strategy.optAlignAB2D = true; else if (mod == "nq") { strategy.A.noExtraPad = strategy.A_prefetch.noExtraPad = true; strategy.B.noExtraPad = strategy.B_prefetch.noExtraPad = true; @@ -539,7 +539,7 @@ void adjustStrategy(HW hw, const GEMMProblem &problem, GEMMStrategy &strategy, && !isPacked(problem.C.layout); // Notify kernel generator to downgrade block 2D prefetches if block 2D cannot be used. - if (tags && strategy.optAlignAB != GEMMStrategy::AlignBlock2D) { + if (tags && !strategy.optAlignAB2D) { bool block2DA = false, block2DB = false; while (*tags) { block2DA |= (*tags == kcatalog::ReqBlock2DA); From d4dba336ba18183b338eeaee21ad996dbf5bdd72 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Tue, 7 May 2024 15:21:40 -0700 Subject: [PATCH 007/187] gpu: jit: gemm: update catalog sorting --- src/gpu/intel/jit/gemm/kernel.db | 163 +++++++++++----------- src/gpu/intel/jit/gemm/kernel_catalog.hpp | 3 +- 2 files changed, 82 insertions(+), 84 deletions(-) diff --git a/src/gpu/intel/jit/gemm/kernel.db b/src/gpu/intel/jit/gemm/kernel.db index 040d9d71505..a627cbee965 100644 --- a/src/gpu/intel/jit/gemm/kernel.db +++ b/src/gpu/intel/jit/gemm/kernel.db @@ -246,23 +246,24 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 xaf dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {596002, 1.33567e+06, 0, 0, 0, 0, 5.53572, 5.47044, 6.55621, 18.2618, 0.0211424, 0.0211424, 0, 1, 1.31489, 1.18381, 6.85524e-13}}}, {{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB32x2 aB wg 2x8x2 kr ca4x2 ks64 xaf st dw vav bo sr bk0 dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.06751e+06, 204301, 68125.3, 363183, 0, 0, 5.23091, 5.8341, 5.50916, 14.4411, 0.0246809, 0.0146692, 0.0199046, 0.894464, 1.4112, 1.17796, 1.27893e-12}}}, {{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ip"}, "aB32 aB16 aB wg 2x4x4 kr ca3 ks64 af dw vav bo sr bk0 sm dm sys grf256", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 4, 4}, 1, (WGType) 1, 261, 12288, 12288, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.14066e+06, 578017, 75917, 84236.6, 0, 0, 6.02279, 6.09029, 4.49028, 11.4256, 0.0521067, 0.0432382, 0.0332794, 0.97984, 1.21403, 1.20108, 7.3813e-15}}}, -{{'E', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav di bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, -{{'E', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav di bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, {{'E', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x4 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.31167e+06, 785275, 0, 0, 0, 0, 7.11381, 8.75643, 6.11098, 15.9972, 0.0503546, 0.0303966, 0.0484271, 0.842682, 1.20649, 1.2023, -1.8357e-15}}}, {{'E', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 8x4 kc2 cab4 ks8 nse di bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.35147e+06, 357789, 0, 0, 0, 0, 11.5708, 11.8958, 6.40012, 17.218, 0.14396, 0.139657, 0.0130437, 0.882761, 1.16324, 1.0488, 3.12079e-12}}}, -{{'E', "gemm", {"F", "H", "S"}, {"A", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "xyzI"}, "sB32 sB32 sB wg 2x1x16 akr fg 0.25 nse sr sb32 bk0 bm0 pab sys", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 1, 16}, 1, (WGType) 0, 262917, 0, 512, {8, 4, 4}, {false, false, false}}, {'W', 1, {64}}}, -{{'E', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}}, +{{'E', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav di bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, +{{'E', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav di bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, {{'E', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB16 sB16 aB wg 4x16 cab4 ks32 af dw vav di bo bk0 sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {540457, 1.10675e+06, 0, 0, 0, 0, 5.61877, 5.78531, 5.50943, 14.7763, 0.0373088, 0.0373088, 0, 1, 1.21298, 1.20167, -5.60976e-15}}}, {{'E', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, +{{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav di bo bk0 sm sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27535e+06, 215187, 0, 0, 0, 0, 4.64559, 4.65327, 3.47044, 9.89232, 0.105235, 0.0510608, 0.0580013, 1, 1.20879, 1.2012, -2.49667e-15}}}, +{{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav di bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20075e+06, 136815, 51380.2, 5732.06, 0, 0, 6.73706, 6.52116, 7.79608, 7.48028, 2.1671, 0.385165, 0.687428, 0.333333, 1.20286, 0, 0}}}, +{{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, +{{'E', "gemm", {"F", "H", "S"}, {"A", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "xyzI"}, "sB32 sB32 sB wg 2x1x16 akr fg 0.25 nse sr sb32 bk0 bm0 pab sys", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 1, 16}, 1, (WGType) 0, 262917, 0, 512, {8, 4, 4}, {false, false, false}}, {'W', 1, {64}}}, +{{'E', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}}, {{'E', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 4, -1}, {-1, -1, -1}, {-1, 4, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS8 sB wg 2x1x16 akr fg 0.5 kc4 nse sr sb32 bk0 bm0 grf256 pab", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 8}, {2, 1, 16}, 1, (WGType) 0, 525061, 0, 2048, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, {{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav di bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06234e+06, 445401, 0, 0, 0, 0, 4.62244, 5.31323, 3.78655, 11.4316, 0.08593, 0.0721937, 0.0244972, 1, 1.21089, 1.20155, -5.01322e-15}}}, {{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.24598e+06, 199092, 0, 0, 0, 0, 5.11406, 6.54063, 3.6868, 10.138, 0.114685, 0.0491769, 0.0713602, 0.902881, 1.20724, 1.20086, 6.44189e-15}}}, {{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.00833e+06, 472991, 67249.4, 65461, 0, 0, 5.08333, 5.12749, 4.89837, 12.0468, 0.0787666, 0.065148, 0.0481217, 1, 1.20614, 1.20116, 2.71563e-15}}}, {{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 8x4 cab4 ks16 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.05172e+06, 557389, 0, 0, 0, 0, 2.23646, 5.1076, 6.24833, 17.6243, 0.0206356, 0.0106181, 0.0095341, 0.748815, 1.31282, 1.18539, 7.88099e-13}}}, -{{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav di bo bk0 sm sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27535e+06, 215187, 0, 0, 0, 0, 4.64559, 4.65327, 3.47044, 9.89232, 0.105235, 0.0510608, 0.0580013, 1, 1.20879, 1.2012, -2.49667e-15}}}, -{{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav di bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20075e+06, 136815, 51380.2, 5732.06, 0, 0, 6.73706, 6.52116, 7.79608, 7.48028, 2.1671, 0.385165, 0.687428, 0.333333, 1.20286, 0, 0}}}, -{{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, {{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {4, 4, 4}, {false, false, true}}, {'W', 1, {128}}}, +{{'E', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03504e+06, 483759, 0, 0, 0, 0, 5.14655, 5.4381, 5.95309, 14.8916, 0.0340942, 0.0152122, 0.0258839, 1, 1.27788, 1.19062, 3.54023e-13}}}, {{'E', "gemm", {"H", "H", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "smqp"}, "ab16x3 ab16x3 ab fs sc bo acb bk2048 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {4, 4, 1}, 2, (WGType) 1, 256, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, {{'E', "gemm", {"H", "H", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzDsmqp"}, "sB16 sB16 sb fs wg 8x4 bo acb bk4096 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 4096}, {32, 48, 16}, {8, 4, 1}, 1, (WGType) 1, 0, 61440, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}}, {{'E', "gemm", {"H", "H", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzdsmqp"}, "sB16 sB16 sb fs wg 4x4 bo acb bk4096 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 4096}, {32, 48, 16}, {4, 4, 1}, 1, (WGType) 1, 0, 32256, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}}, @@ -309,7 +310,6 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 4x8 cab3 ks32 xaf dw vav bo sr bk0 grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {986809, 437370, 0, 0, 0, 0, 5.68695, 6.48375, 6.53936, 16.9925, 0.0282225, 0.0226525, 0.0239336, 0.736827, 1.37762, 1.17817, 1.10401e-12}}}, {{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {585301, 1.13594e+06, 0, 0, 0, 0, 6.24472, 6.60826, 6.57509, 18.329, 0.0236345, 0.0236345, 0, 0.985563, 1.32741, 1.18438, 6.65402e-13}}}, {{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 4x8 cab3 ks32 xaf dw vav bo sr bk0 grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {982154, 437724, 0, 0, 0, 0, 6.2565, 6.37164, 6.49676, 16.9924, 0.0283206, 0.0139661, 0.0208344, 0.966273, 1.36049, 1.17681, 1.17759e-12}}}, -{{'E', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03504e+06, 483759, 0, 0, 0, 0, 5.14655, 5.4381, 5.95309, 14.8916, 0.0340942, 0.0152122, 0.0258839, 1, 1.27788, 1.19062, 3.54023e-13}}}, {{'E', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB8/4 aS8 aB wg 4x8 kc8 ca4 ks8 nse di bo sr bk0 sm dm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 8192, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, {{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.0836e+06, 535346, 0, 0, 0, 0, 5.23021, 5.23405, 6.51087, 16.4718, 0.0217916, 0.0114948, 0.0166387, 0.885878, 1.48291, 1.16519, 1.92343e-12}}}, {{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB32 aB wg 8x4 cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06855e+06, 487726, 0, 0, 0, 0, 4.61972, 4.65428, 5.62471, 14.4914, 0.0515614, 0.0277546, 0.0342646, 0.995874, 1.21142, 1.20166, 4.60295e-15}}}, @@ -334,17 +334,29 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 xaf dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {599248, 1.33458e+06, 0, 0, 0, 0, 5.52852, 5.45748, 6.54024, 18.2766, 0.0211426, 0.0211426, 0, 1, 1.40808, 1.175, 1.4855e-12}}}, {{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB32x2 aB wg 2x8x2 kr ca4x2 ks64 xaf st dw vav bo sr bk0 dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.06367e+06, 203220, 74026.4, 363426, 0, 0, 5.22592, 5.83255, 5.53361, 14.4283, 0.0245863, -0.000813833, 0.0346436, 0.773746, 1.49124, 1.1734, 2.5839e-12}}}, {{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ip"}, "aB32 aB16 aB wg 2x4x4 kr ca3 ks64 af dw vav bo sr bk0 sm dm sys grf256", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 4, 4}, 1, (WGType) 1, 261, 12288, 12288, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13288e+06, 578431, 81868.2, 83815, 0, 0, 6.01706, 6.08456, 4.49408, 11.3129, 0.0520799, 0.0402399, 0.0372522, 0.940886, 1.2079, 1.2015, 4.21515e-15}}}, -{{'E', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03504e+06, 483759, 0, 0, 0, 0, 5.14655, 5.4381, 5.95309, 14.8916, 0.0340942, 0.0152122, 0.0258839, 1, 1.27788, 1.19062, 3.54023e-13}}}, {{'E', "gemm", {"H", "H", "H"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB8 aB8/4 aB wg 4x8 kc8 cab4 ks8 nse di bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, {{'E', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x4 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.30894e+06, 785550, 0, 0, 0, 0, 7.11935, 8.75656, 6.13129, 16.041, 0.0504259, 0.0420693, 0.0674256, 0.73758, 1.20696, 1.20187, 1.15297e-15}}}, {{'E', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 8x4 kc2 cab4 ks8 nse di bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.34765e+06, 357923, 0, 0, 0, 0, 11.5919, 11.8886, 6.4384, 17.1449, 0.145301, 0.14134, 0.0125881, 0.886981, 1.17153, 1.00812, 9.37293e-12}}}, +{{'E', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03504e+06, 483759, 0, 0, 0, 0, 5.14655, 5.4381, 5.95309, 14.8916, 0.0340942, 0.0152122, 0.0258839, 1, 1.27788, 1.19062, 3.54023e-13}}}, +{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB16 sB16 aB wg 4x16 cab4 ks32 af dw vav di bo bk0 sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {540457, 1.10675e+06, 0, 0, 0, 0, 5.61877, 5.78531, 5.50943, 14.7763, 0.0373088, 0.0373088, 0, 1, 1.21298, 1.20167, -5.60976e-15}}}, +{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, +{{'E', "gemm", {"O", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {560656, 1.11604e+06, 0, 0, 0, 0, 8.60856, 11.0509, 6.14909, 15.6609, 0.0531671, 0.0531671, 0, 1, 1.21193, 1.20139, 7.41618e-15}}}, +{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav di bo bk0 sm sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27535e+06, 215187, 0, 0, 0, 0, 4.64559, 4.65327, 3.47044, 9.89232, 0.105235, 0.0510608, 0.0580013, 1, 1.20879, 1.2012, -2.49667e-15}}}, +{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav di bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20075e+06, 136815, 51380.2, 5732.06, 0, 0, 6.73706, 6.52116, 7.79608, 7.48028, 2.1671, 0.385165, 0.687428, 0.333333, 1.20286, 0, 0}}}, +{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, +{{'E', "gemm", {"O", "H", "S"}, {"A", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "xyz"}, "sB8 sB8 sB wg 2x1x16 akr kc8 fg 0.25 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {32, 4, 8}, {2, 1, 16}, 1, (WGType) 0, 262917, 0, 1024, {32, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, +{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}}, +{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, +{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB64 aB wg 4x4x2 kr cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 65536, 32768, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.01594e+06, 428512, 105061, 111208, 0, 0, 4.57657, 4.56121, 3.76336, 13.6659, 0.0951637, 0.0550303, 0.0735522, 1, 1.21613, 1.20157, 3.14445e-15}}}, +{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav di bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06234e+06, 445401, 0, 0, 0, 0, 4.62244, 5.31323, 3.78655, 11.4316, 0.08593, 0.0721937, 0.0244972, 1, 1.21089, 1.20155, -5.01322e-15}}}, +{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav di bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20626e+06, 136698, 52320.2, 5700.91, 0, 0, 6.73519, 6.52141, 8.28549, 8.11665, 2.16712, 0.382531, 0.689817, 0.333333, 1.20221, 0, 0}}}, +{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.24598e+06, 199092, 0, 0, 0, 0, 5.11406, 6.54063, 3.6868, 10.138, 0.114685, 0.0491769, 0.0713602, 0.902881, 1.20724, 1.20086, 6.44189e-15}}}, +{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.00833e+06, 472991, 67249.4, 65461, 0, 0, 5.08333, 5.12749, 4.89837, 12.0468, 0.0787666, 0.065148, 0.0481217, 1, 1.20614, 1.20116, 2.71563e-15}}}, +{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, {{'E', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzdsm"}, "sB16 sB16 sb fs wg 4x4 bo acb bk8192 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 8192}, {8192, 8192, 8192}, {32, 48, 32}, {4, 4, 1}, 1, (WGType) 1, 256, 32256, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}}, {{'E', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzDsm"}, "sB16 sB16 sb fs wg 8x4 bo acb bk8192 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 8192}, {8192, 8192, 8192}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 256, 61440, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}}, {{'E', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "sm"}, "ab16x3 ab16x3 ab fs sc bo acb bk4096 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 4096}, {8192, 8192, 4096}, {32, 32, 32}, {4, 4, 1}, 2, (WGType) 1, 256, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, {{'E', "gemm", {"O", "O", "I"}, {"A4", "B4", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB16x2 aB wg 8x4 nse di bo sr sb32 bk0 grf256", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 0, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, -{{'E', "gemm", {"O", "H", "S"}, {"A", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "xyz"}, "sB8 sB8 sB wg 2x1x16 akr kc8 fg 0.25 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {32, 4, 8}, {2, 1, 16}, 1, (WGType) 0, 262917, 0, 1024, {32, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, -{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}}, -{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB16 sB16 aB wg 4x16 cab4 ks32 af dw vav di bo bk0 sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {540457, 1.10675e+06, 0, 0, 0, 0, 5.61877, 5.78531, 5.50943, 14.7763, 0.0373088, 0.0373088, 0, 1, 1.21298, 1.20167, -5.60976e-15}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "sB32 sB32 aB wg 4x8x2 kr cab4 ks32 af dw vav di bo bk0 sn sys pab sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 8, 2}, 1, (WGType) 1, 261, 49152, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.00712e+06, 406000, 172581, 272859, 0, 0, 3.12891, 3.34068, 4.87654, 14.094, 0.0252127, 0.0218325, 0.0113697, 0.982774, 1.22754, 1.20144, 7.00834e-14}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 8x8 cab4 ks32 af dw vav di bo bk0 sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {552020, 1.12529e+06, 0, 0, 0, 0, 5.16852, 2.80811, 6.1605, 15.5799, 0.0215997, 0.0215997, 0, 1, 1.2111, 1.20564, -3.21736e-14}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "sB32 sB64 aB wg 4x16 cab4 ks64 af dw vav di bo bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {8192, 8192, 16777216}, {32, 4, 64}, {4, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {545420, 1.0938e+06, 0, 0, 0, 0, 3.27178, 5.27297, 4.83132, 13.78, 0.0374045, 0.0374045, 0, 1, 1.21335, 1.20085, 1.04884e-14}}}, @@ -367,9 +379,6 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Inpqxy"}, "sB32 sB32 aB wg 2x8x2 kr ca3 ks64 af dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 24576, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {905358, -142888, 156169, 708291, 0, 0, 2.80808, 2.70632, 5.86735, 16.3366, 0.0116269, 0.00613968, 0.00837311, 0.898385, 1.42173, 1.18156, 1.07791e-12}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Inpxy"}, "sB32 sB64x2 aB wg 4x8 ca4 ks128 xaf st dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.00948e+06, 476755, 0, 0, 0, 0, 3.40086, 2.5695, 6.47179, 16.9, 0.0122444, 0.00343553, 0.0130855, 0.923057, 1.31761, 1.18319, 6.35596e-13}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "IPnxy"}, "sB64 sB32 aB wg 4x8 cab3 ks64 xaf dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.07695e+06, 540770, 0, 0, 0, 0, 3.25146, 2.5002, 6.59555, 17.6112, 0.0108004, 0.00702661, 0.00787872, 0.935703, 1.34469, 1.18472, 4.82174e-13}}}, -{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'E', "gemm", {"O", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {560656, 1.11604e+06, 0, 0, 0, 0, 8.60856, 11.0509, 6.14909, 15.6609, 0.0531671, 0.0531671, 0, 1, 1.21193, 1.20139, 7.41618e-15}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8/4x2 aB8/4x2 aB wg 4x4x4 kr cab4 ks16 nse di bo bk0 l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {4, 4, 4}, 1, (WGType) 1, 261, 49152, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {969115, 583269, 188510, 252220, 0, 0, 3.42328, 4.82691, 4.54757, 13.174, 0.0460941, 0.043499, 0.00642526, 0.922341, 1.28419, 1.04574, 3.16587e-12}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB8 aB wg 4x1x8 kr cb4 ks16 nse di bo bk0 grf256 l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {4, 1, 8}, 1, (WGType) 1, 261, 16384, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {925117, 381536, 217388, 254033, 0, 0, 3.37631, 5.16271, 2.19276, 9.90428, 0.0485492, 0.0129792, 0.042163, 0.629897, 1.27615, 1.05045, 4.02882e-12}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32x2 aB32x2 aB wg 8x8 cab4 ks32 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {563627, 1.13673e+06, 0, 0, 0, 0, 4.3956, 9.25108, 5.16051, 13.8631, 0.0539423, 0.0539423, 0, 1, 1.21113, 1.20077, 9.36221e-15}}}, @@ -386,13 +395,6 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Inxy"}, "sB32 sB32 aB wg 8x4 cab4 ks32 xaf st dw vav bo sr bk0 grf256 sys pab l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.01391e+06, 509797, 0, 0, 0, 0, 3.18295, 3.81981, 6.57362, 18.2308, 0.0132914, 0.00669773, 0.00903364, 0.924525, 1.25888, 1.19572, 2.0617e-13}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB32 sB32 aB wg 4x4x2 kr cab4 ks32 xaf st dw vav bo sr bk0 grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {588062, -204062, 489793, 781400, 0, 0, 3.40801, 3.41401, 5.96958, 15.8543, 0.0153168, 0.00865183, 0.0102862, 0.874684, 1.30067, 1.19121, 4.28268e-13}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB32 sB32 aB wg 4x8 cab4 ks64 xaf dw vav bo sr bk0 grf256 sys pab l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.03222e+06, 489032, 0, 0, 0, 0, 3.6293, 4.30606, 6.4398, 16.8446, 0.0180904, 0.00975268, 0.0127029, 0.887231, 1.21983, 1.20127, -3.55483e-15}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB64 aB wg 4x4x2 kr cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 65536, 32768, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.01594e+06, 428512, 105061, 111208, 0, 0, 4.57657, 4.56121, 3.76336, 13.6659, 0.0951637, 0.0550303, 0.0735522, 1, 1.21613, 1.20157, 3.14445e-15}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav di bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06234e+06, 445401, 0, 0, 0, 0, 4.62244, 5.31323, 3.78655, 11.4316, 0.08593, 0.0721937, 0.0244972, 1, 1.21089, 1.20155, -5.01322e-15}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav di bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20626e+06, 136698, 52320.2, 5700.91, 0, 0, 6.73519, 6.52141, 8.28549, 8.11665, 2.16712, 0.382531, 0.689817, 0.333333, 1.20221, 0, 0}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.24598e+06, 199092, 0, 0, 0, 0, 5.11406, 6.54063, 3.6868, 10.138, 0.114685, 0.0491769, 0.0713602, 0.902881, 1.20724, 1.20086, 6.44189e-15}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.00833e+06, 472991, 67249.4, 65461, 0, 0, 5.08333, 5.12749, 4.89837, 12.0468, 0.0787666, 0.065148, 0.0481217, 1, 1.20614, 1.20116, 2.71563e-15}}}, -{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav di bo bk0 sm sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27535e+06, 215187, 0, 0, 0, 0, 4.64559, 4.65327, 3.47044, 9.89232, 0.105235, 0.0510608, 0.0580013, 1, 1.20879, 1.2012, -2.49667e-15}}}, -{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav di bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20075e+06, 136815, 51380.2, 5732.06, 0, 0, 6.73706, 6.52116, 7.79608, 7.48028, 2.1671, 0.385165, 0.687428, 0.333333, 1.20286, 0, 0}}}, {{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB32 aB wg 8x4 cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.08485e+06, 534090, 0, 0, 0, 0, 2.26038, 2.28822, 6.54241, 16.5807, 0.0108941, -0.00616224, 0.023865, 0.743702, 1.35093, 1.1815, 6.50109e-13}}}, {{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB64 aB wg 8x8 cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {546628, 1.16417e+06, 0, 0, 0, 0, 2.69496, 2.82552, 5.90182, 15.4398, 0.0165818, 0.0165818, 0, 1, 1.23827, 1.20187, -6.19745e-15}}}, {{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB128 sB64 aB wg 8x4 cab4 ks128 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 128}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.18851e+06, 476513, 0, 0, 0, 0, 2.34744, 2.36664, 5.65933, 14.5041, 0.0251992, 0.0179545, 0.0233066, 0.991699, 1.2182, 1.2025, -6.17732e-15}}}, @@ -412,8 +414,6 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8x2 aB8x2 aB wg 8x8 cab4 ks16 nse di bo sr bk0 sm sn grf256 dm l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {582554, 1.48982e+06, 0, 0, 0, 0, 3.76758, 3.77334, 6.62494, 17.6039, 0.0363451, 0.0363451, 0, 1, 1.20147, 1.02529, 3.01473e-12}}}, {{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {64, 4, 1}, "xyI"}, "sS32x2 sB32 aB wg 16x2 cb4x2 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {64, 4, 4}, {false, false, true}}, {'E', 17, {1.06789e+06, 533234, 0, 0, 0, 0, 2.49913, 2.42261, 6.47452, 17.5868, 0.00865422, 0.00164569, 0.0106404, 0.852414, 1.37422, 1.18475, 8.41776e-13}}}, {{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipxy"}, "sB64 sB32 aB wg 8x4 cab3 ks64 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.0553e+06, 523707, 0, 0, 0, 0, 2.27443, 2.47386, 6.75043, 17.4507, 0.00973723, 0.0173445, 0.00450943, 0.936605, 1.39613, 1.18223, 6.42735e-13}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, -{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, {{'E', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb32 sb32 sb l4 cab1 wg 4x4 cs di sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 4096, 0, {1, 1, 4}, {false, false, false}}, {'E', 17, {1.01946e+06, 85192.9, 0, 0, 0, 0, 3.69873, 4.09617, 6.42674, 17.041, 0.0424222, 0.0270009, 0.0195008, 0.698122, 1.40723, 1.13886, 7.68905e-13}}}, {{'E', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8 aB16 aB wg 4x8 cab4 ks16 nse di bo sr bk0 sn grf256 dm l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.31114e+06, 828815, 0, 0, 0, 0, 4.12868, 4.53677, 6.35113, 17.6714, 0.04015, 0.0278237, 0.024414, 0.810338, 1.20421, 1.02447, 3.99205e-12}}}, {{'E', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 4x8 kc4 nse di sb64 bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 4}, {4, 8, 1}, 1, (WGType) 0, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, @@ -508,8 +508,6 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879752, 350276, 0, 0, 0, 0, 0.455042, 18.3558, 0.734906, 2.00265, 0.0538244, 0.0538244, 0, 0.0478715, 1.11789, 0.996902, 3.19089e-13}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8+m16@20 aS8x2+m16@20 aB wg 4x8 kc8 nse hi pt sr br sb32 bk0 sm sn grf256 kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {892682, 628982, 0, 0, 3.18669e+06, 6.10304e+06, 1.35529, 2.79397, 0.639478, 1.11864, 0.062606, 0.062606, 0, 1, 1.00571, 1.0015, -7.07015e-15}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8+m16@20 aS8x2+m16@20 aB wg 4x8 kc8 nse hi pt sr br sb32 bk0 sm grf256 kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.27607e+06, 520636, 0, 0, 3.24403e+06, 3.02285e+06, 1.78044, 2.04816, 0.439647, 0.861601, 0.0628166, 0.0628166, 0, 1, 1.00681, 0.891913, 8.19094e-13}}}, -{{'F', "gemm", {"B", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav di li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, -{{'F', "gemm", {"B", "F", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav di li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@32 av16x2 aB wg 4x8 cb4x2 ks32 xaf vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03592e+06, 704672, 0, 0, 7.32365e+06, 1.08544e+07, 0.894904, 1.09998, 0.983005, 1.70679, 0.00421397, 0.00421397, 0, 1, 1.62696, 1.15394, 2.31737e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@48 av32 aB wg 8x4 cb4 ks32 xaf st vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00518e+06, 647557, 0, 0, 5.6361e+06, 8.82278e+06, 0.824036, 1.52239, 1.05594, 1.7661, 0.00544276, 0.00544276, 0, 0.821492, 1.58157, 1.1307, 1.25438e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 aB wg 8x4 cb3 ks64 af vav di hi pt sr br bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01546e+06, 551460, 0, 0, 5.50994e+06, 5.39853e+06, 0.786066, 1.60403, 1.03564, 1.72987, 0.00627267, 0.00627267, 0, 0.945912, 1.4299, 1.12176, 2.119e-12}}}, @@ -584,34 +582,41 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav di sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS16+m16@12 aS8+m16@12 aB wg 8x4 kc8 nse hi pt sr br sb32 bk0 sm sn grf256 kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {874480, 814710, 0, 0, 3.14573e+06, 1.04694e+07, 1.59572, 1.73308, 0.796063, 1.18049, 0.0626559, 0.0626559, 0, 1, 1.00457, 1.00258, -3.94402e-14}}}, -{{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, -{{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, -{{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, -{{'F', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, -{{'F', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, -{{'F', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABqI"}, "am32+S1,32@64 av32+B32@64 aS cs di sys grf256 af wg 8x4 bo sb256 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {705599, 1.02394e+06, 0, 0, 0, 0, 0.719208, 0.662079, 1.08787, 2.05052, 0.00435156, 0.00435156, 0, 0.998842, 1.73144, 1.10326, 2.79532e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16 aS16 aB sys grf256 cab2 wg 4x4 l4 sr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.2896e+06, 327092, 0, 0, 0, 0, 1.64753, 1.69722, 1.01172, 1.48812, 0.0145767, 0.000763122, 0.0157325, 0.871871, 1.01157, 1.00431, 1.22757e-13}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS8x2+m16@20 aB8+m16@20 aS wg 8x4 kc8 nse hi pt sr sb32 bk0 sm sn grf256 kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 8}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {888082, 626619, 0, 0, 1.5319e+07, 2.15941e+07, 3.12178, 1.38894, 4.6861, 5.03272, 0.0626843, 0.0626843, 0, 1, 1.00285, 1.00112, -2.5704e-15}}}, +{{'F', "gemm", {"B", "F", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav di li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, +{{'F', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, +{{'F', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, +{{'F', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, +{{'F', "gemm", {"B", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav di li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, +{{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, +{{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, +{{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, +{{'F', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, +{{'F', "gemm", {"F", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, +{{'F', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "IB"}, "aB64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr dm", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {1, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.21597e+06, 165524, 28763.4, 9411.33, 0, 0, 0.278476, 0.599765, 0.920572, 5.21577, 0.021695, 0.0325353, 0.0148497, 1, 1.30279, 0.785158, 1.47381e-11}}}, {{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, {{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav di hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, -{{'F', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, -{{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {16, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, -{{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 aS8x2+S32@8 aB wg 2x8x2 kr kc8 nse hi pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 8}, {2, 8, 2}, 1, (WGType) 1, 413, 0, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12483e+06, -574330, -20527.6, 853244, 4.03046e+06, 0, 2.33007, 1.51079, 0.80632, 1.51643, 0.0625462, -1.54388e-05, 0.0629715, 0.425906, 1.06028, 0.229788, 1.06051e-11}}}, -{{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse di li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {16, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}}, {{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "BIp"}, "aB16+m32@48 am32+S32@64 aB wg 4x8 xaf st vav hi pt ca4x2 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {953955, 796880, 0, 0, 6.54705e+06, 1.02728e+07, 0.868475, 0.996885, 0.927331, 1.55842, 0.00423299, 0.00423299, 0, 0.997407, 1.58612, 1.11537, 2.43106e-12}}}, {{'F', "gemm", {"F", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#ABI"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, {{'F', "gemm", {"F", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav di li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, -{{'F', "gemm", {"F", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, +{{'F', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "IB"}, "aB64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr dm", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {1, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.21597e+06, 165524, 28763.4, 9411.33, 0, 0, 0.278476, 0.599765, 0.920572, 5.21577, 0.021695, 0.0325353, 0.0148497, 1, 1.30279, 0.785158, 1.47381e-11}}}, +{{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {16, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, +{{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 aS8x2+S32@8 aB wg 2x8x2 kr kc8 nse hi pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 8}, {2, 8, 2}, 1, (WGType) 1, 413, 0, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12483e+06, -574330, -20527.6, 853244, 4.03046e+06, 0, 2.33007, 1.51079, 0.80632, 1.51643, 0.0625462, -1.54388e-05, 0.0629715, 0.425906, 1.06028, 0.229788, 1.06051e-11}}}, +{{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse di li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {16, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, -{{'F', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "IB"}, "aB64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr dm", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {1, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.21597e+06, 165524, 28763.4, 9411.33, 0, 0, 0.278476, 0.599765, 0.920572, 5.21577, 0.021695, 0.0325353, 0.0148497, 1, 1.30279, 0.785158, 1.47381e-11}}}, -{{'F', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "IB"}, "aB64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr dm", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {1, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.21597e+06, 165524, 28763.4, 9411.33, 0, 0, 0.278476, 0.599765, 0.920572, 5.21577, 0.021695, 0.0325353, 0.0148497, 1, 1.30279, 0.785158, 1.47381e-11}}}, +{{'F', "gemm", {"H", "F", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, +{{'F', "gemm", {"H", "F", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, +{{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 2}, "ABI"}, "at16+m32@64 aB16 aB wg 16x2 cb3x2 ks32 af vav di hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, +{{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, +{{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 dm sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, +{{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 dm sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, {{'F', "gemm", {"H", "H", "S"}, {"A2#16,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16+B16@48 aB16+B16@48 aB vav di sys grf256 af hi pt wg 4x8 sb256 bk0 sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {2048}}}, {{'F', "gemm", {"H", "H", "S"}, {"A2", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2+B16@80 aB16x2+B16@80 aB vav sb256 wg 4x8 di bo pt sys bk0 sr br", {16, (LoopType) 255, 128, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {64, 128, 4}, {true, true, true}}, {'W', 1, {256}}}, -{{'F', "gemm", {"H", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@72 am32+m32@64 aB wg 4x8 xaf vav di hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {903365, 697556, 0, 0, 8.2903e+06, 1.21651e+07, 0.724506, 0.722081, 0.92287, 1.55416, 0.00402055, 0.00402055, 0, 0.997691, 1.6726, 1.18622, 5.18793e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m32@48 am32+S32@64 aB wg 4x8 xaf st vav di hi pt sb32 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {869760, 741196, 0, 0, 8.192e+06, 1.05431e+07, 0.731287, 0.777012, 0.881104, 1.51408, 0.00403024, 0.00403024, 0, 0.998184, 1.76752, 1.28618, 2.08947e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av64+m16@64 am32+m32@72 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 64}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {876646, 564122, 0, 0, 6.5151e+06, 7.83974e+06, 0.629669, 0.87362, 0.885543, 1.48097, 0.00440774, 0.00440774, 0, 1, 1.66234, 1.24996, 2.85794e-12}}}, @@ -650,7 +655,6 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "iv"}, "aB8+m16@20 aS8x2+m16@20 aB wg 4x8 kc8 nse hi pt sb32 bk0 sm sn grf256 kv afb l4 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {882414, 629829, 0, 0, 2.86065e+06, 5.90643e+06, 1.12485, 2.37649, 0.641517, 1.11886, 0.0675119, 0.0675119, 0, 1, 1.00282, 1.00087, -2.48723e-15}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8+m16@20 aS8x2+m16@20 aB wg 4x8 kc8 nse hi pt sb32 bk0 sm grf256 kv afb l4 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {899247, 559339, 0, 0, 3.09248e+06, 3.03104e+06, 1.69164, 2.03658, 0.425236, 0.833504, 0.0686503, 0.0686503, 0, 1, 1.0052, 0.967023, 2.81088e-13}}}, -{{'F', "gemm", {"H", "F", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@32 av16x2 aB wg 4x8 cb4x2 ks32 xaf vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03762e+06, 706048, 0, 0, 6.93043e+06, 1.10019e+07, 0.892859, 1.0972, 0.98165, 1.70677, 0.00434444, 0.00434444, 0, 0.705466, 1.6217, 1.19447, 5.03184e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@48 av32 aB wg 8x4 cb4 ks32 xaf st vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00046e+06, 649003, 0, 0, 5.58776e+06, 8.89651e+06, 0.806799, 1.52159, 1.05017, 1.76588, 0.00548707, 0.00548707, 0, 0.843701, 1.54307, 1.23912, 1.78553e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 aB wg 8x4 cb3 ks64 af vav di hi pt sr br bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01629e+06, 551456, 0, 0, 5.42147e+06, 5.37395e+06, 0.789185, 1.6085, 1.03879, 1.72752, 0.00628363, 0.00628363, 0, 0.924224, 1.51259, 1.24909, 2.21382e-12}}}, @@ -685,12 +689,6 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav di li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m32@48 at64+m16@48 aB wg 2x2x8 kr xaf cs di hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.13361e+06, -87922.6, -10020.2, 138891, 2.8672e+06, 1.83501e+06, 0.818937, 0.8499, 0.851117, 1.53666, 0.0324244, 0.0285585, 0.0107132, 0.756667, 1.09867, 0.84864, 1.283e-11}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8x2+m8@12 aB8+m16@12 aB wg 8x4 kc8 nse hi pt sb32 bk0 sn grf256 kv afb l4 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {872740, 634178, 0, 0, 2.88358e+06, 5.61971e+06, 1.27345, 2.56135, 0.654382, 1.05183, 0.0667366, 0.0667366, 0, 1, 1.0025, 1.03224, -1.62343e-12}}}, -{{'F', "gemm", {"H", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, -{{'F', "gemm", {"H", "F", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, -{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 2}, "ABI"}, "at16+m32@64 aB16 aB wg 16x2 cb3x2 ks32 af vav di hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, -{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, -{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 dm sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, -{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 dm sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIqp"}, "at16+m64@48 am32+m32@56 aB wg 4x8 xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {885722, 718272, 0, 0, 8.48691e+06, 1.26566e+07, 1.01026, 0.782981, 0.918959, 1.54496, 0.00414471, 0.00414471, 0, 1, 2.17926, 1.31409, 2.23082e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m64@48 am32+m32@64 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, 1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at16x2+m32@32 am32+m64@48 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {879324, 573732, 0, 0, 5.95558e+06, 8.37222e+06, 0.790222, 0.904604, 0.883707, 1.48127, 0.0050314, 0.0050314, 0, 0.985529, 1.4878, 1.23641, 2.41747e-12}}}, @@ -730,18 +728,33 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav di sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS16+m16@12 aS8+m16@12 aB wg 8x4 kc8 nse hi pt sb32 bk0 sm sn grf256 kv afb l4 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {878230, 813623, 0, 0, 3.31776e+06, 9.39622e+06, 1.5936, 1.75404, 0.798399, 1.185, 0.0656902, 0.0656902, 0, 1, 1.00344, 1.00135, -9.21479e-15}}}, -{{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 2}, "ABI"}, "at16+m32@64 aB16 aB wg 16x2 cb3x2 ks32 af vav di hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, -{{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, -{{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 dm sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, -{{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 dm sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16 aS16 aB sys grf256 cab2 wg 4x4 l4 sr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17895e+06, 343529, 0, 0, 0, 0, 1.63411, 1.77325, 1.00531, 1.48275, 0.0145617, 0.000936039, 0.0155971, 0.877282, 1.01034, 1.0048, 9.67486e-14}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABqI"}, "am32+S1,32@64 av32+B32@64 aS cs di sys grf256 af wg 8x4 bo sb256 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {765291, 1.00234e+06, 0, 0, 0, 0, 0.723688, 0.663141, 1.08538, 2.05438, 0.00434664, 0.00434664, 0, 1, 1.8693, 1.21785, 3.96104e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS8x2+m16@20 aB8+m16@20 aS wg 8x4 kc8 nse hi pt sb32 bk0 sm sn grf256 kv afb l4 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {971283, 550381, 0, 0, 1.25911e+07, 8.13466e+06, 2.33425, 1.5484, 4.45361, 4.80392, 0.0689395, 0.0689395, 0, 1, 1.00725, 0.817916, 1.37423e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"A4#16,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32+B32@96 aB32+B32@96 aB vav di sys grf256 af hi pt wg 4x8 sb512 bk0 sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {8192, 8192, 16777216}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {2048}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav di hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, +{{'F', "gemm", {"H", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, +{{'F', "gemm", {"H", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, +{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 2}, "ABI"}, "at16+m32@64 aB16 aB wg 16x2 cb3x2 ks32 af vav di hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, +{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, +{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 dm sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, +{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 dm sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, {{'F', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, {{'F', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav di hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.0376e+06, 806829, 0, 0, 0, 0, 1.28219, 1.0295, 1.49381, 3.00966, 0.0179815, 0.0179815, 0, 0.929009, 1.36496, 0.83458, 3.84739e-12}}}, +{{'F', "gemm", {"O", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, +{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}}, +{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, +{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, +{{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav di hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 aB wg 8x4 cb3 ks64 af vav di hi pt sr br bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01629e+06, 551456, 0, 0, 5.42147e+06, 5.37395e+06, 0.789185, 1.6085, 1.03879, 1.72752, 0.00628363, 0.00628363, 0, 0.924224, 1.51259, 1.24909, 2.21382e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav di li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 dm sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 dm sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav di sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m32@48 am32+m32@32 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, 1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"A4#16,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32+B32@96 aB32+B32@96 aB vav di sys grf256 af hi pt wg 4x8 sb512 bk0 sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {8192, 8192, 16777216}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {2048}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m128@96 am64+m64@96 aB wg 4x8 xaf st vav di hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 64}, {64, 40, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {886295, 679810, 0, 0, 6.14973e+06, 1.05103e+07, 0.387882, 0.34723, 0.844766, 1.28789, 0.00202312, 0.00202312, 0, 0.99971, 1.60161, 1.05949, 2.70909e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m64@128 am64x2+m64@128 aB wg 4x8 xaf vav di hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {878684, 628439, 0, 0, 5.63446e+06, 8.38042e+06, 0.326785, 0.347891, 0.805503, 1.24882, 0.00204557, 0.00204557, 0, 1, 1.50403, 1.06152, 3.02972e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@64 am64+m128@64 aB wg 4x8 af vav di hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 393216, 16777216}, {1048576, 393216, 64}, {64, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {893468, 542180, 0, 0, 4.62356e+06, 6.62733e+06, 0.391937, 0.505587, 0.774621, 1.23145, 0.00235858, 0.00235858, 0, 0.996539, 1.5023, 1.05328, 2.52696e-12}}}, @@ -779,16 +792,6 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "i"}, "av8 am16x2+m32@24 aB wg 4x8 ca3 ks32 nse hi pt sr br bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {970346, 653718, 0, 0, 3.67821e+06, 9.17504e+06, 0.766571, 0.880554, 0.876014, 1.54472, 0.0160172, 0.0160172, 0, 1, 1.11467, 0.986994, 1.14313e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#i"}, "aB32 aB32 aB wg 4x8 cab3 ks32 nse hi pt bk0 grf256 kv afb sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 49152, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.54615e+06, 742084, 0, 0, 3.95674e+06, 8.85555e+06, 0.656903, 1.11296, 0.865967, 1.49351, 0.0166951, 0.0166951, 0, 1, 1.09758, 0.989116, 8.61864e-13}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#i"}, "aB16 aB16 aB wg 4x8 cab3 ks32 nse hi pt bk0 grf256 kv afb sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 49152, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.01957e+06, 906366, 0, 0, 4.13696e+06, 8.73267e+06, 1.51183, 1.36399, 0.870951, 1.51109, 0.0167461, 0.0167461, 0, 1, 1.09321, 0.977493, 1.29851e-12}}}, -{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {16, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, -{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 aS8x2+S32@8 aB wg 2x8x2 kr kc8 nse hi pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 8}, {2, 8, 2}, 1, (WGType) 1, 413, 0, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12483e+06, -574330, -20527.6, 853244, 4.03046e+06, 0, 2.33007, 1.51079, 0.80632, 1.51643, 0.0625462, -1.54388e-05, 0.0629715, 0.425906, 1.06028, 0.229788, 1.06051e-11}}}, -{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse di li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {16, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}}, -{{'F', "gemm", {"O", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 aB wg 8x4 cb3 ks64 af vav di hi pt sr br bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01629e+06, 551456, 0, 0, 5.42147e+06, 5.37395e+06, 0.789185, 1.6085, 1.03879, 1.72752, 0.00628363, 0.00628363, 0, 0.924224, 1.51259, 1.24909, 2.21382e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, -{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, -{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, -{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, -{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32x2+m32@64 av32+m32@64 aB wg 4x8 cb3 ks32 xaf st vav di hi pt sr br bk0 nb 0x8 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {968994, 716920, 0, 0, 5.78437e+06, 9.22419e+06, 0.534082, 0.742409, 0.894677, 1.49963, 0.00222583, 0.00222583, 0, 0.887893, 1.57089, 1.08446, 1.91309e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32x2+m128@64 av64+m32@64 aB wg 4x4x2 kr cb3 ks64 xaf vav di hi pt sr br bk0 nb 0x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.48013e+06, -879245, -212784, 1.19787e+06, 3.27516e+06, 5.10362e+06, 0.353883, 0.703019, 0.911615, 1.47833, 0.00312427, 0.000484667, 0.00281171, 0.593822, 1.43189, 1.02998, 1.76713e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m64@128 av64+m32@128 aB wg 4x8 cab4x2 ks64 af vav di hi pt sr br bk0 nb 4x8 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.04641e+06, 551646, 0, 0, 0, 0, 0.775015, 0.878808, 0.742373, 1.49937, 0.00558315, 0.00558315, 0, 1, 1.29356, 0.970044, 1.0419e-12}}}, @@ -821,16 +824,6 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "i"}, "av32 aS32+m32@8 aB wg 4x8 cab4 ks32 nse hi pt bk0 sn nb 4x8 grf256 kv afb sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.4409e+06, 852858, 0, 0, 4.12877e+06, 8.43776e+06, 0.61084, 0.782227, 0.876523, 1.49805, 0.0164756, 0.0164756, 0, 1, 1.10945, 0.995623, 8.03063e-13}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#i"}, "aB8x2 aS8x2 aB wg 4x8 cab4 ks32 nse hi pt sr br bk0 nb 4x8 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04685e+06, 855332, 0, 0, 3.9977e+06, 9.216e+06, 0.658789, 0.718726, 0.867169, 1.54002, 0.0168389, 0.0168389, 0, 1, 1.09776, 0.992762, 7.01917e-13}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#i"}, "aB8/4 aS8 aB wg 4x8 cab4 ks32 nse hi pt sr br bk0 nb 4x8 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {991806, 971407, 0, 0, 4.04685e+06, 9.01939e+06, 1.72245, 1.50029, 0.860423, 1.51374, 0.0176793, 0.0176793, 0, 1, 1.07978, 0.979007, 6.31428e-13}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav di li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, -{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}}, -{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 dm sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 dm sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav di sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m32@48 am32+m32@32 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, 1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}}, -{{'F', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "at16x2+m16@32 at16+m32@32 aB wg 16x1x2 kr kc16 nse nmk li pt sr sb256 bk0 sm grf256 kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {262144, 65536, 32}, {16, 4, 16}, {16, 1, 2}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.1734e+06, -264907, -109870, 485064, 2.21266e+06, 0, 0.856653, 15.807, 1.98085, 3.89882, 0.125049, 0.0139237, 0.143865, 1, 1.34573, 0.978713, 4.7619e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32+m128@96 am64+m64@112 aB wg 4x8 xaf st vav di hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 64}, {64, 40, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {884649, 705009, 0, 0, 6.08092e+06, 1.03629e+07, 0.496687, 0.364134, 0.840897, 1.28383, 0.00200956, 0.00200956, 0, 1, 2.31371, 1.15477, 1.64496e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32x2+m128@96 am64+m64@128 aB wg 8x4 xaf vav di hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {895996, 619115, 0, 0, 5.44113e+06, 8.45414e+06, 0.385983, 0.342777, 0.788529, 1.22228, 0.00199536, 0.00199536, 0, 1, 1.60168, 1.14344, 3.24663e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32x2+m64@64 am64+m128@96 aB wg 8x4 xaf vav di hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 64}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {873940, 565190, 0, 0, 4.58752e+06, 7.00416e+06, 0.375538, 0.421029, 0.747052, 1.19407, 0.00251587, 0.00251587, 0, 1, 1.69069, 1.13204, 1.62736e-12}}}, @@ -868,25 +861,31 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m64@96 aB128 aB wg 1x4x8 kr af vav di li pt sr br sb128 bk0 sm dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 128}, {16, 16, 128}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.19588e+06, -90151.7, -17276.9, 143923, 2.18726e+06, 1.17965e+06, 0.50932, 0.206862, 0.351165, 0.961797, 0.00952689, 0.0094563, 0.00799782, 1, 1.08799, 0.29902, 3.8884e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aS32+S64@96 aB64+S32@96 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 64}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.18068e+06, 53960.8, 0, 0, 0, 0, 0.213645, 3.70034, 2.54839, 9.45781, 0.067707, 0.0150101, 0.0808417, 1, 1.00383, 0, 0}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS16+m32@32 aB8x2+m16@32 aB wg 8x2 cb3 ks32 nse hi pt sr br bk0 sm sn grf256 kv afb l4 dm", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 2, 1}, 1, (WGType) 1, 441, 12288, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.43403e+06, 452176, 0, 0, 4.48102e+06, 9.67475e+06, 0.486151, 0.555121, 0.890086, 1.40877, 0.0164251, 0.000123601, 0.0164842, 0.490983, 1.14664, 1.00079, 8.17626e-13}}}, -{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABpqI"}, "am64+S1,64@128 av64+B64@128 aS cs di sys grf256 af wg 8x4 bo sb512 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {696651, 1.10436e+06, 0, 0, 0, 0, 0.942188, 0.965594, 1.06189, 2.04065, 0.00368306, 0.00368306, 0, 0.911159, 1.34929, 0.941877, 1.64736e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32 aS32 aB sys grf256 cab2 wg 4x4 ek l4 sr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.16401e+06, 348644, 0, 0, 0, 0, 0.807306, 0.892675, 0.990554, 1.4802, 0.00939438, 0.000733543, 0.0109328, 0.899502, 1.01113, 1.00523, 2.16142e-14}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB16 aB wg 8x4 cab3 ks32 nse hi pt sr bk0 grf256 kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.02524e+06, 922411, 0, 0, 4.21888e+06, 8.7081e+06, 0.917542, 0.658478, 0.919692, 1.40366, 0.0167399, 0.0167399, 0, 1, 1.08933, 0.991295, 6.92853e-13}}}, +{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {16, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, +{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 aS8x2+S32@8 aB wg 2x8x2 kr kc8 nse hi pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 8}, {2, 8, 2}, 1, (WGType) 1, 413, 0, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12483e+06, -574330, -20527.6, 853244, 4.03046e+06, 0, 2.33007, 1.51079, 0.80632, 1.51643, 0.0625462, -1.54388e-05, 0.0629715, 0.425906, 1.06028, 0.229788, 1.06051e-11}}}, +{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse di li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {16, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}}, +{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, +{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, +{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, +{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, +{{'F', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "at16x2+m16@32 at16+m32@32 aB wg 16x1x2 kr kc16 nse nmk li pt sr sb256 bk0 sm grf256 kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {262144, 65536, 32}, {16, 4, 16}, {16, 1, 2}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.1734e+06, -264907, -109870, 485064, 2.21266e+06, 0, 0.856653, 15.807, 1.98085, 3.89882, 0.125049, 0.0139237, 0.143865, 1, 1.34573, 0.978713, 4.7619e-12}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m32@48 am32+S32@64 aB wg 4x8 xaf st vav di hi pt sb32 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {869760, 741196, 0, 0, 8.192e+06, 1.05431e+07, 0.731287, 0.777012, 0.881104, 1.51408, 0.00403024, 0.00403024, 0, 0.998184, 1.76752, 1.28618, 2.08947e-12}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ip"}, "aB32 aB32 aB wg 8x4 cab3 ks32 af vav di hi pt sr br bk0 sn nb 8x4 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 86016, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07153e+06, 922406, 0, 0, 5.48536e+06, 9.18323e+06, 0.85293, 1.19559, 1.04485, 1.64281, 0.00471518, 0.00471518, 0, 0.961495, 1.72318, 1.24713, 3.69593e-12}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 aB wg 2x4x4 kr ca3x2 ks16 af vav di hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 6144, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.27954e+06, -187821, -42333.9, 291644, 3.34234e+06, 2.63782e+06, 0.670967, 0.826166, 0.942564, 1.64083, 0.0148244, 0.00555253, 0.00975056, 0.806514, 1.26716, 0.788997, 1.48059e-11}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@48 av32 aB wg 8x4 cb4 ks32 xaf st vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00046e+06, 649003, 0, 0, 5.58776e+06, 8.89651e+06, 0.806799, 1.52159, 1.05017, 1.76588, 0.00548707, 0.00548707, 0, 0.843701, 1.54307, 1.23912, 1.78553e-12}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@32 av16x2 aB wg 4x8 cb4x2 ks32 xaf vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03762e+06, 706048, 0, 0, 6.93043e+06, 1.10019e+07, 0.892859, 1.0972, 0.98165, 1.70677, 0.00434444, 0.00434444, 0, 0.705466, 1.6217, 1.19447, 5.03184e-12}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB16+B16@16 aB wg 4x4 vav di hi pt sb256 bk0 grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {16, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.10018e+06, 251905, 0, 0, 0, 0, 1.56408, 2.85947, 0.648851, 1.37611, 0.0629702, 0.000146865, 0.0632313, 0.517444, 1.16754, -0.0884205, 2.14696e-11}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m64@48 am32+m32@64 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, -1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIqp"}, "at16+m64@48 am32+m32@56 aB wg 4x8 xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {16, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {885722, 718272, 0, 0, 8.48691e+06, - 1.26566e+07, 1.01026, 0.782981, 0.918959, 1.54496, 0.00414471, 0.00414471, 0, 1, 2.17926, 1.31409, 2.23082e-12}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "AI"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.15217e+06, -88485.5, -9852.59, 144015, -2.69517e+06, 1.69656e+06, 1.08886, 0.414631, 0.587045, 1.19593, 0.0202405, 0.0220064, 0.0155006, 1, 1.0809, 0.644827, 3.81346e-12}}}, +{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m64@48 am32+m32@64 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, 1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}}, +{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIqp"}, "at16+m64@48 am32+m32@56 aB wg 4x8 xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {16, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {885722, 718272, 0, 0, 8.48691e+06, 1.26566e+07, 1.01026, 0.782981, 0.918959, 1.54496, 0.00414471, 0.00414471, 0, 1, 2.17926, 1.31409, 2.23082e-12}}}, +{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "AI"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.15217e+06, -88485.5, -9852.59, 144015, 2.69517e+06, 1.69656e+06, 1.08886, 0.414631, 0.587045, 1.19593, 0.0202405, 0.0220064, 0.0155006, 1, 1.0809, 0.644827, 3.81346e-12}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+S1,16@24 aS16+S32@16 aB wg 2x2x8 kr vav hi pt sr sb256 bk0 sm sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.2289e+06, -127728, -18531.3, 192240, 3.35053e+06, 0, 0.932716, 1.33521, 0.665104, 1.39923, 0.0628179, 0.0675437, 0.0114361, 0.999809, 1.27564, 0.821381, 3.76564e-11}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am16x2+m64@32 aB wg 4x4x2 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {16, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03742e+06, -452509, 25423.9, 695882, 3.92397e+06, 3.94035e+06, 0.820319, 0.785135, 0.84902, 1.56923, 0.00809246, 0.00116078, 0.00745441, 0.526504, 1.60176, 1.07294, 4.15601e-12}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,16@16 aB8+B8@16 aU vav di wg 8x4 bo pt sm sb256 grf256 bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {875898, 864492, 0, 0, 0, 0, 2.55527, 2.12966, 0.857629, 1.85569, 0.0625314, 0.0625314, 0, 1, 1.01096, 1.00724, 3.06523e-14}}}, +{{'F', "gemm", {"S", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+m32@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn grf256 l4", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {1, 1, 1}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, +{{'F', "gemm", {"S", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+m32@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn grf256 l4", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {1, 1, 1}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@8 aB8+B8@8 aB nse di wg 4x8 bo pt sb256 kc8 bk0 sr", {16, (LoopType) 255, 128, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+m8@32 aS32+m16@40 aB wg 4x4 kc16 nse di hi pt sb256 bk0 sn grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08792e+06, 260070, 0, 0, 0, 0, 1.27159, 2.25336, 0.633711, 1.35704, 0.0632943, 0.00105479, 0.0694168, 0.543903, 1.15915, 0.195161, 2.93818e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@24 am/S16+S32@32 aB wg 4x8 kc8 nse di hi pt sb256 bk0 sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {933004, 470490, 0, 0, 0, 0, 2.49562, 3.97982, 0.810184, 1.38841, 0.0630776, 0.0630776, 0, 1, 1.22055, -0.309162, 2.6504e-11}}}, @@ -927,7 +926,6 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am8+m8@32 am16x2+m16@16 aB wg 2x8 kc8 nse di hi pt sr sb256 bk0 grf256 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.09546e+06, 254169, 0, 0, 0, 0, 1.63097, 3.84728, 1.50643, 2.37923, 0.0742032, 0.0154716, 0.0594372, 0.827878, 1.42273, 0.913754, 1.12977e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "aB8+m8@8 aB32+m32@8 aB wg 8x1x4 kr kc8 nse di li nmk pt sr sb32 bk0 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {524288, 65536, 32}, {32, 4, 32}, {8, 1, 4}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.18493e+06, -151099, 5931.1, 262767, 2.36749e+06, 0, 0.843802, 28.5958, 1.39394, 2.57371, 0.107758, 0.0474906, 0.0797103, 0.994029, 1.25885, 0.690983, 1.3586e-11}}}, -{{'F', "gemm", {"S", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+m32@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn grf256 l4", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {1, 1, 1}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at8x2+S1,8@56 am/S8x2+S16@56 aB wg 4x8 kc8 nse di hi pt sb32 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {900022, 588464, 0, 0, 0, 0, 2.11582, 2.1833, 0.547862, 1.23354, 0.0626136, 0.0626136, 0, 1, 1.1599, -0.725344, 3.06459e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at8x2+m32@24 am/S16+m8@32 aB wg 8x4 kc8 nse di hi pt sb256 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {893971, 565638, 0, 0, 0, 0, 2.10566, 2.60265, 0.8239, 1.43017, 0.072004, 0.072004, 0, 0.497868, 1.18996, 0.90853, 4.46328e-12}}}, {{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at16+m16@64 am/S16x2+m32@48 aB wg 4x4 kc16 nse di hi pt sb256 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.30686e+06, 256588, 0, 0, 0, 0, 1.11607, 1.55713, 0.941507, 2.42007, 0.0726522, 0.00111933, 0.0659252, 0.759172, 1.32447, 0.944187, 7.04547e-12}}}, @@ -947,7 +945,6 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, ""}, "am32+m32@32 at8x2+m16@24 aS wg 1x8x4 kr kc8 nse li pt sr sb256 bk0 sn grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {16384, 262144, 16777216}, {16384, 262144, 32}, {1, 16, 32}, {1, 8, 4}, 1, (WGType) 1, 413, 0, 512, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.26572e+06, -129216, -67284, 239862, 2.15941e+06, 0, 24.5032, 0.852871, 5.82167, 16.1942, 0.788258, 0.295269, 0.225216, 0.354473, 1.07957, 0, 0}}}, {{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8x2+m16@24 aS8x2+m32@24 aB wg 2x4x4 kr kc8 nse di hi pt sb256 bk0 sm sn grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 8}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {964742, 420594, -264.455, 102391, 0, 0, 1.36495, 1.22335, 2.7783, 6.70529, 0.075127, 0.075127, 0, 1, 1.36799, 0.837729, 1.63216e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at8+S1,16@32 at16+S8@32 aB wg 4x4x2 kr kc8 nse di hi pt sb32 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07352e+06, 346157, 1765.64, 296014, 0, 0, 1.38106, 2.15672, 0.936372, 2.08073, 0.062816, 0.062816, 0, 1, 1.1961, 0.493637, 1.23641e-11}}}, -{{'F', "gemm", {"S", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+m32@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn grf256 l4", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {1, 1, 1}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,16@32 aB8/4+B8@32 aU nse di wg 8x4 bo pt kc8 sm sb256 grf256 bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {32, 64, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {875898, 864492, 0, 0, 0, 0, 2.55527, 2.12966, 0.857629, 1.85569, 0.0625314, 0.0625314, 0, 1, 1.01096, 1.00724, 3.06523e-14}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIpq"}, "am16+B16@20 am16+m16@20 aB wg 4x8 xaf st rr vav hi pt sr br sb32 bk0 sn grf256 sys np kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {870519, 757436, 0, 0, 7.0656e+06, 9.74029e+06, 1.61519, 1.51882, 0.777223, 1.2159, 0.00794099, 0.00794099, 0, 1, 1.52399, 1.13177, 3.83469e-12}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "am16+B8@32 am32+m16@32 aB wg 4x8 af rr vav hi pt sr br sb32 bk0 sn grf256 sys kv afb np", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 393216, 16777216}, {1048576, 393216, 32}, {64, 24, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {881335, 689481, 0, 0, 4.45891e+06, 7.58579e+06, 1.46799, 1.88849, 0.735162, 1.19884, 0.00844968, 0.00844968, 0, 1, 1.56029, 1.15624, 3.29649e-12}}}, diff --git a/src/gpu/intel/jit/gemm/kernel_catalog.hpp b/src/gpu/intel/jit/gemm/kernel_catalog.hpp index a2300d92079..d9bcc0dafeb 100644 --- a/src/gpu/intel/jit/gemm/kernel_catalog.hpp +++ b/src/gpu/intel/jit/gemm/kernel_catalog.hpp @@ -104,7 +104,8 @@ struct Selector { friend bool operator<(const Selector &sel1, const Selector &sel2) { auto tupleize = [](const Selector &sel) { return std::make_tuple(sel.hw, sel.precisions[0][0] & 0x1F, - sel.layouts[0][0], sel.layouts[1][0]); + sel.precisions[1][0] & 0x1F, sel.layouts[0][0], + sel.layouts[1][0]); }; return tupleize(sel1) < tupleize(sel2); }; From ea63c95c25f24b57cc43c539c9cd4eeff9974b80 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Wed, 8 May 2024 09:38:22 -0700 Subject: [PATCH 008/187] gpu: jit: gemm: more Xe2 block 2D fallback paths --- src/gpu/intel/jit/gemm/kernel.db | 142 +++++++++++++++---------------- 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/src/gpu/intel/jit/gemm/kernel.db b/src/gpu/intel/jit/gemm/kernel.db index a627cbee965..8fe264a92e6 100644 --- a/src/gpu/intel/jit/gemm/kernel.db +++ b/src/gpu/intel/jit/gemm/kernel.db @@ -525,19 +525,19 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m16@32 at32x2+m16@16 aB wg 4x2x2 kr af vav di li nmk pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 32}, {4, 2, 2}, 1, (WGType) 1, 445, 0, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {774823, -159604, 298303, 251381, 2.48054e+06, 3.06135e+06, 0.49199, 4.0514, 0.750455, 1.55245, 0.0179351, 0.0189282, 0.0113946, 1, 1.20887, 0.775438, 7.38219e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABIpq"}, "aB16+m16@48 at32+m16@48 aB wg 4x8 cab3 ks32 xaf vav di hi pt sr br bk0 sn nb 4x8 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04733e+06, 850139, 0, 0, 7.14342e+06, 1.02482e+07, 1.35439, 1.1268, 0.968002, 1.56332, 0.0049908, 0.0049908, 0, 1, 1.5967, 1.09547, 2.65254e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at16+m16@32 aB wg 8x2x2 kr cab3 ks16 xaf st vav di hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 16}, {8, 2, 2}, 1, (WGType) 1, 445, 36864, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.62752e+06, -1.34654e+06, -263082, 1.70692e+06, 5.27729e+06, 7.17619e+06, 1.02404, 1.43681, 1.0099, 1.67277, 0.00669992, 0.000934616, 0.00604576, 0.813201, 1.39102, 1.03493, 5.39508e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB16+m32@48 at32+m32@48 aB wg 8x4 cab4 ks32 xaf st vav di hi pt sr br bk0 sn nb 8x4 grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07446e+06, 778165, 0, 0, 0, 0, 1.04453, 2.06434, 0.993405, 1.67185, 0.00809777, 0.00809777, 0, 0.966922, 1.35225, 0.983739, 2.64748e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB32+m32@64 at32+m16@64 aB wg 4x4x2 kr cab3 ks32 xaf st vav di hi pt sr br bk0 sn nb 4x4 grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.66893e+06, -477403, -298914, 842863, 4.01408e+06, 4.51379e+06, 1.32266, 0.980356, 0.957919, 1.60267, 0.00874929, 0.000443986, 0.00826981, 0.601212, 1.33967, 0.974327, 4.12771e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16+m16@16 aB wg 4x4x2 kr cab4x2 ks16 af vav di hi pt sr br bk0 sn nb 4x4 grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06612e+06, 307476, 2776.1, 398222, 0, 0, 1.55249, 0.974457, 0.947265, 1.9437, 0.00891094, 0.00891094, 0, 0.999859, 1.35378, 0.955966, 4.71609e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB32 at32+m32@48 aB wg 4x8 cab4 ks64 xaf st vav di hi pt sr br bk0 sn grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 131072, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08109e+06, 676708, 0, 0, 0, 0, 1.46721, 1.49766, 0.911352, 1.83399, 0.0123018, 0.0123018, 0, 0.930092, 1.22199, 0.994137, 1.11339e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16+m32@48 at32+m32@48 aB wg 8x4 cab4 ks32 xaf st vav di hi pt sr br bk0 sn nb 8x4 grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07446e+06, 778165, 0, 0, 0, 0, 1.04453, 2.06434, 0.993405, 1.67185, 0.00809777, 0.00809777, 0, 0.966922, 1.35225, 0.983739, 2.64748e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m32@64 at32+m16@64 aB wg 4x4x2 kr cab3 ks32 xaf st vav di hi pt sr br bk0 sn nb 4x4 grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.66893e+06, -477403, -298914, 842863, 4.01408e+06, 4.51379e+06, 1.32266, 0.980356, 0.957919, 1.60267, 0.00874929, 0.000443986, 0.00826981, 0.601212, 1.33967, 0.974327, 4.12771e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16+m16@16 aB wg 4x4x2 kr cab4x2 ks16 af vav di hi pt sr br bk0 sn nb 4x4 grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06612e+06, 307476, 2776.1, 398222, 0, 0, 1.55249, 0.974457, 0.947265, 1.9437, 0.00891094, 0.00891094, 0, 0.999859, 1.35378, 0.955966, 4.71609e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 at32+m32@48 aB wg 4x8 cab4 ks64 xaf st vav di hi pt sr br bk0 sn grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 131072, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08109e+06, 676708, 0, 0, 0, 0, 1.46721, 1.49766, 0.911352, 1.83399, 0.0123018, 0.0123018, 0, 0.930092, 1.22199, 0.994137, 1.11339e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab4 ks16 xaf vav di hi pt sr br bk0 sn nb 2x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.53714e+06, -226698, -110555, 381984, 4.01408e+06, 3.08838e+06, 1.02014, 1.2588, 0.99644, 1.73352, 0.0120789, 0.00866919, 0.00421095, 0.619534, 1.33753, 0.955156, 5.93572e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at32 aB wg 4x2x4 kr cab2 ks32 xaf vav di hi pt sr br bk0 nb 4x2 grf256 sys sn", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08374e+06, 622037, -3355.41, 197096, 0, 0, 1.60574, 1.79337, 1.16087, 2.52873, 0.0170908, 0.0170908, 0, 0.954666, 1.0389, 0.191897, 4.8148e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16x2 at16x2 aB wg 4x8 cab4x2 ks64 xaf st vav di hi pt sr br bk0 nb 4x8 grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08079e+06, 618844, 0, 0, 0, 0, 1.25013, 3.49017, 1.047, 2.28189, 0.0251551, 0.0251551, 0, 0.955754, 1.00322, 1.00053, 1.67181e-14}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab3x2 ks32 xaf vav di hi pt sr br bk0 sn nb 2x4 grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.59894e+06, -165886, -123352, 328698, 3.3538e+06, 2.68698e+06, 1.29378, 0.935395, 0.914424, 1.63719, 0.0161479, 0.0115282, 0.00624899, 0.60152, 1.23882, 0.978417, 2.1418e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16x2 at16x2 aB wg 2x4x4 kr cab3 ks32 xaf vav di hi pt sr br bk0 grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07496e+06, 610685, -736.443, 135973, 0, 0, 2.10737, 1.71053, 1.57349, 3.40087, 0.0274909, 0.0274909, 0, 0.932509, 1.00516, 0.683467, 2.27771e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 aB wg 2x2x8 kr cab4x2 ks16 xaf vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 445, 12288, 12288, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.46165e+06, -119762, -43325.5, 189248, 3.71917e+06, 2.31834e+06, 0.816729, 1.39844, 0.648548, 1.38561, 0.0254865, 0.0233101, 0.00872739, 0.939824, 1.2192, -0.0765997, 2.67484e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07937e+06, 601693, -1889.77, 111799, 0, 0, 1.25642, 1.87167, 2.39915, 4.97536, 0.0272706, 0.0272706, 0, 0.945164, 1.20671, 0.942435, 2.33159e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 at16x2 aB wg 4x8 cab4x2 ks64 xaf st vav di hi pt sr br bk0 nb 4x8 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08079e+06, 618844, 0, 0, 0, 0, 1.25013, 3.49017, 1.047, 2.28189, 0.0251551, 0.0251551, 0, 0.955754, 1.00322, 1.00053, 1.67181e-14}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab3x2 ks32 xaf vav di hi pt sr br bk0 sn nb 2x4 grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.59894e+06, -165886, -123352, 328698, 3.3538e+06, 2.68698e+06, 1.29378, 0.935395, 0.914424, 1.63719, 0.0161479, 0.0115282, 0.00624899, 0.60152, 1.23882, 0.978417, 2.1418e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 at16x2 aB wg 2x4x4 kr cab3 ks32 xaf vav di hi pt sr br bk0 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07496e+06, 610685, -736.443, 135973, 0, 0, 2.10737, 1.71053, 1.57349, 3.40087, 0.0274909, 0.0274909, 0, 0.932509, 1.00516, 0.683467, 2.27771e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 2x2x8 kr cab4x2 ks16 xaf vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 445, 12288, 12288, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.46165e+06, -119762, -43325.5, 189248, 3.71917e+06, 2.31834e+06, 0.816729, 1.39844, 0.648548, 1.38561, 0.0254865, 0.0233101, 0.00872739, 0.939824, 1.2192, -0.0765997, 2.67484e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07937e+06, 601693, -1889.77, 111799, 0, 0, 1.25642, 1.87167, 2.39915, 4.97536, 0.0272706, 0.0272706, 0, 0.945164, 1.20671, 0.942435, 2.33159e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#Ip"}, "aB16x2 aS16x2 aB wg 4x16 cab4 ks64 af vav di hi pt sr br bk0 nb 4x16 sys", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 64}, {4, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.0392e+06, 894929, 0, 0, 0, 0, 1.62399, 5.20002, 2.09792, 4.09594, 0.063358, 0.063358, 0, 0.952898, 1.11995, 0.921533, 1.32456e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "aB32+m16@48 aS16 aB wg 16x1x2 kr cb4x2 ks16 xaf vav di li nmk pt sr br bk0 grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {16, 1, 2}, 1, (WGType) 1, 445, 2048, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.4553e+06, -444405, -228349, 795645, 2.85082e+06, 2.41664e+06, 0.798343, 8.5823, 0.957266, 1.78402, 0.0570677, 0.00285529, 0.0528963, 0.711108, 1.00269, 0.593861, 3.05276e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav di li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03388e+06, 205982, 0, 0, 2.69353e+06, 2.9311e+06, 5.87694, 0.62071, 0.59549, 1.16678, 0.0302158, 0.00216455, 0.029364, 0.590076, 1.35965, 0.82526, 6.19021e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m32@48 at64+m16@48 aB wg 2x2x8 kr xaf cs di hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.12446e+06, -88323.8, -10316.2, 139507, 2.89997e+06, 1.7449e+06, 0.820255, 0.848685, 0.849377, 1.53444, 0.0324984, 0.0285972, 0.0106938, 0.747769, 1.17424, 0.248229, 3.53369e-11}}}, @@ -564,20 +564,20 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 8, 1}, "ABI"}, "at16+m64@16 am64+m64@64 aB wg 8x4 af vav di li nmk pt sr br sb64 bk0 sm sys kv afb", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 64}, {16, 4, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03147e+06, 232150, 0, 0, 2.25853e+06, 2.39288e+06, 0.636799, 3.56799, 1.54923, 2.28375, 0.0371495, 0.00331608, 0.0414442, 0.906021, 1.00388, 1.00126, 2.00587e-14}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at16+m32@48 aB16 aB wg 8x4 cb4x2 ks32 af vav di hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.05957e+06, 973070, 0, 0, 7.52845e+06, 1.03301e+07, 0.801208, 0.831268, 0.985204, 1.59017, 0.00445847, 0.00445847, 0, 0.989412, 1.66875, 1.16553, 2.2562e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AIp"}, "at16+m32@48 aB16 aB wg 16x2 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1572864, 16777216}, {262144, 1572864, 64}, {16, 96, 64}, {16, 2, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07221e+06, 859869, 0, 0, 5.8327e+06, 7.83974e+06, 0.766638, 1.10644, 1.03215, 1.48351, 0.00498767, 0.00498767, 0, 0.940961, 1.6724, 1.16768, 2.0947e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@64 aB16x2 aB wg 16x2 cb3 ks32 xaf st vav di hi pt sr br bk0 sm sn nb 0x2 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 441, 24576, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {997268, 779168, 0, 0, 4.36634e+06, 5.75078e+06, 0.740136, 1.4932, 1.0067, 1.5422, 0.00624117, 0.00624117, 0, 0.962734, 1.45555, 1.07753, 2.97255e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AIp"}, "at16x2+m32@32 aB16x2 aB wg 8x2x2 kr cb4 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 445, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.79219e+06, -531031, -361044, 940628, 4.07142e+06, 4.00589e+06, 0.824906, 0.816235, 1.02815, 1.50776, 0.00733025, 0.00114441, 0.00653515, 0.764373, 1.47826, 1.10061, 2.80374e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m32@48 aB32 aB wg 8x4 cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00274e+06, 774511, 0, 0, 0, 0, 0.955103, 0.935315, 0.976805, 1.72673, 0.00884678, 0.00884678, 0, 1, 1.29366, 0.922321, 3.85241e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AIp"}, "at16+m32@16 aB32 aB wg 8x1x4 kr cb4x2 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 1, 4}, 1, (WGType) 1, 445, 16384, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.73971e+06, -258868, -158599, 451327, 3.80928e+06, 3.05316e+06, 0.482671, 0.706304, 1.03317, 1.54125, 0.00920456, 0.00608899, 0.00408001, 0.912767, 1.36748, 1.00539, 5.47259e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AIpq"}, "at32+m32@32 aB16 aB wg 8x2x4 kr cb4 ks16 af vav di hi pt sr br bk0 sm sn dm sys l4", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 2, 4}, 1, (WGType) 1, 261, 8192, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.11084e+06, 1.12066e+06, -4115.24, 188026, 0, 0, 0.763818, 1.33134, 1.06246, 2.16315, 0.0116126, 0.0116126, 0, 0.839402, 1.31397, 1.00293, 1.95132e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m16@64 aB32x2 aB wg 8x4 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn nb 0x4 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {965920, 744570, 0, 0, 0, 0, 0.791602, 1.36895, 0.948542, 2.16931, 0.0146708, 0.0146708, 0, 0.923434, 1.19189, 0.953727, 1.73617e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@32 aB16x2 aB wg 4x2x4 kr cb4x2 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.88588e+06, -200788, -193135, 423330, 3.44883e+06, 2.47398e+06, 0.572193, 0.527027, 0.933872, 1.59818, 0.0129759, 0.0095749, 0.00498263, 0.687385, 1.38427, 0.929972, 5.58992e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m16@32 aB16 aB wg 4x2x4 kr cb4x2 ks32 af vav di hi pt sr br bk0 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06355e+06, 786133, -174.663, 137254, 0, 0, 0.678583, 0.661981, 1.59904, 3.40955, 0.0149449, 0.0149449, 0, 0.904272, 1.23465, 1.00013, 1.46897e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m32@64 aB64 aB wg 4x8 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {994587, 706072, 0, 0, 0, 0, 1.16172, 1.12925, 1.50732, 3.11789, 0.0225454, 0.0225454, 0, 0.932581, 1.112, -1.70688, 2.19465e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.63784e+06, -161637, -133730, 320507, 2.49037e+06, 1.72196e+06, 0.553135, 0.77437, 0.93673, 1.73418, 0.0199906, 0.0144326, 0.00833266, 0.7979, 1.32142, 0.974789, 3.83166e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "at16+m16@32 aB64+m32@32 aB wg 2x2x8 kr af vav di hi pt sr br sb64 bk0 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 261, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.56172e+06, 526043, -33531.5, 75691, 0, 0, 0.616943, 0.690186, 3.77748, 8.29004, 0.0313301, 0.0313301, 0, 0.830451, 1.08692, 0.965356, 1.18354e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#ABI"}, "at16x2+m64@16 aB32+m16@32 aB wg 4x2x4 kr af vav di li nmk pt sr br sb64 bk0 sm sn dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 32}, {16, 8, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.28894e+06, -131687, -76296.7, 251879, 2.71155e+06, 2.04882e+06, 0.531564, 1.43928, 1.43076, 2.58493, 0.0282574, 0.0178971, 0.0205543, 1, 1.23116, -0.0703731, 1.35882e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "AI"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19242e+06, -90063.1, -14616.2, 144232, 2.71974e+06, 1.70476e+06, 1.08584, 0.412189, 0.608623, 1.21556, 0.0200475, 0.0220263, 0.0153697, 1, 1.02408, 0.835925, 1.49508e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@64 aB16x2 aB wg 16x2 cb3 ks32 xaf st vav di hi pt sr br bk0 sm sn nb 0x2 dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 441, 24576, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {997268, 779168, 0, 0, 4.36634e+06, 5.75078e+06, 0.740136, 1.4932, 1.0067, 1.5422, 0.00624117, 0.00624117, 0, 0.962734, 1.45555, 1.07753, 2.97255e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at16x2+m32@32 aB16x2 aB wg 8x2x2 kr cb4 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 445, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.79219e+06, -531031, -361044, 940628, 4.07142e+06, 4.00589e+06, 0.824906, 0.816235, 1.02815, 1.50776, 0.00733025, 0.00114441, 0.00653515, 0.764373, 1.47826, 1.10061, 2.80374e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m32@48 aB32 aB wg 8x4 cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00274e+06, 774511, 0, 0, 0, 0, 0.955103, 0.935315, 0.976805, 1.72673, 0.00884678, 0.00884678, 0, 1, 1.29366, 0.922321, 3.85241e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at16+m32@16 aB32 aB wg 8x1x4 kr cb4x2 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 1, 4}, 1, (WGType) 1, 445, 16384, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.73971e+06, -258868, -158599, 451327, 3.80928e+06, 3.05316e+06, 0.482671, 0.706304, 1.03317, 1.54125, 0.00920456, 0.00608899, 0.00408001, 0.912767, 1.36748, 1.00539, 5.47259e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ipq"}, "at32+m32@32 aB16 aB wg 8x2x4 kr cb4 ks16 af vav di hi pt sr br bk0 sm sn dm sys l4 l2d", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 2, 4}, 1, (WGType) 1, 261, 8192, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.11084e+06, 1.12066e+06, -4115.24, 188026, 0, 0, 0.763818, 1.33134, 1.06246, 2.16315, 0.0116126, 0.0116126, 0, 0.839402, 1.31397, 1.00293, 1.95132e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m16@64 aB32x2 aB wg 8x4 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn nb 0x4 dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {965920, 744570, 0, 0, 0, 0, 0.791602, 1.36895, 0.948542, 2.16931, 0.0146708, 0.0146708, 0, 0.923434, 1.19189, 0.953727, 1.73617e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@32 aB16x2 aB wg 4x2x4 kr cb4x2 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.88588e+06, -200788, -193135, 423330, 3.44883e+06, 2.47398e+06, 0.572193, 0.527027, 0.933872, 1.59818, 0.0129759, 0.0095749, 0.00498263, 0.687385, 1.38427, 0.929972, 5.58992e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m16@32 aB16 aB wg 4x2x4 kr cb4x2 ks32 af vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06355e+06, 786133, -174.663, 137254, 0, 0, 0.678583, 0.661981, 1.59904, 3.40955, 0.0149449, 0.0149449, 0, 0.904272, 1.23465, 1.00013, 1.46897e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m32@64 aB64 aB wg 4x8 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {994587, 706072, 0, 0, 0, 0, 1.16172, 1.12925, 1.50732, 3.11789, 0.0225454, 0.0225454, 0, 0.932581, 1.112, -1.70688, 2.19465e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.63784e+06, -161637, -133730, 320507, 2.49037e+06, 1.72196e+06, 0.553135, 0.77437, 0.93673, 1.73418, 0.0199906, 0.0144326, 0.00833266, 0.7979, 1.32142, 0.974789, 3.83166e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m16@32 aB64+m32@32 aB wg 2x2x8 kr af vav di hi pt sr br sb64 bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 261, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.56172e+06, 526043, -33531.5, 75691, 0, 0, 0.616943, 0.690186, 3.77748, 8.29004, 0.0313301, 0.0313301, 0, 0.830451, 1.08692, 0.965356, 1.18354e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "at16x2+m64@16 aB32+m16@32 aB wg 4x2x4 kr af vav di li nmk pt sr br sb64 bk0 sm sn dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 32}, {16, 8, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.28894e+06, -131687, -76296.7, 251879, 2.71155e+06, 2.04882e+06, 0.531564, 1.43928, 1.43076, 2.58493, 0.0282574, 0.0178971, 0.0205543, 1, 1.23116, -0.0703731, 1.35882e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "I"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19242e+06, -90063.1, -14616.2, 144232, 2.71974e+06, 1.70476e+06, 1.08584, 0.412189, 0.608623, 1.21556, 0.0200475, 0.0220263, 0.0153697, 1, 1.02408, 0.835925, 1.49508e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav di sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, @@ -594,12 +594,12 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, {{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, {{'F', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, -{{'F', "gemm", {"F", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, +{{'F', "gemm", {"F", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, {{'F', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "IB"}, "aB64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr dm", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {1, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.21597e+06, 165524, 28763.4, 9411.33, 0, 0, 0.278476, 0.599765, 0.920572, 5.21577, 0.021695, 0.0325353, 0.0148497, 1, 1.30279, 0.785158, 1.47381e-11}}}, {{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, {{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav di hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, {{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "BIp"}, "aB16+m32@48 am32+S32@64 aB wg 4x8 xaf st vav hi pt ca4x2 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {953955, 796880, 0, 0, 6.54705e+06, 1.02728e+07, 0.868475, 0.996885, 0.927331, 1.55842, 0.00423299, 0.00423299, 0, 0.997407, 1.58612, 1.11537, 2.43106e-12}}}, -{{'F', "gemm", {"F", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#ABI"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, +{{'F', "gemm", {"F", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, {{'F', "gemm", {"F", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav di li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, {{'F', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "IB"}, "aB64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr dm", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {1, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.21597e+06, 165524, 28763.4, 9411.33, 0, 0, 0.278476, 0.599765, 0.920572, 5.21577, 0.021695, 0.0325353, 0.0148497, 1, 1.30279, 0.785158, 1.47381e-11}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {16, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, @@ -610,7 +610,7 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"F", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, {{'F', "gemm", {"H", "F", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, -{{'F', "gemm", {"H", "F", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, +{{'F', "gemm", {"H", "F", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, {{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 2}, "ABI"}, "at16+m32@64 aB16 aB wg 16x2 cb3x2 ks32 af vav di hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, {{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, {{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 dm sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, @@ -672,19 +672,19 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m16@32 at32x2+m16@16 aB wg 4x2x2 kr af vav di li nmk pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 32}, {4, 2, 2}, 1, (WGType) 1, 445, 0, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {726421, -157896, 326311, 250406, 2.55754e+06, 3.09658e+06, 0.491778, 4.05366, 0.747811, 1.5515, 0.0177586, 0.0187752, 0.0114616, 1, 1.28009, 0.929609, 5.62845e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABIpq"}, "aB16+m16@48 at32+m16@48 aB wg 4x8 cab3 ks32 xaf vav di hi pt sr br bk0 sn nb 4x8 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06016e+06, 848171, 0, 0, 7.38099e+06, 1.0453e+07, 1.36089, 1.12404, 0.968578, 1.56365, 0.00464109, 0.00464109, 0, 1, 1.59464, 1.11033, 6.56771e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at16+m16@32 aB wg 8x2x2 kr cab3 ks16 xaf st vav di hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 16}, {8, 2, 2}, 1, (WGType) 1, 445, 36864, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.60943e+06, -1.34483e+06, -253630, 1.70607e+06, 5.342e+06, 7.41376e+06, 1.0168, 1.44036, 1.01244, 1.68138, 0.00675092, 0.0008524, 0.00611925, 0.798697, 1.5218, 1.14975, 5.05128e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB16+m32@48 at32+m32@48 aB wg 8x4 cab4 ks32 xaf st vav di hi pt sr br bk0 sn nb 8x4 grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08227e+06, 776908, 0, 0, 0, 0, 1.04569, 2.05877, 0.996213, 1.67003, 0.00827411, 0.00827411, 0, 0.961616, 1.45528, 1.11502, 2.00546e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB32+m32@64 at32+m16@64 aB wg 4x4x2 kr cab3 ks32 xaf st vav di hi pt sr br bk0 sn nb 4x4 grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.63235e+06, -473591, -275698, 840536, 4.04275e+06, 4.34176e+06, 1.32077, 0.982863, 0.96265, 1.60044, 0.00888121, 0.000531032, 0.00822524, 0.585331, 1.48926, 0.995812, 5.75876e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16+m16@16 aB wg 4x4x2 kr cab4x2 ks16 af vav di hi pt sr br bk0 sn nb 4x4 grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.09218e+06, 305991, -14323.3, 399143, 0, 0, 1.5323, 0.974695, 0.948184, 1.94112, 0.00890454, 0.00890454, 0, 1, 1.48844, 1.02749, 3.65821e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB32 at32+m32@48 aB wg 4x8 cab4 ks64 xaf st vav di hi pt sr br bk0 sn grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 131072, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07062e+06, 676936, 0, 0, 0, 0, 1.46521, 1.49357, 0.913257, 1.83797, 0.0123376, 0.0123376, 0, 0.978477, 1.29593, 1.02611, 1.67976e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16+m32@48 at32+m32@48 aB wg 8x4 cab4 ks32 xaf st vav di hi pt sr br bk0 sn nb 8x4 grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08227e+06, 776908, 0, 0, 0, 0, 1.04569, 2.05877, 0.996213, 1.67003, 0.00827411, 0.00827411, 0, 0.961616, 1.45528, 1.11502, 2.00546e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m32@64 at32+m16@64 aB wg 4x4x2 kr cab3 ks32 xaf st vav di hi pt sr br bk0 sn nb 4x4 grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.63235e+06, -473591, -275698, 840536, 4.04275e+06, 4.34176e+06, 1.32077, 0.982863, 0.96265, 1.60044, 0.00888121, 0.000531032, 0.00822524, 0.585331, 1.48926, 0.995812, 5.75876e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16+m16@16 aB wg 4x4x2 kr cab4x2 ks16 af vav di hi pt sr br bk0 sn nb 4x4 grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.09218e+06, 305991, -14323.3, 399143, 0, 0, 1.5323, 0.974695, 0.948184, 1.94112, 0.00890454, 0.00890454, 0, 1, 1.48844, 1.02749, 3.65821e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 at32+m32@48 aB wg 4x8 cab4 ks64 xaf st vav di hi pt sr br bk0 sn grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 131072, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07062e+06, 676936, 0, 0, 0, 0, 1.46521, 1.49357, 0.913257, 1.83797, 0.0123376, 0.0123376, 0, 0.978477, 1.29593, 1.02611, 1.67976e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab4 ks16 xaf vav di hi pt sr br bk0 sn nb 2x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.51888e+06, -225905, -111399, 382282, 3.85024e+06, 3.13754e+06, 1.02462, 1.25826, 0.997766, 1.72944, 0.0116731, 0.00921103, 0.00406354, 0.785111, 1.49836, 0.998742, 6.32281e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at32 aB wg 4x2x4 kr cab2 ks32 xaf vav di hi pt sr br bk0 nb 4x2 grf256 sys sn", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07937e+06, 621635, 2230.81, 196698, 0, 0, 1.60744, 1.79127, 1.1599, 2.52429, 0.0169955, 0.0169955, 0, 0.96773, 1.11098, 0.946816, 1.02001e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16x2 at16x2 aB wg 4x8 cab4x2 ks64 xaf st vav di hi pt sr br bk0 nb 4x8 grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.09253e+06, 617958, 0, 0, 0, 0, 1.24883, 3.48717, 1.05505, 2.27656, 0.0251952, 0.0251952, 0, 0.980002, 1.05051, 0.285902, 7.08638e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab3x2 ks32 xaf vav di hi pt sr br bk0 sn nb 2x4 grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {2.50658e+06, -199989, -350122, 337287, 3.39968e+06, 2.49856e+06, 0.784682, 0.339369, 0.497498, 1.59625, 0.016336, 0.0120673, 0.0057422, 0.609408, 1.33312, 0.989535, 3.60691e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16x2 at16x2 aB wg 2x4x4 kr cab3 ks32 xaf vav di hi pt sr br bk0 grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06937e+06, 611760, 4194.1, 135343, 0, 0, 2.11403, 1.71129, 1.57142, 3.39186, 0.0275862, 0.0275862, 0, 0.920705, 1.00345, 0.413363, 6.7934e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 aB wg 2x2x8 kr cab4x2 ks16 xaf vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 445, 12288, 12288, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.47272e+06, -119971, -42490.2, 189240, 3.69459e+06, 2.31014e+06, 0.811882, 1.39447, 0.646674, 1.39008, 0.025457, 0.0234095, 0.00867396, 0.960377, 1.27279, 0.825982, 8.35402e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 at16x2 aB wg 4x8 cab4x2 ks64 xaf st vav di hi pt sr br bk0 nb 4x8 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.09253e+06, 617958, 0, 0, 0, 0, 1.24883, 3.48717, 1.05505, 2.27656, 0.0251952, 0.0251952, 0, 0.980002, 1.05051, 0.285902, 7.08638e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab3x2 ks32 xaf vav di hi pt sr br bk0 sn nb 2x4 grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {2.50658e+06, -199989, -350122, 337287, 3.39968e+06, 2.49856e+06, 0.784682, 0.339369, 0.497498, 1.59625, 0.016336, 0.0120673, 0.0057422, 0.609408, 1.33312, 0.989535, 3.60691e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 at16x2 aB wg 2x4x4 kr cab3 ks32 xaf vav di hi pt sr br bk0 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06937e+06, 611760, 4194.1, 135343, 0, 0, 2.11403, 1.71129, 1.57142, 3.39186, 0.0275862, 0.0275862, 0, 0.920705, 1.00345, 0.413363, 6.7934e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 2x2x8 kr cab4x2 ks16 xaf vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 445, 12288, 12288, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.47272e+06, -119971, -42490.2, 189240, 3.69459e+06, 2.31014e+06, 0.811882, 1.39447, 0.646674, 1.39008, 0.025457, 0.0234095, 0.00867396, 0.960377, 1.27279, 0.825982, 8.35402e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#Ip"}, "aB16x2 aS16x2 aB wg 4x16 cab4 ks64 af vav di hi pt sr br bk0 nb 4x16 sys", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 64}, {4, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04437e+06, 894474, 0, 0, 0, 0, 1.61942, 5.20006, 2.09495, 4.09093, 0.063329, 0.063329, 0, 0.953584, 1.16518, 0.929804, 1.67951e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "aB32+m16@48 aS16 aB wg 16x1x2 kr cb4x2 ks16 xaf vav di li nmk pt sr br bk0 grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {16, 1, 2}, 1, (WGType) 1, 445, 2048, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.42997e+06, -443650, -203140, 794602, 2.88358e+06, 2.22003e+06, 0.794365, 8.5854, 0.944109, 1.77614, 0.0570978, 0.00252013, 0.0532169, 0.706182, 1.0381, 0.381417, 6.1353e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav di li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m32@48 at64+m16@48 aB wg 2x2x8 kr xaf cs di hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.13361e+06, -87922.6, -10020.2, 138891, 2.8672e+06, 1.83501e+06, 0.818937, 0.8499, 0.851117, 1.53666, 0.0324244, 0.0285585, 0.0107132, 0.756667, 1.09867, 0.84864, 1.283e-11}}}, @@ -710,20 +710,20 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 8, 1}, "ABI"}, "at16+m64@16 am64+m64@64 aB wg 8x4 af vav di li nmk pt sr br sb64 bk0 sm sys kv afb", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 64}, {16, 4, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03017e+06, 232218, 0, 0, 2.06356e+06, 2.32817e+06, 0.635122, 3.56777, 1.53227, 2.28154, 0.0372029, 0.00337686, 0.0415229, 0.905521, 1.06254, 0.957223, 9.43647e-13}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at16+m32@48 aB16 aB wg 8x4 cb4x2 ks32 af vav di hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06632e+06, 970822, 0, 0, 6.83377e+06, 1.0453e+07, 0.802893, 0.831622, 0.987736, 1.60125, 0.0044336, 0.0044336, 0, 0.99801, 1.7879, 1.21352, 4.70293e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AIp"}, "at16+m32@48 aB16 aB wg 16x2 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1572864, 16777216}, {262144, 1572864, 64}, {16, 96, 64}, {16, 2, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06174e+06, 862966, 0, 0, 5.06266e+06, 7.90528e+06, 0.76759, 1.1074, 1.02291, 1.47362, 0.00497143, 0.00497143, 0, 0.968925, 1.63841, 1.28013, 3.04063e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@64 aB16x2 aB wg 16x2 cb3 ks32 xaf st vav di hi pt sr br bk0 sm sn nb 0x2 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 441, 24576, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {996043, 778753, 0, 0, 4.31555e+06, 5.99654e+06, 0.739151, 1.50326, 1.00767, 1.54244, 0.00616043, 0.00616043, 0, 0.997048, 1.53835, 1.17302, 3.4064e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AIp"}, "at16x2+m32@32 aB16x2 aB wg 8x2x2 kr cb4 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 445, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.77071e+06, -527987, -353300, 939437, 4.14515e+06, 4.096e+06, 0.810631, 0.820929, 1.02923, 1.5067, 0.0071533, 0.000990959, 0.00658984, 0.834339, 1.70826, 1.24022, 4.35712e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m32@48 aB32 aB wg 8x4 cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00127e+06, 774753, 0, 0, 0, 0, 0.947461, 0.934863, 0.97605, 1.72158, 0.00894179, 0.00894179, 0, 0.970326, 1.50724, 1.0217, 2.9666e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AIp"}, "at16+m32@16 aB32 aB wg 8x1x4 kr cb4x2 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 1, 4}, 1, (WGType) 1, 445, 16384, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.78479e+06, -261349, -173232, 452033, 4.02227e+06, 3.01466e+06, 0.481975, 0.723918, 1.03793, 1.54262, 0.00927887, 0.00655604, 0.00397641, 0.95291, 1.53912, 1.14817, 5.51161e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AIpq"}, "at32+m32@32 aB16 aB wg 8x2x4 kr cb4 ks16 af vav di hi pt sr br bk0 sm sn dm sys l4", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 2, 4}, 1, (WGType) 1, 261, 8192, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.10265e+06, 1.12141e+06, -262.878, 187723, 0, 0, 0.762329, 1.33218, 1.05656, 2.16208, 0.011677, 0.011677, 0, 0.866398, 1.36265, 1.01013, 3.61321e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m16@64 aB32x2 aB wg 8x4 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn nb 0x4 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {972331, 751881, 0, 0, 0, 0, 0.799573, 1.37171, 0.93861, 2.1425, 0.0147472, 0.0147472, 0, 0.928068, 1.27036, 0.985348, 1.75902e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@32 aB16x2 aB wg 4x2x4 kr cb4x2 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.88533e+06, -200778, -198593, 423600, 3.39149e+06, 2.28557e+06, 0.572852, 0.529126, 0.934694, 1.58997, 0.012793, 0.00968393, 0.00496565, 0.873155, 1.53254, 0.953462, 7.13806e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m16@32 aB16 aB wg 4x2x4 kr cb4x2 ks32 af vav di hi pt sr br bk0 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.073e+06, 784464, -1329.54, 137590, 0, 0, 0.675201, 0.663483, 1.58515, 3.38345, 0.0150557, 0.0150557, 0, 0.919981, 1.2957, 0.987294, 2.89092e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m32@64 aB64 aB wg 4x8 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00586e+06, 704834, 0, 0, 0, 0, 1.16313, 1.1353, 1.51168, 3.11912, 0.0225951, 0.0225951, 0, 0.923429, 1.15703, 0.85336, 2.75385e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.66288e+06, -98375, -71966.2, 195025, 2.84262e+06, 2.06438e+06, 0.483255, 0.500833, 0.809319, 1.72387, 0.023295, 0.0176286, 0.00890789, 0.991568, 1.447, 0.919946, 9.68796e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "at16+m16@32 aB64+m32@32 aB wg 2x2x8 kr af vav di hi pt sr br sb64 bk0 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 261, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.05941e+06, 583908, 26225.8, 68801.4, 0, 0, 0.627148, 0.702832, 3.725, 8.29062, 0.0313015, 0.0313015, 0, 0.85187, 1.15432, 0.975457, 2.06896e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#ABI"}, "at16x2+m64@16 aB32+m16@32 aB wg 4x2x4 kr af vav di li nmk pt sr br sb64 bk0 sm sn dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 32}, {16, 8, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.26276e+06, -130899, -63587.1, 251457, 2.73613e+06, 1.99803e+06, 0.535584, 1.4389, 1.4172, 2.56436, 0.0281714, 0.0174514, 0.0209074, 1, 1.28388, 0.655999, 5.90369e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "AI"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.15217e+06, -88485.5, -9852.59, 144015, 2.69517e+06, 1.69656e+06, 1.08886, 0.414631, 0.587045, 1.19593, 0.0202405, 0.0220064, 0.0155006, 1, 1.0809, 0.644827, 3.81346e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@64 aB16x2 aB wg 16x2 cb3 ks32 xaf st vav di hi pt sr br bk0 sm sn nb 0x2 dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 441, 24576, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {996043, 778753, 0, 0, 4.31555e+06, 5.99654e+06, 0.739151, 1.50326, 1.00767, 1.54244, 0.00616043, 0.00616043, 0, 0.997048, 1.53835, 1.17302, 3.4064e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at16x2+m32@32 aB16x2 aB wg 8x2x2 kr cb4 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 445, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.77071e+06, -527987, -353300, 939437, 4.14515e+06, 4.096e+06, 0.810631, 0.820929, 1.02923, 1.5067, 0.0071533, 0.000990959, 0.00658984, 0.834339, 1.70826, 1.24022, 4.35712e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m32@48 aB32 aB wg 8x4 cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00127e+06, 774753, 0, 0, 0, 0, 0.947461, 0.934863, 0.97605, 1.72158, 0.00894179, 0.00894179, 0, 0.970326, 1.50724, 1.0217, 2.9666e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at16+m32@16 aB32 aB wg 8x1x4 kr cb4x2 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 1, 4}, 1, (WGType) 1, 445, 16384, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.78479e+06, -261349, -173232, 452033, 4.02227e+06, 3.01466e+06, 0.481975, 0.723918, 1.03793, 1.54262, 0.00927887, 0.00655604, 0.00397641, 0.95291, 1.53912, 1.14817, 5.51161e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ipq"}, "at32+m32@32 aB16 aB wg 8x2x4 kr cb4 ks16 af vav di hi pt sr br bk0 sm sn dm sys l4 l2d", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 2, 4}, 1, (WGType) 1, 261, 8192, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.10265e+06, 1.12141e+06, -262.878, 187723, 0, 0, 0.762329, 1.33218, 1.05656, 2.16208, 0.011677, 0.011677, 0, 0.866398, 1.36265, 1.01013, 3.61321e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m16@64 aB32x2 aB wg 8x4 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn nb 0x4 dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {972331, 751881, 0, 0, 0, 0, 0.799573, 1.37171, 0.93861, 2.1425, 0.0147472, 0.0147472, 0, 0.928068, 1.27036, 0.985348, 1.75902e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@32 aB16x2 aB wg 4x2x4 kr cb4x2 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.88533e+06, -200778, -198593, 423600, 3.39149e+06, 2.28557e+06, 0.572852, 0.529126, 0.934694, 1.58997, 0.012793, 0.00968393, 0.00496565, 0.873155, 1.53254, 0.953462, 7.13806e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m16@32 aB16 aB wg 4x2x4 kr cb4x2 ks32 af vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.073e+06, 784464, -1329.54, 137590, 0, 0, 0.675201, 0.663483, 1.58515, 3.38345, 0.0150557, 0.0150557, 0, 0.919981, 1.2957, 0.987294, 2.89092e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m32@64 aB64 aB wg 4x8 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00586e+06, 704834, 0, 0, 0, 0, 1.16313, 1.1353, 1.51168, 3.11912, 0.0225951, 0.0225951, 0, 0.923429, 1.15703, 0.85336, 2.75385e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.66288e+06, -98375, -71966.2, 195025, 2.84262e+06, 2.06438e+06, 0.483255, 0.500833, 0.809319, 1.72387, 0.023295, 0.0176286, 0.00890789, 0.991568, 1.447, 0.919946, 9.68796e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m16@32 aB64+m32@32 aB wg 2x2x8 kr af vav di hi pt sr br sb64 bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 261, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.05941e+06, 583908, 26225.8, 68801.4, 0, 0, 0.627148, 0.702832, 3.725, 8.29062, 0.0313015, 0.0313015, 0, 0.85187, 1.15432, 0.975457, 2.06896e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "at16x2+m64@16 aB32+m16@32 aB wg 4x2x4 kr af vav di li nmk pt sr br sb64 bk0 sm sn dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 32}, {16, 8, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.26276e+06, -130899, -63587.1, 251457, 2.73613e+06, 1.99803e+06, 0.535584, 1.4389, 1.4172, 2.56436, 0.0281714, 0.0174514, 0.0209074, 1, 1.28388, 0.655999, 5.90369e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "I"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.15217e+06, -88485.5, -9852.59, 144015, 2.69517e+06, 1.69656e+06, 1.08886, 0.414631, 0.587045, 1.19593, 0.0202405, 0.0220064, 0.0155006, 1, 1.0809, 0.644827, 3.81346e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav di sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, @@ -732,25 +732,25 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABqI"}, "am32+S1,32@64 av32+B32@64 aS cs di sys grf256 af wg 8x4 bo sb256 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {765291, 1.00234e+06, 0, 0, 0, 0, 0.723688, 0.663141, 1.08538, 2.05438, 0.00434664, 0.00434664, 0, 1, 1.8693, 1.21785, 3.96104e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS8x2+m16@20 aB8+m16@20 aS wg 8x4 kc8 nse hi pt sb32 bk0 sm sn grf256 kv afb l4 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {971283, 550381, 0, 0, 1.25911e+07, 8.13466e+06, 2.33425, 1.5484, 4.45361, 4.80392, 0.0689395, 0.0689395, 0, 1, 1.00725, 0.817916, 1.37423e-12}}}, {{'F', "gemm", {"H", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, -{{'F', "gemm", {"H", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, +{{'F', "gemm", {"H", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, {{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 2}, "ABI"}, "at16+m32@64 aB16 aB wg 16x2 cb3x2 ks32 af vav di hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, {{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, {{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 dm sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, {{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 dm sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, {{'F', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, {{'F', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav di hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.0376e+06, 806829, 0, 0, 0, 0, 1.28219, 1.0295, 1.49381, 3.00966, 0.0179815, 0.0179815, 0, 0.929009, 1.36496, 0.83458, 3.84739e-12}}}, -{{'F', "gemm", {"O", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, -{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}}, +{{'F', "gemm", {"O", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, +{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}}, {{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, {{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, {{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, {{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav di hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, {{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 aB wg 8x4 cb3 ks64 af vav di hi pt sr br bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01629e+06, 551456, 0, 0, 5.42147e+06, 5.37395e+06, 0.789185, 1.6085, 1.03879, 1.72752, 0.00628363, 0.00628363, 0, 0.924224, 1.51259, 1.24909, 2.21382e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, {{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav di li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, {{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 dm sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, {{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}}, {{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 dm sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, {{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav di sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, {{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m32@48 am32+m32@32 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, 1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}}, @@ -845,20 +845,20 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 8, 1}, "IAB"}, "at32+m128@32 am128+m128@128 aB wg 8x4 af vav di li nmk pt sr br sb128 bk0 sm sys kv afb", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 128}, {16, 4, 128}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01309e+06, 225543, 0, 0, 1.84812e+06, 1.93577e+06, 0.318973, 1.6272, 0.589134, 1.33561, 0.014082, 0.00175378, 0.0216268, 1, 1.06459, 0.958519, 4.93673e-13}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at32+m64@96 aB32 aB wg 8x4 cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.05324e+06, 979401, 0, 0, 5.54107e+06, 7.84794e+06, 0.402313, 0.410019, 0.918697, 1.43736, 0.00226438, 0.00226438, 0, 1, 1.62189, 1.08285, 3.19321e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AIp"}, "at32+m64@96 aB32 aB wg 16x2 cb4x2 ks128 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1572864, 16777216}, {262144, 1572864, 128}, {16, 96, 128}, {16, 2, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04068e+06, 873177, 0, 0, 3.37183e+06, 6.29965e+06, 0.384445, 0.528477, 0.860175, 1.32814, 0.00247393, 0.00247393, 0, 0.990759, 1.57684, 1.14116, 2.03781e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AIpq"}, "at64+m64@128 aB32x2 aB wg 16x2 cb3 ks64 xaf st vav di hi pt sr br bk0 sm sn nb 0x2 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 64}, {16, 64, 64}, {16, 2, 1}, 1, (WGType) 1, 441, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {973856, 781280, 0, 0, 3.47341e+06, 4.52198e+06, 0.376801, 0.745941, 0.834762, 1.36837, 0.00306578, 0.00306578, 0, 1, 1.548, 1.17568, 1.37045e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at32x2+m64@64 aB32x2 aB wg 8x2x2 kr cb4 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 64}, {16, 64, 64}, {8, 2, 2}, 1, (WGType) 1, 445, 32768, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.76235e+06, -528635, -346576, 928208, 2.73039e+06, 3.28253e+06, 0.411331, 0.407059, 0.82943, 1.26791, 0.00365219, 0.000634094, 0.00340305, 0.949468, 1.57605, 1.11643, 2.18464e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m64@96 aB64 aB wg 8x4 cb4x2 ks64 xaf vav di hi pt sr br bk0 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {979561, 775985, 0, 0, 0, 0, 0.472998, 0.448938, 0.785458, 1.5451, 0.00427182, 0.00427182, 0, 1, 1.48807, 0.994577, 1.6929e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at32+m64@32 aB64 aB wg 8x1x4 kr cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 64}, {16, 64, 64}, {8, 1, 4}, 1, (WGType) 1, 445, 16384, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.72793e+06, -260474, -161085, 447531, 2.31834e+06, 2.89178e+06, 0.238818, 0.354785, 0.799686, 1.20956, 0.00460907, 0.00312961, 0.00215107, 0.989106, 1.46994, 1.02912, 3.02074e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at64+m64@64 aB32 aB wg 8x2x4 kr cb4 ks32 af vav di hi pt sr br bk0 sm sn dm sys l4", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {8, 2, 4}, 1, (WGType) 1, 261, 8192, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.10055e+06, 1.06985e+06, 1310.76, 199871, 0, 0, 0.375142, 0.660095, 0.719737, 1.72017, 0.00584039, 0.00584039, 0, 1, 1.38677, 1.01044, 1.17577e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@128 aB64x2 aB wg 8x4 cb4x2 ks128 xaf st vav di hi pt sr br bk0 sm sn nb 0x4 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.00856e+06, 729582, 0, 0, 0, 0, 0.391803, 0.680643, 0.557725, 1.58519, 0.00733339, 0.00733339, 0, 0.974679, 1.32826, 0.97557, 9.11096e-13}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at64+m64@64 aB32x2 aB wg 4x2x4 kr cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 64}, {16, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.84304e+06, -196719, -186130, 412502, 2.21921e+06, 2.048e+06, 0.286789, 0.263596, 0.436547, 1.25452, 0.00648641, 0.00488563, 0.00263508, 0.940889, 1.37843, 0.889564, 5.23147e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AIp"}, "at64+m32@64 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 261, 16384, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.05176e+06, 782569, -1901.36, 140619, 0, 0, 0.336566, 0.331805, 0.795679, 2.44302, 0.00752315, 0.00752315, 0, 1, 1.33186, 0.99299, 1.10267e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m64@128 aB128 aB wg 4x8 cb4x2 ks128 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.68462e+06, 614668, 0, 0, 0, 0, 0.528613, 0.557291, -0.992969, 0.00078125, 0.0113836, 0.0113836, 0, 0.96247, 1.20355, 0.897505, 1.16415e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32x2+m64@96 aB64 aB wg 4x2x4 kr cb4x2 ks128 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 128}, {16, 16, 128}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.63724e+06, -117354, -138376, 322423, 1.85139e+06, 1.3271e+06, 0.217482, 0.358374, -0.173415, 0.826585, 0.00954542, 0.00723565, 0.00380699, 0.869435, 1.30653, 0.807528, 6.89227e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at64+m64@96 aB64x2 aB wg 2x2x8 kr cb4x2 ks64 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.65053e+06, -99807.1, -68122.5, 193337, 2.53952e+06, 1.70394e+06, 0.244079, 0.25177, 0.0702585, 0.887446, 0.0117307, 0.00844281, 0.00495289, 0.997313, 1.45603, 0.945166, 4.40941e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "at32+m32@64 aB128+m64@64 aB wg 2x2x8 kr af vav di hi pt sr br sb128 bk0 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {2, 2, 8}, 1, (WGType) 1, 261, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {995441, 575346, 2016.61, 75427, 0, 0, 0.311133, 0.344824, 2.79512, 7.15467, 0.0157164, 0.0157164, 0, 0.82504, 1.17356, 0.963482, 1.218e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#ABI"}, "at32x2+m128@32 aB64+m32@64 aB wg 4x2x4 kr af vav di li nmk pt sr br sb128 bk0 sm sn dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 64}, {16, 8, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.2871e+06, -132026, -84694.7, 251812, 2.17989e+06, 1.66871e+06, 0.26071, 0.702238, 0.035417, 1.12604, 0.0101197, 0.0087237, 0.010893, 1, 1.29244, 0.838716, 2.17924e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m64@96 aB128 aB wg 1x4x8 kr af vav di li pt sr br sb128 bk0 sm dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 128}, {16, 16, 128}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.19588e+06, -90151.7, -17276.9, 143923, 2.18726e+06, 1.17965e+06, 0.50932, 0.206862, 0.351165, 0.961797, 0.00952689, 0.0094563, 0.00799782, 1, 1.08799, 0.29902, 3.8884e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ipq"}, "at64+m64@128 aB32x2 aB wg 16x2 cb3 ks64 xaf st vav di hi pt sr br bk0 sm sn nb 0x2 dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 64}, {16, 64, 64}, {16, 2, 1}, 1, (WGType) 1, 441, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {973856, 781280, 0, 0, 3.47341e+06, 4.52198e+06, 0.376801, 0.745941, 0.834762, 1.36837, 0.00306578, 0.00306578, 0, 1, 1.548, 1.17568, 1.37045e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32x2+m64@64 aB32x2 aB wg 8x2x2 kr cb4 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 64}, {16, 64, 64}, {8, 2, 2}, 1, (WGType) 1, 445, 32768, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.76235e+06, -528635, -346576, 928208, 2.73039e+06, 3.28253e+06, 0.411331, 0.407059, 0.82943, 1.26791, 0.00365219, 0.000634094, 0.00340305, 0.949468, 1.57605, 1.11643, 2.18464e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m64@96 aB64 aB wg 8x4 cb4x2 ks64 xaf vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {979561, 775985, 0, 0, 0, 0, 0.472998, 0.448938, 0.785458, 1.5451, 0.00427182, 0.00427182, 0, 1, 1.48807, 0.994577, 1.6929e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m64@32 aB64 aB wg 8x1x4 kr cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 64}, {16, 64, 64}, {8, 1, 4}, 1, (WGType) 1, 445, 16384, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.72793e+06, -260474, -161085, 447531, 2.31834e+06, 2.89178e+06, 0.238818, 0.354785, 0.799686, 1.20956, 0.00460907, 0.00312961, 0.00215107, 0.989106, 1.46994, 1.02912, 3.02074e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at64+m64@64 aB32 aB wg 8x2x4 kr cb4 ks32 af vav di hi pt sr br bk0 sm sn dm sys l4 l2d", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {8, 2, 4}, 1, (WGType) 1, 261, 8192, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.10055e+06, 1.06985e+06, 1310.76, 199871, 0, 0, 0.375142, 0.660095, 0.719737, 1.72017, 0.00584039, 0.00584039, 0, 1, 1.38677, 1.01044, 1.17577e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@128 aB64x2 aB wg 8x4 cb4x2 ks128 xaf st vav di hi pt sr br bk0 sm sn nb 0x4 dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.00856e+06, 729582, 0, 0, 0, 0, 0.391803, 0.680643, 0.557725, 1.58519, 0.00733339, 0.00733339, 0, 0.974679, 1.32826, 0.97557, 9.11096e-13}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at64+m64@64 aB32x2 aB wg 4x2x4 kr cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 64}, {16, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.84304e+06, -196719, -186130, 412502, 2.21921e+06, 2.048e+06, 0.286789, 0.263596, 0.436547, 1.25452, 0.00648641, 0.00488563, 0.00263508, 0.940889, 1.37843, 0.889564, 5.23147e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at64+m32@64 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 261, 16384, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.05176e+06, 782569, -1901.36, 140619, 0, 0, 0.336566, 0.331805, 0.795679, 2.44302, 0.00752315, 0.00752315, 0, 1, 1.33186, 0.99299, 1.10267e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m64@128 aB128 aB wg 4x8 cb4x2 ks128 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.68462e+06, 614668, 0, 0, 0, 0, 0.528613, 0.557291, -0.992969, 0.00078125, 0.0113836, 0.0113836, 0, 0.96247, 1.20355, 0.897505, 1.16415e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32x2+m64@96 aB64 aB wg 4x2x4 kr cb4x2 ks128 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 128}, {16, 16, 128}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.63724e+06, -117354, -138376, 322423, 1.85139e+06, 1.3271e+06, 0.217482, 0.358374, -0.173415, 0.826585, 0.00954542, 0.00723565, 0.00380699, 0.869435, 1.30653, 0.807528, 6.89227e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at64+m64@96 aB64x2 aB wg 2x2x8 kr cb4x2 ks64 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.65053e+06, -99807.1, -68122.5, 193337, 2.53952e+06, 1.70394e+06, 0.244079, 0.25177, 0.0702585, 0.887446, 0.0117307, 0.00844281, 0.00495289, 0.997313, 1.45603, 0.945166, 4.40941e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@64 aB128+m64@64 aB wg 2x2x8 kr af vav di hi pt sr br sb128 bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {2, 2, 8}, 1, (WGType) 1, 261, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {995441, 575346, 2016.61, 75427, 0, 0, 0.311133, 0.344824, 2.79512, 7.15467, 0.0157164, 0.0157164, 0, 0.82504, 1.17356, 0.963482, 1.218e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "at32x2+m128@32 aB64+m32@64 aB wg 4x2x4 kr af vav di li nmk pt sr br sb128 bk0 sm sn dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 64}, {16, 8, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.2871e+06, -132026, -84694.7, 251812, 2.17989e+06, 1.66871e+06, 0.26071, 0.702238, 0.035417, 1.12604, 0.0101197, 0.0087237, 0.010893, 1, 1.29244, 0.838716, 2.17924e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "#I"}, "at32+m64@96 aB128 aB wg 1x4x8 kr af vav di li pt sr br sb128 bk0 sm dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 128}, {16, 16, 128}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.19588e+06, -90151.7, -17276.9, 143923, 2.18726e+06, 1.17965e+06, 0.50932, 0.206862, 0.351165, 0.961797, 0.00952689, 0.0094563, 0.00799782, 1, 1.08799, 0.29902, 3.8884e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aS32+S64@96 aB64+S32@96 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 64}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.18068e+06, 53960.8, 0, 0, 0, 0, 0.213645, 3.70034, 2.54839, 9.45781, 0.067707, 0.0150101, 0.0808417, 1, 1.00383, 0, 0}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS16+m32@32 aB8x2+m16@32 aB wg 8x2 cb3 ks32 nse hi pt sr br bk0 sm sn grf256 kv afb l4 dm", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 2, 1}, 1, (WGType) 1, 441, 12288, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.43403e+06, 452176, 0, 0, 4.48102e+06, 9.67475e+06, 0.486151, 0.555121, 0.890086, 1.40877, 0.0164251, 0.000123601, 0.0164842, 0.490983, 1.14664, 1.00079, 8.17626e-13}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABpqI"}, "am64+S1,64@128 av64+B64@128 aS cs di sys grf256 af wg 8x4 bo sb512 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {696651, 1.10436e+06, 0, 0, 0, 0, 0.942188, 0.965594, 1.06189, 2.04065, 0.00368306, 0.00368306, 0, 0.911159, 1.34929, 0.941877, 1.64736e-12}}}, @@ -880,7 +880,7 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB16+B16@16 aB wg 4x4 vav di hi pt sb256 bk0 grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {16, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.10018e+06, 251905, 0, 0, 0, 0, 1.56408, 2.85947, 0.648851, 1.37611, 0.0629702, 0.000146865, 0.0632313, 0.517444, 1.16754, -0.0884205, 2.14696e-11}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m64@48 am32+m32@64 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, 1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIqp"}, "at16+m64@48 am32+m32@56 aB wg 4x8 xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {16, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {885722, 718272, 0, 0, 8.48691e+06, 1.26566e+07, 1.01026, 0.782981, 0.918959, 1.54496, 0.00414471, 0.00414471, 0, 1, 2.17926, 1.31409, 2.23082e-12}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "AI"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.15217e+06, -88485.5, -9852.59, 144015, 2.69517e+06, 1.69656e+06, 1.08886, 0.414631, 0.587045, 1.19593, 0.0202405, 0.0220064, 0.0155006, 1, 1.0809, 0.644827, 3.81346e-12}}}, +{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "I"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.15217e+06, -88485.5, -9852.59, 144015, 2.69517e+06, 1.69656e+06, 1.08886, 0.414631, 0.587045, 1.19593, 0.0202405, 0.0220064, 0.0155006, 1, 1.0809, 0.644827, 3.81346e-12}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+S1,16@24 aS16+S32@16 aB wg 2x2x8 kr vav hi pt sr sb256 bk0 sm sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.2289e+06, -127728, -18531.3, 192240, 3.35053e+06, 0, 0.932716, 1.33521, 0.665104, 1.39923, 0.0628179, 0.0675437, 0.0114361, 0.999809, 1.27564, 0.821381, 3.76564e-11}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am16x2+m64@32 aB wg 4x4x2 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {16, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03742e+06, -452509, 25423.9, 695882, 3.92397e+06, 3.94035e+06, 0.820319, 0.785135, 0.84902, 1.56923, 0.00809246, 0.00116078, 0.00745441, 0.526504, 1.60176, 1.07294, 4.15601e-12}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,16@16 aB8+B8@16 aU vav di wg 8x4 bo pt sm sb256 grf256 bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {875898, 864492, 0, 0, 0, 0, 2.55527, 2.12966, 0.857629, 1.85569, 0.0625314, 0.0625314, 0, 1, 1.01096, 1.00724, 3.06523e-14}}}, From 1fbe670b4e631f5288181cbe057bb0980d5d4a6d Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Sun, 28 Apr 2024 16:50:47 -0700 Subject: [PATCH 009/187] build, doc: set DNNL_GPU_VENDOR to NONE when DNNL_GPU_RUNTIME is NONE --- cmake/options.cmake | 13 +++++++++---- doc/build/build_options.md | 2 +- src/CMakeLists.txt | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/cmake/options.cmake b/cmake/options.cmake index 01ffa6e03d1..91359d92100 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -269,10 +269,15 @@ if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL)$") message(FATAL_ERROR "Unsupported GPU runtime: ${DNNL_GPU_RUNTIME}") endif() -set(DNNL_GPU_VENDOR "INTEL" CACHE STRING - "specifies target GPU vendor for GPU engines. - Can be INTEL (default) or NVIDIA.") -if(NOT "${DNNL_GPU_VENDOR}" MATCHES "^(INTEL|NVIDIA|AMD)$") +set(DNNL_GPU_VENDOR "NONE" CACHE STRING + "When DNNL_GPU_RUNTIME is not NONE DNNL_GPU_VENDOR specifies target GPU + vendor for GPU engines. Can be INTEL (default), NVIDIA or AMD.") + +if(NOT DNNL_GPU_RUNTIME STREQUAL "NONE" AND DNNL_GPU_VENDOR STREQUAL "NONE") + set(DNNL_GPU_VENDOR "INTEL") +endif() + +if(NOT "${DNNL_GPU_VENDOR}" MATCHES "^(NONE|INTEL|NVIDIA|AMD)$") message(FATAL_ERROR "Unsupported GPU vendor: ${DNNL_GPU_VENDOR}") endif() diff --git a/doc/build/build_options.md b/doc/build/build_options.md index 493855e3041..7632809c16d 100644 --- a/doc/build/build_options.md +++ b/doc/build/build_options.md @@ -31,7 +31,7 @@ oneDNN supports the following build-time options. | ONEDNN_DEV_MODE | ON, **OFF** | Enables internal tracing and `debuginfo` logging in verbose output (for oneDNN developers) | | ONEDNN_AARCH64_USE_ACL | ON, **OFF** | Enables integration with Arm Compute Library for AArch64 builds | | ONEDNN_BLAS_VENDOR | **NONE**, ARMPL, ACCELERATE | Defines an external BLAS library to link to for GEMM-like operations | -| ONEDNN_GPU_VENDOR | **INTEL**, NVIDIA, AMD | Defines GPU vendor for GPU engines | +| ONEDNN_GPU_VENDOR | NONE, **INTEL**, NVIDIA, AMD | When DNNL_GPU_RUNTIME is not NONE defines GPU vendor for GPU engines otherwise its value is NONE| | ONEDNN_DPCPP_HOST_COMPILER | **DEFAULT**, *GNU or Clang C++ compiler executable* | Specifies host compiler executable for SYCL runtime | | ONEDNN_LIBRARY_NAME | **dnnl**, *library name* | Specifies name of the library | | ONEDNN_TEST_SET | SMOKE, **CI**, NIGHTLY, MODIFIER_NAME | Specifies the testing coverage enabled through the generated testing targets | diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2482385d9e2..f2a306d66e3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -134,7 +134,7 @@ endif() if(ONEDNN_BUILD_GRAPH) message(STATUS "Graph component is enabled") - if (NOT DNNL_GPU_VENDOR STREQUAL "INTEL") + if (NOT DNNL_GPU_RUNTIME STREQUAL "NONE" AND NOT DNNL_GPU_VENDOR STREQUAL "INTEL") message(FATAL_ERROR "Graph API does not support ${DNNL_GPU_VENDOR} GPU. " "Either disable Graph API with ONEDNN_BUILD_GRAPH=OFF or change GPU " "vendor to INTEL with ONEDNN_GPU_VENDOR=INTEL.") From faa37bc7d488a1eed6e2856a0c5e012ade29ade3 Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Fri, 26 Apr 2024 11:24:11 -0700 Subject: [PATCH 010/187] common: introduce vendor macros --- include/oneapi/dnnl/dnnl_config.h.in | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/oneapi/dnnl/dnnl_config.h.in b/include/oneapi/dnnl/dnnl_config.h.in index b4035f51ac3..30c8a8c0ccf 100644 --- a/include/oneapi/dnnl/dnnl_config.h.in +++ b/include/oneapi/dnnl/dnnl_config.h.in @@ -82,6 +82,18 @@ /// DPC++ runtime #define DNNL_RUNTIME_DPCPP DNNL_RUNTIME_SYCL +/// No vendor (corresponding runtime is disabled) +#define DNNL_VENDOR_NONE 0u + +/// Intel vendor +#define DNNL_VENDOR_INTEL 1u + +/// NVIDIA vendor +#define DNNL_VENDOR_NVIDIA 2u + +/// AMD vendor +#define DNNL_VENDOR_AMD 4u + /// @} dnnl_api_service // oneDNN CPU threading runtime @@ -93,6 +105,9 @@ // oneDNN GPU engine runtime #cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME} +// oneDNN GPU vendor +#cmakedefine DNNL_GPU_VENDOR DNNL_VENDOR_${DNNL_GPU_VENDOR} + // clang-format on #if defined(DNNL_CPU_RUNTIME) && defined(DNNL_GPU_RUNTIME) From f426dc708e9f4e0e9bc5565128bd30a008981c4f Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Fri, 26 Apr 2024 11:27:40 -0700 Subject: [PATCH 011/187] common: hrt: ocl: move device agnostic ocl utils to common --- src/CMakeLists.txt | 4 + src/gpu/intel/compute/compute_engine.hpp | 3 +- src/gpu/intel/compute/kernel.hpp | 6 +- src/gpu/intel/compute/utils.hpp | 10 - src/gpu/intel/gpu_primitive.hpp | 3 +- src/gpu/intel/jit/codegen/kernel.hpp | 12 +- src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp | 2 +- src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp | 4 +- src/gpu/intel/jit/jit_generator.hpp | 4 +- src/gpu/intel/jit/jit_generator_base.hpp | 5 +- src/gpu/intel/ocl/capi/engine.cpp | 5 +- src/gpu/intel/ocl/capi/primitive.cpp | 5 +- src/gpu/intel/ocl/mdapi_utils.cpp | 5 +- .../intel/ocl/ocl_buffer_memory_storage.cpp | 7 +- .../intel/ocl/ocl_buffer_memory_storage.hpp | 5 +- src/gpu/intel/ocl/ocl_context.hpp | 16 +- src/gpu/intel/ocl/ocl_engine.hpp | 5 +- src/gpu/intel/ocl/ocl_gpu_engine.cpp | 32 +- src/gpu/intel/ocl/ocl_gpu_engine.hpp | 7 +- src/gpu/intel/ocl/ocl_gpu_engine_id.hpp | 4 +- src/gpu/intel/ocl/ocl_gpu_kernel.cpp | 14 +- src/gpu/intel/ocl/ocl_gpu_kernel.hpp | 3 +- src/gpu/intel/ocl/ocl_stream.cpp | 8 +- src/gpu/intel/ocl/ocl_stream.hpp | 2 +- src/gpu/intel/ocl/ocl_usm_utils.cpp | 34 +- src/gpu/intel/ocl/ocl_utils.cpp | 265 +---------- src/gpu/intel/ocl/ocl_utils.hpp | 393 +---------------- src/gpu/sycl/sycl_interop_gpu_kernel.cpp | 3 +- src/hrt/CMakeLists.txt | 29 ++ src/hrt/ocl/CMakeLists.txt | 27 ++ src/hrt/ocl/utils.cpp | 416 ++++++++++++++++++ src/hrt/ocl/utils.hpp | 302 +++++++++++++ src/hrt/utils.cpp | 33 ++ src/hrt/utils.hpp | 43 ++ src/sycl/level_zero_utils.cpp | 6 +- src/sycl/level_zero_utils.hpp | 4 +- src/sycl/sycl_compat.cpp | 8 +- src/sycl/sycl_compat.hpp | 4 +- src/sycl/sycl_device_info.cpp | 4 +- src/sycl/sycl_engine_base.hpp | 10 +- src/sycl/sycl_utils.cpp | 66 ++- src/sycl/sycl_utils.hpp | 4 +- 42 files changed, 1019 insertions(+), 803 deletions(-) create mode 100644 src/hrt/CMakeLists.txt create mode 100644 src/hrt/ocl/CMakeLists.txt create mode 100644 src/hrt/ocl/utils.cpp create mode 100644 src/hrt/ocl/utils.hpp create mode 100644 src/hrt/utils.cpp create mode 100644 src/hrt/utils.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f2a306d66e3..a681ce19e6e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -122,6 +122,10 @@ if(NOT DNNL_GPU_RUNTIME STREQUAL "NONE") add_subdirectory(gpu) endif() +if(DNNL_WITH_SYCL OR DNNL_GPU_RUNTIME STREQUAL "OCL") + add_subdirectory(hrt) +endif() + if(DNNL_WITH_SYCL) # Enable linking SYCL kernels. if(DNNL_SYCL_CUDA) diff --git a/src/gpu/intel/compute/compute_engine.hpp b/src/gpu/intel/compute/compute_engine.hpp index e1543aa120c..0c108ffa167 100644 --- a/src/gpu/intel/compute/compute_engine.hpp +++ b/src/gpu/intel/compute/compute_engine.hpp @@ -33,6 +33,7 @@ #include "gpu/intel/compute/kernel.hpp" #include "gpu/intel/compute/kernel_ctx.hpp" #include "gpu/intel/jit/jit_generator_base.hpp" +#include "hrt/utils.hpp" namespace dnnl { namespace impl { @@ -80,7 +81,7 @@ class compute_engine_t : public engine_t { }; virtual status_t create_kernel_from_binary(compute::kernel_t &kernel, - const compute::binary_t &binary, const char *kernel_name) const = 0; + const hrt::binary_t &binary, const char *kernel_name) const = 0; virtual status_t create_kernels_from_cache_blob( const cache_blob_t &cache_blob, diff --git a/src/gpu/intel/compute/kernel.hpp b/src/gpu/intel/compute/kernel.hpp index 440e6f06b6d..1350eedd132 100644 --- a/src/gpu/intel/compute/kernel.hpp +++ b/src/gpu/intel/compute/kernel.hpp @@ -26,6 +26,7 @@ #include "gpu/intel/compute/kernel_arg_list.hpp" #include "gpu/intel/compute/utils.hpp" #include "gpu/intel/utils.hpp" +#include "hrt/utils.hpp" namespace dnnl { namespace impl { @@ -60,7 +61,7 @@ class kernel_impl_t { return status::runtime_error; } virtual status_t get_binary( - const engine_t *engine, compute::binary_t &binary) const { + const engine_t *engine, hrt::binary_t &binary) const { gpu_assert(false) << "unimplemented function get_binary() called"; return status::runtime_error; } @@ -144,8 +145,7 @@ class kernel_t { return impl_->get_binary_size(engine, binary_size); } - status_t get_binary( - const engine_t *engine, compute::binary_t &binary) const { + status_t get_binary(const engine_t *engine, hrt::binary_t &binary) const { return impl_->get_binary(engine, binary); } diff --git a/src/gpu/intel/compute/utils.hpp b/src/gpu/intel/compute/utils.hpp index e27482d88ff..02cb0c7bcc3 100644 --- a/src/gpu/intel/compute/utils.hpp +++ b/src/gpu/intel/compute/utils.hpp @@ -33,16 +33,6 @@ namespace gpu { namespace intel { namespace compute { -using binary_t = std::vector; -using device_uuid_t = std::tuple; - -struct device_uuid_hasher_t { - size_t operator()(const device_uuid_t &uuid) const { - const size_t seed = hash_combine(0, std::get<0>(uuid)); - return hash_combine(seed, std::get<1>(uuid)); - } -}; - class range_t { public: static constexpr size_t max_ndims = 3; diff --git a/src/gpu/intel/gpu_primitive.hpp b/src/gpu/intel/gpu_primitive.hpp index 918cd9d01bf..03e14c41d9e 100644 --- a/src/gpu/intel/gpu_primitive.hpp +++ b/src/gpu/intel/gpu_primitive.hpp @@ -31,6 +31,7 @@ #include "gpu/intel/jit/jit_generator_base.hpp" #include "gpu/intel/kernel_cache.hpp" #include "gpu/intel/ocl/types_interop.hpp" +#include "hrt/utils.hpp" #define CTX_GPU_RES_STORAGE(arg) \ (*(ctx.get_resource_mapper() \ @@ -119,7 +120,7 @@ struct gpu_primitive_t : public primitive_t { switch (cb.kind()) { case compute_block_t::kind_t::kernel: { // Get a binary for each kernel within current primitive. - compute::binary_t binary; + hrt::binary_t binary; CHECK(cb.kernel().get_binary(engine, binary)); CHECK(blob.add_binary(binary.data(), binary.size())); break; diff --git a/src/gpu/intel/jit/codegen/kernel.hpp b/src/gpu/intel/jit/codegen/kernel.hpp index 2d58169801b..94a6101df4a 100644 --- a/src/gpu/intel/jit/codegen/kernel.hpp +++ b/src/gpu/intel/jit/codegen/kernel.hpp @@ -32,6 +32,7 @@ #include "gpu/intel/jit/jit_generator.hpp" #include "gpu/intel/jit/ngen/ngen.hpp" #include "gpu/intel/jit/ngen/ngen_register_allocator.hpp" +#include "hrt/utils.hpp" namespace dnnl { namespace impl { @@ -46,11 +47,10 @@ struct ir_generator_t : public jit_generator_base { const char *kernel_name() const override { return kernel_name_.c_str(); } - gpu::intel::compute::binary_t get_binary( - cl_context context, cl_device_id device) override { + hrt::binary_t get_binary(cl_context context, cl_device_id device) override { kernel_info_t kernel_info; auto status = kernel_desc_.init_kernel_info(kernel_info); - if (status != status::success) return gpu::intel::compute::binary_t(); + if (status != status::success) return hrt::binary_t(); try { #define CASE(hw) \ case ngen::HW::hw: { \ @@ -68,10 +68,8 @@ struct ir_generator_t : public jit_generator_base { default: gpu_assert(false) << "Unexpected GPU architecture"; } #undef CASE - } catch (ngen::out_of_registers_exception &) { - return gpu::intel::compute::binary_t(); - } - return gpu::intel::compute::binary_t(); + } catch (ngen::out_of_registers_exception &) { return hrt::binary_t(); } + return hrt::binary_t(); } private: diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp index d003dbc2063..d8642c849fa 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp @@ -703,7 +703,7 @@ void gen_gemm_kernel_t::init_interface() { interface_.externalName(kernel_name()); } -gpu::intel::compute::binary_t gen_gemm_kernel_t::get_binary( +hrt::binary_t gen_gemm_kernel_t::get_binary( cl_context context, cl_device_id device) { init_interface(); diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp index d11fb36da94..9f1c27a6757 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp @@ -25,6 +25,7 @@ #include "gpu/intel/jit/gemm/kernel_evaluator.hpp" #include "gpu/intel/jit/jit_generator_base.hpp" #include "gpu/intel/kernel_cache.hpp" +#include "hrt/utils.hpp" namespace dnnl { namespace impl { @@ -159,8 +160,7 @@ struct gen_gemm_kernel_t : public jit_generator_base { : desc_(desc) {} const char *kernel_name() const override { return "gemm_kernel"; } - gpu::intel::compute::binary_t get_binary( - cl_context context, cl_device_id device) override; + hrt::binary_t get_binary(cl_context context, cl_device_id device) override; const gen_gemm_kernel_desc_t *desc() const { return &desc_; } diff --git a/src/gpu/intel/jit/jit_generator.hpp b/src/gpu/intel/jit/jit_generator.hpp index 13b7ac402e5..94445fecb0c 100644 --- a/src/gpu/intel/jit/jit_generator.hpp +++ b/src/gpu/intel/jit/jit_generator.hpp @@ -30,6 +30,7 @@ #include "gpu/intel/jit/jit_generator_base.hpp" #include "gpu/intel/jit/utils/ngen_type_bridge.hpp" #include "gpu/intel/jit/utils/utils.hpp" +#include "hrt/utils.hpp" #include "gpu/intel/jit/ngen/ngen_opencl.hpp" @@ -137,8 +138,7 @@ class jit_generator : public ngen::OpenCLCodeGenerator, return ngen::OpenCLCodeGenerator::getExternalName().c_str(); } - compute::binary_t get_binary( - cl_context context, cl_device_id device) override { + hrt::binary_t get_binary(cl_context context, cl_device_id device) override { return ngen::OpenCLCodeGenerator::getBinary(context, device); } diff --git a/src/gpu/intel/jit/jit_generator_base.hpp b/src/gpu/intel/jit/jit_generator_base.hpp index 2efc82c709e..592aa99c9ba 100644 --- a/src/gpu/intel/jit/jit_generator_base.hpp +++ b/src/gpu/intel/jit/jit_generator_base.hpp @@ -20,7 +20,7 @@ #include #include -#include "gpu/intel/compute/utils.hpp" +#include "hrt/utils.hpp" namespace dnnl { namespace impl { @@ -31,8 +31,7 @@ namespace jit { struct jit_generator_base { virtual ~jit_generator_base() = default; virtual const char *kernel_name() const = 0; - virtual compute::binary_t get_binary( - cl_context context, cl_device_id device) + virtual hrt::binary_t get_binary(cl_context context, cl_device_id device) = 0; }; diff --git a/src/gpu/intel/ocl/capi/engine.cpp b/src/gpu/intel/ocl/capi/engine.cpp index 0d5eeff4dee..4f50f70bc77 100644 --- a/src/gpu/intel/ocl/capi/engine.cpp +++ b/src/gpu/intel/ocl/capi/engine.cpp @@ -21,6 +21,7 @@ #include "common/c_types_map.hpp" #include "common/engine.hpp" #include "gpu/intel/ocl/ocl_engine.hpp" +#include "hrt/ocl/utils.hpp" using namespace dnnl::impl; using namespace dnnl::impl::gpu::intel::ocl; @@ -33,7 +34,7 @@ status_t dnnl_ocl_interop_engine_create( ocl_engine_factory_t f(engine_kind::gpu); size_t index; - CHECK(get_ocl_device_index(&index, device)); + CHECK(hrt::ocl::get_device_index(&index, device)); return f.engine_create(engine, device, context, index); } @@ -71,7 +72,7 @@ status_t dnnl_ocl_interop_engine_create_from_cache_blob(engine_t **engine, ocl_engine_factory_t f(engine_kind::gpu); size_t index; - CHECK(get_ocl_device_index(&index, device)); + CHECK(hrt::ocl::get_device_index(&index, device)); const std::vector cb(cache_blob, cache_blob + size); return f.engine_create(engine, device, context, index, cb); diff --git a/src/gpu/intel/ocl/capi/primitive.cpp b/src/gpu/intel/ocl/capi/primitive.cpp index b483d363a44..2c67c5a8c06 100644 --- a/src/gpu/intel/ocl/capi/primitive.cpp +++ b/src/gpu/intel/ocl/capi/primitive.cpp @@ -24,6 +24,7 @@ #include "common/primitive_desc_iface.hpp" #include "common/primitive_iface.hpp" #include "common/utils.hpp" +#include "hrt/ocl/utils.hpp" #include "gpu/intel/ocl/ocl_c_types_map.hpp" #include "gpu/intel/ocl/ocl_engine.hpp" @@ -49,9 +50,9 @@ status_t dnnl_ocl_interop_primitive_execute( ocl_stream->before_exec_hook(); if (deps != nullptr) { - std::vector> events(ndeps); + std::vector> events(ndeps); for (int i = 0; i < ndeps; i++) { - events[i] = gpu::intel::ocl::ocl_wrapper_t(deps[i], true); + events[i] = hrt::ocl::wrapper_t(deps[i], true); } ocl_stream->ocl_ctx().set_deps(events); } diff --git a/src/gpu/intel/ocl/mdapi_utils.cpp b/src/gpu/intel/ocl/mdapi_utils.cpp index 9240317bee6..72b984cafbc 100644 --- a/src/gpu/intel/ocl/mdapi_utils.cpp +++ b/src/gpu/intel/ocl/mdapi_utils.cpp @@ -91,11 +91,12 @@ class mdapi_helper_impl_t { using clCreatePerfCountersCommandQueueINTEL_func_t = cl_command_queue (*)(cl_context, cl_device_id, cl_command_queue_properties, cl_uint, cl_int *); - static ext_func_t + static hrt::ocl::ext_func_t< + clCreatePerfCountersCommandQueueINTEL_func_t> create_queue_with_perf_counters( "clCreatePerfCountersCommandQueueINTEL"); auto func = create_queue_with_perf_counters.get_func( - get_ocl_platform(dev)); + hrt::ocl::get_platform(dev)); if (!func) { *err = CL_INVALID_VALUE; return nullptr; diff --git a/src/gpu/intel/ocl/ocl_buffer_memory_storage.cpp b/src/gpu/intel/ocl/ocl_buffer_memory_storage.cpp index dad28ff1871..2bc08ccf521 100644 --- a/src/gpu/intel/ocl/ocl_buffer_memory_storage.cpp +++ b/src/gpu/intel/ocl/ocl_buffer_memory_storage.cpp @@ -84,7 +84,7 @@ status_t ocl_buffer_memory_storage_t::map_data( cl_int err; *mapped_ptr = clEnqueueMapBuffer(queue, mem_object(), CL_TRUE, map_flags, 0, mem_bytes, 0, nullptr, nullptr, &err); - return convert_to_dnnl(err); + return hrt::ocl::convert_to_dnnl(err); } status_t ocl_buffer_memory_storage_t::unmap_data( @@ -116,8 +116,9 @@ std::unique_ptr ocl_buffer_memory_storage_t::get_sub_storage( gpu_assert(offset % OCL_BUFFER_ALIGNMENT == 0); cl_buffer_region buffer_region = {base_offset_ + offset, size}; - ocl_wrapper_t sub_buffer = clCreateSubBuffer(parent_mem_object(), - mem_flags, CL_BUFFER_CREATE_TYPE_REGION, &buffer_region, &err); + hrt::ocl::wrapper_t sub_buffer + = clCreateSubBuffer(parent_mem_object(), mem_flags, + CL_BUFFER_CREATE_TYPE_REGION, &buffer_region, &err); gpu_assert(err == CL_SUCCESS); if (err != CL_SUCCESS) return nullptr; diff --git a/src/gpu/intel/ocl/ocl_buffer_memory_storage.hpp b/src/gpu/intel/ocl/ocl_buffer_memory_storage.hpp index 264adc12d51..c242e2e3211 100644 --- a/src/gpu/intel/ocl/ocl_buffer_memory_storage.hpp +++ b/src/gpu/intel/ocl/ocl_buffer_memory_storage.hpp @@ -45,7 +45,8 @@ class ocl_buffer_memory_storage_t : public ocl_memory_storage_base_t { } status_t set_data_handle(void *handle) override { - mem_object_ = ocl_wrapper_t(static_cast(handle), true); + mem_object_ = hrt::ocl::wrapper_t( + static_cast(handle), true); return status::success; } @@ -70,7 +71,7 @@ class ocl_buffer_memory_storage_t : public ocl_memory_storage_base_t { private: cl_mem parent_mem_object() const; - ocl_wrapper_t mem_object_; + hrt::ocl::wrapper_t mem_object_; size_t base_offset_ = 0; DNNL_DISALLOW_COPY_AND_ASSIGN(ocl_buffer_memory_storage_t); diff --git a/src/gpu/intel/ocl/ocl_context.hpp b/src/gpu/intel/ocl/ocl_context.hpp index d89618d64d1..44820bbe379 100644 --- a/src/gpu/intel/ocl/ocl_context.hpp +++ b/src/gpu/intel/ocl/ocl_context.hpp @@ -28,18 +28,18 @@ namespace ocl { struct ocl_event_t final : compute::event_t { ocl_event_t() = default; - ocl_event_t(const std::vector> &events) + ocl_event_t(const std::vector> &events) : events(events) {} - ocl_event_t(std::vector> &&events) + ocl_event_t(std::vector> &&events) : events(std::move(events)) {} - ocl_event_t(ocl_wrapper_t &&event) { + ocl_event_t(hrt::ocl::wrapper_t &&event) { events.emplace_back(std::move(event)); } - const ocl_wrapper_t &operator[](size_t i) const { + const hrt::ocl::wrapper_t &operator[](size_t i) const { return events[i]; } - ocl_wrapper_t &operator[](size_t i) { return events[i]; } + hrt::ocl::wrapper_t &operator[](size_t i) { return events[i]; } size_t size() const { return events.size(); } static ocl_event_t &from(compute::event_t &event) { @@ -57,12 +57,12 @@ struct ocl_event_t final : compute::event_t { events.insert(events.end(), other.events.begin(), other.events.end()); }; - std::vector> events; + std::vector> events; }; struct ocl_context_t final : public gpu::intel::compute::context_t { ocl_context_t() = default; - ocl_context_t(const std::vector> &&events) + ocl_context_t(const std::vector> &&events) : events_(std::move(events)) {}; ocl_context_t(const ocl_context_t &) = default; ~ocl_context_t() = default; @@ -79,7 +79,7 @@ struct ocl_context_t final : public gpu::intel::compute::context_t { return events_; } - void set_deps(std::vector> &&event) { + void set_deps(std::vector> &&event) { events_ = ocl_event_t(std::move(event)); } void set_deps(ocl_event_t &&events) { events_ = std::move(events); }; diff --git a/src/gpu/intel/ocl/ocl_engine.hpp b/src/gpu/intel/ocl/ocl_engine.hpp index 8043a5b2e61..ae81b63f7b6 100644 --- a/src/gpu/intel/ocl/ocl_engine.hpp +++ b/src/gpu/intel/ocl/ocl_engine.hpp @@ -34,7 +34,8 @@ class ocl_engine_factory_t : public engine_factory_t { size_t count() const override { std::vector ocl_devices; - status_t status = get_ocl_devices(&ocl_devices, CL_DEVICE_TYPE_GPU); + status_t status + = hrt::ocl::get_devices(&ocl_devices, CL_DEVICE_TYPE_GPU); if (status != status::success) return status; return ocl_devices.size(); } @@ -43,7 +44,7 @@ class ocl_engine_factory_t : public engine_factory_t { status_t status; std::vector ocl_devices; - status = get_ocl_devices(&ocl_devices, CL_DEVICE_TYPE_GPU); + status = hrt::ocl::get_devices(&ocl_devices, CL_DEVICE_TYPE_GPU); VERROR_ENGINE( status == status::success, status, "no ocl devices found"); diff --git a/src/gpu/intel/ocl/ocl_gpu_engine.cpp b/src/gpu/intel/ocl/ocl_gpu_engine.cpp index 828cd4521ed..ce9146f77d6 100644 --- a/src/gpu/intel/ocl/ocl_gpu_engine.cpp +++ b/src/gpu/intel/ocl/ocl_gpu_engine.cpp @@ -86,7 +86,7 @@ status_t ocl_gpu_engine_t::init(const std::vector &cache_blob) { OCL_CHECK(err); - CHECK(check_device(engine_kind::gpu, device_, context_)); + CHECK(hrt::ocl::check_device(engine_kind::gpu, device_, context_)); compute::compute_engine_t::init(cache_blob); return status::success; @@ -139,7 +139,7 @@ status_t create_ocl_kernel_from_cache_blob(const ocl_gpu_engine_t *ocl_engine, CHECK(cache_blob.get_binary(&binary, &binary_size)); - auto program = make_ocl_wrapper(clCreateProgramWithBinary( + auto program = hrt::ocl::make_wrapper(clCreateProgramWithBinary( ctx, 1, &dev, &binary_size, &binary, nullptr, &err)); OCL_CHECK(err); err = clBuildProgram(program, 1, &dev, nullptr, nullptr, nullptr); @@ -165,7 +165,7 @@ status_t create_ocl_kernel_from_cache_blob(const ocl_gpu_engine_t *ocl_engine, // Remove the null terminator as std::string already includes it. kernel_name.pop_back(); } - auto ocl_kernel = make_ocl_wrapper( + auto ocl_kernel = hrt::ocl::make_wrapper( clCreateKernel(program, kernel_name.c_str(), &err)); OCL_CHECK(err); @@ -229,7 +229,7 @@ inline status_t preprocess_headers( } // namespace status_t ocl_gpu_engine_t::build_program_from_source( - ocl_wrapper_t &program, const char *code_string, + hrt::ocl::wrapper_t &program, const char *code_string, const compute::kernel_ctx_t &kernel_ctx) const { std::string options = kernel_ctx.options(); @@ -252,7 +252,7 @@ status_t ocl_gpu_engine_t::build_program_from_source( debugdump_processed_source( pp_code_str, options, dev_info->get_cl_ext_options()); - program = make_ocl_wrapper(clCreateProgramWithSource( + program = hrt::ocl::make_wrapper(clCreateProgramWithSource( context(), 1, &pp_code_str_ptr, nullptr, &err)); OCL_CHECK(err); @@ -262,10 +262,10 @@ status_t ocl_gpu_engine_t::build_program_from_source( return status::success; } -status_t ocl_gpu_engine_t::create_binary_from_ocl_source( - compute::binary_t &binary, const char *code_string, +status_t ocl_gpu_engine_t::create_binary_from_ocl_source(hrt::binary_t &binary, + const char *code_string, const compute::kernel_ctx_t &kernel_ctx) const { - ocl_wrapper_t program; + hrt::ocl::wrapper_t program; CHECK(build_program_from_source(program, code_string, kernel_ctx)); CHECK(get_ocl_program_binary(program, device(), binary)); @@ -273,14 +273,14 @@ status_t ocl_gpu_engine_t::create_binary_from_ocl_source( } status_t ocl_gpu_engine_t::create_kernel_from_binary(compute::kernel_t &kernel, - const compute::binary_t &binary, const char *kernel_name) const { - ocl_wrapper_t program; - CHECK(ocl::create_ocl_program( + const hrt::binary_t &binary, const char *kernel_name) const { + hrt::ocl::wrapper_t program; + CHECK(hrt::ocl::create_program( program, this->device(), this->context(), binary)); cl_int err; - auto ocl_kernel - = make_ocl_wrapper(clCreateKernel(program, kernel_name, &err)); + auto ocl_kernel = hrt::ocl::make_wrapper( + clCreateKernel(program, kernel_name, &err)); OCL_CHECK(err); std::vector arg_types; @@ -315,7 +315,7 @@ status_t ocl_gpu_engine_t::create_kernel(compute::kernel_t *kernel, return status::success; } - compute::binary_t binary = jitter->get_binary(context(), device()); + hrt::binary_t binary = jitter->get_binary(context(), device()); if (binary.empty()) return status::runtime_error; return create_kernel_from_binary(*kernel, binary, kernel_name); } @@ -346,14 +346,14 @@ status_t ocl_gpu_engine_t::create_kernels_from_ocl_source( std::vector *kernels, const std::vector &kernel_names, const char *code_string, const compute::kernel_ctx_t &kernel_ctx) const { - ocl_wrapper_t program; + hrt::ocl::wrapper_t program; CHECK(build_program_from_source(program, code_string, kernel_ctx)); *kernels = std::vector(kernel_names.size()); for (size_t i = 0; i < kernel_names.size(); ++i) { if (!kernel_names[i]) continue; cl_int err; - ocl_wrapper_t ocl_kernel + hrt::ocl::wrapper_t ocl_kernel = clCreateKernel(program, kernel_names[i], &err); OCL_CHECK(err); std::vector arg_types; diff --git a/src/gpu/intel/ocl/ocl_gpu_engine.hpp b/src/gpu/intel/ocl/ocl_gpu_engine.hpp index fd7febc8e73..b5d17fdb746 100644 --- a/src/gpu/intel/ocl/ocl_gpu_engine.hpp +++ b/src/gpu/intel/ocl/ocl_gpu_engine.hpp @@ -23,6 +23,7 @@ #include "gpu/intel/compute/compute_engine.hpp" #include "gpu/intel/ocl/ocl_gpu_engine_id.hpp" #include "gpu/intel/ocl/ocl_utils.hpp" +#include "hrt/utils.hpp" namespace dnnl { namespace impl { @@ -47,12 +48,12 @@ class ocl_gpu_engine_t : public compute::compute_engine_t { status_t create_stream(stream_t **stream, unsigned flags) override; status_t create_stream(stream_t **stream, cl_command_queue queue); - status_t create_binary_from_ocl_source(compute::binary_t &binary, + status_t create_binary_from_ocl_source(hrt::binary_t &binary, const char *code_string, const compute::kernel_ctx_t &kernel_ctx) const; status_t create_kernel_from_binary(compute::kernel_t &kernel, - const compute::binary_t &binary, + const hrt::binary_t &binary, const char *kernel_name) const override; status_t create_kernels_from_cache_blob(const cache_blob_t &cache_blob, @@ -117,7 +118,7 @@ class ocl_gpu_engine_t : public compute::compute_engine_t { } protected: - status_t build_program_from_source(ocl_wrapper_t &program, + status_t build_program_from_source(hrt::ocl::wrapper_t &program, const char *code_string, const compute::kernel_ctx_t &kernel_ctx) const; diff --git a/src/gpu/intel/ocl/ocl_gpu_engine_id.hpp b/src/gpu/intel/ocl/ocl_gpu_engine_id.hpp index 25f6497d159..af6b71804fa 100644 --- a/src/gpu/intel/ocl/ocl_gpu_engine_id.hpp +++ b/src/gpu/intel/ocl/ocl_gpu_engine_id.hpp @@ -49,8 +49,8 @@ struct ocl_gpu_engine_id_impl_t : public engine_id_impl_t { return seed; } - ocl_wrapper_t device_; - ocl_wrapper_t context_; + hrt::ocl::wrapper_t device_; + hrt::ocl::wrapper_t context_; }; } // namespace ocl diff --git a/src/gpu/intel/ocl/ocl_gpu_kernel.cpp b/src/gpu/intel/ocl/ocl_gpu_kernel.cpp index b861f8a55f0..0e0b4a4da46 100644 --- a/src/gpu/intel/ocl/ocl_gpu_kernel.cpp +++ b/src/gpu/intel/ocl/ocl_gpu_kernel.cpp @@ -44,13 +44,13 @@ class kernel_wrapper_t { status_t set_arg(int arg_index, size_t arg_size, const void *arg_value) { cl_int err = clSetKernelArg(kernel_, arg_index, arg_size, arg_value); - return convert_to_dnnl(err); + return hrt::ocl::convert_to_dnnl(err); } status_t set_svm_arg(int arg_index, const void *arg_value) { #ifdef CL_VERSION_2_0 cl_int err = clSetKernelArgSVMPointer(kernel_, arg_index, arg_value); - return convert_to_dnnl(err); + return hrt::ocl::convert_to_dnnl(err); #else // SVM is not supported. UNUSED(arg_index); @@ -92,7 +92,7 @@ class ocl_gpu_kernel_cache_t { // No copy for this thread, clone the original kernel and save the // copy. cl_kernel cloned_kernel; - CHECK(clone_kernel(main_kernel_, &cloned_kernel)); + CHECK(hrt::ocl::clone_kernel(main_kernel_, &cloned_kernel)); utils::lock_write_t lock_write(mutex_); auto ret = kernels_.emplace(id, cloned_kernel); @@ -118,7 +118,7 @@ ocl_gpu_kernel_t::~ocl_gpu_kernel_t() { } status_t ocl_gpu_kernel_t::get_binary( - const engine_t *engine, compute::binary_t &binary) const { + const engine_t *engine, hrt::binary_t &binary) const { auto *ocl_engine = utils::downcast(engine); return get_ocl_program_binary(ocl_kernel(), ocl_engine->device(), binary); } @@ -205,7 +205,7 @@ status_t ocl_gpu_kernel_t::parallel_for(stream_t &stream, cl_uint ndims = static_cast(range.ndims()); if (range.is_zero()) { return status::success; } - ocl_wrapper_t event; + hrt::ocl::wrapper_t event; if (ocl_stream->flags() & stream_flags::out_of_order) { const auto &event_wrappers = ocl_event_t::from(deps).events; std::vector events( @@ -237,14 +237,14 @@ status_t ocl_gpu_kernel_t::parallel_for(stream_t &stream, } status_t ocl_gpu_kernel_t::dump() const { - compute::binary_t binary; + hrt::binary_t binary; CHECK(get_ocl_kernel_binary(ocl_kernel(), binary)); CHECK(gpu_utils::dump_kernel_binary(binary, name())); return status::success; } std::string ocl_gpu_kernel_t::name() const { - return get_kernel_name(ocl_kernel()); + return hrt::ocl::get_kernel_name(ocl_kernel()); } } // namespace ocl diff --git a/src/gpu/intel/ocl/ocl_gpu_kernel.hpp b/src/gpu/intel/ocl/ocl_gpu_kernel.hpp index bc464122f5e..36237c11194 100644 --- a/src/gpu/intel/ocl/ocl_gpu_kernel.hpp +++ b/src/gpu/intel/ocl/ocl_gpu_kernel.hpp @@ -21,6 +21,7 @@ #include #include "gpu/intel/compute/kernel.hpp" +#include "hrt/utils.hpp" namespace dnnl { namespace impl { @@ -39,7 +40,7 @@ class ocl_gpu_kernel_t : public compute::kernel_impl_t { cl_kernel ocl_kernel() const { return ocl_kernel_; } status_t get_binary( - const engine_t *engine, compute::binary_t &binary) const override; + const engine_t *engine, hrt::binary_t &binary) const override; status_t get_binary_size( const engine_t *engine, size_t *binary_size) const override; diff --git a/src/gpu/intel/ocl/ocl_stream.cpp b/src/gpu/intel/ocl/ocl_stream.cpp index 35efab4478d..d3a997d993b 100644 --- a/src/gpu/intel/ocl/ocl_stream.cpp +++ b/src/gpu/intel/ocl/ocl_stream.cpp @@ -131,7 +131,7 @@ status_t ocl_stream_t::copy(const memory_storage_t &src, cl_uint num_events = (cl_uint)events.size(); const cl_event *events_ptr = events.data(); - ocl_wrapper_t out_event; + hrt::ocl::wrapper_t out_event; bool need_out_event = is_profiling_enabled() || flags() & stream_flags::out_of_order; cl_event *out_event_ptr = need_out_event ? &out_event.unwrap() : nullptr; @@ -247,7 +247,7 @@ status_t ocl_stream_t::copy(const memory_storage_t &src, if (is_profiling_enabled()) { auto ocl_event = utils::make_unique( - std::vector> {out_event}); + std::vector> {out_event}); profiler_->register_event(std::move(ocl_event)); } @@ -274,7 +274,7 @@ status_t ocl_stream_t::fill(const memory_storage_t &dst, uint8_t pattern, cl_uint num_events = (cl_uint)events.size(); const cl_event *events_ptr = events.data(); - ocl_wrapper_t out_event; + hrt::ocl::wrapper_t out_event; bool need_out_event = is_profiling_enabled() || flags() & stream_flags::out_of_order; cl_event *out_event_ptr = need_out_event ? &out_event.unwrap() : nullptr; @@ -295,7 +295,7 @@ status_t ocl_stream_t::fill(const memory_storage_t &dst, uint8_t pattern, if (is_profiling_enabled()) { auto ocl_event = utils::make_unique( - std::vector> {out_event}); + std::vector> {out_event}); profiler_->register_event(std::move(ocl_event)); } diff --git a/src/gpu/intel/ocl/ocl_stream.hpp b/src/gpu/intel/ocl/ocl_stream.hpp index 572608b889b..dc1d38d8212 100644 --- a/src/gpu/intel/ocl/ocl_stream.hpp +++ b/src/gpu/intel/ocl/ocl_stream.hpp @@ -114,7 +114,7 @@ struct ocl_stream_t : public compute::compute_stream_t { return ocl_ctx(); } - const ocl_wrapper_t &get_output_event() const { + const hrt::ocl::wrapper_t &get_output_event() const { auto &deps = ocl_event_t::from(ctx().get_deps()); assert(deps.size() == 1); return deps[0]; diff --git a/src/gpu/intel/ocl/ocl_usm_utils.cpp b/src/gpu/intel/ocl/ocl_usm_utils.cpp index 2af15032d63..7a239a9f59e 100644 --- a/src/gpu/intel/ocl/ocl_usm_utils.cpp +++ b/src/gpu/intel/ocl/ocl_usm_utils.cpp @@ -53,7 +53,7 @@ cl_command_queue get_ocl_queue(stream_t *stream) { bool is_usm_supported(engine_t *engine) { using clSharedMemAllocINTEL_func_t = void *(*)(cl_context, cl_device_id, cl_ulong *, size_t, cl_uint, cl_int *); - static ext_func_t ext_func( + static hrt::ocl::ext_func_t ext_func( "clSharedMemAllocINTEL"); return (bool)ext_func.get_func(engine); } @@ -64,7 +64,7 @@ void *malloc_host(engine_t *engine, size_t size) { if (size == 0) return nullptr; - static ext_func_t ext_func( + static hrt::ocl::ext_func_t ext_func( "clHostMemAllocINTEL"); cl_int err; void *p = ext_func(engine, get_ocl_context(engine), nullptr, size, 0, &err); @@ -79,7 +79,7 @@ void *malloc_device(engine_t *engine, size_t size) { if (size == 0) return nullptr; - static ext_func_t ext_func( + static hrt::ocl::ext_func_t ext_func( "clDeviceMemAllocINTEL"); cl_int err; void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine), @@ -95,7 +95,7 @@ void *malloc_shared(engine_t *engine, size_t size) { if (size == 0) return nullptr; - static ext_func_t ext_func( + static hrt::ocl::ext_func_t ext_func( "clSharedMemAllocINTEL"); cl_int err; void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine), @@ -109,7 +109,8 @@ void free(engine_t *engine, void *ptr) { using clMemFreeINTEL_func_t = cl_int (*)(cl_context, void *); if (!ptr) return; - static ext_func_t ext_func("clMemFreeINTEL"); + static hrt::ocl::ext_func_t ext_func( + "clMemFreeINTEL"); cl_int err = ext_func(engine, get_ocl_context(engine), ptr); assert(err == CL_SUCCESS); MAYBE_UNUSED(err); @@ -119,9 +120,10 @@ status_t set_kernel_arg_usm(engine_t *engine, cl_kernel kernel, int arg_index, const void *arg_value) { using clSetKernelArgMemPointerINTEL_func_t = cl_int (*)(cl_kernel, cl_uint, const void *); - static ext_func_t ext_func( + static hrt::ocl::ext_func_t ext_func( "clSetKernelArgMemPointerINTEL"); - return convert_to_dnnl(ext_func(engine, kernel, arg_index, arg_value)); + return hrt::ocl::convert_to_dnnl( + ext_func(engine, kernel, arg_index, arg_value)); } status_t memcpy(stream_t *stream, void *dst, const void *src, size_t size, @@ -129,11 +131,12 @@ status_t memcpy(stream_t *stream, void *dst, const void *src, size_t size, using clEnqueueMemcpyINTEL_func_t = cl_int (*)(cl_command_queue, cl_bool, void *, const void *, size_t, cl_uint, const cl_event *, cl_event *); - static ext_func_t ext_func( + static hrt::ocl::ext_func_t ext_func( "clEnqueueMemcpyINTEL"); - return convert_to_dnnl(ext_func(stream->engine(), get_ocl_queue(stream), - /* blocking */ CL_FALSE, dst, src, size, num_events, events, - out_event)); + return hrt::ocl::convert_to_dnnl( + ext_func(stream->engine(), get_ocl_queue(stream), + /* blocking */ CL_FALSE, dst, src, size, num_events, events, + out_event)); } status_t memcpy(stream_t *stream, void *dst, const void *src, size_t size) { @@ -146,10 +149,11 @@ status_t fill(stream_t *stream, void *ptr, const void *pattern, using clEnqueueMemFillINTEL_func_t = cl_int (*)(cl_command_queue, void *, const void *, size_t, size_t, cl_uint, const cl_event *, cl_event *); - static ext_func_t ext_func( + static hrt::ocl::ext_func_t ext_func( "clEnqueueMemFillINTEL"); - return convert_to_dnnl(ext_func(stream->engine(), get_ocl_queue(stream), - ptr, pattern, pattern_size, size, num_events, events, out_event)); + return hrt::ocl::convert_to_dnnl( + ext_func(stream->engine(), get_ocl_queue(stream), ptr, pattern, + pattern_size, size, num_events, events, out_event)); } status_t memset(stream_t *stream, void *ptr, int value, size_t size) { @@ -170,7 +174,7 @@ ocl_usm_kind_t get_pointer_type(engine_t *engine, const void *ptr) { static constexpr cl_uint cl_mem_alloc_type_intel = 0x419A; - static ext_func_t ext_func( + static hrt::ocl::ext_func_t ext_func( "clGetMemAllocInfoINTEL"); if (!ptr) return ocl_usm_kind_t::unknown; diff --git a/src/gpu/intel/ocl/ocl_utils.cpp b/src/gpu/intel/ocl/ocl_utils.cpp index c7cf6f72740..b85ba1d7e6c 100644 --- a/src/gpu/intel/ocl/ocl_utils.cpp +++ b/src/gpu/intel/ocl/ocl_utils.cpp @@ -75,194 +75,6 @@ namespace gpu { namespace intel { namespace ocl { -template -static std::string get_ocl_name(T obj, F get_func, cl_uint name_query) { - size_t name_size; - cl_int err = get_func(obj, name_query, 0, nullptr, &name_size); - // Ignore error. - UNUSED_OCL_RESULT(err); - - // Include null terminator explicitly - to safely overwrite it in - // clGetKernelInfo - std::string name(name_size, 0); - err = get_func(obj, name_query, name_size, &name[0], nullptr); - // Ignore error. - UNUSED_OCL_RESULT(err); - - // Remove the null terminator as std::string already includes it - name.resize(name_size - 1); - return name; -} - -std::string get_kernel_name(cl_kernel kernel) { - return get_ocl_name(kernel, clGetKernelInfo, CL_KERNEL_FUNCTION_NAME); -} - -static std::string get_platform_name(cl_platform_id platform) { - return get_ocl_name(platform, clGetPlatformInfo, CL_PLATFORM_NAME); -} - -static bool is_intel_platform(cl_platform_id platform) { - auto name = get_platform_name(platform); - return name.find("Intel") != std::string::npos; -} - -status_t check_device( - engine_kind_t eng_kind, cl_device_id dev, cl_context ctx) { - assert(dev && ctx); - - // Check device and context consistency. - size_t dev_bytes; - OCL_CHECK( - clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, nullptr, &dev_bytes)); - - std::vector ctx_devices(dev_bytes / sizeof(cl_device_id)); - OCL_CHECK(clGetContextInfo( - ctx, CL_CONTEXT_DEVICES, dev_bytes, &ctx_devices[0], nullptr)); - - bool found = false; - for (size_t i = 0; i < ctx_devices.size(); ++i) { - if (ctx_devices[i] == dev) { - found = true; - break; - } - } - VERROR_ENGINE( - found, status::invalid_arguments, VERBOSE_DEVICE_CTX_MISMATCH); - - // Check engine kind and device consistency. - cl_device_type dev_type; - OCL_CHECK(clGetDeviceInfo( - dev, CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, nullptr)); - VERROR_ENGINE(!((eng_kind == engine_kind::cpu) - && (dev_type & CL_DEVICE_TYPE_CPU) == 0), - status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); - VERROR_ENGINE(!((eng_kind == engine_kind::gpu) - && (dev_type & CL_DEVICE_TYPE_GPU) == 0), - status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); - - // Check that the platform is an Intel platform. - cl_platform_id platform; - OCL_CHECK(clGetDeviceInfo( - dev, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr)); - - VERROR_ENGINE(is_intel_platform(platform), status::invalid_arguments, - VERBOSE_INVALID_PLATFORM, "ocl", "intel", - get_platform_name(platform).c_str()); - - return status::success; -} - -status_t get_ocl_devices( - std::vector *devices, cl_device_type device_type) { - cl_uint num_platforms = 0; - - cl_int err = clGetPlatformIDs(0, nullptr, &num_platforms); - // No platforms - a valid scenario - if (err == CL_PLATFORM_NOT_FOUND_KHR) return status::success; - - OCL_CHECK(err); - - std::vector platforms(num_platforms); - OCL_CHECK(clGetPlatformIDs(num_platforms, &platforms[0], nullptr)); - - for (size_t i = 0; i < platforms.size(); ++i) { - if (!is_intel_platform(platforms[i])) continue; - - cl_uint num_devices = 0; - cl_int err = clGetDeviceIDs( - platforms[i], device_type, 0, nullptr, &num_devices); - - if (!utils::one_of(err, CL_SUCCESS, CL_DEVICE_NOT_FOUND)) { - return status::runtime_error; - } - - if (num_devices != 0) { - std::vector plat_devices; - plat_devices.resize(num_devices); - OCL_CHECK(clGetDeviceIDs(platforms[i], device_type, num_devices, - &plat_devices[0], nullptr)); - - // Use Intel devices only - for (size_t j = 0; j < plat_devices.size(); ++j) { - cl_uint vendor_id; - OCL_CHECK(clGetDeviceInfo(plat_devices[j], CL_DEVICE_VENDOR_ID, - sizeof(cl_uint), &vendor_id, nullptr)); - if (vendor_id == 0x8086) { - devices->push_back(plat_devices[j]); - } - } - } - } - // No devices found but still return success - return status::success; -} - -status_t get_ocl_devices(std::vector *devices, - std::vector> *sub_devices, - cl_device_type device_type) { - std::vector devices_tmp; - std::vector> sub_devices_tmp; - - CHECK(get_ocl_devices(&devices_tmp, device_type)); - - for (cl_device_id d : devices_tmp) { - cl_uint max_sub_devices; - cl_device_partition_property properties[3] - = {CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, - CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, 0}; - cl_int err = clCreateSubDevices( - d, properties, 0, nullptr, &max_sub_devices); - if (err == CL_DEVICE_PARTITION_FAILED) continue; - OCL_CHECK(err); - std::vector sds(max_sub_devices); - OCL_CHECK(clCreateSubDevices( - d, properties, max_sub_devices, sds.data(), nullptr)); - for (cl_device_id sd : sds) - sub_devices_tmp.emplace_back(sd); - } - *devices = devices_tmp; - *sub_devices = std::move(sub_devices_tmp); - return status::success; -} - -status_t get_ocl_device_index(size_t *index, cl_device_id device) { - std::vector ocl_devices; - CHECK(get_ocl_devices(&ocl_devices, CL_DEVICE_TYPE_GPU)); - - // Search the top level device unconditionally - auto parent_device = device; - auto top_level_device = device; - while (parent_device) { - top_level_device = parent_device; - OCL_CHECK(clGetDeviceInfo(top_level_device, CL_DEVICE_PARENT_DEVICE, - sizeof(cl_device_id), &parent_device, nullptr)); - } - - // Find the top level device in the list - auto it = std::find( - ocl_devices.begin(), ocl_devices.end(), top_level_device); - if (it != ocl_devices.end()) { - *index = it - ocl_devices.begin(); - return status::success; - } else { - *index = SIZE_MAX; - return status::invalid_arguments; - } -} - -cl_platform_id get_ocl_platform(cl_device_id device) { - cl_platform_id platform; - cl_int err = clGetDeviceInfo( - device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr); - if (err != CL_SUCCESS) return nullptr; - return platform; -} - -cl_platform_id get_ocl_platform(engine_t *engine) { - return utils::downcast(engine)->platform(); -} - status_t get_ocl_kernel_arg_type(compute::scalar_type_t *type, cl_kernel ocl_kernel, cl_uint idx, bool allow_undef) { char s_type[16]; @@ -325,7 +137,7 @@ status_t get_ocl_program_binary_size( OCL_CHECK(err); // Identify local device index in the list of devices the program was - // compiled for. Using global indexing through `get_ocl_device_index` may + // compiled for. Using global indexing through `get_device_index` may // fail due to presence of two or more physical devices in the system. std::vector devices(n_devices); err = clGetProgramInfo(program, CL_PROGRAM_DEVICES, @@ -341,7 +153,7 @@ status_t get_ocl_program_binary_size( } status_t get_ocl_program_binary( - cl_program program, cl_device_id device, compute::binary_t &binary) { + cl_program program, cl_device_id device, hrt::binary_t &binary) { size_t n_devices = 0; CHECK(get_number_devices(program, &n_devices)); @@ -358,9 +170,9 @@ status_t get_ocl_program_binary( size_t device_idx = std::distance( devices.begin(), std::find(devices.begin(), devices.end(), device)); std::vector binary_pointers(n_devices); - std::vector binaries(n_devices); + std::vector binaries(n_devices); for (size_t i = 0; i < n_devices; ++i) { - binaries[i] = compute::binary_t(binarySize[i]); + binaries[i] = hrt::binary_t(binarySize[i]); binary_pointers[i] = binaries[i].data(); } @@ -372,7 +184,7 @@ status_t get_ocl_program_binary( } status_t get_ocl_program_binary( - cl_kernel kernel, cl_device_id device, compute::binary_t &binary) { + cl_kernel kernel, cl_device_id device, hrt::binary_t &binary) { cl_int err; cl_program program; @@ -383,8 +195,7 @@ status_t get_ocl_program_binary( return get_ocl_program_binary(program, device, binary); } -status_t get_ocl_kernel_binary( - cl_kernel ocl_kernel, compute::binary_t &binary) { +status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, hrt::binary_t &binary) { binary.clear(); size_t binary_size; OCL_CHECK(clGetKernelInfo(ocl_kernel, CL_KERNEL_BINARY_PROGRAM_INTEL, 0, @@ -585,70 +396,6 @@ status_t get_ocl_device_eu_count(cl_device_id device, return status::success; } -status_t clone_kernel(cl_kernel kernel, cl_kernel *cloned_kernel) { - cl_int err; -#if !defined(DNNL_SYCL_HIP) && !defined(DNNL_SYCL_CUDA) \ - && defined(CL_VERSION_2_1) - *cloned_kernel = clCloneKernel(kernel, &err); - OCL_CHECK(err); -#else - // clCloneKernel is not available - recreate from the program. - auto name = get_kernel_name(kernel); - - cl_program program; - err = clGetKernelInfo( - kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr); - OCL_CHECK(err); - - *cloned_kernel = clCreateKernel(program, name.c_str(), &err); - OCL_CHECK(err); -#endif - - return status::success; -} - -status_t create_ocl_program( - gpu::intel::ocl::ocl_wrapper_t &ocl_program, - cl_device_id dev, cl_context ctx, - const gpu::intel::compute::binary_t &binary) { - cl_int err; - const unsigned char *binary_buffer = binary.data(); - size_t binary_size = binary.size(); - assert(binary_size > 0); - - ocl_program = clCreateProgramWithBinary( - ctx, 1, &dev, &binary_size, &binary_buffer, nullptr, &err); - OCL_CHECK(err); - err = clBuildProgram(ocl_program, 1, &dev, nullptr, nullptr, nullptr); - OCL_CHECK(err); - - return status::success; -} - -status_t get_device_uuid( - gpu::intel::compute::device_uuid_t &uuid, cl_device_id ocl_dev) { - // This function is used only with SYCL that works with OpenCL 3.0 - // that supports `cl_khr_device_uuid` extension. -#if defined(cl_khr_device_uuid) - static_assert( - CL_UUID_SIZE_KHR == 16, "CL_UUID_SIZE_KHR is expected to be 16"); - - cl_uchar ocl_dev_uuid[CL_UUID_SIZE_KHR] = {}; - OCL_CHECK(clGetDeviceInfo(ocl_dev, CL_DEVICE_UUID_KHR, CL_UUID_SIZE_KHR, - ocl_dev_uuid, nullptr)); - - uint64_t uuid_packed[CL_UUID_SIZE_KHR / sizeof(uint64_t)] = {}; - for (size_t i = 0; i < CL_UUID_SIZE_KHR; ++i) { - size_t shift = i % sizeof(uint64_t) * CHAR_BIT; - uuid_packed[i / sizeof(uint64_t)] - |= (((uint64_t)ocl_dev_uuid[i]) << shift); - } - uuid = gpu::intel::compute::device_uuid_t(uuid_packed[0], uuid_packed[1]); - return status::success; -#endif - return status::runtime_error; -} - } // namespace ocl } // namespace intel } // namespace gpu diff --git a/src/gpu/intel/ocl/ocl_utils.hpp b/src/gpu/intel/ocl/ocl_utils.hpp index 7d2de7a7722..8c24d608ec3 100644 --- a/src/gpu/intel/ocl/ocl_utils.hpp +++ b/src/gpu/intel/ocl/ocl_utils.hpp @@ -30,6 +30,7 @@ #include "gpu/intel/compute/device_info.hpp" #include "gpu/intel/compute/kernel_arg_list.hpp" #include "gpu/intel/compute/utils.hpp" +#include "hrt/ocl/utils.hpp" namespace dnnl { namespace impl { @@ -37,380 +38,8 @@ namespace gpu { namespace intel { namespace ocl { -std::string get_kernel_name(cl_kernel kernel); - -inline status_t convert_to_dnnl(cl_int cl_status) { - switch (cl_status) { - case CL_SUCCESS: return status::success; - case CL_MEM_OBJECT_ALLOCATION_FAILURE: - case CL_OUT_OF_RESOURCES: - case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory; - case CL_DEVICE_NOT_FOUND: - case CL_DEVICE_NOT_AVAILABLE: - case CL_COMPILER_NOT_AVAILABLE: - case CL_PROFILING_INFO_NOT_AVAILABLE: - case CL_MEM_COPY_OVERLAP: - case CL_IMAGE_FORMAT_MISMATCH: - case CL_IMAGE_FORMAT_NOT_SUPPORTED: - case CL_BUILD_PROGRAM_FAILURE: - case CL_MAP_FAILURE: - case CL_MISALIGNED_SUB_BUFFER_OFFSET: - case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: - case CL_COMPILE_PROGRAM_FAILURE: - case CL_LINKER_NOT_AVAILABLE: - case CL_LINK_PROGRAM_FAILURE: - case CL_DEVICE_PARTITION_FAILED: - case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: - case CL_INVALID_PLATFORM: - case CL_INVALID_DEVICE: return status::runtime_error; - case CL_INVALID_VALUE: - case CL_INVALID_DEVICE_TYPE: - case CL_INVALID_CONTEXT: - case CL_INVALID_QUEUE_PROPERTIES: - case CL_INVALID_COMMAND_QUEUE: - case CL_INVALID_HOST_PTR: - case CL_INVALID_MEM_OBJECT: - case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: - case CL_INVALID_IMAGE_SIZE: - case CL_INVALID_SAMPLER: - case CL_INVALID_BINARY: - case CL_INVALID_BUILD_OPTIONS: - case CL_INVALID_PROGRAM: - case CL_INVALID_PROGRAM_EXECUTABLE: - case CL_INVALID_KERNEL_NAME: - case CL_INVALID_KERNEL_DEFINITION: - case CL_INVALID_KERNEL: - case CL_INVALID_ARG_INDEX: - case CL_INVALID_ARG_VALUE: - case CL_INVALID_ARG_SIZE: - case CL_INVALID_KERNEL_ARGS: - case CL_INVALID_WORK_DIMENSION: - case CL_INVALID_WORK_GROUP_SIZE: - case CL_INVALID_WORK_ITEM_SIZE: - case CL_INVALID_GLOBAL_OFFSET: - case CL_INVALID_EVENT_WAIT_LIST: - case CL_INVALID_EVENT: - case CL_INVALID_OPERATION: - case CL_INVALID_GL_OBJECT: - case CL_INVALID_BUFFER_SIZE: - case CL_INVALID_MIP_LEVEL: - case CL_INVALID_GLOBAL_WORK_SIZE: return status::invalid_arguments; - - default: return status::runtime_error; - } -} - -// Ordered by value as defined by opencl -inline const char *convert_cl_int_to_str(cl_int cl_status) { -#define CL_STATUS_CASE(status) \ - case status: return #status - switch (cl_status) { - CL_STATUS_CASE(CL_SUCCESS); - CL_STATUS_CASE(CL_DEVICE_NOT_FOUND); - CL_STATUS_CASE(CL_DEVICE_NOT_AVAILABLE); - CL_STATUS_CASE(CL_COMPILER_NOT_AVAILABLE); - CL_STATUS_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE); - CL_STATUS_CASE(CL_OUT_OF_RESOURCES); - CL_STATUS_CASE(CL_OUT_OF_HOST_MEMORY); - CL_STATUS_CASE(CL_PROFILING_INFO_NOT_AVAILABLE); - CL_STATUS_CASE(CL_MEM_COPY_OVERLAP); - CL_STATUS_CASE(CL_IMAGE_FORMAT_MISMATCH); - CL_STATUS_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED); - CL_STATUS_CASE(CL_BUILD_PROGRAM_FAILURE); - CL_STATUS_CASE(CL_MAP_FAILURE); - CL_STATUS_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET); - CL_STATUS_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); - CL_STATUS_CASE(CL_COMPILE_PROGRAM_FAILURE); - CL_STATUS_CASE(CL_LINKER_NOT_AVAILABLE); - CL_STATUS_CASE(CL_LINK_PROGRAM_FAILURE); - CL_STATUS_CASE(CL_DEVICE_PARTITION_FAILED); - CL_STATUS_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE); - CL_STATUS_CASE(CL_INVALID_VALUE); - CL_STATUS_CASE(CL_INVALID_DEVICE_TYPE); - CL_STATUS_CASE(CL_INVALID_PLATFORM); - CL_STATUS_CASE(CL_INVALID_DEVICE); - CL_STATUS_CASE(CL_INVALID_CONTEXT); - CL_STATUS_CASE(CL_INVALID_QUEUE_PROPERTIES); - CL_STATUS_CASE(CL_INVALID_COMMAND_QUEUE); - CL_STATUS_CASE(CL_INVALID_HOST_PTR); - CL_STATUS_CASE(CL_INVALID_MEM_OBJECT); - CL_STATUS_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); - CL_STATUS_CASE(CL_INVALID_IMAGE_SIZE); - CL_STATUS_CASE(CL_INVALID_SAMPLER); - CL_STATUS_CASE(CL_INVALID_BINARY); - CL_STATUS_CASE(CL_INVALID_BUILD_OPTIONS); - CL_STATUS_CASE(CL_INVALID_PROGRAM); - CL_STATUS_CASE(CL_INVALID_PROGRAM_EXECUTABLE); - CL_STATUS_CASE(CL_INVALID_KERNEL_NAME); - CL_STATUS_CASE(CL_INVALID_KERNEL_DEFINITION); - CL_STATUS_CASE(CL_INVALID_KERNEL); - CL_STATUS_CASE(CL_INVALID_ARG_INDEX); - CL_STATUS_CASE(CL_INVALID_ARG_VALUE); - CL_STATUS_CASE(CL_INVALID_ARG_SIZE); - CL_STATUS_CASE(CL_INVALID_KERNEL_ARGS); - CL_STATUS_CASE(CL_INVALID_WORK_DIMENSION); - CL_STATUS_CASE(CL_INVALID_WORK_GROUP_SIZE); - CL_STATUS_CASE(CL_INVALID_WORK_ITEM_SIZE); - CL_STATUS_CASE(CL_INVALID_GLOBAL_OFFSET); - CL_STATUS_CASE(CL_INVALID_EVENT_WAIT_LIST); - CL_STATUS_CASE(CL_INVALID_EVENT); - CL_STATUS_CASE(CL_INVALID_OPERATION); - CL_STATUS_CASE(CL_INVALID_GL_OBJECT); - CL_STATUS_CASE(CL_INVALID_BUFFER_SIZE); - CL_STATUS_CASE(CL_INVALID_MIP_LEVEL); - CL_STATUS_CASE(CL_INVALID_GLOBAL_WORK_SIZE); -#undef CL_STATUS_CASE - default: return "unknown macro name"; - } -} enum { OCL_BUFFER_ALIGNMENT = 128 }; -#define MAYBE_REPORT_ERROR(msg) \ - do { \ - VERROR(primitive, gpu, msg); \ - } while (0) - -#define MAYBE_REPORT_OCL_ERROR(s) \ - do { \ - VERROR(primitive, ocl, "errcode %d,%s,%s:%d", int(s), \ - gpu::intel::ocl::convert_cl_int_to_str(s), __FILENAME__, \ - __LINE__); \ - } while (0) - -#define OCL_CHECK_V(x) \ - do { \ - cl_int s = x; \ - if (s != CL_SUCCESS) { \ - MAYBE_REPORT_OCL_ERROR(s); \ - return; \ - } \ - } while (0) - -#define OCL_CHECK(x) \ - do { \ - cl_int s = x; \ - if (s != CL_SUCCESS) { \ - MAYBE_REPORT_OCL_ERROR(s); \ - return dnnl::impl::gpu::intel::ocl::convert_to_dnnl(s); \ - } \ - } while (0) - -#define UNUSED_OCL_RESULT(x) \ - do { \ - cl_int s = x; \ - if (s != CL_SUCCESS) { MAYBE_REPORT_OCL_ERROR(s); } \ - assert(s == CL_SUCCESS); \ - MAYBE_UNUSED(s); \ - } while (false) - -// Check for three conditions: -// 1. Device and context are compatible, i.e. the device belongs to -// the context devices. -// 2. Device type matches the passed engine kind -// 3. Device/context platfrom is an Intel platform -status_t check_device(engine_kind_t eng_kind, cl_device_id dev, cl_context ctx); - -status_t get_ocl_devices( - std::vector *devices, cl_device_type device_type); - -status_t get_ocl_device_index(size_t *index, cl_device_id device); - -cl_platform_id get_ocl_platform(cl_device_id device); -cl_platform_id get_ocl_platform(engine_t *engine); - -namespace details { - -// OpenCL objects reference counting traits -template -struct ocl_ref_traits; -//{ -// static void retain(T t) {} -// static void release(T t) {} -//}; - -template <> -struct ocl_ref_traits { - static void retain(cl_context t) { UNUSED_OCL_RESULT(clRetainContext(t)); } - static void release(cl_context t) { - UNUSED_OCL_RESULT(clReleaseContext(t)); - } -}; - -template <> -struct ocl_ref_traits { - static void retain(cl_command_queue t) { - UNUSED_OCL_RESULT(clRetainCommandQueue(t)); - } - static void release(cl_command_queue t) { - UNUSED_OCL_RESULT(clReleaseCommandQueue(t)); - } -}; - -template <> -struct ocl_ref_traits { - static void retain(cl_program t) { UNUSED_OCL_RESULT(clRetainProgram(t)); } - static void release(cl_program t) { - UNUSED_OCL_RESULT(clReleaseProgram(t)); - } -}; - -template <> -struct ocl_ref_traits { - static void retain(cl_kernel t) { UNUSED_OCL_RESULT(clRetainKernel(t)); } - static void release(cl_kernel t) { UNUSED_OCL_RESULT(clReleaseKernel(t)); } -}; - -template <> -struct ocl_ref_traits { - static void retain(cl_mem t) { UNUSED_OCL_RESULT(clRetainMemObject(t)); } - static void release(cl_mem t) { UNUSED_OCL_RESULT(clReleaseMemObject(t)); } -}; - -template <> -struct ocl_ref_traits { - static void retain(cl_sampler t) { UNUSED_OCL_RESULT(clRetainSampler(t)); } - static void release(cl_sampler t) { - UNUSED_OCL_RESULT(clReleaseSampler(t)); - } -}; - -template <> -struct ocl_ref_traits { - static void retain(cl_event t) { UNUSED_OCL_RESULT(clRetainEvent(t)); } - static void release(cl_event t) { UNUSED_OCL_RESULT(clReleaseEvent(t)); } -}; - -template <> -struct ocl_ref_traits { - static void retain(cl_device_id t) { UNUSED_OCL_RESULT(clRetainDevice(t)); } - static void release(cl_device_id t) { - UNUSED_OCL_RESULT(clReleaseDevice(t)); - } -}; - -} // namespace details - -// Generic class providing RAII support for OpenCL objects -template -struct ocl_wrapper_t { - ocl_wrapper_t(T t = nullptr, bool retain = false) : t_(t) { - if (retain) { do_retain(); } - } - - ocl_wrapper_t(const ocl_wrapper_t &other) : t_(other.t_) { do_retain(); } - - ocl_wrapper_t(ocl_wrapper_t &&other) noexcept : ocl_wrapper_t() { - swap(*this, other); - } - - ocl_wrapper_t &operator=(ocl_wrapper_t other) { - swap(*this, other); - return *this; - } - - friend void swap(ocl_wrapper_t &a, ocl_wrapper_t &b) noexcept { - using std::swap; - swap(a.t_, b.t_); - } - - ~ocl_wrapper_t() { do_release(); } - - operator T() const { return t_; } - T get() const { return t_; } - T &unwrap() { return t_; } - const T &unwrap() const { return t_; } - - T release() { - T t = t_; - t_ = nullptr; - return t; - } - -private: - T t_; - - void do_retain() { - if (t_) { details::ocl_ref_traits::retain(t_); } - } - - void do_release() { - if (t_) { details::ocl_ref_traits::release(t_); } - } -}; - -// Constructs an OpenCL wrapper object (providing RAII support) -template -ocl_wrapper_t make_ocl_wrapper(T t, bool retain = false) { - return ocl_wrapper_t(t, retain); -} - -template -struct ext_func_t { - ext_func_t(const char *name) : ext_func_ptrs_(intel_platforms().size()) { - for (size_t i = 0; i < intel_platforms().size(); ++i) { - auto p = intel_platforms()[i]; - auto it = ext_func_ptrs_.insert({p, load_ext_func(p, name)}); - assert(it.second); - MAYBE_UNUSED(it); - } - } - - template - typename cpp_compat::invoke_result::type operator()( - engine_t *engine, Args... args) const { - auto f = get_func(engine); - return f(args...); - } - - F get_func(engine_t *engine) const { - return get_func(get_ocl_platform(engine)); - } - - F get_func(cl_platform_id platform) const { - return ext_func_ptrs_.at(platform); - } - -private: - std::unordered_map ext_func_ptrs_; - - static F load_ext_func(cl_platform_id platform, const char *name) { - return reinterpret_cast( - clGetExtensionFunctionAddressForPlatform(platform, name)); - } - - static const std::vector &intel_platforms() { - static auto intel_platforms = get_intel_platforms(); - return intel_platforms; - } - - static std::vector get_intel_platforms() { - cl_uint num_platforms = 0; - cl_int err = clGetPlatformIDs(0, nullptr, &num_platforms); - if (err != CL_SUCCESS) return {}; - - std::vector platforms(num_platforms); - err = clGetPlatformIDs(num_platforms, platforms.data(), nullptr); - if (err != CL_SUCCESS) return {}; - - std::vector intel_platforms; - char vendor_name[128] = {}; - for (cl_platform_id p : platforms) { - err = clGetPlatformInfo(p, CL_PLATFORM_VENDOR, sizeof(vendor_name), - vendor_name, nullptr); - if (err != CL_SUCCESS) continue; - if (std::string(vendor_name).find("Intel") != std::string::npos) - intel_platforms.push_back(p); - } - - // OpenCL can return a list of platforms that contains duplicates. - std::sort(intel_platforms.begin(), intel_platforms.end()); - intel_platforms.erase( - std::unique(intel_platforms.begin(), intel_platforms.end()), - intel_platforms.end()); - return intel_platforms; - } -}; - status_t get_ocl_kernel_arg_type(compute::scalar_type_t *type, cl_kernel ocl_kernel, int idx, bool allow_undef = false); @@ -423,12 +52,12 @@ cl_mem clCreateBuffer_wrapper(cl_context context, cl_mem_flags flags, #endif status_t get_ocl_program_binary( - cl_program program, cl_device_id device, compute::binary_t &binary); + cl_program program, cl_device_id device, hrt::binary_t &binary); status_t get_ocl_program_binary( - cl_kernel kernel, cl_device_id device, compute::binary_t &binary); + cl_kernel kernel, cl_device_id device, hrt::binary_t &binary); -status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, compute::binary_t &binary); +status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, hrt::binary_t &binary); status_t get_ocl_program_binary_size( cl_kernel kernel, cl_device_id device, size_t *size); @@ -448,20 +77,6 @@ status_t get_ocl_device_enabled_systolic_intel( status_t get_ocl_device_enabled_native_float_atomics( cl_device_id device, uint64_t &native_extensions, bool is_xelpg); -status_t clone_kernel(cl_kernel kernel, cl_kernel *cloned_kernel); - -status_t create_ocl_program( - gpu::intel::ocl::ocl_wrapper_t &ocl_program, - cl_device_id dev, cl_context ctx, - const gpu::intel::compute::binary_t &binary); - -status_t get_device_uuid( - gpu::intel::compute::device_uuid_t &uuid, cl_device_id ocl_dev); - -status_t get_ocl_devices(std::vector *devices, - std::vector> *sub_devices, - cl_device_type device_type); - } // namespace ocl } // namespace intel } // namespace gpu diff --git a/src/gpu/sycl/sycl_interop_gpu_kernel.cpp b/src/gpu/sycl/sycl_interop_gpu_kernel.cpp index 36c92995dd8..f861b6c43a4 100644 --- a/src/gpu/sycl/sycl_interop_gpu_kernel.cpp +++ b/src/gpu/sycl/sycl_interop_gpu_kernel.cpp @@ -22,6 +22,7 @@ #include "gpu/intel/ocl/stream_profiler.hpp" #include "gpu/intel/ocl/types_interop.hpp" #include "gpu/intel/utils.hpp" +#include "hrt/utils.hpp" #include "sycl/level_zero_utils.hpp" #include "sycl/sycl_c_types_map.hpp" #include "sycl/sycl_stream.hpp" @@ -178,7 +179,7 @@ status_t sycl_interop_gpu_kernel_t::parallel_for(stream_t &stream, } status_t sycl_interop_gpu_kernel_t::dump() const { - intel::compute::binary_t binary; + hrt::binary_t binary; CHECK(get_kernel_binary(sycl_kernel(), binary)); return gpu::intel::gpu_utils::dump_kernel_binary(binary, name()); } diff --git a/src/hrt/CMakeLists.txt b/src/hrt/CMakeLists.txt new file mode 100644 index 00000000000..fb6323f6309 --- /dev/null +++ b/src/hrt/CMakeLists.txt @@ -0,0 +1,29 @@ +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +file(GLOB SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/*.h + ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.c + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ) + +add_subdirectory(ocl) + +set(OBJ_LIB ${LIB_PACKAGE_NAME}_common_hrt) +add_library(${OBJ_LIB} OBJECT ${SOURCES}) +set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS + $) diff --git a/src/hrt/ocl/CMakeLists.txt b/src/hrt/ocl/CMakeLists.txt new file mode 100644 index 00000000000..f5f62bcec33 --- /dev/null +++ b/src/hrt/ocl/CMakeLists.txt @@ -0,0 +1,27 @@ +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +file(GLOB_RECURSE SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/*.h + ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.c + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ) + +set(OBJ_LIB ${LIB_PACKAGE_NAME}_hrt_ocl) +add_library(${OBJ_LIB} OBJECT ${SOURCES}) +set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS + $) diff --git a/src/hrt/ocl/utils.cpp b/src/hrt/ocl/utils.cpp new file mode 100644 index 00000000000..da956a42399 --- /dev/null +++ b/src/hrt/ocl/utils.cpp @@ -0,0 +1,416 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +// Include for: +// - CL_PLATFORM_NOT_FOUND_KHR +// - CL_UUID_SIZE_KHR +// - CL_DEVICE_UUID_KHR +#include + +#include "hrt/ocl/utils.hpp" + +// XXX: Include this header for VERROR_ENGINE. +// TODO: Move VERROR_ENGINE and other similar macros to a separate file. +#include "common/engine.hpp" + +// TODO: remove it when engine_impl_t is introduced. +#include "gpu/intel/ocl/ocl_gpu_engine.hpp" + +namespace dnnl { +namespace impl { +namespace hrt { +namespace ocl { + +status_t convert_to_dnnl(cl_int cl_status) { + switch (cl_status) { + case CL_SUCCESS: return status::success; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + case CL_OUT_OF_RESOURCES: + case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory; + case CL_DEVICE_NOT_FOUND: + case CL_DEVICE_NOT_AVAILABLE: + case CL_COMPILER_NOT_AVAILABLE: + case CL_PROFILING_INFO_NOT_AVAILABLE: + case CL_MEM_COPY_OVERLAP: + case CL_IMAGE_FORMAT_MISMATCH: + case CL_IMAGE_FORMAT_NOT_SUPPORTED: + case CL_BUILD_PROGRAM_FAILURE: + case CL_MAP_FAILURE: + case CL_MISALIGNED_SUB_BUFFER_OFFSET: + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: + case CL_COMPILE_PROGRAM_FAILURE: + case CL_LINKER_NOT_AVAILABLE: + case CL_LINK_PROGRAM_FAILURE: + case CL_DEVICE_PARTITION_FAILED: + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: + case CL_INVALID_PLATFORM: + case CL_INVALID_DEVICE: return status::runtime_error; + case CL_INVALID_VALUE: + case CL_INVALID_DEVICE_TYPE: + case CL_INVALID_CONTEXT: + case CL_INVALID_QUEUE_PROPERTIES: + case CL_INVALID_COMMAND_QUEUE: + case CL_INVALID_HOST_PTR: + case CL_INVALID_MEM_OBJECT: + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: + case CL_INVALID_IMAGE_SIZE: + case CL_INVALID_SAMPLER: + case CL_INVALID_BINARY: + case CL_INVALID_BUILD_OPTIONS: + case CL_INVALID_PROGRAM: + case CL_INVALID_PROGRAM_EXECUTABLE: + case CL_INVALID_KERNEL_NAME: + case CL_INVALID_KERNEL_DEFINITION: + case CL_INVALID_KERNEL: + case CL_INVALID_ARG_INDEX: + case CL_INVALID_ARG_VALUE: + case CL_INVALID_ARG_SIZE: + case CL_INVALID_KERNEL_ARGS: + case CL_INVALID_WORK_DIMENSION: + case CL_INVALID_WORK_GROUP_SIZE: + case CL_INVALID_WORK_ITEM_SIZE: + case CL_INVALID_GLOBAL_OFFSET: + case CL_INVALID_EVENT_WAIT_LIST: + case CL_INVALID_EVENT: + case CL_INVALID_OPERATION: + case CL_INVALID_GL_OBJECT: + case CL_INVALID_BUFFER_SIZE: + case CL_INVALID_MIP_LEVEL: + case CL_INVALID_GLOBAL_WORK_SIZE: return status::invalid_arguments; + + default: return status::runtime_error; + } +} + +// Ordered by value as defined by opencl +const char *convert_cl_int_to_str(cl_int cl_status) { +#define CL_STATUS_CASE(status) \ + case status: return #status + switch (cl_status) { + CL_STATUS_CASE(CL_SUCCESS); + CL_STATUS_CASE(CL_DEVICE_NOT_FOUND); + CL_STATUS_CASE(CL_DEVICE_NOT_AVAILABLE); + CL_STATUS_CASE(CL_COMPILER_NOT_AVAILABLE); + CL_STATUS_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE); + CL_STATUS_CASE(CL_OUT_OF_RESOURCES); + CL_STATUS_CASE(CL_OUT_OF_HOST_MEMORY); + CL_STATUS_CASE(CL_PROFILING_INFO_NOT_AVAILABLE); + CL_STATUS_CASE(CL_MEM_COPY_OVERLAP); + CL_STATUS_CASE(CL_IMAGE_FORMAT_MISMATCH); + CL_STATUS_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED); + CL_STATUS_CASE(CL_BUILD_PROGRAM_FAILURE); + CL_STATUS_CASE(CL_MAP_FAILURE); + CL_STATUS_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET); + CL_STATUS_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); + CL_STATUS_CASE(CL_COMPILE_PROGRAM_FAILURE); + CL_STATUS_CASE(CL_LINKER_NOT_AVAILABLE); + CL_STATUS_CASE(CL_LINK_PROGRAM_FAILURE); + CL_STATUS_CASE(CL_DEVICE_PARTITION_FAILED); + CL_STATUS_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE); + CL_STATUS_CASE(CL_INVALID_VALUE); + CL_STATUS_CASE(CL_INVALID_DEVICE_TYPE); + CL_STATUS_CASE(CL_INVALID_PLATFORM); + CL_STATUS_CASE(CL_INVALID_DEVICE); + CL_STATUS_CASE(CL_INVALID_CONTEXT); + CL_STATUS_CASE(CL_INVALID_QUEUE_PROPERTIES); + CL_STATUS_CASE(CL_INVALID_COMMAND_QUEUE); + CL_STATUS_CASE(CL_INVALID_HOST_PTR); + CL_STATUS_CASE(CL_INVALID_MEM_OBJECT); + CL_STATUS_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); + CL_STATUS_CASE(CL_INVALID_IMAGE_SIZE); + CL_STATUS_CASE(CL_INVALID_SAMPLER); + CL_STATUS_CASE(CL_INVALID_BINARY); + CL_STATUS_CASE(CL_INVALID_BUILD_OPTIONS); + CL_STATUS_CASE(CL_INVALID_PROGRAM); + CL_STATUS_CASE(CL_INVALID_PROGRAM_EXECUTABLE); + CL_STATUS_CASE(CL_INVALID_KERNEL_NAME); + CL_STATUS_CASE(CL_INVALID_KERNEL_DEFINITION); + CL_STATUS_CASE(CL_INVALID_KERNEL); + CL_STATUS_CASE(CL_INVALID_ARG_INDEX); + CL_STATUS_CASE(CL_INVALID_ARG_VALUE); + CL_STATUS_CASE(CL_INVALID_ARG_SIZE); + CL_STATUS_CASE(CL_INVALID_KERNEL_ARGS); + CL_STATUS_CASE(CL_INVALID_WORK_DIMENSION); + CL_STATUS_CASE(CL_INVALID_WORK_GROUP_SIZE); + CL_STATUS_CASE(CL_INVALID_WORK_ITEM_SIZE); + CL_STATUS_CASE(CL_INVALID_GLOBAL_OFFSET); + CL_STATUS_CASE(CL_INVALID_EVENT_WAIT_LIST); + CL_STATUS_CASE(CL_INVALID_EVENT); + CL_STATUS_CASE(CL_INVALID_OPERATION); + CL_STATUS_CASE(CL_INVALID_GL_OBJECT); + CL_STATUS_CASE(CL_INVALID_BUFFER_SIZE); + CL_STATUS_CASE(CL_INVALID_MIP_LEVEL); + CL_STATUS_CASE(CL_INVALID_GLOBAL_WORK_SIZE); +#undef CL_STATUS_CASE + default: return "unknown macro name"; + } +} + +template +static std::string get_ocl_name(T obj, F get_func, cl_uint name_query) { + size_t name_size; + cl_int err = get_func(obj, name_query, 0, nullptr, &name_size); + // Ignore error. + UNUSED_OCL_RESULT(err); + + // Include null terminator explicitly - to safely overwrite it in + // clGetKernelInfo + std::string name(name_size, 0); + err = get_func(obj, name_query, name_size, &name[0], nullptr); + // Ignore error. + UNUSED_OCL_RESULT(err); + + // Remove the null terminator as std::string already includes it + name.resize(name_size - 1); + return name; +} + +std::string get_kernel_name(cl_kernel kernel) { + return get_ocl_name(kernel, clGetKernelInfo, CL_KERNEL_FUNCTION_NAME); +} + +static std::string get_platform_name(cl_platform_id platform) { + return get_ocl_name(platform, clGetPlatformInfo, CL_PLATFORM_NAME); +} + +static bool is_intel_platform(cl_platform_id platform) { + auto name = get_platform_name(platform); + return name.find("Intel") != std::string::npos; +} + +status_t get_devices(std::vector *devices, + cl_device_type device_type, cl_uint vendor_id /* = 0x8086 */) { + cl_uint num_platforms = 0; + + cl_int err = clGetPlatformIDs(0, nullptr, &num_platforms); + // No platforms - a valid scenario + if (err == CL_PLATFORM_NOT_FOUND_KHR) return status::success; + + OCL_CHECK(err); + + std::vector platforms(num_platforms); + OCL_CHECK(clGetPlatformIDs(num_platforms, &platforms[0], nullptr)); + + for (size_t i = 0; i < platforms.size(); ++i) { + if (!is_intel_platform(platforms[i])) continue; + + cl_uint num_devices = 0; + cl_int err = clGetDeviceIDs( + platforms[i], device_type, 0, nullptr, &num_devices); + + if (!utils::one_of(err, CL_SUCCESS, CL_DEVICE_NOT_FOUND)) { + return status::runtime_error; + } + + if (num_devices != 0) { + std::vector plat_devices; + plat_devices.resize(num_devices); + OCL_CHECK(clGetDeviceIDs(platforms[i], device_type, num_devices, + &plat_devices[0], nullptr)); + + // Use the devices for the requested vendor only. + for (size_t j = 0; j < plat_devices.size(); ++j) { + cl_uint v_id; + OCL_CHECK(clGetDeviceInfo(plat_devices[j], CL_DEVICE_VENDOR_ID, + sizeof(cl_uint), &v_id, nullptr)); + if (v_id == vendor_id) { devices->push_back(plat_devices[j]); } + } + } + } + // No devices found but still return success + return status::success; +} + +status_t get_devices(std::vector *devices, + std::vector> *sub_devices, + cl_device_type device_type) { + std::vector devices_tmp; + std::vector> sub_devices_tmp; + + CHECK(get_devices(&devices_tmp, device_type)); + + for (cl_device_id d : devices_tmp) { + cl_uint max_sub_devices; + cl_device_partition_property properties[3] + = {CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, + CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, 0}; + cl_int err = clCreateSubDevices( + d, properties, 0, nullptr, &max_sub_devices); + if (err == CL_DEVICE_PARTITION_FAILED) continue; + OCL_CHECK(err); + std::vector sds(max_sub_devices); + OCL_CHECK(clCreateSubDevices( + d, properties, max_sub_devices, sds.data(), nullptr)); + for (cl_device_id sd : sds) + sub_devices_tmp.emplace_back(sd); + } + *devices = devices_tmp; + *sub_devices = std::move(sub_devices_tmp); + return status::success; +} + +status_t get_device_index(size_t *index, cl_device_id device) { + std::vector ocl_devices; + cl_device_type device_type; + OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(device_type), + &device_type, nullptr)); + CHECK(get_devices(&ocl_devices, device_type)); + + // Search the top level device unconditionally + auto parent_device = device; + auto top_level_device = device; + while (parent_device) { + top_level_device = parent_device; + OCL_CHECK(clGetDeviceInfo(top_level_device, CL_DEVICE_PARENT_DEVICE, + sizeof(cl_device_id), &parent_device, nullptr)); + } + + // Find the top level device in the list + auto it = std::find( + ocl_devices.begin(), ocl_devices.end(), top_level_device); + if (it != ocl_devices.end()) { + *index = it - ocl_devices.begin(); + return status::success; + } else { + *index = SIZE_MAX; + return status::invalid_arguments; + } +} + +cl_platform_id get_platform(cl_device_id device) { + cl_platform_id platform; + cl_int err = clGetDeviceInfo( + device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr); + if (err != CL_SUCCESS) return nullptr; + return platform; +} + +cl_platform_id get_platform(engine_t *engine) { + return utils::downcast(engine) + ->platform(); +} + +status_t create_program(ocl::wrapper_t &ocl_program, + cl_device_id dev, cl_context ctx, const hrt::binary_t &binary) { + cl_int err; + const unsigned char *binary_buffer = binary.data(); + size_t binary_size = binary.size(); + assert(binary_size > 0); + + ocl_program = clCreateProgramWithBinary( + ctx, 1, &dev, &binary_size, &binary_buffer, nullptr, &err); + OCL_CHECK(err); + err = clBuildProgram(ocl_program, 1, &dev, nullptr, nullptr, nullptr); + OCL_CHECK(err); + + return status::success; +} + +status_t get_device_uuid(hrt::device_uuid_t &uuid, cl_device_id ocl_dev) { + // This function is used only with SYCL that works with OpenCL 3.0 + // that supports `cl_khr_device_uuid` extension. +#if defined(cl_khr_device_uuid) + static_assert( + CL_UUID_SIZE_KHR == 16, "CL_UUID_SIZE_KHR is expected to be 16"); + + cl_uchar ocl_dev_uuid[CL_UUID_SIZE_KHR] = {}; + OCL_CHECK(clGetDeviceInfo(ocl_dev, CL_DEVICE_UUID_KHR, CL_UUID_SIZE_KHR, + ocl_dev_uuid, nullptr)); + + uint64_t uuid_packed[CL_UUID_SIZE_KHR / sizeof(uint64_t)] = {}; + for (size_t i = 0; i < CL_UUID_SIZE_KHR; ++i) { + size_t shift = i % sizeof(uint64_t) * CHAR_BIT; + uuid_packed[i / sizeof(uint64_t)] + |= (((uint64_t)ocl_dev_uuid[i]) << shift); + } + uuid = hrt::device_uuid_t(uuid_packed[0], uuid_packed[1]); + return status::success; +#endif + return status::runtime_error; +} + +status_t check_device( + engine_kind_t eng_kind, cl_device_id dev, cl_context ctx) { + assert(dev && ctx); + + // Check device and context consistency. + size_t dev_bytes; + OCL_CHECK( + clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, nullptr, &dev_bytes)); + + std::vector ctx_devices(dev_bytes / sizeof(cl_device_id)); + OCL_CHECK(clGetContextInfo( + ctx, CL_CONTEXT_DEVICES, dev_bytes, &ctx_devices[0], nullptr)); + + bool found = false; + for (size_t i = 0; i < ctx_devices.size(); ++i) { + if (ctx_devices[i] == dev) { + found = true; + break; + } + } + VERROR_ENGINE( + found, status::invalid_arguments, VERBOSE_DEVICE_CTX_MISMATCH); + + // Check engine kind and device consistency. + cl_device_type dev_type; + OCL_CHECK(clGetDeviceInfo( + dev, CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, nullptr)); + VERROR_ENGINE(!((eng_kind == engine_kind::cpu) + && (dev_type & CL_DEVICE_TYPE_CPU) == 0), + status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); + VERROR_ENGINE(!((eng_kind == engine_kind::gpu) + && (dev_type & CL_DEVICE_TYPE_GPU) == 0), + status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); + +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL + // Check that the platform is an Intel platform. + cl_platform_id platform; + OCL_CHECK(clGetDeviceInfo( + dev, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr)); + + VERROR_ENGINE(is_intel_platform(platform), status::invalid_arguments, + VERBOSE_INVALID_PLATFORM, "ocl", "intel", + get_platform_name(platform).c_str()); +#endif + return status::success; +} + +status_t clone_kernel(cl_kernel kernel, cl_kernel *cloned_kernel) { + cl_int err; +#if defined(CL_VERSION_2_1) + *cloned_kernel = clCloneKernel(kernel, &err); + OCL_CHECK(err); +#else + // clCloneKernel is not available - recreate from the program. + auto name = get_kernel_name(kernel); + + cl_program program; + err = clGetKernelInfo( + kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr); + OCL_CHECK(err); + + *cloned_kernel = clCreateKernel(program, name.c_str(), &err); + OCL_CHECK(err); +#endif + + return status::success; +} + +} // namespace ocl +} // namespace hrt +} // namespace impl +} // namespace dnnl diff --git a/src/hrt/ocl/utils.hpp b/src/hrt/ocl/utils.hpp new file mode 100644 index 00000000000..bca644ee0e2 --- /dev/null +++ b/src/hrt/ocl/utils.hpp @@ -0,0 +1,302 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef COMMON_HRT_OCL_UTILS_HPP +#define COMMON_HRT_OCL_UTILS_HPP + +#include + +#include "oneapi/dnnl/dnnl_config.h" + +#include "common/c_types_map.hpp" +#include "common/cpp_compat.hpp" +#include "common/utils.hpp" +#include "common/verbose.hpp" + +#include "hrt/utils.hpp" + +namespace dnnl { +namespace impl { +namespace hrt { +namespace ocl { + +status_t convert_to_dnnl(cl_int cl_status); +const char *convert_cl_int_to_str(cl_int cl_status); + +#define MAYBE_REPORT_ERROR(msg) \ + do { \ + VERROR(primitive, gpu, msg); \ + } while (0) + +#define MAYBE_REPORT_OCL_ERROR(s) \ + do { \ + VERROR(primitive, ocl, "errcode %d,%s,%s:%d", int(s), \ + dnnl::impl::hrt::ocl::convert_cl_int_to_str(s), __FILENAME__, \ + __LINE__); \ + } while (0) + +#define OCL_CHECK_V(x) \ + do { \ + cl_int s = x; \ + if (s != CL_SUCCESS) { \ + MAYBE_REPORT_OCL_ERROR(s); \ + return; \ + } \ + } while (0) + +#define OCL_CHECK(x) \ + do { \ + cl_int s = x; \ + if (s != CL_SUCCESS) { \ + MAYBE_REPORT_OCL_ERROR(s); \ + return dnnl::impl::hrt::ocl::convert_to_dnnl(s); \ + } \ + } while (0) + +#define UNUSED_OCL_RESULT(x) \ + do { \ + cl_int s = x; \ + if (s != CL_SUCCESS) { MAYBE_REPORT_OCL_ERROR(s); } \ + assert(s == CL_SUCCESS); \ + MAYBE_UNUSED(s); \ + } while (false) + +// OpenCL objects reference counting traits +template +struct ref_traits; +//{ +// static void retain(T t) {} +// static void release(T t) {} +//}; + +template <> +struct ref_traits { + static void retain(cl_context t) { UNUSED_OCL_RESULT(clRetainContext(t)); } + static void release(cl_context t) { + UNUSED_OCL_RESULT(clReleaseContext(t)); + } +}; + +template <> +struct ref_traits { + static void retain(cl_command_queue t) { + UNUSED_OCL_RESULT(clRetainCommandQueue(t)); + } + static void release(cl_command_queue t) { + UNUSED_OCL_RESULT(clReleaseCommandQueue(t)); + } +}; + +template <> +struct ref_traits { + static void retain(cl_program t) { UNUSED_OCL_RESULT(clRetainProgram(t)); } + static void release(cl_program t) { + UNUSED_OCL_RESULT(clReleaseProgram(t)); + } +}; + +template <> +struct ref_traits { + static void retain(cl_kernel t) { UNUSED_OCL_RESULT(clRetainKernel(t)); } + static void release(cl_kernel t) { UNUSED_OCL_RESULT(clReleaseKernel(t)); } +}; + +template <> +struct ref_traits { + static void retain(cl_mem t) { UNUSED_OCL_RESULT(clRetainMemObject(t)); } + static void release(cl_mem t) { UNUSED_OCL_RESULT(clReleaseMemObject(t)); } +}; + +template <> +struct ref_traits { + static void retain(cl_sampler t) { UNUSED_OCL_RESULT(clRetainSampler(t)); } + static void release(cl_sampler t) { + UNUSED_OCL_RESULT(clReleaseSampler(t)); + } +}; + +template <> +struct ref_traits { + static void retain(cl_event t) { UNUSED_OCL_RESULT(clRetainEvent(t)); } + static void release(cl_event t) { UNUSED_OCL_RESULT(clReleaseEvent(t)); } +}; + +template <> +struct ref_traits { + static void retain(cl_device_id t) { UNUSED_OCL_RESULT(clRetainDevice(t)); } + static void release(cl_device_id t) { + UNUSED_OCL_RESULT(clReleaseDevice(t)); + } +}; + +// Generic class providing RAII support for OpenCL objects +template +struct wrapper_t { + wrapper_t(T t = nullptr, bool retain = false) : t_(t) { + if (retain) { do_retain(); } + } + + wrapper_t(const wrapper_t &other) : t_(other.t_) { do_retain(); } + + wrapper_t(wrapper_t &&other) noexcept : wrapper_t() { swap(*this, other); } + + wrapper_t &operator=(wrapper_t other) { + swap(*this, other); + return *this; + } + + friend void swap(wrapper_t &a, wrapper_t &b) noexcept { + using std::swap; + swap(a.t_, b.t_); + } + + ~wrapper_t() { do_release(); } + + operator T() const { return t_; } + T get() const { return t_; } + T &unwrap() { return t_; } + const T &unwrap() const { return t_; } + + T release() { + T t = t_; + t_ = nullptr; + return t; + } + +private: + T t_; + + void do_retain() { + if (t_) { ref_traits::retain(t_); } + } + + void do_release() { + if (t_) { ref_traits::release(t_); } + } +}; + +// Constructs an OpenCL wrapper object (providing RAII support) +template +wrapper_t make_wrapper(T t, bool retain = false) { + return wrapper_t(t, retain); +} + +cl_platform_id get_platform(cl_device_id device); +cl_platform_id get_platform(engine_t *engine); + +template +struct ext_func_t { + ext_func_t(const char *ext_func_name, const char *vendor_name = "Intel") + : ext_func_ptrs_(vendor_platforms().size()) { + for (size_t i = 0; i < vendor_platforms().size(); ++i) { + auto p = vendor_platforms()[i]; + auto it = ext_func_ptrs_.insert( + {p, load_ext_func(p, ext_func_name)}); + assert(it.second); + MAYBE_UNUSED(it); + } + } + + template + typename cpp_compat::invoke_result::type operator()( + engine_t *engine, Args... args) const { + auto f = get_func(engine); + return f(args...); + } + + F get_func(engine_t *engine) const { + return get_func(get_platform(engine)); + } + + F get_func(cl_platform_id platform) const { + return ext_func_ptrs_.at(platform); + } + +private: + std::unordered_map ext_func_ptrs_; + + static F load_ext_func(cl_platform_id platform, const char *ext_func_name) { + return reinterpret_cast(clGetExtensionFunctionAddressForPlatform( + platform, ext_func_name)); + } + + static const std::vector &vendor_platforms() { + static auto vendor_platforms = get_vendor_platforms(); + return vendor_platforms; + } + + static std::vector get_vendor_platforms() { + cl_uint num_platforms = 0; + cl_int err = clGetPlatformIDs(0, nullptr, &num_platforms); + if (err != CL_SUCCESS) return {}; + + std::vector platforms(num_platforms); + err = clGetPlatformIDs(num_platforms, platforms.data(), nullptr); + if (err != CL_SUCCESS) return {}; + + std::vector vendor_platforms; + char vendor_name[128] = {}; + for (cl_platform_id p : platforms) { + err = clGetPlatformInfo(p, CL_PLATFORM_VENDOR, sizeof(vendor_name), + vendor_name, nullptr); + if (err != CL_SUCCESS) continue; + if (std::string(vendor_name).find(vendor_name) != std::string::npos) + vendor_platforms.push_back(p); + } + + // OpenCL can return a list of platforms that contains duplicates. + std::sort(vendor_platforms.begin(), vendor_platforms.end()); + vendor_platforms.erase( + std::unique(vendor_platforms.begin(), vendor_platforms.end()), + vendor_platforms.end()); + return vendor_platforms; + } +}; + +std::string get_kernel_name(cl_kernel kernel); + +status_t get_devices(std::vector *devices, + cl_device_type device_type, cl_uint vendor_id = 0x8086); + +status_t get_devices(std::vector *devices, + std::vector> *sub_devices, + cl_device_type device_type); + +status_t get_device_index(size_t *index, cl_device_id device); + +cl_platform_id get_platform(cl_device_id device); +cl_platform_id get_platform(engine_t *engine); + +status_t create_program(ocl::wrapper_t &ocl_program, + cl_device_id dev, cl_context ctx, const hrt::binary_t &binary); + +status_t get_device_uuid(hrt::device_uuid_t &uuid, cl_device_id ocl_dev); + +// Check for three conditions: +// 1. Device and context are compatible, i.e. the device belongs to +// the context devices. +// 2. Device type matches the passed engine kind +// 3. Device/context platfrom is an Intel platform +status_t check_device(engine_kind_t eng_kind, cl_device_id dev, cl_context ctx); + +status_t clone_kernel(cl_kernel kernel, cl_kernel *cloned_kernel); + +} // namespace ocl +} // namespace hrt +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/hrt/utils.cpp b/src/hrt/utils.cpp new file mode 100644 index 00000000000..682967272fa --- /dev/null +++ b/src/hrt/utils.cpp @@ -0,0 +1,33 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include + +#include "hrt/utils.hpp" + +namespace dnnl { +namespace impl { +namespace hrt { + +size_t device_uuid_hasher_t::operator()(const device_uuid_t &uuid) const { + const size_t seed = hash_combine(0, std::get<0>(uuid)); + return hash_combine(seed, std::get<1>(uuid)); +} + +} // namespace hrt +} // namespace impl +} // namespace dnnl diff --git a/src/hrt/utils.hpp b/src/hrt/utils.hpp new file mode 100644 index 00000000000..3564bf3a879 --- /dev/null +++ b/src/hrt/utils.hpp @@ -0,0 +1,43 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef COMMON_HRT_UTILS_HPP +#define COMMON_HRT_UTILS_HPP + +#include +#include + +#include "common/utils.hpp" + +// This file contains utility functionality for heterogeneous runtimes such +// as OpenCL and SYCL. + +namespace dnnl { +namespace impl { +namespace hrt { + +using binary_t = std::vector; +using device_uuid_t = std::tuple; + +struct device_uuid_hasher_t { + size_t operator()(const device_uuid_t &uuid) const; +}; + +} // namespace hrt +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/sycl/level_zero_utils.cpp b/src/sycl/level_zero_utils.cpp index a8f73a951e0..d3bb29345d7 100644 --- a/src/sycl/level_zero_utils.cpp +++ b/src/sycl/level_zero_utils.cpp @@ -152,7 +152,7 @@ status_t func_zeModuleGetNativeBinary(ze_module_handle_t hModule, size_t *pSize, // we query it directly from Level0 with the zeDeviceGetProperties function. // The `get_device_uuid` function packs 128 bits of the device UUID, which are // represented as an uint8_t array of size 16, to 2 uint64_t values. -gpu::intel::compute::device_uuid_t get_device_uuid(const ::sycl::device &dev) { +hrt::device_uuid_t get_device_uuid(const ::sycl::device &dev) { static_assert(ZE_MAX_DEVICE_UUID_SIZE == 16, "ZE_MAX_DEVICE_UUID_SIZE is expected to be 16"); @@ -171,13 +171,13 @@ gpu::intel::compute::device_uuid_t get_device_uuid(const ::sycl::device &dev) { size_t shift = i % sizeof(uint64_t) * CHAR_BIT; uuid[i / sizeof(uint64_t)] |= (((uint64_t)ze_device_id[i]) << shift); } - return gpu::intel::compute::device_uuid_t(uuid[0], uuid[1]); + return hrt::device_uuid_t(uuid[0], uuid[1]); } status_t sycl_create_kernel_with_level_zero( std::unique_ptr<::sycl::kernel> &sycl_kernel, const std::string &kernel_name, const sycl_engine_base_t *sycl_engine, - const gpu::intel::compute::binary_t &binary) { + const hrt::binary_t &binary) { auto desc = ze_module_desc_t(); desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC; desc.format = ZE_MODULE_FORMAT_NATIVE; diff --git a/src/sycl/level_zero_utils.hpp b/src/sycl/level_zero_utils.hpp index f4deeba4b63..ad5e42fd93c 100644 --- a/src/sycl/level_zero_utils.hpp +++ b/src/sycl/level_zero_utils.hpp @@ -28,7 +28,7 @@ namespace dnnl { namespace impl { namespace sycl { -gpu::intel::compute::device_uuid_t get_device_uuid(const ::sycl::device &dev); +hrt::device_uuid_t get_device_uuid(const ::sycl::device &dev); // including sycl_engine_base.hpp leads to circular dependencies, w/a for now. class sycl_engine_base_t; @@ -36,7 +36,7 @@ class sycl_engine_base_t; status_t sycl_create_kernel_with_level_zero( std::unique_ptr<::sycl::kernel> &sycl_kernel, const std::string &kernel_name, const sycl_engine_base_t *sycl_engine, - const gpu::intel::compute::binary_t &binary); + const hrt::binary_t &binary); bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs); diff --git a/src/sycl/sycl_compat.cpp b/src/sycl/sycl_compat.cpp index b1e866b2252..bcf6b887ed8 100644 --- a/src/sycl/sycl_compat.cpp +++ b/src/sycl/sycl_compat.cpp @@ -79,12 +79,12 @@ void *get_native(const ::sycl::context &ctx) { } status_t make_kernel(std::unique_ptr<::sycl::kernel> &sycl_kernel, - const sycl_engine_base_t *sycl_engine, - const gpu::intel::compute::binary_t &binary, const char *kernel_name) { + const sycl_engine_base_t *sycl_engine, const hrt::binary_t &binary, + const char *kernel_name) { auto backend = get_sycl_backend(sycl_engine->device()); if (backend == backend_t::opencl) { - gpu::intel::ocl::ocl_wrapper_t ocl_program; - CHECK(create_ocl_program(ocl_program, sycl_engine->ocl_device(), + hrt::ocl::wrapper_t ocl_program; + CHECK(hrt::ocl::create_program(ocl_program, sycl_engine->ocl_device(), sycl_engine->ocl_context(), binary)); cl_int err; cl_kernel ocl_kernel = clCreateKernel(ocl_program, kernel_name, &err); diff --git a/src/sycl/sycl_compat.hpp b/src/sycl/sycl_compat.hpp index d612289fd7e..14f132936f1 100644 --- a/src/sycl/sycl_compat.hpp +++ b/src/sycl/sycl_compat.hpp @@ -28,8 +28,8 @@ class sycl_engine_base_t; namespace compat { status_t make_kernel(std::unique_ptr<::sycl::kernel> &sycl_kernel, - const sycl_engine_base_t *sycl_engine, - const gpu::intel::compute::binary_t &binary, const char *kernel_name); + const sycl_engine_base_t *sycl_engine, const hrt::binary_t &binary, + const char *kernel_name); void *get_native(const ::sycl::device &dev); void *get_native(const ::sycl::context &ctx); diff --git a/src/sycl/sycl_device_info.cpp b/src/sycl/sycl_device_info.cpp index 8fa69e6b4da..83fe676ee94 100644 --- a/src/sycl/sycl_device_info.cpp +++ b/src/sycl/sycl_device_info.cpp @@ -44,9 +44,9 @@ status_t sycl_device_info_t::init_arch(engine_t *engine) { cl_int err = CL_SUCCESS; auto ocl_dev = compat::get_native(device); - auto ocl_dev_wrapper = gpu::intel::ocl::make_ocl_wrapper(ocl_dev); + auto ocl_dev_wrapper = hrt::ocl::make_wrapper(ocl_dev); - auto ocl_ctx_wrapper = gpu::intel::ocl::make_ocl_wrapper( + auto ocl_ctx_wrapper = hrt::ocl::make_wrapper( clCreateContext(nullptr, 1, &ocl_dev, nullptr, nullptr, &err)); OCL_CHECK(err); diff --git a/src/sycl/sycl_engine_base.hpp b/src/sycl/sycl_engine_base.hpp index 4e18ef77664..5ab0d5bda0b 100644 --- a/src/sycl/sycl_engine_base.hpp +++ b/src/sycl/sycl_engine_base.hpp @@ -73,7 +73,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { if (!ocl_kernels[i]) continue; auto *k = utils::downcast( ocl_kernels[i].impl()); - gpu::intel::compute::binary_t binary; + hrt::binary_t binary; CHECK(k->get_binary(ocl_engine, binary)); CHECK(create_kernel_from_binary( kernels[i], binary, kernel_names[i])); @@ -82,7 +82,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { } status_t create_kernel_from_binary(gpu::intel::compute::kernel_t &kernel, - const gpu::intel::compute::binary_t &binary, + const hrt::binary_t &binary, const char *kernel_name) const override { std::vector arg_types; @@ -133,7 +133,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { auto kernel_name = jitter->kernel_name(); - gpu::intel::compute::binary_t binary = jitter->get_binary( + hrt::binary_t binary = jitter->get_binary( ocl_engine->context(), ocl_engine->device()); return create_kernel_from_binary(*kernel, binary, kernel_name); } @@ -171,7 +171,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { return nullptr; } assert(device_.is_cpu() || device_.is_gpu()); - return gpu::intel::ocl::make_ocl_wrapper( + return hrt::ocl::make_wrapper( compat::get_native(device())); } cl_context ocl_context() const { @@ -180,7 +180,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { return nullptr; } assert(device_.is_cpu() || device_.is_gpu()); - return gpu::intel::ocl::make_ocl_wrapper( + return hrt::ocl::make_wrapper( compat::get_native(context())); } diff --git a/src/sycl/sycl_utils.cpp b/src/sycl/sycl_utils.cpp index 0cd78205244..7a3719c1552 100644 --- a/src/sycl/sycl_utils.cpp +++ b/src/sycl/sycl_utils.cpp @@ -112,7 +112,7 @@ device_id_t sycl_device_id(const ::sycl::device &dev) { = device_id_t {static_cast(backend_t::unknown), 0, 0}; switch (get_sycl_backend(dev)) { case backend_t::opencl: { - auto ocl_device = gpu::intel::ocl::make_ocl_wrapper( + auto ocl_device = hrt::ocl::make_wrapper( compat::get_native(dev)); device_id = std::make_tuple(static_cast(backend_t::opencl), reinterpret_cast(ocl_device.get()), 0); @@ -189,14 +189,14 @@ status_t check_device(engine_kind_t eng_kind, const ::sycl::device &dev, struct uuid2ocl_dev_t { uuid2ocl_dev_t() = default; - status_t add(gpu::intel::compute::device_uuid_t uuid, - const gpu::intel::ocl::ocl_wrapper_t &d) { + status_t add(hrt::device_uuid_t uuid, + const hrt::ocl::wrapper_t &d) { auto it = mapper_.insert(std::make_pair(uuid, d)); if (!it.second) return status::runtime_error; return status::success; } - cl_device_id get(gpu::intel::compute::device_uuid_t uuid) const { + cl_device_id get(hrt::device_uuid_t uuid) const { auto it = mapper_.find(uuid); if (it == mapper_.end()) return nullptr; return it->second; @@ -212,9 +212,8 @@ struct uuid2ocl_dev_t { } private: - using mapper_t = std::unordered_map, - gpu::intel::compute::device_uuid_hasher_t>; + using mapper_t = std::unordered_map, hrt::device_uuid_hasher_t>; void release() { auto t = utils::make_unique(); @@ -237,26 +236,25 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) { auto uuid2ocl_dev_tmp = uuid2ocl_dev_t(); std::vector ocl_devices; - std::vector> - ocl_sub_devices; - auto st = gpu::intel::ocl::get_ocl_devices( + std::vector> ocl_sub_devices; + auto st = hrt::ocl::get_devices( &ocl_devices, &ocl_sub_devices, CL_DEVICE_TYPE_GPU); assert(st == status::success); MAYBE_UNUSED(st); - const auto register_ocl_dev = - [&uuid2ocl_dev_tmp]( - const gpu::intel::ocl::ocl_wrapper_t &d) { - device_uuid_t ocl_dev_uuid; - auto st = gpu::intel::ocl::get_device_uuid(ocl_dev_uuid, d); - assert(st == status::success); - st = uuid2ocl_dev_tmp.add(ocl_dev_uuid, d); - assert(st == status::success); - MAYBE_UNUSED(st); - }; + const auto register_ocl_dev + = [&uuid2ocl_dev_tmp]( + const hrt::ocl::wrapper_t &d) { + hrt::device_uuid_t ocl_dev_uuid; + auto st = hrt::ocl::get_device_uuid(ocl_dev_uuid, d); + assert(st == status::success); + st = uuid2ocl_dev_tmp.add(ocl_dev_uuid, d); + assert(st == status::success); + MAYBE_UNUSED(st); + }; for (cl_device_id d : ocl_devices) { - register_ocl_dev(gpu::intel::ocl::make_ocl_wrapper(d)); + register_ocl_dev(hrt::ocl::make_wrapper(d)); } for (const auto &sd_wrapper : ocl_sub_devices) { register_ocl_dev(sd_wrapper); @@ -267,7 +265,7 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) { if (uuid2ocl_dev.empty()) return status::runtime_error; - const device_uuid_t l0_dev_uuid = get_device_uuid(dev); + const hrt::device_uuid_t l0_dev_uuid = get_device_uuid(dev); auto d = uuid2ocl_dev.get(l0_dev_uuid); if (!d) return status::runtime_error; @@ -287,23 +285,23 @@ static status_t create_ocl_engine( // The SYCL context is always provided for OpenCL backend. if (backend == backend_t::opencl && !sycl_ctx) return status::runtime_error; - gpu::intel::ocl::ocl_wrapper_t ocl_dev; - gpu::intel::ocl::ocl_wrapper_t ocl_ctx; + hrt::ocl::wrapper_t ocl_dev; + hrt::ocl::wrapper_t ocl_ctx; switch (backend) { case backend_t::opencl: - ocl_dev = gpu::intel::ocl::make_ocl_wrapper( + ocl_dev = hrt::ocl::make_wrapper( compat::get_native(sycl_dev)); - ocl_ctx = gpu::intel::ocl::make_ocl_wrapper( + ocl_ctx = hrt::ocl::make_wrapper( compat::get_native(*sycl_ctx)); break; case backend_t::level0: { cl_device_id d {nullptr}; CHECK(sycl_dev2ocl_dev(&d, sycl_dev)); - ocl_dev = gpu::intel::ocl::make_ocl_wrapper(d, true); + ocl_dev = hrt::ocl::make_wrapper(d, true); cl_int err; - ocl_ctx = gpu::intel::ocl::make_ocl_wrapper( + ocl_ctx = hrt::ocl::make_wrapper( clCreateContext(nullptr, 1, &d, nullptr, nullptr, &err)); OCL_CHECK(err); break; @@ -312,7 +310,7 @@ static status_t create_ocl_engine( } engine_t *ocl_engine_ptr; size_t index; - CHECK(gpu::intel::ocl::get_ocl_device_index(&index, ocl_dev)); + CHECK(hrt::ocl::get_device_index(&index, ocl_dev)); CHECK(f.engine_create(&ocl_engine_ptr, ocl_dev, ocl_ctx, index)); ocl_engine->reset(utils::downcast( ocl_engine_ptr)); @@ -328,7 +326,7 @@ status_t create_ocl_engine( } status_t get_kernel_binary( - const ::sycl::kernel &kernel, gpu::intel::compute::binary_t &binary) { + const ::sycl::kernel &kernel, hrt::binary_t &binary) { auto devs = kernel.get_context().get_devices(); assert(!devs.empty()); switch (get_sycl_backend(devs[0])) { @@ -338,7 +336,7 @@ status_t get_kernel_binary( ::sycl::backend::ext_oneapi_level_zero>(bundle); auto module = module_vec[0]; size_t module_binary_size; - gpu::intel::compute::binary_t module_binary; + hrt::binary_t module_binary; CHECK(func_zeModuleGetNativeBinary( module, &module_binary_size, nullptr)); module_binary.resize(module_binary_size); @@ -349,15 +347,15 @@ status_t get_kernel_binary( engine_deleter_t> ocl_engine; CHECK(create_ocl_engine(&ocl_engine, devs[0])); - gpu::intel::ocl::ocl_wrapper_t ocl_program; - CHECK(gpu::intel::ocl::create_ocl_program(ocl_program, + hrt::ocl::wrapper_t ocl_program; + CHECK(hrt::ocl::create_program(ocl_program, ocl_engine->device(), ocl_engine->context(), module_binary)); cl_int err; auto name = kernel.get_info< ::sycl::info::kernel::function_name>(); - auto ocl_kernel = gpu::intel::ocl::make_ocl_wrapper( + auto ocl_kernel = hrt::ocl::make_wrapper( clCreateKernel(ocl_program, name.c_str(), &err)); OCL_CHECK(err); CHECK(gpu::intel::ocl::get_ocl_kernel_binary( diff --git a/src/sycl/sycl_utils.hpp b/src/sycl/sycl_utils.hpp index c514d67b4ae..9a634defe09 100644 --- a/src/sycl/sycl_utils.hpp +++ b/src/sycl/sycl_utils.hpp @@ -21,6 +21,7 @@ #include "common/utils.hpp" #include "gpu/intel/compute/utils.hpp" #include "gpu/intel/ocl/ocl_gpu_engine.hpp" +#include "hrt/utils.hpp" #if __has_include() #include @@ -173,8 +174,7 @@ status_t create_ocl_engine( *ocl_engine, const sycl_engine_base_t *engine); -status_t get_kernel_binary( - const ::sycl::kernel &kernel, gpu::intel::compute::binary_t &binary); +status_t get_kernel_binary(const ::sycl::kernel &kernel, hrt::binary_t &binary); status_t create_ocl_engine( std::unique_ptr From 67da38c8561ee6d63b157a2a37a2ade6f310e89e Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Mon, 29 Apr 2024 15:33:06 -0700 Subject: [PATCH 012/187] sycl: move l0 header and utils to a new place --- src/gpu/intel/CMakeLists.txt | 4 +++ src/gpu/intel/jit/ngen/ngen_level_zero.hpp | 2 +- .../intel/sycl/l0}/level_zero/_clang-format | 0 .../l0}/level_zero/layers/zel_tracing_api.h | 0 .../l0}/level_zero/layers/zel_tracing_ddi.h | 0 .../layers/zel_tracing_register_cb.h | 0 .../sycl/l0}/level_zero/loader/ze_loader.h | 0 .../intel/sycl/l0}/level_zero/ze.py | 0 .../intel/sycl/l0}/level_zero/ze_api.h | 0 .../intel/sycl/l0}/level_zero/ze_ddi.h | 0 .../intel/sycl/l0}/level_zero/zes.py | 0 .../intel/sycl/l0}/level_zero/zes_api.h | 0 .../intel/sycl/l0}/level_zero/zes_ddi.h | 0 .../intel/sycl/l0}/level_zero/zet.py | 0 .../intel/sycl/l0}/level_zero/zet_api.h | 0 .../intel/sycl/l0}/level_zero/zet_ddi.h | 0 .../intel/sycl/l0/utils.cpp} | 29 ++++++++++++------- .../intel/sycl/l0/utils.hpp} | 21 ++++++++++---- src/gpu/sycl/sycl_interop_gpu_kernel.cpp | 2 +- src/sycl/sycl_compat.cpp | 5 ++-- src/sycl/sycl_utils.cpp | 15 ++++++---- 21 files changed, 52 insertions(+), 26 deletions(-) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/_clang-format (100%) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/layers/zel_tracing_api.h (100%) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/layers/zel_tracing_ddi.h (100%) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/layers/zel_tracing_register_cb.h (100%) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/loader/ze_loader.h (100%) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/ze.py (100%) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/ze_api.h (100%) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/ze_ddi.h (100%) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/zes.py (100%) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/zes_api.h (100%) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/zes_ddi.h (100%) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/zet.py (100%) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/zet_api.h (100%) rename src/{sycl => gpu/intel/sycl/l0}/level_zero/zet_ddi.h (100%) rename src/{sycl/level_zero_utils.cpp => gpu/intel/sycl/l0/utils.cpp} (90%) rename src/{sycl/level_zero_utils.hpp => gpu/intel/sycl/l0/utils.hpp} (83%) diff --git a/src/gpu/intel/CMakeLists.txt b/src/gpu/intel/CMakeLists.txt index dc308e646c2..1b8c3b5d594 100644 --- a/src/gpu/intel/CMakeLists.txt +++ b/src/gpu/intel/CMakeLists.txt @@ -29,6 +29,10 @@ add_subdirectory(compute) add_subdirectory(jit) add_subdirectory(ocl) +if(DNNL_WITH_SYCL) + add_subdirectory(sycl) +endif() + set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel) add_library(${OBJ_LIB} OBJECT ${SOURCES}) set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS diff --git a/src/gpu/intel/jit/ngen/ngen_level_zero.hpp b/src/gpu/intel/jit/ngen/ngen_level_zero.hpp index f721d6e6b78..e5b8c691a4d 100644 --- a/src/gpu/intel/jit/ngen/ngen_level_zero.hpp +++ b/src/gpu/intel/jit/ngen/ngen_level_zero.hpp @@ -19,7 +19,7 @@ #include "ngen_config.hpp" -#include +#include "gpu/intel/sycl/l0/level_zero/ze_api.h" #include diff --git a/src/sycl/level_zero/_clang-format b/src/gpu/intel/sycl/l0/level_zero/_clang-format similarity index 100% rename from src/sycl/level_zero/_clang-format rename to src/gpu/intel/sycl/l0/level_zero/_clang-format diff --git a/src/sycl/level_zero/layers/zel_tracing_api.h b/src/gpu/intel/sycl/l0/level_zero/layers/zel_tracing_api.h similarity index 100% rename from src/sycl/level_zero/layers/zel_tracing_api.h rename to src/gpu/intel/sycl/l0/level_zero/layers/zel_tracing_api.h diff --git a/src/sycl/level_zero/layers/zel_tracing_ddi.h b/src/gpu/intel/sycl/l0/level_zero/layers/zel_tracing_ddi.h similarity index 100% rename from src/sycl/level_zero/layers/zel_tracing_ddi.h rename to src/gpu/intel/sycl/l0/level_zero/layers/zel_tracing_ddi.h diff --git a/src/sycl/level_zero/layers/zel_tracing_register_cb.h b/src/gpu/intel/sycl/l0/level_zero/layers/zel_tracing_register_cb.h similarity index 100% rename from src/sycl/level_zero/layers/zel_tracing_register_cb.h rename to src/gpu/intel/sycl/l0/level_zero/layers/zel_tracing_register_cb.h diff --git a/src/sycl/level_zero/loader/ze_loader.h b/src/gpu/intel/sycl/l0/level_zero/loader/ze_loader.h similarity index 100% rename from src/sycl/level_zero/loader/ze_loader.h rename to src/gpu/intel/sycl/l0/level_zero/loader/ze_loader.h diff --git a/src/sycl/level_zero/ze.py b/src/gpu/intel/sycl/l0/level_zero/ze.py similarity index 100% rename from src/sycl/level_zero/ze.py rename to src/gpu/intel/sycl/l0/level_zero/ze.py diff --git a/src/sycl/level_zero/ze_api.h b/src/gpu/intel/sycl/l0/level_zero/ze_api.h similarity index 100% rename from src/sycl/level_zero/ze_api.h rename to src/gpu/intel/sycl/l0/level_zero/ze_api.h diff --git a/src/sycl/level_zero/ze_ddi.h b/src/gpu/intel/sycl/l0/level_zero/ze_ddi.h similarity index 100% rename from src/sycl/level_zero/ze_ddi.h rename to src/gpu/intel/sycl/l0/level_zero/ze_ddi.h diff --git a/src/sycl/level_zero/zes.py b/src/gpu/intel/sycl/l0/level_zero/zes.py similarity index 100% rename from src/sycl/level_zero/zes.py rename to src/gpu/intel/sycl/l0/level_zero/zes.py diff --git a/src/sycl/level_zero/zes_api.h b/src/gpu/intel/sycl/l0/level_zero/zes_api.h similarity index 100% rename from src/sycl/level_zero/zes_api.h rename to src/gpu/intel/sycl/l0/level_zero/zes_api.h diff --git a/src/sycl/level_zero/zes_ddi.h b/src/gpu/intel/sycl/l0/level_zero/zes_ddi.h similarity index 100% rename from src/sycl/level_zero/zes_ddi.h rename to src/gpu/intel/sycl/l0/level_zero/zes_ddi.h diff --git a/src/sycl/level_zero/zet.py b/src/gpu/intel/sycl/l0/level_zero/zet.py similarity index 100% rename from src/sycl/level_zero/zet.py rename to src/gpu/intel/sycl/l0/level_zero/zet.py diff --git a/src/sycl/level_zero/zet_api.h b/src/gpu/intel/sycl/l0/level_zero/zet_api.h similarity index 100% rename from src/sycl/level_zero/zet_api.h rename to src/gpu/intel/sycl/l0/level_zero/zet_api.h diff --git a/src/sycl/level_zero/zet_ddi.h b/src/gpu/intel/sycl/l0/level_zero/zet_ddi.h similarity index 100% rename from src/sycl/level_zero/zet_ddi.h rename to src/gpu/intel/sycl/l0/level_zero/zet_ddi.h diff --git a/src/sycl/level_zero_utils.cpp b/src/gpu/intel/sycl/l0/utils.cpp similarity index 90% rename from src/sycl/level_zero_utils.cpp rename to src/gpu/intel/sycl/l0/utils.cpp index d3bb29345d7..276093065a7 100644 --- a/src/sycl/level_zero_utils.cpp +++ b/src/gpu/intel/sycl/l0/utils.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ * limitations under the License. *******************************************************************************/ -#include "sycl/level_zero_utils.hpp" +#include "gpu/intel/sycl/l0/utils.hpp" #include "oneapi/dnnl/dnnl_config.h" #include @@ -27,7 +27,7 @@ #error "Level Zero is supported on Linux and Windows only" #endif -#include +#include "gpu/intel/sycl/l0/level_zero/ze_api.h" #if !defined(__SYCL_COMPILER_VERSION) #error "Unsupported compiler" @@ -47,6 +47,8 @@ namespace dnnl { namespace impl { +namespace gpu { +namespace intel { namespace sycl { namespace { @@ -159,7 +161,7 @@ hrt::device_uuid_t get_device_uuid(const ::sycl::device &dev) { auto ze_device_properties = ze_device_properties_t(); ze_device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; - auto ze_device = compat::get_native(dev); + auto ze_device = impl::sycl::compat::get_native(dev); auto status = func_zeDeviceGetProperties(ze_device, &ze_device_properties); MAYBE_UNUSED(status); assert(status == status::success); @@ -176,7 +178,8 @@ hrt::device_uuid_t get_device_uuid(const ::sycl::device &dev) { status_t sycl_create_kernel_with_level_zero( std::unique_ptr<::sycl::kernel> &sycl_kernel, - const std::string &kernel_name, const sycl_engine_base_t *sycl_engine, + const std::string &kernel_name, + const impl::sycl::sycl_engine_base_t *sycl_engine, const hrt::binary_t &binary) { auto desc = ze_module_desc_t(); desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC; @@ -188,10 +191,10 @@ status_t sycl_create_kernel_with_level_zero( ze_module_handle_t ze_module; - auto ze_device - = compat::get_native(sycl_engine->device()); - auto ze_ctx - = compat::get_native(sycl_engine->context()); + auto ze_device = impl::sycl::compat::get_native( + sycl_engine->device()); + auto ze_ctx = impl::sycl::compat::get_native( + sycl_engine->context()); CHECK(func_zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr)); ::sycl::kernel_bundle<::sycl::bundle_state::executable> kernel_bundle @@ -211,12 +214,16 @@ status_t sycl_create_kernel_with_level_zero( } bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) { - auto lhs_ze_handle = compat::get_native(lhs); - auto rhs_ze_handle = compat::get_native(rhs); + auto lhs_ze_handle + = impl::sycl::compat::get_native(lhs); + auto rhs_ze_handle + = impl::sycl::compat::get_native(rhs); return lhs_ze_handle == rhs_ze_handle; } } // namespace sycl +} // namespace intel +} // namespace gpu } // namespace impl } // namespace dnnl diff --git a/src/sycl/level_zero_utils.hpp b/src/gpu/intel/sycl/l0/utils.hpp similarity index 83% rename from src/sycl/level_zero_utils.hpp rename to src/gpu/intel/sycl/l0/utils.hpp index ad5e42fd93c..c976df21414 100644 --- a/src/sycl/level_zero_utils.hpp +++ b/src/gpu/intel/sycl/l0/utils.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,18 +24,27 @@ #include "gpu/intel/compute/kernel.hpp" #include "sycl/sycl_compat.hpp" +// including sycl_engine_base.hpp leads to circular dependencies, w/a for now. namespace dnnl { namespace impl { namespace sycl { +class sycl_engine_base_t; +} +} // namespace impl +} // namespace dnnl -hrt::device_uuid_t get_device_uuid(const ::sycl::device &dev); +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace sycl { -// including sycl_engine_base.hpp leads to circular dependencies, w/a for now. -class sycl_engine_base_t; +hrt::device_uuid_t get_device_uuid(const ::sycl::device &dev); status_t sycl_create_kernel_with_level_zero( std::unique_ptr<::sycl::kernel> &sycl_kernel, - const std::string &kernel_name, const sycl_engine_base_t *sycl_engine, + const std::string &kernel_name, + const impl::sycl::sycl_engine_base_t *sycl_engine, const hrt::binary_t &binary); bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs); @@ -44,6 +53,8 @@ status_t func_zeModuleGetNativeBinary(ze_module_handle_t hModule, size_t *pSize, uint8_t *pModuleNativeBinary); } // namespace sycl +} // namespace intel +} // namespace gpu } // namespace impl } // namespace dnnl diff --git a/src/gpu/sycl/sycl_interop_gpu_kernel.cpp b/src/gpu/sycl/sycl_interop_gpu_kernel.cpp index f861b6c43a4..f93340412f4 100644 --- a/src/gpu/sycl/sycl_interop_gpu_kernel.cpp +++ b/src/gpu/sycl/sycl_interop_gpu_kernel.cpp @@ -21,9 +21,9 @@ #include "gpu/intel/ocl/ocl_utils.hpp" #include "gpu/intel/ocl/stream_profiler.hpp" #include "gpu/intel/ocl/types_interop.hpp" +#include "gpu/intel/sycl/l0/utils.hpp" #include "gpu/intel/utils.hpp" #include "hrt/utils.hpp" -#include "sycl/level_zero_utils.hpp" #include "sycl/sycl_c_types_map.hpp" #include "sycl/sycl_stream.hpp" #include "sycl/sycl_utils.hpp" diff --git a/src/sycl/sycl_compat.cpp b/src/sycl/sycl_compat.cpp index bcf6b887ed8..2060a1db9ab 100644 --- a/src/sycl/sycl_compat.cpp +++ b/src/sycl/sycl_compat.cpp @@ -14,12 +14,13 @@ * limitations under the License. *******************************************************************************/ -#include #include #include "oneapi/dnnl/dnnl_config.h" #include "sycl/sycl_utils.hpp" +#include "gpu/intel/sycl/l0/level_zero/ze_api.h" + #if __has_include() #include #elif __has_include() @@ -32,7 +33,7 @@ #include "common/utils.hpp" #include "gpu/intel/compute/device_info.hpp" -#include "sycl/level_zero_utils.hpp" +#include "gpu/intel/sycl/l0/utils.hpp" #include "sycl/sycl_compat.hpp" #include "sycl/sycl_engine_base.hpp" diff --git a/src/sycl/sycl_utils.cpp b/src/sycl/sycl_utils.cpp index 7a3719c1552..66fdfbc464a 100644 --- a/src/sycl/sycl_utils.cpp +++ b/src/sycl/sycl_utils.cpp @@ -19,7 +19,7 @@ #include "sycl/sycl_compat.hpp" #include "sycl/sycl_engine_base.hpp" -#include "sycl/level_zero_utils.hpp" +#include "gpu/intel/sycl/l0/utils.hpp" #include @@ -87,7 +87,9 @@ bool are_equal(const ::sycl::device &lhs, const ::sycl::device &rhs) { return lhs_ocl_handle == rhs_ocl_handle; } - if (lhs_be == backend_t::level0) { return compare_ze_devices(lhs, rhs); } + if (lhs_be == backend_t::level0) { + return gpu::intel::sycl::compare_ze_devices(lhs, rhs); + } #ifdef DNNL_SYCL_CUDA if (lhs_be == backend_t::nvidia) { @@ -121,7 +123,7 @@ device_id_t sycl_device_id(const ::sycl::device &dev) { case backend_t::level0: { device_id = std::tuple_cat( std::make_tuple(static_cast(backend_t::level0)), - get_device_uuid(dev)); + gpu::intel::sycl::get_device_uuid(dev)); break; } case backend_t::unknown: assert(!"unknown backend"); break; @@ -265,7 +267,8 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) { if (uuid2ocl_dev.empty()) return status::runtime_error; - const hrt::device_uuid_t l0_dev_uuid = get_device_uuid(dev); + const hrt::device_uuid_t l0_dev_uuid + = gpu::intel::sycl::get_device_uuid(dev); auto d = uuid2ocl_dev.get(l0_dev_uuid); if (!d) return status::runtime_error; @@ -337,10 +340,10 @@ status_t get_kernel_binary( auto module = module_vec[0]; size_t module_binary_size; hrt::binary_t module_binary; - CHECK(func_zeModuleGetNativeBinary( + CHECK(gpu::intel::sycl::func_zeModuleGetNativeBinary( module, &module_binary_size, nullptr)); module_binary.resize(module_binary_size); - CHECK(func_zeModuleGetNativeBinary( + CHECK(gpu::intel::sycl::func_zeModuleGetNativeBinary( module, &module_binary_size, module_binary.data())); { std::unique_ptr Date: Wed, 1 May 2024 00:27:59 -0700 Subject: [PATCH 013/187] sycl: split compat and utility code into common and vendor specific --- src/gpu/amd/sycl_hip_compat.hpp | 4 +- src/gpu/amd/sycl_hip_engine.cpp | 6 +- src/gpu/amd/sycl_hip_utils.hpp | 2 +- src/gpu/intel/compute/device_info.cpp | 2 +- src/gpu/intel/sycl/CMakeLists.txt | 29 ++ .../intel/sycl/compat.cpp} | 49 +-- src/gpu/intel/sycl/compat.hpp | 55 +++ src/gpu/intel/sycl/l0/utils.cpp | 16 +- src/gpu/intel/sycl/l0/utils.hpp | 2 +- .../intel/sycl/utils.cpp} | 203 ++-------- src/gpu/intel/sycl/utils.hpp | 59 +++ src/gpu/nvidia/sycl_cuda_compat.hpp | 4 +- src/gpu/nvidia/sycl_cuda_engine.cpp | 6 +- src/gpu/nvidia/sycl_cuda_utils.hpp | 6 +- src/gpu/sycl/sycl_gpu_engine.hpp | 4 +- src/gpu/sycl/sycl_gpu_kernel.hpp | 6 +- src/gpu/sycl/sycl_interop_gpu_kernel.cpp | 11 +- src/gpu/sycl/sycl_interop_gpu_kernel.hpp | 2 +- src/gpu/sycl/sycl_math_utils.hpp | 4 +- src/gpu/sycl/sycl_q10n.hpp | 4 +- src/gpu/sycl/sycl_types.hpp | 6 +- src/hrt/CMakeLists.txt | 8 +- src/hrt/sycl/CMakeLists.txt | 27 ++ src/hrt/sycl/compat.cpp | 73 ++++ .../sycl_compat.hpp => hrt/sycl/compat.hpp} | 22 +- src/hrt/sycl/utils.cpp | 348 ++++++++++++++++++ src/hrt/sycl/utils.hpp | 92 +++++ src/sycl/capi/capi_engine.cpp | 6 +- src/sycl/stream_profiler.cpp | 2 +- src/sycl/sycl_buffer_memory_storage.cpp | 11 +- src/sycl/sycl_buffer_memory_storage.hpp | 14 +- src/sycl/sycl_cpu_engine.hpp | 4 +- src/sycl/sycl_device_info.cpp | 30 +- src/sycl/sycl_engine.cpp | 6 +- src/sycl/sycl_engine.hpp | 78 +--- src/sycl/sycl_engine_base.cpp | 4 +- src/sycl/sycl_engine_base.hpp | 43 ++- src/sycl/sycl_stream.cpp | 2 +- src/sycl/sycl_stream.hpp | 4 +- src/sycl/sycl_stream_submit_cpu_primitive.cpp | 4 +- src/sycl/sycl_stream_submit_cpu_primitive.hpp | 4 +- src/sycl/sycl_utils.hpp | 188 ---------- .../sycl/test_cpp_api_compiled_partition.cpp | 8 +- .../graph/api/sycl/test_cpp_api_engine.cpp | 4 +- .../graph/api/sycl/test_cpp_api_tensor.cpp | 2 +- tests/gtests/graph/api/test_api_common.cpp | 11 +- tests/gtests/graph/api/test_api_common.hpp | 4 +- .../unit/interface/sycl/test_allocator.cpp | 10 +- tests/gtests/graph/unit/unit_test_common.cpp | 4 +- tests/gtests/graph/unit/unit_test_common.hpp | 2 +- tests/gtests/sycl/api/test_engine.cpp | 6 +- tests/gtests/sycl/api/test_memory_buffer.cpp | 8 +- 52 files changed, 891 insertions(+), 618 deletions(-) create mode 100644 src/gpu/intel/sycl/CMakeLists.txt rename src/{sycl/sycl_compat.cpp => gpu/intel/sycl/compat.cpp} (78%) create mode 100644 src/gpu/intel/sycl/compat.hpp rename src/{sycl/sycl_utils.cpp => gpu/intel/sycl/utils.cpp} (55%) create mode 100644 src/gpu/intel/sycl/utils.hpp create mode 100644 src/hrt/sycl/CMakeLists.txt create mode 100644 src/hrt/sycl/compat.cpp rename src/{sycl/sycl_compat.hpp => hrt/sycl/compat.hpp} (88%) create mode 100644 src/hrt/sycl/utils.cpp create mode 100644 src/hrt/sycl/utils.hpp delete mode 100644 src/sycl/sycl_utils.hpp diff --git a/src/gpu/amd/sycl_hip_compat.hpp b/src/gpu/amd/sycl_hip_compat.hpp index 1dfe01b4dd7..ccdcf51f09f 100644 --- a/src/gpu/amd/sycl_hip_compat.hpp +++ b/src/gpu/amd/sycl_hip_compat.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ #include -#include "sycl/sycl_compat.hpp" +#include "hrt/sycl/compat.hpp" #include "gpu/amd/sycl_hip_utils.hpp" diff --git a/src/gpu/amd/sycl_hip_engine.cpp b/src/gpu/amd/sycl_hip_engine.cpp index 26d2bad2214..8399789f0d6 100644 --- a/src/gpu/amd/sycl_hip_engine.cpp +++ b/src/gpu/amd/sycl_hip_engine.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,8 +19,8 @@ #include "common/utils.hpp" #include "hip/hip_runtime.h" +#include "hrt/sycl/utils.hpp" #include "miopen/miopen.h" -#include "sycl/sycl_utils.hpp" #include "gpu/amd/miopen_batch_normalization.hpp" #include "gpu/amd/miopen_binary.hpp" @@ -136,7 +136,7 @@ rocblas_handle *sycl_hip_engine_t::get_rocblas_handle() { } device_id_t sycl_hip_engine_t::device_id() const { - return device_id_t(static_cast(impl::sycl::backend_t::amd), + return device_id_t(static_cast(hrt::sycl::backend_t::amd), static_cast(compat::get_native(device())), static_cast(0)); } diff --git a/src/gpu/amd/sycl_hip_utils.hpp b/src/gpu/amd/sycl_hip_utils.hpp index 9ee9515b4a3..a60423a5928 100644 --- a/src/gpu/amd/sycl_hip_utils.hpp +++ b/src/gpu/amd/sycl_hip_utils.hpp @@ -29,7 +29,7 @@ #include "common/utils.hpp" #include "common/z_magic.hpp" -#include "sycl/sycl_utils.hpp" +#include "hrt/sycl/utils.hpp" #include "gpu/amd/sycl_hip_compat.hpp" diff --git a/src/gpu/intel/compute/device_info.cpp b/src/gpu/intel/compute/device_info.cpp index 2b75c66102a..3bfb6b59570 100644 --- a/src/gpu/intel/compute/device_info.cpp +++ b/src/gpu/intel/compute/device_info.cpp @@ -253,7 +253,7 @@ status_t device_info_t::init_attributes_common(engine_t *engine) { using namespace impl::sycl; if (engine->runtime_kind() == runtime_kind::sycl) { auto *sycl_engine = utils::downcast(engine); - ocl_backend = (sycl_engine->backend() == backend_t::opencl); + ocl_backend = (sycl_engine->backend() == hrt::sycl::backend_t::opencl); } #endif diff --git a/src/gpu/intel/sycl/CMakeLists.txt b/src/gpu/intel/sycl/CMakeLists.txt new file mode 100644 index 00000000000..63d937ddb7b --- /dev/null +++ b/src/gpu/intel/sycl/CMakeLists.txt @@ -0,0 +1,29 @@ +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +file(GLOB_RECURSE SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/*.h + ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.c + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ) + +include_directories_with_host_compiler(${CMAKE_CURRENT_SOURCE_DIR}) + +set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_sycl) +add_library(${OBJ_LIB} OBJECT ${SOURCES}) +set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS + $) diff --git a/src/sycl/sycl_compat.cpp b/src/gpu/intel/sycl/compat.cpp similarity index 78% rename from src/sycl/sycl_compat.cpp rename to src/gpu/intel/sycl/compat.cpp index 2060a1db9ab..c0f8615fba9 100644 --- a/src/sycl/sycl_compat.cpp +++ b/src/gpu/intel/sycl/compat.cpp @@ -16,8 +16,8 @@ #include +#include "gpu/intel/sycl/utils.hpp" #include "oneapi/dnnl/dnnl_config.h" -#include "sycl/sycl_utils.hpp" #include "gpu/intel/sycl/l0/level_zero/ze_api.h" @@ -33,12 +33,14 @@ #include "common/utils.hpp" #include "gpu/intel/compute/device_info.hpp" +#include "gpu/intel/sycl/compat.hpp" #include "gpu/intel/sycl/l0/utils.hpp" -#include "sycl/sycl_compat.hpp" #include "sycl/sycl_engine_base.hpp" namespace dnnl { namespace impl { +namespace gpu { +namespace intel { namespace sycl { status_t func_zeKernelCreate( @@ -48,42 +50,11 @@ namespace compat { using namespace gpu::intel::compute; -namespace { -template -void *get_native_impl(backend_t backend, const sycl_object_t &sycl_object) { - if (backend == backend_t::opencl) { - return ::sycl::get_native<::sycl::backend::opencl>(sycl_object); - } else if (backend == backend_t::level0) { - return ::sycl::get_native<::sycl::backend::ext_oneapi_level_zero>( - sycl_object); - } else { - assert(!"unexpected"); - return nullptr; - } - return nullptr; -} - -} // namespace - -void *get_native(const ::sycl::device &dev) { - auto backend = get_sycl_backend(dev); - return get_native_impl(backend, dev); -} - -void *get_native(const ::sycl::context &ctx) { - auto devices = ctx.get_devices(); - assert(!devices.empty()); - if (devices.empty()) return nullptr; - // backend is expected to be the same for all devices in a context. - auto backend = get_sycl_backend(devices[0]); - return get_native_impl(backend, ctx); -} - status_t make_kernel(std::unique_ptr<::sycl::kernel> &sycl_kernel, - const sycl_engine_base_t *sycl_engine, const hrt::binary_t &binary, - const char *kernel_name) { - auto backend = get_sycl_backend(sycl_engine->device()); - if (backend == backend_t::opencl) { + const impl::sycl::sycl_engine_base_t *sycl_engine, + const hrt::binary_t &binary, const char *kernel_name) { + auto backend = hrt::sycl::get_backend(sycl_engine->device()); + if (backend == hrt::sycl::backend_t::opencl) { hrt::ocl::wrapper_t ocl_program; CHECK(hrt::ocl::create_program(ocl_program, sycl_engine->ocl_device(), sycl_engine->ocl_context(), binary)); @@ -93,7 +64,7 @@ status_t make_kernel(std::unique_ptr<::sycl::kernel> &sycl_kernel, sycl_kernel = utils::make_unique<::sycl::kernel>( ::sycl::make_kernel<::sycl::backend::opencl>( ocl_kernel, sycl_engine->context())); - } else if (backend == backend_t::level0) { + } else if (backend == hrt::sycl::backend_t::level0) { CHECK(sycl_create_kernel_with_level_zero( sycl_kernel, kernel_name, sycl_engine, binary)); } else { @@ -160,5 +131,7 @@ uint64_t init_extensions(const ::sycl::device &dev) { } // namespace compat } // namespace sycl +} // namespace intel +} // namespace gpu } // namespace impl } // namespace dnnl diff --git a/src/gpu/intel/sycl/compat.hpp b/src/gpu/intel/sycl/compat.hpp new file mode 100644 index 00000000000..a25362e1350 --- /dev/null +++ b/src/gpu/intel/sycl/compat.hpp @@ -0,0 +1,55 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_INTEL_SYCL_COMPAT_HPP +#define GPU_INTEL_SYCL_COMPAT_HPP + +#include "hrt/sycl/compat.hpp" + +#include "gpu/intel/sycl/utils.hpp" + +namespace dnnl { +namespace impl { +namespace sycl { + +class sycl_engine_base_t; + +} +} // namespace impl +} // namespace dnnl + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace sycl { + +namespace compat { + +status_t make_kernel(std::unique_ptr<::sycl::kernel> &sycl_kernel, + const impl::sycl::sycl_engine_base_t *sycl_engine, + const hrt::binary_t &binary, const char *kernel_name); + +uint64_t init_extensions(const ::sycl::device &dev); + +} // namespace compat +} // namespace sycl +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/intel/sycl/l0/utils.cpp b/src/gpu/intel/sycl/l0/utils.cpp index 276093065a7..abaed7640f8 100644 --- a/src/gpu/intel/sycl/l0/utils.cpp +++ b/src/gpu/intel/sycl/l0/utils.cpp @@ -17,8 +17,6 @@ #include "gpu/intel/sycl/l0/utils.hpp" #include "oneapi/dnnl/dnnl_config.h" -#include - #if defined(__linux__) #include #elif defined(_WIN32) @@ -40,7 +38,7 @@ #include "common/c_types_map.hpp" #include "common/verbose.hpp" -#include "sycl/sycl_utils.hpp" +#include "gpu/intel/sycl/utils.hpp" #include #include "sycl/sycl_engine_base.hpp" @@ -161,7 +159,7 @@ hrt::device_uuid_t get_device_uuid(const ::sycl::device &dev) { auto ze_device_properties = ze_device_properties_t(); ze_device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; - auto ze_device = impl::sycl::compat::get_native(dev); + auto ze_device = hrt::sycl::compat::get_native(dev); auto status = func_zeDeviceGetProperties(ze_device, &ze_device_properties); MAYBE_UNUSED(status); assert(status == status::success); @@ -191,9 +189,9 @@ status_t sycl_create_kernel_with_level_zero( ze_module_handle_t ze_module; - auto ze_device = impl::sycl::compat::get_native( + auto ze_device = hrt::sycl::compat::get_native( sycl_engine->device()); - auto ze_ctx = impl::sycl::compat::get_native( + auto ze_ctx = hrt::sycl::compat::get_native( sycl_engine->context()); CHECK(func_zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr)); @@ -214,10 +212,8 @@ status_t sycl_create_kernel_with_level_zero( } bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) { - auto lhs_ze_handle - = impl::sycl::compat::get_native(lhs); - auto rhs_ze_handle - = impl::sycl::compat::get_native(rhs); + auto lhs_ze_handle = hrt::sycl::compat::get_native(lhs); + auto rhs_ze_handle = hrt::sycl::compat::get_native(rhs); return lhs_ze_handle == rhs_ze_handle; } diff --git a/src/gpu/intel/sycl/l0/utils.hpp b/src/gpu/intel/sycl/l0/utils.hpp index c976df21414..c3bcb8c150a 100644 --- a/src/gpu/intel/sycl/l0/utils.hpp +++ b/src/gpu/intel/sycl/l0/utils.hpp @@ -22,7 +22,7 @@ #include #include "gpu/intel/compute/kernel.hpp" -#include "sycl/sycl_compat.hpp" +#include "gpu/intel/sycl/compat.hpp" // including sycl_engine_base.hpp leads to circular dependencies, w/a for now. namespace dnnl { diff --git a/src/sycl/sycl_utils.cpp b/src/gpu/intel/sycl/utils.cpp similarity index 55% rename from src/sycl/sycl_utils.cpp rename to src/gpu/intel/sycl/utils.cpp index 66fdfbc464a..0651c3262be 100644 --- a/src/sycl/sycl_utils.cpp +++ b/src/gpu/intel/sycl/utils.cpp @@ -14,178 +14,42 @@ * limitations under the License. *******************************************************************************/ -#include "sycl/sycl_utils.hpp" +#include "gpu/intel/sycl/utils.hpp" #include "gpu/intel/ocl/ocl_engine.hpp" -#include "sycl/sycl_compat.hpp" +#include "gpu/intel/sycl/compat.hpp" #include "sycl/sycl_engine_base.hpp" #include "gpu/intel/sycl/l0/utils.hpp" +#include "hrt/ocl/utils.hpp" #include -#ifdef DNNL_SYCL_CUDA -// Do not include sycl_cuda_utils.hpp because it's intended for use in -// gpu/nvidia directory only. - -namespace dnnl { -namespace impl { -namespace gpu { -namespace nvidia { -bool compare_cuda_devices(const ::sycl::device &lhs, const ::sycl::device &rhs); -} -} // namespace gpu -} // namespace impl -} // namespace dnnl -#endif - -#ifdef DNNL_SYCL_HIP -// Do not include sycl_cuda_utils.hpp because it's intended for use in -// gpu/amd directory only. namespace dnnl { namespace impl { namespace gpu { -namespace amd { -bool compare_hip_devices(const ::sycl::device &lhs, const ::sycl::device &rhs); -} -} // namespace gpu -} // namespace impl -} // namespace dnnl -#endif -namespace dnnl { -namespace impl { +namespace intel { namespace sycl { -backend_t get_sycl_gpu_backend() { - // Create default GPU device and query its backend (assumed as default) - static backend_t default_backend = []() { - const backend_t fallback = backend_t::opencl; - - const auto gpu_type = ::sycl::info::device_type::gpu; - if (::sycl::device::get_devices(gpu_type).empty()) return fallback; - - ::sycl::device dev {compat::gpu_selector_v}; - backend_t backend = get_sycl_backend(dev); - - return backend; - }(); - - return default_backend; -} - -bool are_equal(const ::sycl::device &lhs, const ::sycl::device &rhs) { - auto lhs_be = get_sycl_backend(lhs); - auto rhs_be = get_sycl_backend(rhs); - if (lhs_be != rhs_be) return false; +::sycl::nd_range<3> to_sycl_nd_range( + const gpu::intel::compute::nd_range_t &range) { + const auto &local_range = range.local_range(); + const auto &global_range = range.global_range(); - // Only one host device exists. - if (lhs_be == backend_t::host) return true; + assert(range.ndims() <= 3); + auto sycl_global_range = ::sycl::range<3>( + global_range.ndims() >= 3 ? global_range[2] : 1, + global_range.ndims() >= 2 ? global_range[1] : 1, global_range[0]); - if (lhs_be == backend_t::opencl) { - // Use wrapper objects to avoid memory leak. - auto lhs_ocl_handle = compat::get_native(lhs); - auto rhs_ocl_handle = compat::get_native(rhs); - return lhs_ocl_handle == rhs_ocl_handle; + if (!local_range) { + assert(!"not expected"); + return ::sycl::nd_range<3>( + sycl_global_range, ::sycl::range<3>(1, 1, 1)); } - if (lhs_be == backend_t::level0) { - return gpu::intel::sycl::compare_ze_devices(lhs, rhs); - } - -#ifdef DNNL_SYCL_CUDA - if (lhs_be == backend_t::nvidia) { - return gpu::nvidia::compare_cuda_devices(lhs, rhs); - } -#endif - -#ifdef DNNL_SYCL_HIP - if (lhs_be == backend_t::amd) { - return gpu::amd::compare_hip_devices(lhs, rhs); - } -#endif - assert(!"not expected"); - return false; -} - -device_id_t sycl_device_id(const ::sycl::device &dev) { - if (is_host(dev)) - return std::make_tuple(static_cast(backend_t::host), 0, 0); - - device_id_t device_id - = device_id_t {static_cast(backend_t::unknown), 0, 0}; - switch (get_sycl_backend(dev)) { - case backend_t::opencl: { - auto ocl_device = hrt::ocl::make_wrapper( - compat::get_native(dev)); - device_id = std::make_tuple(static_cast(backend_t::opencl), - reinterpret_cast(ocl_device.get()), 0); - break; - } - case backend_t::level0: { - device_id = std::tuple_cat( - std::make_tuple(static_cast(backend_t::level0)), - gpu::intel::sycl::get_device_uuid(dev)); - break; - } - case backend_t::unknown: assert(!"unknown backend"); break; - default: assert(!"unreachable"); - } - assert(std::get<0>(device_id) != static_cast(backend_t::unknown)); - return device_id; -} - -bool dev_ctx_consistency_check( - const ::sycl::device &dev, const ::sycl::context &ctx) { - auto ctx_devs = ctx.get_devices(); - - // Try to find the given device in the given context. - auto it = std::find_if(ctx_devs.begin(), ctx_devs.end(), - [&](const ::sycl::device &ctx_dev) { - return are_equal(ctx_dev, dev); - }); - // If found. - if (it != ctx_devs.end()) return true; - - // If not found and the given device is not a sub-device. - if (!is_subdevice(dev)) return false; - - // Try to find a parent device of the given sub-device in the given - // context. - while (is_subdevice(dev)) { - auto parent_dev = get_parent_device(dev); - it = std::find_if(ctx_devs.begin(), ctx_devs.end(), - [&](const ::sycl::device &ctx_dev) { - return are_equal(ctx_dev, parent_dev); - }); - // If found. - if (it != ctx_devs.end()) return true; - } - - return false; -} - -status_t check_device(engine_kind_t eng_kind, const ::sycl::device &dev, - const ::sycl::context &ctx) { - // Check device and context consistency. - VERROR_ENGINE(dev_ctx_consistency_check(dev, ctx), - status::invalid_arguments, VERBOSE_DEVICE_CTX_MISMATCH); - - // Check engine kind and device consistency. - VERROR_ENGINE( - !(eng_kind == engine_kind::cpu && !dev.is_cpu() && !is_host(dev)), - status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); - VERROR_ENGINE(!(eng_kind == engine_kind::gpu && !dev.is_gpu()), - status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); - -#if !defined(DNNL_SYCL_CUDA) && !defined(DNNL_SYCL_HIP) - // Check that platform is an Intel platform. - VERROR_ENGINE(!(!is_host(dev) && !is_intel_platform(dev.get_platform())), - status::invalid_arguments, VERBOSE_INVALID_PLATFORM, "sycl", - "intel", - dev.get_platform() - .get_info<::sycl::info::platform::name>() - .c_str()); -#endif - return status::success; + auto sycl_local_range = ::sycl::range<3>( + local_range.ndims() >= 3 ? local_range[2] : 1, + local_range.ndims() >= 2 ? local_range[1] : 1, local_range[0]); + return ::sycl::nd_range<3>(sycl_global_range, sycl_local_range); } struct uuid2ocl_dev_t { @@ -230,8 +94,8 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) { #error "cl_khr_device_uuid is required" #endif using namespace gpu::intel::compute; - assert(get_sycl_backend(dev) == backend_t::level0); - if (get_sycl_backend(dev) != backend_t::level0) + assert(hrt::sycl::get_backend(dev) == hrt::sycl::backend_t::level0); + if (hrt::sycl::get_backend(dev) != hrt::sycl::backend_t::level0) return status::runtime_error; static const uuid2ocl_dev_t uuid2ocl_dev = []() { @@ -284,21 +148,22 @@ static status_t create_ocl_engine( const ::sycl::device &sycl_dev, const ::sycl::context *sycl_ctx = nullptr) { gpu::intel::ocl::ocl_engine_factory_t f(engine_kind::gpu); - const auto backend = get_sycl_backend(sycl_dev); + const auto backend = hrt::sycl::get_backend(sycl_dev); // The SYCL context is always provided for OpenCL backend. - if (backend == backend_t::opencl && !sycl_ctx) return status::runtime_error; + if (backend == hrt::sycl::backend_t::opencl && !sycl_ctx) + return status::runtime_error; hrt::ocl::wrapper_t ocl_dev; hrt::ocl::wrapper_t ocl_ctx; switch (backend) { - case backend_t::opencl: + case hrt::sycl::backend_t::opencl: ocl_dev = hrt::ocl::make_wrapper( - compat::get_native(sycl_dev)); + hrt::sycl::compat::get_native(sycl_dev)); ocl_ctx = hrt::ocl::make_wrapper( - compat::get_native(*sycl_ctx)); + hrt::sycl::compat::get_native(*sycl_ctx)); break; - case backend_t::level0: { + case hrt::sycl::backend_t::level0: { cl_device_id d {nullptr}; CHECK(sycl_dev2ocl_dev(&d, sycl_dev)); ocl_dev = hrt::ocl::make_wrapper(d, true); @@ -323,7 +188,7 @@ static status_t create_ocl_engine( status_t create_ocl_engine( std::unique_ptr *ocl_engine, - const sycl_engine_base_t *engine) { + const impl::sycl::sycl_engine_base_t *engine) { const auto sycl_ctx = engine->context(); return create_ocl_engine(ocl_engine, engine->device(), &sycl_ctx); } @@ -332,8 +197,8 @@ status_t get_kernel_binary( const ::sycl::kernel &kernel, hrt::binary_t &binary) { auto devs = kernel.get_context().get_devices(); assert(!devs.empty()); - switch (get_sycl_backend(devs[0])) { - case backend_t::level0: { + switch (hrt::sycl::get_backend(devs[0])) { + case hrt::sycl::backend_t::level0: { auto bundle = kernel.get_kernel_bundle(); auto module_vec = ::sycl::get_native< ::sycl::backend::ext_oneapi_level_zero>(bundle); @@ -366,7 +231,7 @@ status_t get_kernel_binary( } return status::success; } - case backend_t::opencl: { + case hrt::sycl::backend_t::opencl: { auto ocl_kernel = ::sycl::get_native<::sycl::backend::opencl>(kernel); CHECK(gpu::intel::ocl::get_ocl_kernel_binary(ocl_kernel, binary)); @@ -377,5 +242,7 @@ status_t get_kernel_binary( } } // namespace sycl +} // namespace intel +} // namespace gpu } // namespace impl } // namespace dnnl diff --git a/src/gpu/intel/sycl/utils.hpp b/src/gpu/intel/sycl/utils.hpp new file mode 100644 index 00000000000..49a2787b1d6 --- /dev/null +++ b/src/gpu/intel/sycl/utils.hpp @@ -0,0 +1,59 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_INTEL_SYCL_UTILS_HPP +#define GPU_INTEL_SYCL_UTILS_HPP + +#include "gpu/intel/compute/utils.hpp" +#include "gpu/intel/ocl/ocl_gpu_engine.hpp" +#include "hrt/sycl/utils.hpp" + +namespace dnnl { +namespace impl { +namespace sycl { +class sycl_engine_base_t; +} +} // namespace impl +} // namespace dnnl + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace sycl { + +::sycl::nd_range<3> to_sycl_nd_range( + const gpu::intel::compute::nd_range_t &range); + +status_t create_ocl_engine( + std::unique_ptr + *ocl_engine, + const impl::sycl::sycl_engine_base_t *engine); + +status_t get_kernel_binary(const ::sycl::kernel &kernel, hrt::binary_t &binary); + +status_t create_ocl_engine( + std::unique_ptr + *ocl_engine, + const impl::sycl::sycl_engine_base_t *engine); + +} // namespace sycl +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/nvidia/sycl_cuda_compat.hpp b/src/gpu/nvidia/sycl_cuda_compat.hpp index 475a82b2e5c..46738790b46 100644 --- a/src/gpu/nvidia/sycl_cuda_compat.hpp +++ b/src/gpu/nvidia/sycl_cuda_compat.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2023 Intel Corporation +* Copyright 2021-2024 Intel Corporation * Copyright 2020 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,7 +20,7 @@ #include -#include "sycl/sycl_compat.hpp" +#include "hrt/sycl/compat.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/sycl_cuda_engine.cpp b/src/gpu/nvidia/sycl_cuda_engine.cpp index 000b1c2b9ef..3c5cac94234 100644 --- a/src/gpu/nvidia/sycl_cuda_engine.cpp +++ b/src/gpu/nvidia/sycl_cuda_engine.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,7 +18,7 @@ #include "common/impl_list_item.hpp" #include "common/utils.hpp" -#include "sycl/sycl_utils.hpp" +#include "hrt/sycl/utils.hpp" #include "gpu/nvidia/cudnn_batch_normalization.hpp" #include "gpu/nvidia/cudnn_binary.hpp" @@ -147,7 +147,7 @@ cublasHandle_t *sycl_cuda_engine_t::get_cublas_handle() { } device_id_t sycl_cuda_engine_t::device_id() const { - return device_id_t(static_cast(impl::sycl::backend_t::nvidia), + return device_id_t(static_cast(hrt::sycl::backend_t::nvidia), static_cast(compat::get_native(device())), static_cast(0)); } diff --git a/src/gpu/nvidia/sycl_cuda_utils.hpp b/src/gpu/nvidia/sycl_cuda_utils.hpp index 395063021eb..26009db58b7 100644 --- a/src/gpu/nvidia/sycl_cuda_utils.hpp +++ b/src/gpu/nvidia/sycl_cuda_utils.hpp @@ -30,7 +30,7 @@ #include "common/primitive_attr.hpp" #include "common/z_magic.hpp" -#include "sycl/sycl_utils.hpp" +#include "hrt/sycl/utils.hpp" #include "gpu/nvidia/sycl_cuda_compat.hpp" @@ -325,7 +325,7 @@ ::sycl::event copy(::sycl::queue &q, T *src, ::sycl::buffer &dst) { auto event = q.submit([&, src](::sycl::handler &cgh) { // Retrieve a write accessor to a global buffer auto acc = dst.template get_access<::sycl::access::mode::write, - impl::sycl::compat::target_device>(cgh); + hrt::sycl::compat::target_device>(cgh); // Copy from the input pointer into the buffer associated with the // accessor cgh.copy(src, acc); @@ -339,7 +339,7 @@ ::sycl::event copy(::sycl::queue &q, ::sycl::buffer &src, T *dst) { auto event = q.submit([&, dst](::sycl::handler &cgh) { // Retrieve a read accessor to a global buffer auto acc = src.template get_access<::sycl::access::mode::read, - impl::sycl::compat::target_device>(cgh); + hrt::sycl::compat::target_device>(cgh); // Copy from the buffer associated with the accessor into the output // pointer cgh.copy(acc, dst); diff --git a/src/gpu/sycl/sycl_gpu_engine.hpp b/src/gpu/sycl/sycl_gpu_engine.hpp index 2543d42ad21..d939493a5ee 100644 --- a/src/gpu/sycl/sycl_gpu_engine.hpp +++ b/src/gpu/sycl/sycl_gpu_engine.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2022 Intel Corporation +* Copyright 2019-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,8 +20,8 @@ #include "common/c_types_map.hpp" #include "common/engine.hpp" #include "common/utils.hpp" +#include "gpu/intel/sycl/utils.hpp" #include "sycl/sycl_engine_base.hpp" -#include "sycl/sycl_utils.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/sycl_gpu_kernel.hpp b/src/gpu/sycl/sycl_gpu_kernel.hpp index f8d34f28055..fd0bc0d0e95 100644 --- a/src/gpu/sycl/sycl_gpu_kernel.hpp +++ b/src/gpu/sycl/sycl_gpu_kernel.hpp @@ -18,14 +18,16 @@ #define GPU_SYCL_SYCL_GPU_KERNEL_HPP #include "common/utils.hpp" -#include "sycl/sycl_utils.hpp" +#include "hrt/sycl/utils.hpp" + +#include "gpu/intel/compute/kernel.hpp" namespace dnnl { namespace impl { namespace gpu { namespace sycl { -struct sycl_gpu_kernel_t : public intel::compute::kernel_impl_t { +struct sycl_gpu_kernel_t : public gpu::intel::compute::kernel_impl_t { using kernel_bundle_e_t = ::sycl::kernel_bundle<::sycl::bundle_state::executable>; diff --git a/src/gpu/sycl/sycl_interop_gpu_kernel.cpp b/src/gpu/sycl/sycl_interop_gpu_kernel.cpp index f93340412f4..022fbf5ded8 100644 --- a/src/gpu/sycl/sycl_interop_gpu_kernel.cpp +++ b/src/gpu/sycl/sycl_interop_gpu_kernel.cpp @@ -22,11 +22,11 @@ #include "gpu/intel/ocl/stream_profiler.hpp" #include "gpu/intel/ocl/types_interop.hpp" #include "gpu/intel/sycl/l0/utils.hpp" +#include "gpu/intel/sycl/utils.hpp" #include "gpu/intel/utils.hpp" #include "hrt/utils.hpp" #include "sycl/sycl_c_types_map.hpp" #include "sycl/sycl_stream.hpp" -#include "sycl/sycl_utils.hpp" namespace dnnl { namespace impl { @@ -100,7 +100,8 @@ status_t sycl_interop_gpu_kernel_t::parallel_for(stream_t &stream, // XXX: DPCPP/L0 does not support non-uniform work-groups and does not // provide any diagnostics. This is to catch potential issues on oneDNN // side. - if (sycl_engine->backend() == backend_t::level0 && range.local_range()) { + if (sycl_engine->backend() == hrt::sycl::backend_t::level0 + && range.local_range()) { for (size_t i = 0; i < range.ndims(); i++) { size_t gws = range.global_range()[i]; size_t lws = range.local_range()[i]; @@ -148,7 +149,7 @@ status_t sycl_interop_gpu_kernel_t::parallel_for(stream_t &stream, cgh.set_arg((int)i, nullptr); } } else if (arg.is_local()) { - auto acc = compat::local_accessor( + auto acc = hrt::sycl::compat::local_accessor( ::sycl::range<1>(arg.size()), cgh); cgh.set_arg((int)i, acc); } else { @@ -156,7 +157,7 @@ status_t sycl_interop_gpu_kernel_t::parallel_for(stream_t &stream, } } if (range.local_range()) { - auto sycl_nd_range = to_sycl_nd_range(range); + auto sycl_nd_range = gpu::intel::sycl::to_sycl_nd_range(range); cgh.parallel_for(sycl_nd_range, *sycl_kernel_); } else { const auto &global_range = range.global_range(); @@ -180,7 +181,7 @@ status_t sycl_interop_gpu_kernel_t::parallel_for(stream_t &stream, status_t sycl_interop_gpu_kernel_t::dump() const { hrt::binary_t binary; - CHECK(get_kernel_binary(sycl_kernel(), binary)); + CHECK(gpu::intel::sycl::get_kernel_binary(sycl_kernel(), binary)); return gpu::intel::gpu_utils::dump_kernel_binary(binary, name()); } diff --git a/src/gpu/sycl/sycl_interop_gpu_kernel.hpp b/src/gpu/sycl/sycl_interop_gpu_kernel.hpp index 80957ab15c0..0304229b785 100644 --- a/src/gpu/sycl/sycl_interop_gpu_kernel.hpp +++ b/src/gpu/sycl/sycl_interop_gpu_kernel.hpp @@ -19,7 +19,7 @@ #include -#include "sycl/sycl_utils.hpp" +#include "gpu/intel/sycl/utils.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/sycl_math_utils.hpp b/src/gpu/sycl/sycl_math_utils.hpp index 04b407fb278..234212b0153 100644 --- a/src/gpu/sycl/sycl_math_utils.hpp +++ b/src/gpu/sycl/sycl_math_utils.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ #include "common/c_types_map.hpp" #include "common/math_utils.hpp" #include "common/utils.hpp" -#include "sycl/sycl_utils.hpp" +#include "hrt/sycl/utils.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/sycl_q10n.hpp b/src/gpu/sycl/sycl_q10n.hpp index bf0e106d924..d9910ea68d6 100644 --- a/src/gpu/sycl/sycl_q10n.hpp +++ b/src/gpu/sycl/sycl_q10n.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ #include "common/math_utils.hpp" #include "common/utils.hpp" #include "gpu/sycl/sycl_types.hpp" -#include "sycl/sycl_utils.hpp" +#include "hrt/sycl/utils.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/sycl_types.hpp b/src/gpu/sycl/sycl_types.hpp index 6bd3a065fbd..236c90fce46 100644 --- a/src/gpu/sycl/sycl_types.hpp +++ b/src/gpu/sycl/sycl_types.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,8 +21,8 @@ #include "common/c_types_map.hpp" #include "common/utils.hpp" -#include "sycl/sycl_compat.hpp" -#include "sycl/sycl_utils.hpp" +#include "hrt/sycl/compat.hpp" +#include "hrt/sycl/utils.hpp" namespace dnnl { namespace impl { diff --git a/src/hrt/CMakeLists.txt b/src/hrt/CMakeLists.txt index fb6323f6309..0b470c61010 100644 --- a/src/hrt/CMakeLists.txt +++ b/src/hrt/CMakeLists.txt @@ -21,9 +21,15 @@ file(GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ) +if(DNNL_WITH_SYCL) + add_subdirectory(sycl) +endif() + +# TODO: When the dependency of nvidia and amd on intel vendor is removed the +# `ocl` directory will have to be added only for the intel vendor. add_subdirectory(ocl) -set(OBJ_LIB ${LIB_PACKAGE_NAME}_common_hrt) +set(OBJ_LIB ${LIB_PACKAGE_NAME}_hrt) add_library(${OBJ_LIB} OBJECT ${SOURCES}) set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS $) diff --git a/src/hrt/sycl/CMakeLists.txt b/src/hrt/sycl/CMakeLists.txt new file mode 100644 index 00000000000..4c4bdb95e2c --- /dev/null +++ b/src/hrt/sycl/CMakeLists.txt @@ -0,0 +1,27 @@ +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +file(GLOB_RECURSE SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/*.h + ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.c + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ) + +set(OBJ_LIB ${LIB_PACKAGE_NAME}_hrt_sycl) +add_library(${OBJ_LIB} OBJECT ${SOURCES}) +set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS + $) diff --git a/src/hrt/sycl/compat.cpp b/src/hrt/sycl/compat.cpp new file mode 100644 index 00000000000..8aada3971a8 --- /dev/null +++ b/src/hrt/sycl/compat.cpp @@ -0,0 +1,73 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +// TODO: Move backend specific code to gpu/intel/sycl +#if __has_include() +#include +#elif __has_include() +#include +#else +#error "Unsupported compiler" +#endif + +#include + +#include "hrt/sycl/compat.hpp" +#include "hrt/sycl/utils.hpp" + +namespace dnnl { +namespace impl { +namespace hrt { +namespace sycl { + +namespace { +template +void *get_native_impl(backend_t backend, const sycl_object_t &sycl_object) { + if (backend == backend_t::opencl) { + return ::sycl::get_native<::sycl::backend::opencl>(sycl_object); + } else if (backend == backend_t::level0) { + return ::sycl::get_native<::sycl::backend::ext_oneapi_level_zero>( + sycl_object); + } else { + assert(!"unexpected"); + return nullptr; + } + return nullptr; +} + +} // namespace + +namespace compat { + +void *get_native(const ::sycl::device &dev) { + auto backend = get_backend(dev); + return get_native_impl(backend, dev); +} + +void *get_native(const ::sycl::context &ctx) { + auto devices = ctx.get_devices(); + assert(!devices.empty()); + if (devices.empty()) return nullptr; + // backend is expected to be the same for all devices in a context. + auto backend = get_backend(devices[0]); + return get_native_impl(backend, ctx); +} + +} // namespace compat +} // namespace sycl +} // namespace hrt +} // namespace impl +} // namespace dnnl diff --git a/src/sycl/sycl_compat.hpp b/src/hrt/sycl/compat.hpp similarity index 88% rename from src/sycl/sycl_compat.hpp rename to src/hrt/sycl/compat.hpp index 14f132936f1..bdebed7656b 100644 --- a/src/sycl/sycl_compat.hpp +++ b/src/hrt/sycl/compat.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,23 +14,20 @@ * limitations under the License. *******************************************************************************/ -#ifndef SYCL_SYCL_COMPAT_HPP -#define SYCL_SYCL_COMPAT_HPP +#ifndef COMMON_HRT_SYCL_COMPAT_HPP +#define COMMON_HRT_SYCL_COMPAT_HPP -#include "sycl/sycl_utils.hpp" +// This file contains a common SYCL compatibility layer. All vendor specific +// SYCL code that requires compatbility must reside in the vendor directories. + +#include "hrt/sycl/utils.hpp" namespace dnnl { namespace impl { +namespace hrt { namespace sycl { - -class sycl_engine_base_t; - namespace compat { -status_t make_kernel(std::unique_ptr<::sycl::kernel> &sycl_kernel, - const sycl_engine_base_t *sycl_engine, const hrt::binary_t &binary, - const char *kernel_name); - void *get_native(const ::sycl::device &dev); void *get_native(const ::sycl::context &ctx); @@ -59,8 +56,6 @@ inline void host_task(H &cgh, F &&f) { host_task_impl(cgh, f, 0); } -uint64_t init_extensions(const ::sycl::device &dev); - constexpr auto target_device = ::sycl::target::device; #if DNNL_USE_SYCL121_API @@ -95,6 +90,7 @@ inline const auto &gpu_selector_v = ::sycl::gpu_selector_v; } // namespace compat } // namespace sycl +} // namespace hrt } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/utils.cpp b/src/hrt/sycl/utils.cpp new file mode 100644 index 00000000000..a9800bc9944 --- /dev/null +++ b/src/hrt/sycl/utils.cpp @@ -0,0 +1,348 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "hrt/sycl/utils.hpp" +#include "hrt/sycl/compat.hpp" + +// TODO: Include only for GPU vendor intel. +#include "gpu/intel/sycl/l0/utils.hpp" + +// TODO: Refactor build system for NVIDIA and AMD parts to enable them properly +// to be able to include their utility headers here. +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +// Do not include sycl_cuda_utils.hpp because it's intended for use in +// gpu/nvidia directory only. + +namespace dnnl { +namespace impl { +namespace gpu { +namespace nvidia { +bool compare_cuda_devices(const ::sycl::device &lhs, const ::sycl::device &rhs); +} +} // namespace gpu +} // namespace impl +} // namespace dnnl +#endif + +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +// Do not include sycl_hip_utils.hpp because it's intended for use in +// gpu/amd directory only. +namespace dnnl { +namespace impl { +namespace gpu { +namespace amd { +bool compare_hip_devices(const ::sycl::device &lhs, const ::sycl::device &rhs); +} +} // namespace gpu +} // namespace impl +} // namespace dnnl +#endif + +namespace dnnl { +namespace impl { +namespace hrt { +namespace sycl { + +std::string to_string(backend_t backend) { + switch (backend) { + case backend_t::host: return "Host"; + case backend_t::level0: return "Level Zero"; + case backend_t::opencl: return "OpenCL"; + case backend_t::nvidia: return "Nvidia"; + case backend_t::amd: return "AMD"; + default: return "Unknown"; + } +} + +std::string to_string(::sycl::info::device_type dev_type) { + using namespace ::sycl::info; + switch (dev_type) { + case device_type::cpu: return "cpu"; + case device_type::gpu: return "gpu"; + case device_type::accelerator: return "accelerator"; + case device_type::custom: return "custom"; + case device_type::automatic: return "automatic"; + case device_type::host: return "host"; + case device_type::all: return "all"; + default: return "unknown"; + } +} + +backend_t get_gpu_backend() { + // Create default GPU device and query its backend (assumed as default) + static backend_t default_backend = []() { + const backend_t fallback = backend_t::opencl; + + const auto gpu_type = ::sycl::info::device_type::gpu; + if (::sycl::device::get_devices(gpu_type).empty()) return fallback; + + ::sycl::device dev {compat::gpu_selector_v}; + const auto backend = get_backend(dev); + + return backend; + }(); + + return default_backend; +} + +bool is_host(const ::sycl::device &dev) { + return dev.get_info<::sycl::info::device::device_type>() + == ::sycl::info::device_type::host; +} + +bool is_host(const ::sycl::platform &plat) { + auto devices = plat.get_devices(); + if (devices.size() != 1) return false; + return is_host(devices[0]); +} + +backend_t get_backend(const ::sycl::device &dev) { + if (is_host(dev)) return backend_t::host; + + auto plat = dev.get_platform(); + std::string plat_name = plat.get_info<::sycl::info::platform::name>(); + if (plat_name.find("OpenCL") != std::string::npos) return backend_t::opencl; + if (plat_name.find("NVIDIA") != std::string::npos) return backend_t::nvidia; + if (plat_name.find("AMD") != std::string::npos) return backend_t::amd; + if (plat_name.find("Level-Zero") != std::string::npos) + return backend_t::level0; + + return backend_t::unknown; +} + +bool is_intel_platform(const ::sycl::platform &plat) { + std::string plat_name = plat.get_info<::sycl::info::platform::name>(); + return plat_name.find("Intel") != std::string::npos; +} + +bool is_subdevice(const ::sycl::device &dev) { + return dev.get_info<::sycl::info::device::partition_type_property>() + != ::sycl::info::partition_property::no_partition; +} + +::sycl::device get_root_device(const ::sycl::device &dev) { + // Search for the top level device. + auto parent_device = dev; + while (is_subdevice(parent_device)) { + parent_device + = parent_device.get_info<::sycl::info::device::parent_device>(); + } + return parent_device; +} + +::sycl::device get_parent_device(const ::sycl::device &dev) { + return dev.get_info<::sycl::info::device::parent_device>(); +} + +bool are_equal(const ::sycl::device &lhs, const ::sycl::device &rhs) { + auto lhs_be = get_backend(lhs); + auto rhs_be = get_backend(rhs); + if (lhs_be != rhs_be) return false; + + // Only one host device exists. + if (lhs_be == backend_t::host) return true; + +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL + if (lhs_be == backend_t::opencl) { + // Use wrapper objects to avoid memory leak. + auto lhs_ocl_handle = compat::get_native(lhs); + auto rhs_ocl_handle = compat::get_native(rhs); + return lhs_ocl_handle == rhs_ocl_handle; + } + + if (lhs_be == backend_t::level0) { + return gpu::intel::sycl::compare_ze_devices(lhs, rhs); + } +#endif + +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA + if (lhs_be == backend_t::nvidia) { + return gpu::nvidia::compare_cuda_devices(lhs, rhs); + } +#endif + +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD + if (lhs_be == backend_t::amd) { + return gpu::amd::compare_hip_devices(lhs, rhs); + } +#endif + assert(!"not expected"); + return false; +} + +device_id_t device_id(const ::sycl::device &dev) { + if (is_host(dev)) + return std::make_tuple(static_cast(backend_t::host), 0, 0); + + device_id_t device_id + = device_id_t {static_cast(backend_t::unknown), 0, 0}; + switch (get_backend(dev)) { + case backend_t::opencl: { + auto ocl_device = hrt::ocl::make_wrapper( + compat::get_native(dev)); + device_id = std::make_tuple(static_cast(backend_t::opencl), + reinterpret_cast(ocl_device.get()), 0); + break; + } + case backend_t::level0: { + device_id = std::tuple_cat( + std::make_tuple(static_cast(backend_t::level0)), + gpu::intel::sycl::get_device_uuid(dev)); + break; + } + case backend_t::unknown: assert(!"unknown backend"); break; + default: assert(!"unreachable"); + } + assert(std::get<0>(device_id) != static_cast(backend_t::unknown)); + return device_id; +} + +bool dev_ctx_consistency_check( + const ::sycl::device &dev, const ::sycl::context &ctx) { + auto ctx_devs = ctx.get_devices(); + + // Try to find the given device in the given context. + auto it = std::find_if(ctx_devs.begin(), ctx_devs.end(), + [&](const ::sycl::device &ctx_dev) { + return are_equal(ctx_dev, dev); + }); + // If found. + if (it != ctx_devs.end()) return true; + + // If not found and the given device is not a sub-device. + if (!is_subdevice(dev)) return false; + + // Try to find a parent device of the given sub-device in the given + // context. + while (is_subdevice(dev)) { + auto parent_dev = get_parent_device(dev); + it = std::find_if(ctx_devs.begin(), ctx_devs.end(), + [&](const ::sycl::device &ctx_dev) { + return are_equal(ctx_dev, parent_dev); + }); + // If found. + if (it != ctx_devs.end()) return true; + } + + return false; +} + +status_t check_device(engine_kind_t eng_kind, const ::sycl::device &dev, + const ::sycl::context &ctx) { + // Check device and context consistency. + VERROR_ENGINE(dev_ctx_consistency_check(dev, ctx), + status::invalid_arguments, VERBOSE_DEVICE_CTX_MISMATCH); + + // Check engine kind and device consistency. + VERROR_ENGINE( + !(eng_kind == engine_kind::cpu && !dev.is_cpu() && !is_host(dev)), + status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); + VERROR_ENGINE(!(eng_kind == engine_kind::gpu && !dev.is_gpu()), + status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); + +#if !defined(DNNL_SYCL_CUDA) && !defined(DNNL_SYCL_HIP) + // Check that platform is an Intel platform. + VERROR_ENGINE(!(!is_host(dev) && !is_intel_platform(dev.get_platform())), + status::invalid_arguments, VERBOSE_INVALID_PLATFORM, "sycl", + "intel", + dev.get_platform() + .get_info<::sycl::info::platform::name>() + .c_str()); +#endif + return status::success; +} + +bool is_intel_device(const ::sycl::device &dev) { + const int intel_vendor_id = 0x8086; + auto vendor_id = dev.get_info<::sycl::info::device::vendor_id>(); + return vendor_id == intel_vendor_id; +} + +std::vector<::sycl::device> get_devices( + ::sycl::info::device_type dev_type, backend_t backend) { + const uint32_t intel_vendor_id = 0x8086; +#ifdef DNNL_SYCL_CUDA + const uint32_t vendor_id + = ((dev_type == ::sycl::info::device_type::gpu) ? 0x10DE + : intel_vendor_id); +#elif defined(DNNL_SYCL_HIP) + const uint32_t vendor_id + = ((dev_type == ::sycl::info::device_type::gpu) ? 0x1002 + : intel_vendor_id); +#else + const uint32_t vendor_id = intel_vendor_id; +#endif + auto gpu_backend + = backend == backend_t::unknown ? get_gpu_backend() : backend; + + std::vector<::sycl::device> devices; + auto platforms = ::sycl::platform::get_platforms(); + + for (const auto &p : platforms) { +#if !defined(DNNL_SYCL_CUDA) && !defined(DNNL_SYCL_HIP) + if (!is_host(p) && !is_intel_platform(p)) continue; +#endif + auto p_devices = p.get_devices(dev_type); + devices.insert(devices.end(), p_devices.begin(), p_devices.end()); + } + + devices.erase(std::remove_if(devices.begin(), devices.end(), + [=](const ::sycl::device &dev) { + auto _vendor_id = dev.get_info< + ::sycl::info::device::vendor_id>(); + if (_vendor_id != vendor_id) return true; + + auto _dev_type = dev.get_info< + ::sycl::info::device::device_type>(); + if (_dev_type != dev_type) return true; + + if (dev_type == ::sycl::info::device_type::gpu) { + auto _backend = get_backend(dev); + if (_backend == backend_t::unknown + || _backend != gpu_backend) + return true; + } + + return false; + }), + devices.end()); + return devices; +} + +status_t get_device_index(size_t *index, const ::sycl::device &dev) { + auto dev_type = dev.get_info<::sycl::info::device::device_type>(); + auto backend = get_backend(dev); + auto devices = get_devices(dev_type, backend); + + // Find the top level device in the list + auto it = std::find(devices.begin(), devices.end(), get_root_device(dev)); + if (it != devices.end()) { + *index = it - devices.begin(); + return status::success; + } else { + *index = SIZE_MAX; + // TODO: remove this work around once Level-Zero is fixed + if (backend == backend_t::level0) return status::success; + VERROR_ENGINE(false, status::invalid_arguments, + VERBOSE_INVALID_ENGINE_IDX, SIZE_MAX, + to_string(dev_type).c_str(), devices.size()); + } +} + +} // namespace sycl +} // namespace hrt +} // namespace impl +} // namespace dnnl diff --git a/src/hrt/sycl/utils.hpp b/src/hrt/sycl/utils.hpp new file mode 100644 index 00000000000..af2b6934acd --- /dev/null +++ b/src/hrt/sycl/utils.hpp @@ -0,0 +1,92 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef HRT_SYCL_UTILS_HPP +#define HRT_SYCL_UTILS_HPP + +#include "common/c_types_map.hpp" +#include "common/utils.hpp" + +#include "hrt/utils.hpp" + +#if __has_include() +#include +#elif __has_include() +#include +#else +#error "Unsupported compiler" +#endif + +#if defined(__INTEL_LLVM_COMPILER) +#if (__INTEL_LLVM_COMPILER < 20230000) +#define DNNL_USE_SYCL121_API 1 +#else +#define DNNL_USE_SYCL121_API 0 +#endif +#elif defined(__LIBSYCL_MAJOR_VERSION) +#if (__LIBSYCL_MAJOR_VERSION < 6) +#define DNNL_USE_SYCL121_API 1 +#else +#define DNNL_USE_SYCL121_API 0 +#endif +#else +#error "Unsupported compiler" +#endif + +namespace dnnl { +namespace impl { +namespace hrt { +namespace sycl { + +using buffer_u8_t = ::sycl::buffer; + +enum class backend_t { unknown, host, level0, opencl, nvidia, amd }; + +std::string to_string(backend_t backend); +std::string to_string(::sycl::info::device_type dev_type); +backend_t get_gpu_backend(); + +bool is_host(const ::sycl::device &dev); +bool is_host(const ::sycl::platform &plat); +backend_t get_backend(const ::sycl::device &dev); +bool are_equal(const ::sycl::device &lhs, const ::sycl::device &rhs); + +status_t check_device(engine_kind_t eng_kind, const ::sycl::device &dev, + const ::sycl::context &ctx); + +device_id_t device_id(const ::sycl::device &dev); +bool dev_ctx_consistency_check( + const ::sycl::device &dev, const ::sycl::context &ctx); + +bool is_intel_device(const ::sycl::device &dev); +bool is_intel_platform(const ::sycl::platform &plat); + +bool is_subdevice(const ::sycl::device &dev); + +::sycl::device get_root_device(const ::sycl::device &dev); +::sycl::device get_parent_device(const ::sycl::device &dev); + +std::vector<::sycl::device> get_devices(::sycl::info::device_type dev_type, + backend_t backend = backend_t::unknown); + +status_t get_device_index(size_t *index, const ::sycl::device &dev); + +} // namespace sycl +} // namespace hrt +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/sycl/capi/capi_engine.cpp b/src/sycl/capi/capi_engine.cpp index 0eef46ae6cb..15cb9eb0a27 100644 --- a/src/sycl/capi/capi_engine.cpp +++ b/src/sycl/capi/capi_engine.cpp @@ -19,8 +19,8 @@ #include "common/c_types_map.hpp" #include "common/engine.hpp" #include "common/utils.hpp" +#include "hrt/sycl/utils.hpp" #include "sycl/sycl_engine.hpp" -#include "sycl/sycl_utils.hpp" using dnnl::impl::engine_t; using dnnl::impl::status_t; @@ -37,7 +37,7 @@ status_t dnnl_sycl_interop_engine_create( engine_kind_t kind; if (sycl_dev.is_gpu()) kind = engine_kind::gpu; - else if (sycl_dev.is_cpu() || dnnl::impl::sycl::is_host(sycl_dev)) + else if (sycl_dev.is_cpu() || dnnl::impl::hrt::sycl::is_host(sycl_dev)) kind = engine_kind::cpu; else VERROR_ENGINE( @@ -52,7 +52,7 @@ status_t dnnl_sycl_interop_engine_create( VERROR_ENGINE(ef, status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); size_t index; - CHECK(dnnl::impl::sycl::get_sycl_device_index(&index, sycl_dev)); + CHECK(dnnl::impl::hrt::sycl::get_device_index(&index, sycl_dev)); return ef->engine_create(engine, sycl_dev, sycl_ctx, index); } diff --git a/src/sycl/stream_profiler.cpp b/src/sycl/stream_profiler.cpp index 929aaccf1de..df754a27f36 100644 --- a/src/sycl/stream_profiler.cpp +++ b/src/sycl/stream_profiler.cpp @@ -21,8 +21,8 @@ #include "common/c_types_map.hpp" #include "common/utils.hpp" +#include "hrt/sycl/utils.hpp" #include "sycl/sycl_stream.hpp" -#include "sycl/sycl_utils.hpp" namespace dnnl { namespace impl { diff --git a/src/sycl/sycl_buffer_memory_storage.cpp b/src/sycl/sycl_buffer_memory_storage.cpp index 9eb80290fba..4fd8c442379 100644 --- a/src/sycl/sycl_buffer_memory_storage.cpp +++ b/src/sycl/sycl_buffer_memory_storage.cpp @@ -92,12 +92,13 @@ std::unique_ptr sycl_buffer_memory_storage_t::get_sub_storage( storage->buffer_ = buffer_; } else { gpu_assert(IMPLICATION( - is_intel_device( + hrt::sycl::is_intel_device( utils::downcast(engine()) ->device()), offset % gpu::intel::ocl::OCL_BUFFER_ALIGNMENT == 0)); - buffer_u8_t *sub_buffer = buffer_ - ? new buffer_u8_t(parent_buffer(), base_offset_ + offset, size) + hrt::sycl::buffer_u8_t *sub_buffer = buffer_ + ? new hrt::sycl::buffer_u8_t( + parent_buffer(), base_offset_ + offset, size) : nullptr; storage->buffer_.reset(sub_buffer); storage->base_offset_ = base_offset_ + offset; @@ -126,12 +127,12 @@ status_t sycl_buffer_memory_storage_t::init_allocate(size_t size) { return status::out_of_memory; } - buffer_ = std::make_shared(::sycl::range<1>(size)); + buffer_ = std::make_shared(::sycl::range<1>(size)); if (!buffer_) return status::out_of_memory; return status::success; } -buffer_u8_t &sycl_buffer_memory_storage_t::parent_buffer() const { +hrt::sycl::buffer_u8_t &sycl_buffer_memory_storage_t::parent_buffer() const { return utils::downcast( parent_storage()) ->buffer(); diff --git a/src/sycl/sycl_buffer_memory_storage.hpp b/src/sycl/sycl_buffer_memory_storage.hpp index 58c1a891cbd..95ba98c955c 100644 --- a/src/sycl/sycl_buffer_memory_storage.hpp +++ b/src/sycl/sycl_buffer_memory_storage.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2022 Intel Corporation +* Copyright 2019-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,9 +22,9 @@ #include "common/c_types_map.hpp" #include "common/memory_storage.hpp" #include "common/utils.hpp" +#include "gpu/intel/sycl/utils.hpp" #include "sycl/sycl_c_types_map.hpp" #include "sycl/sycl_memory_storage_base.hpp" -#include "sycl/sycl_utils.hpp" namespace dnnl { namespace impl { @@ -37,7 +37,7 @@ class sycl_buffer_memory_storage_t : public sycl_memory_storage_base_t { sycl_buffer_memory_storage_t( engine_t *engine, const memory_storage_t *parent_storage); - buffer_u8_t &buffer() const { return *buffer_; } + hrt::sycl::buffer_u8_t &buffer() const { return *buffer_; } memory_kind_t memory_kind() const override { return memory_kind::buffer; } @@ -49,8 +49,8 @@ class sycl_buffer_memory_storage_t : public sycl_memory_storage_base_t { status_t set_data_handle(void *handle) override { if (!handle) return status::success; - auto *buf_u8_ptr = static_cast(handle); - buffer_.reset(new buffer_u8_t(*buf_u8_ptr)); + auto *buf_u8_ptr = static_cast(handle); + buffer_.reset(new hrt::sycl::buffer_u8_t(*buf_u8_ptr)); return status::success; } @@ -78,9 +78,9 @@ class sycl_buffer_memory_storage_t : public sycl_memory_storage_base_t { status_t init_allocate(size_t size) override; private: - buffer_u8_t &parent_buffer() const; + hrt::sycl::buffer_u8_t &parent_buffer() const; - std::shared_ptr buffer_; + std::shared_ptr buffer_; size_t base_offset_ = 0; }; diff --git a/src/sycl/sycl_cpu_engine.hpp b/src/sycl/sycl_cpu_engine.hpp index d6bbe7d4bce..1283916668b 100644 --- a/src/sycl/sycl_cpu_engine.hpp +++ b/src/sycl/sycl_cpu_engine.hpp @@ -20,8 +20,8 @@ #include "common/impl_list_item.hpp" #include "cpu/cpu_engine.hpp" #include "gpu/intel/ocl/ocl_utils.hpp" +#include "gpu/intel/sycl/utils.hpp" #include "sycl/sycl_engine_base.hpp" -#include "sycl/sycl_utils.hpp" namespace dnnl { namespace impl { @@ -36,7 +36,7 @@ class sycl_cpu_engine_t : public sycl_engine_base_t { sycl_cpu_engine_t( const ::sycl::device &dev, const ::sycl::context &ctx, size_t index) : sycl_engine_base_t(engine_kind::cpu, dev, ctx, index) { - assert(dev.is_cpu() || is_host(dev)); + assert(dev.is_cpu() || hrt::sycl::is_host(dev)); } status_t create_memory_storage(memory_storage_t **storage, unsigned flags, diff --git a/src/sycl/sycl_device_info.cpp b/src/sycl/sycl_device_info.cpp index 83fe676ee94..e426c36dbf7 100644 --- a/src/sycl/sycl_device_info.cpp +++ b/src/sycl/sycl_device_info.cpp @@ -15,12 +15,12 @@ *******************************************************************************/ #include "sycl/sycl_device_info.hpp" -#include "sycl/sycl_compat.hpp" -#include "sycl/sycl_utils.hpp" +#include "gpu/intel/sycl/utils.hpp" #include "gpu/intel/ocl/ocl_engine.hpp" #include "gpu/intel/ocl/ocl_gpu_hw_info.hpp" #include "gpu/intel/ocl/ocl_utils.hpp" +#include "gpu/intel/sycl/compat.hpp" #include "gpu/sycl/sycl_gpu_engine.hpp" #include "cpu/platform.hpp" @@ -37,13 +37,13 @@ status_t sycl_device_info_t::init_arch(engine_t *engine) { if (!device.is_gpu()) return status::success; // skip other vendors - if (!is_intel_device(device)) return status::success; + if (!hrt::sycl::is_intel_device(device)) return status::success; - backend_t be = get_sycl_backend(device); - if (be == backend_t::opencl) { + auto be = hrt::sycl::get_backend(device); + if (be == hrt::sycl::backend_t::opencl) { cl_int err = CL_SUCCESS; - auto ocl_dev = compat::get_native(device); + auto ocl_dev = hrt::sycl::compat::get_native(device); auto ocl_dev_wrapper = hrt::ocl::make_wrapper(ocl_dev); auto ocl_ctx_wrapper = hrt::ocl::make_wrapper( @@ -53,7 +53,7 @@ status_t sycl_device_info_t::init_arch(engine_t *engine) { gpu::intel::ocl::init_gpu_hw_info(engine, ocl_dev_wrapper, ocl_ctx_wrapper, gpu_arch_, stepping_id_, native_extensions_, mayiuse_systolic_, mayiuse_ngen_kernels_); - } else if (be == backend_t::level0) { + } else if (be == hrt::sycl::backend_t::level0) { // TODO: add support for L0 binary ngen check // XXX: query from ocl_engine for now gpu::intel::ocl::ocl_engine_factory_t f(engine_kind::gpu); @@ -105,7 +105,7 @@ status_t sycl_device_info_t::init_extensions(engine_t *engine) { auto &device = utils::downcast(engine)->device(); - extensions_ = compat::init_extensions(device); + extensions_ = gpu::intel::sycl::compat::init_extensions(device); // Handle future extensions, not yet supported by the DPC++ API extensions_ @@ -117,18 +117,20 @@ status_t sycl_device_info_t::init_extensions(engine_t *engine) { status_t sycl_device_info_t::init_attributes(engine_t *engine) { auto &device = utils::downcast(engine)->device(); - if (device.is_gpu() && is_intel_device(device)) { - backend_t be = get_sycl_backend(device); - if (be == backend_t::opencl) { + if (device.is_gpu() && hrt::sycl::is_intel_device(device)) { + hrt::sycl::backend_t be = hrt::sycl::get_backend(device); + if (be == hrt::sycl::backend_t::opencl) { // XXX: OpenCL backend get_info() queries below are not yet // supported so query OpenCL directly. - cl_device_id ocl_dev = compat::get_native(device); + cl_device_id ocl_dev + = hrt::sycl::compat::get_native(device); CHECK(gpu::intel::ocl::get_ocl_device_eu_count( ocl_dev, gpu_arch_, &eu_count_)); } else { - auto slices = device.get_info(); + auto slices = device.get_info< + hrt::sycl::compat::ext_intel_gpu_slices>(); auto sub_slices = device.get_info< - compat::ext_intel_gpu_subslices_per_slice>(); + hrt::sycl::compat::ext_intel_gpu_subslices_per_slice>(); auto eus_per_subslice = device.get_info<::sycl::info::device:: ext_intel_gpu_eu_count_per_subslice>(); if (gpu_arch_ == gpu::intel::compute::gpu_arch_t::xe2) diff --git a/src/sycl/sycl_engine.cpp b/src/sycl/sycl_engine.cpp index 2194fa59f56..1c73b597c03 100644 --- a/src/sycl/sycl_engine.cpp +++ b/src/sycl/sycl_engine.cpp @@ -31,7 +31,7 @@ status_t sycl_engine_factory_t::engine_create( auto dev_type = (engine_kind_ == engine_kind::cpu) ? ::sycl::info::device_type::cpu : ::sycl::info::device_type::gpu; - auto devices = get_sycl_devices(dev_type); + auto devices = hrt::sycl::get_devices(dev_type); auto &dev = devices[index]; auto exception_handler = [](const ::sycl::exception_list &eptr_list) { @@ -61,7 +61,7 @@ status_t sycl_engine_factory_t::engine_create(engine_t **engine, const ::sycl::device &dev, const ::sycl::context &ctx, size_t index) const { // Validate device and context. - VERROR_ENGINE(dev_ctx_consistency_check(dev, ctx), + VERROR_ENGINE(hrt::sycl::dev_ctx_consistency_check(dev, ctx), status::invalid_arguments, VERBOSE_DEVICE_CTX_MISMATCH); #ifdef DNNL_SYCL_CUDA @@ -76,7 +76,7 @@ status_t sycl_engine_factory_t::engine_create(engine_t **engine, engine, engine_kind_, dev, ctx, index); #endif VERROR_ENGINE(!(engine_kind_ == engine_kind::cpu && !dev.is_cpu() - && !is_host(dev)), + && !hrt::sycl::is_host(dev)), status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); VERROR_ENGINE(!(engine_kind_ == engine_kind::gpu && !dev.is_gpu()), status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); diff --git a/src/sycl/sycl_engine.hpp b/src/sycl/sycl_engine.hpp index 841126d7b74..135964afa26 100644 --- a/src/sycl/sycl_engine.hpp +++ b/src/sycl/sycl_engine.hpp @@ -27,8 +27,8 @@ #include "common/c_types_map.hpp" #include "common/engine.hpp" #include "common/utils.hpp" +#include "gpu/intel/sycl/utils.hpp" #include "gpu/sycl/sycl_gpu_engine.hpp" -#include "sycl/sycl_utils.hpp" #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE #include "sycl/sycl_cpu_engine.hpp" @@ -67,80 +67,6 @@ status_t hip_engine_create(engine_t **engine, engine_kind_t engine_kind, namespace sycl { -inline std::vector<::sycl::device> get_sycl_devices( - ::sycl::info::device_type dev_type, - backend_t backend = backend_t::unknown) { - const uint32_t intel_vendor_id = 0x8086; -#ifdef DNNL_SYCL_CUDA - const uint32_t vendor_id - = ((dev_type == ::sycl::info::device_type::gpu) ? 0x10DE - : intel_vendor_id); -#elif defined(DNNL_SYCL_HIP) - const uint32_t vendor_id - = ((dev_type == ::sycl::info::device_type::gpu) ? 0x1002 - : intel_vendor_id); -#else - const uint32_t vendor_id = intel_vendor_id; -#endif - auto gpu_backend - = backend == backend_t::unknown ? get_sycl_gpu_backend() : backend; - - std::vector<::sycl::device> devices; - auto platforms = ::sycl::platform::get_platforms(); - - for (const auto &p : platforms) { -#if !defined(DNNL_SYCL_CUDA) && !defined(DNNL_SYCL_HIP) - if (!is_host(p) && !is_intel_platform(p)) continue; -#endif - auto p_devices = p.get_devices(dev_type); - devices.insert(devices.end(), p_devices.begin(), p_devices.end()); - } - - devices.erase(std::remove_if(devices.begin(), devices.end(), - [=](const ::sycl::device &dev) { - auto _vendor_id = dev.get_info< - ::sycl::info::device::vendor_id>(); - if (_vendor_id != vendor_id) return true; - - auto _dev_type = dev.get_info< - ::sycl::info::device::device_type>(); - if (_dev_type != dev_type) return true; - - if (dev_type == ::sycl::info::device_type::gpu) { - auto _backend = get_sycl_backend(dev); - if (_backend == backend_t::unknown - || _backend != gpu_backend) - return true; - } - - return false; - }), - devices.end()); - return devices; -} - -inline status_t get_sycl_device_index( - size_t *index, const ::sycl::device &dev) { - auto dev_type = dev.get_info<::sycl::info::device::device_type>(); - auto backend = get_sycl_backend(dev); - auto devices = get_sycl_devices(dev_type, backend); - - // Find the top level device in the list - auto it = std::find( - devices.begin(), devices.end(), get_main_parent_device(dev)); - if (it != devices.end()) { - *index = it - devices.begin(); - return status::success; - } else { - *index = SIZE_MAX; - // TODO: remove this work around once Level-Zero is fixed - if (backend == backend_t::level0) return status::success; - VERROR_ENGINE(false, status::invalid_arguments, - VERBOSE_INVALID_ENGINE_IDX, SIZE_MAX, - to_string(dev_type).c_str(), devices.size()); - } -} - class sycl_engine_factory_t : public engine_factory_t { public: sycl_engine_factory_t(engine_kind_t engine_kind) @@ -155,7 +81,7 @@ class sycl_engine_factory_t : public engine_factory_t { auto dev_type = (engine_kind_ == engine_kind::cpu) ? ::sycl::info::device_type::cpu : ::sycl::info::device_type::gpu; - return get_sycl_devices(dev_type).size(); + return hrt::sycl::get_devices(dev_type).size(); } status_t engine_create(engine_t **engine, size_t index) const override; diff --git a/src/sycl/sycl_engine_base.cpp b/src/sycl/sycl_engine_base.cpp index 20a210d9638..3d32795f251 100644 --- a/src/sycl/sycl_engine_base.cpp +++ b/src/sycl/sycl_engine_base.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2023 Intel Corporation +* Copyright 2019-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ #include "common/memory.hpp" #include "common/memory_storage.hpp" -#include "sycl/sycl_compat.hpp" +#include "gpu/intel/sycl/compat.hpp" #include "sycl/sycl_device_info.hpp" #include "sycl/sycl_memory_storage.hpp" #include "sycl/sycl_stream.hpp" diff --git a/src/sycl/sycl_engine_base.hpp b/src/sycl/sycl_engine_base.hpp index 5ab0d5bda0b..31142592d29 100644 --- a/src/sycl/sycl_engine_base.hpp +++ b/src/sycl/sycl_engine_base.hpp @@ -25,9 +25,9 @@ #include "gpu/intel/compute/compute_engine.hpp" #include "gpu/intel/ocl/ocl_gpu_engine.hpp" #include "gpu/intel/ocl/ocl_gpu_kernel.hpp" +#include "gpu/intel/sycl/compat.hpp" +#include "gpu/intel/sycl/utils.hpp" #include "gpu/sycl/sycl_interop_gpu_kernel.hpp" -#include "sycl/sycl_compat.hpp" -#include "sycl/sycl_utils.hpp" #include "sycl_engine_id.hpp" namespace dnnl { @@ -41,16 +41,18 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { : gpu::intel::compute::compute_engine_t(kind, runtime_kind::sycl, index) , device_(dev) , context_(ctx) - , backend_(backend_t::unknown) {} + , backend_(hrt::sycl::backend_t::unknown) {} status_t init() override { - backend_ = get_sycl_backend(device_); - VERROR_ENGINE( - utils::one_of(backend_, backend_t::host, backend_t::opencl, - backend_t::level0, backend_t::nvidia, backend_t::amd), + backend_ = hrt::sycl::get_backend(device_); + VERROR_ENGINE(utils::one_of(backend_, hrt::sycl::backend_t::host, + hrt::sycl::backend_t::opencl, + hrt::sycl::backend_t::level0, + hrt::sycl::backend_t::nvidia, + hrt::sycl::backend_t::amd), status::invalid_arguments, VERBOSE_UNSUPPORTED_BACKEND, "sycl"); - CHECK(check_device(kind(), device_, context_)); + CHECK(hrt::sycl::check_device(kind(), device_, context_)); CHECK(gpu::intel::compute::compute_engine_t::init()); return status::success; @@ -87,7 +89,8 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { std::vector arg_types; std::unique_ptr<::sycl::kernel> sycl_kernel; - CHECK(compat::make_kernel(sycl_kernel, this, binary, kernel_name)); + CHECK(gpu::intel::sycl::compat::make_kernel( + sycl_kernel, this, binary, kernel_name)); std::shared_ptr kernel_impl = std::make_shared( @@ -106,7 +109,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { std::unique_ptr ocl_engine; - auto status = create_ocl_engine(&ocl_engine, this); + auto status = gpu::intel::sycl::create_ocl_engine(&ocl_engine, this); if (status != status::success) return status; std::vector ocl_kernels; @@ -129,7 +132,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { std::unique_ptr ocl_engine; - CHECK(create_ocl_engine(&ocl_engine, this)); + CHECK(gpu::intel::sycl::create_ocl_engine(&ocl_engine, this)); auto kernel_name = jitter->kernel_name(); @@ -150,7 +153,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { std::unique_ptr ocl_engine; - CHECK(create_ocl_engine(&ocl_engine, this)); + CHECK(gpu::intel::sycl::create_ocl_engine(&ocl_engine, this)); std::vector ocl_kernels; CHECK(ocl_engine->create_kernels( @@ -163,28 +166,30 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { const ::sycl::device &device() const { return device_; } const ::sycl::context &context() const { return context_; } - backend_t backend() const { return backend_; } + hrt::sycl::backend_t backend() const { return backend_; } cl_device_id ocl_device() const { - if (backend() != backend_t::opencl) { + if (backend() != hrt::sycl::backend_t::opencl) { assert(!"not expected"); return nullptr; } assert(device_.is_cpu() || device_.is_gpu()); return hrt::ocl::make_wrapper( - compat::get_native(device())); + hrt::sycl::compat::get_native(device())); } cl_context ocl_context() const { - if (backend() != backend_t::opencl) { + if (backend() != hrt::sycl::backend_t::opencl) { assert(!"not expected"); return nullptr; } assert(device_.is_cpu() || device_.is_gpu()); return hrt::ocl::make_wrapper( - compat::get_native(context())); + hrt::sycl::compat::get_native(context())); } - device_id_t device_id() const override { return sycl_device_id(device_); } + device_id_t device_id() const override { + return hrt::sycl::device_id(device_); + } engine_id_t engine_id() const override { return engine_id_t(new sycl_engine_id_impl_t( @@ -199,7 +204,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { ::sycl::device device_; ::sycl::context context_; - backend_t backend_; + hrt::sycl::backend_t backend_; }; } // namespace sycl diff --git a/src/sycl/sycl_stream.cpp b/src/sycl/sycl_stream.cpp index 152611ffc64..d8ae46ce5b3 100644 --- a/src/sycl/sycl_stream.cpp +++ b/src/sycl/sycl_stream.cpp @@ -68,7 +68,7 @@ status_t sycl_stream_t::init() { && IMPLICATION( engine()->kind() == engine_kind::gpu, sycl_dev.is_gpu()) && IMPLICATION(engine()->kind() == engine_kind::cpu, - (sycl_dev.is_cpu() || is_host(sycl_dev))); + (sycl_dev.is_cpu() || hrt::sycl::is_host(sycl_dev))); if (!args_ok) return status::invalid_arguments; } diff --git a/src/sycl/sycl_stream.hpp b/src/sycl/sycl_stream.hpp index 40af5132377..2d2e0a9d8ed 100644 --- a/src/sycl/sycl_stream.hpp +++ b/src/sycl/sycl_stream.hpp @@ -261,7 +261,7 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { out_event = queue_->submit([&](::sycl::handler &cgh) { // need a u8 accessor to get the proper range ::sycl::accessor + hrt::sycl::compat::target_device> acc_dst(buffer_dst->buffer(), cgh, ::sycl::range<1>(size), ::sycl::id<1>(0)); cgh.depends_on(sycl_event_t::from(deps).events); @@ -341,7 +341,7 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { // XXX: this is a temporary solution to make sycl_memory_arg_t // default constructible. - buffer_u8_t dummy_buffer_ = buffer_u8_t(1); + hrt::sycl::buffer_u8_t dummy_buffer_ = hrt::sycl::buffer_u8_t(1); private: status_t init(); diff --git a/src/sycl/sycl_stream_submit_cpu_primitive.cpp b/src/sycl/sycl_stream_submit_cpu_primitive.cpp index 45afca4c400..9e175b17218 100644 --- a/src/sycl/sycl_stream_submit_cpu_primitive.cpp +++ b/src/sycl/sycl_stream_submit_cpu_primitive.cpp @@ -25,8 +25,8 @@ #include "common/primitive_iface.hpp" #include "common/stream.hpp" #include "common/utils.hpp" +#include "gpu/intel/sycl/compat.hpp" #include "sycl/sycl_c_types_map.hpp" -#include "sycl/sycl_compat.hpp" #include "sycl/sycl_memory_storage.hpp" #include @@ -57,7 +57,7 @@ template status_t submit_cpu_primitive_with_params_impl( submit_ctx_t *submit_ctx, ::sycl::handler &cgh, param_types... params) { - compat::host_task(cgh, [=]() { + hrt::sycl::compat::host_task(cgh, [=]() { thunk_params_t thunk_params; thunk_params.submit_ctx_ptr = submit_ctx; diff --git a/src/sycl/sycl_stream_submit_cpu_primitive.hpp b/src/sycl/sycl_stream_submit_cpu_primitive.hpp index 10fbf09770a..400b0a948bd 100644 --- a/src/sycl/sycl_stream_submit_cpu_primitive.hpp +++ b/src/sycl/sycl_stream_submit_cpu_primitive.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2022 Intel Corporation +* Copyright 2019-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,8 +18,8 @@ #define SYCL_STREAM_SUBMIT_CPU_DISPATCH_HPP #include "common/c_types_map.hpp" +#include "hrt/sycl/utils.hpp" #include "sycl/sycl_stream_cpu_thunk.hpp" -#include "sycl/sycl_utils.hpp" #include diff --git a/src/sycl/sycl_utils.hpp b/src/sycl/sycl_utils.hpp deleted file mode 100644 index 9a634defe09..00000000000 --- a/src/sycl/sycl_utils.hpp +++ /dev/null @@ -1,188 +0,0 @@ -/******************************************************************************* -* Copyright 2019-2024 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef SYCL_UTILS_HPP -#define SYCL_UTILS_HPP - -#include "common/c_types_map.hpp" -#include "common/utils.hpp" -#include "gpu/intel/compute/utils.hpp" -#include "gpu/intel/ocl/ocl_gpu_engine.hpp" -#include "hrt/utils.hpp" - -#if __has_include() -#include -#elif __has_include() -#include -#else -#error "Unsupported compiler" -#endif - -#if defined(__INTEL_LLVM_COMPILER) -#if (__INTEL_LLVM_COMPILER < 20230000) -#define DNNL_USE_SYCL121_API 1 -#else -#define DNNL_USE_SYCL121_API 0 -#endif -#elif defined(__LIBSYCL_MAJOR_VERSION) -#if (__LIBSYCL_MAJOR_VERSION < 6) -#define DNNL_USE_SYCL121_API 1 -#else -#define DNNL_USE_SYCL121_API 0 -#endif -#else -#error "Unsupported compiler" -#endif - -namespace dnnl { -namespace impl { -namespace sycl { - -using buffer_u8_t = ::sycl::buffer; - -inline ::sycl::nd_range<3> to_sycl_nd_range( - const gpu::intel::compute::nd_range_t &range) { - const auto &local_range = range.local_range(); - const auto &global_range = range.global_range(); - - assert(range.ndims() <= 3); - auto sycl_global_range = ::sycl::range<3>( - global_range.ndims() >= 3 ? global_range[2] : 1, - global_range.ndims() >= 2 ? global_range[1] : 1, global_range[0]); - - if (!local_range) { - assert(!"not expected"); - return ::sycl::nd_range<3>( - sycl_global_range, ::sycl::range<3>(1, 1, 1)); - } - - auto sycl_local_range = ::sycl::range<3>( - local_range.ndims() >= 3 ? local_range[2] : 1, - local_range.ndims() >= 2 ? local_range[1] : 1, local_range[0]); - return ::sycl::nd_range<3>(sycl_global_range, sycl_local_range); -} - -enum class backend_t { unknown, host, level0, opencl, nvidia, amd }; - -inline std::string to_string(backend_t backend) { - switch (backend) { - case backend_t::host: return "Host"; - case backend_t::level0: return "Level Zero"; - case backend_t::opencl: return "OpenCL"; - case backend_t::nvidia: return "Nvidia"; - case backend_t::amd: return "AMD"; - default: return "Unknown"; - } -} - -inline std::string to_string(::sycl::info::device_type dev_type) { - using namespace ::sycl::info; - switch (dev_type) { - case device_type::cpu: return "cpu"; - case device_type::gpu: return "gpu"; - case device_type::accelerator: return "accelerator"; - case device_type::custom: return "custom"; - case device_type::automatic: return "automatic"; - case device_type::host: return "host"; - case device_type::all: return "all"; - default: return "unknown"; - } -} - -backend_t get_sycl_gpu_backend(); - -inline bool is_host(const ::sycl::device &dev) { - return dev.get_info<::sycl::info::device::device_type>() - == ::sycl::info::device_type::host; -} - -inline bool is_host(const ::sycl::platform &plat) { - auto devices = plat.get_devices(); - if (devices.size() != 1) return false; - return is_host(devices[0]); -} - -inline backend_t get_sycl_backend(const ::sycl::device &dev) { - if (is_host(dev)) return backend_t::host; - - auto plat = dev.get_platform(); - std::string plat_name = plat.get_info<::sycl::info::platform::name>(); - if (plat_name.find("OpenCL") != std::string::npos) return backend_t::opencl; - if (plat_name.find("NVIDIA") != std::string::npos) return backend_t::nvidia; - if (plat_name.find("AMD") != std::string::npos) return backend_t::amd; - if (plat_name.find("Level-Zero") != std::string::npos) - return backend_t::level0; - - return backend_t::unknown; -} - -bool are_equal(const ::sycl::device &lhs, const ::sycl::device &rhs); -device_id_t sycl_device_id(const ::sycl::device &dev); - -status_t check_device(engine_kind_t eng_kind, const ::sycl::device &dev, - const ::sycl::context &ctx); - -bool dev_ctx_consistency_check( - const ::sycl::device &dev, const ::sycl::context &ctx); - -inline bool is_intel_device(const ::sycl::device &dev) { - const int intel_vendor_id = 0x8086; - auto vendor_id = dev.get_info<::sycl::info::device::vendor_id>(); - return vendor_id == intel_vendor_id; -} - -inline bool is_intel_platform(const ::sycl::platform &plat) { - std::string plat_name = plat.get_info<::sycl::info::platform::name>(); - return plat_name.find("Intel") != std::string::npos; -} - -inline bool is_subdevice(const ::sycl::device &dev) { - return dev.get_info<::sycl::info::device::partition_type_property>() - != ::sycl::info::partition_property::no_partition; -} - -inline ::sycl::device get_main_parent_device(const ::sycl::device &dev) { - // Search for the top level device. - auto parent_device = dev; - while (is_subdevice(parent_device)) { - parent_device - = parent_device.get_info<::sycl::info::device::parent_device>(); - } - return parent_device; -} - -inline ::sycl::device get_parent_device(const ::sycl::device &dev) { - return dev.get_info<::sycl::info::device::parent_device>(); -} - -class sycl_engine_base_t; -status_t create_ocl_engine( - std::unique_ptr - *ocl_engine, - const sycl_engine_base_t *engine); - -status_t get_kernel_binary(const ::sycl::kernel &kernel, hrt::binary_t &binary); - -status_t create_ocl_engine( - std::unique_ptr - *ocl_engine, - const sycl_engine_base_t *engine); - -} // namespace sycl -} // namespace impl -} // namespace dnnl - -#endif diff --git a/tests/gtests/graph/api/sycl/test_cpp_api_compiled_partition.cpp b/tests/gtests/graph/api/sycl/test_cpp_api_compiled_partition.cpp index f5bac6afd06..12adf04ad7a 100644 --- a/tests/gtests/graph/api/sycl/test_cpp_api_compiled_partition.cpp +++ b/tests/gtests/graph/api/sycl/test_cpp_api_compiled_partition.cpp @@ -74,9 +74,9 @@ TEST(SYCLApi, CompiledPartitionExecute) { dnnl::graph::testing::sycl_free_wrapper); sycl::queue q = (ekind == dnnl::engine::kind::gpu) - ? sycl::queue(dnnl::impl::sycl::compat::gpu_selector_v, + ? sycl::queue(dnnl::impl::hrt::sycl::compat::gpu_selector_v, sycl::property::queue::in_order {}) - : sycl::queue(dnnl::impl::sycl::compat::cpu_selector_v, + : sycl::queue(dnnl::impl::hrt::sycl::compat::cpu_selector_v, sycl::property::queue::in_order {}); dnnl::engine eng = sycl_interop::make_engine_with_allocator( @@ -137,9 +137,9 @@ TEST(SYCLApi, CompiledPartitionInteropExecute) { dnnl::graph::testing::sycl_free_wrapper); sycl::queue q = (ekind == dnnl::engine::kind::gpu) - ? sycl::queue(dnnl::impl::sycl::compat::gpu_selector_v, + ? sycl::queue(dnnl::impl::hrt::sycl::compat::gpu_selector_v, sycl::property::queue::in_order {}) - : sycl::queue(dnnl::impl::sycl::compat::cpu_selector_v, + : sycl::queue(dnnl::impl::hrt::sycl::compat::cpu_selector_v, sycl::property::queue::in_order {}); dnnl::engine eng = sycl_interop::make_engine_with_allocator( diff --git a/tests/gtests/graph/api/sycl/test_cpp_api_engine.cpp b/tests/gtests/graph/api/sycl/test_cpp_api_engine.cpp index 6e897878dfb..13db74c14f8 100644 --- a/tests/gtests/graph/api/sycl/test_cpp_api_engine.cpp +++ b/tests/gtests/graph/api/sycl/test_cpp_api_engine.cpp @@ -36,9 +36,9 @@ TEST(SYCLApi, Engine) { = static_cast(api_test_engine_kind); queue q = (ekind == dnnl::engine::kind::gpu) - ? queue(dnnl::impl::sycl::compat::gpu_selector_v, + ? queue(dnnl::impl::hrt::sycl::compat::gpu_selector_v, property::queue::in_order {}) - : queue(dnnl::impl::sycl::compat::cpu_selector_v, + : queue(dnnl::impl::hrt::sycl::compat::cpu_selector_v, property::queue::in_order {}); allocator alloc = dnnl::graph::sycl_interop::make_allocator( diff --git a/tests/gtests/graph/api/sycl/test_cpp_api_tensor.cpp b/tests/gtests/graph/api/sycl/test_cpp_api_tensor.cpp index f131e867577..ba5c0143ba3 100644 --- a/tests/gtests/graph/api/sycl/test_cpp_api_tensor.cpp +++ b/tests/gtests/graph/api/sycl/test_cpp_api_tensor.cpp @@ -29,7 +29,7 @@ using namespace dnnl::graph; #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL TEST(SYCLApi, Tensor) { SKIP_IF(api_test_engine_kind == dnnl_cpu, "skip sycl test for cpu engine."); - sycl::device dev {dnnl::impl::sycl::compat::gpu_selector_v}; + sycl::device dev {dnnl::impl::hrt::sycl::compat::gpu_selector_v}; sycl::context ctx {dev}; dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx); diff --git a/tests/gtests/graph/api/test_api_common.cpp b/tests/gtests/graph/api/test_api_common.cpp index 7345b6fdf59..f1535844e31 100644 --- a/tests/gtests/graph/api/test_api_common.cpp +++ b/tests/gtests/graph/api/test_api_common.cpp @@ -26,7 +26,8 @@ void api_test_dnnl_engine_create( dnnl_engine_t *engine, dnnl_engine_kind_t engine_kind) { if (engine_kind == dnnl_cpu) { #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL - static ::sycl::device dev {dnnl::impl::sycl::compat::cpu_selector_v}; + static ::sycl::device dev { + dnnl::impl::hrt::sycl::compat::cpu_selector_v}; static ::sycl::context ctx {dev}; if (!allocator_handle) { ASSERT_EQ(dnnl_graph_sycl_interop_allocator_create( @@ -58,7 +59,8 @@ void api_test_dnnl_engine_create( *engine = engine_handle.engine; } else { #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL - static ::sycl::device dev {dnnl::impl::sycl::compat::gpu_selector_v}; + static ::sycl::device dev { + dnnl::impl::hrt::sycl::compat::gpu_selector_v}; static ::sycl::context ctx {dev}; if (!allocator_handle) { ASSERT_EQ(dnnl_graph_sycl_interop_allocator_create( @@ -89,7 +91,8 @@ void api_test_dnnl_graph_graph_create( dnnl::engine &cpp_api_test_dnnl_engine_create(dnnl::engine::kind engine_kind) { if (engine_kind == dnnl::engine::kind::cpu) { #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL - static ::sycl::device dev {dnnl::impl::sycl::compat::cpu_selector_v}; + static ::sycl::device dev { + dnnl::impl::hrt::sycl::compat::cpu_selector_v}; static ::sycl::context ctx {dev}; static dnnl::graph::allocator alloc = dnnl::graph::sycl_interop::make_allocator( @@ -107,7 +110,7 @@ dnnl::engine &cpp_api_test_dnnl_engine_create(dnnl::engine::kind engine_kind) { } #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL - static ::sycl::device dev {dnnl::impl::sycl::compat::gpu_selector_v}; + static ::sycl::device dev {dnnl::impl::hrt::sycl::compat::gpu_selector_v}; static ::sycl::context ctx {dev}; static dnnl::graph::allocator alloc = dnnl::graph::sycl_interop::make_allocator( diff --git a/tests/gtests/graph/api/test_api_common.hpp b/tests/gtests/graph/api/test_api_common.hpp index 2d00a2ba0ec..347a6487074 100644 --- a/tests/gtests/graph/api/test_api_common.hpp +++ b/tests/gtests/graph/api/test_api_common.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,7 +35,7 @@ #include "tests/gtests/dnnl_test_macros.hpp" #ifdef DNNL_WITH_SYCL -#include "sycl/sycl_compat.hpp" +#include "hrt/sycl/compat.hpp" #if __has_include() #include #elif __has_include() diff --git a/tests/gtests/graph/unit/interface/sycl/test_allocator.cpp b/tests/gtests/graph/unit/interface/sycl/test_allocator.cpp index d1e83b35c83..5967bf16bda 100644 --- a/tests/gtests/graph/unit/interface/sycl/test_allocator.cpp +++ b/tests/gtests/graph/unit/interface/sycl/test_allocator.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2023 Intel Corporation +* Copyright 2021-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,9 +33,9 @@ TEST(test_interface_test_allocator, DefaultSyclAllocator) { #endif graph::allocator_t *alloc = new graph::allocator_t(); sycl::queue q = kind == graph::engine_kind::gpu - ? sycl::queue {dnnl::impl::sycl::compat::gpu_selector_v, + ? sycl::queue {dnnl::impl::hrt::sycl::compat::gpu_selector_v, sycl::property::queue::in_order {}} - : sycl::queue {dnnl::impl::sycl::compat::cpu_selector_v, + : sycl::queue {dnnl::impl::hrt::sycl::compat::cpu_selector_v, sycl::property::queue::in_order {}}; graph::allocator_t::mem_attr_t attr { @@ -69,8 +69,8 @@ TEST(test_interface_test_allocator, SyclAllocator) { = new graph::allocator_t(dnnl::graph::testing::sycl_malloc_wrapper, dnnl::graph::testing::sycl_free_wrapper); sycl::device sycl_dev = (kind == graph::engine_kind::gpu) - ? sycl::device {dnnl::impl::sycl::compat::gpu_selector_v} - : sycl::device {dnnl::impl::sycl::compat::cpu_selector_v}; + ? sycl::device {dnnl::impl::hrt::sycl::compat::gpu_selector_v} + : sycl::device {dnnl::impl::hrt::sycl::compat::cpu_selector_v}; sycl::context sycl_ctx {sycl_dev}; auto *mem_ptr = sycl_alloc->allocate( diff --git a/tests/gtests/graph/unit/unit_test_common.cpp b/tests/gtests/graph/unit/unit_test_common.cpp index 694098b14ed..ae993d79611 100644 --- a/tests/gtests/graph/unit/unit_test_common.cpp +++ b/tests/gtests/graph/unit/unit_test_common.cpp @@ -37,8 +37,8 @@ namespace graph = dnnl::impl::graph; ::sycl::device &get_device() { static ::sycl::device dev = get_test_engine_kind() == graph::engine_kind::cpu - ? ::sycl::device {dnnl::impl::sycl::compat::cpu_selector_v} - : ::sycl::device {dnnl::impl::sycl::compat::gpu_selector_v}; + ? ::sycl::device {dnnl::impl::hrt::sycl::compat::cpu_selector_v} + : ::sycl::device {dnnl::impl::hrt::sycl::compat::gpu_selector_v}; return dev; } diff --git a/tests/gtests/graph/unit/unit_test_common.hpp b/tests/gtests/graph/unit/unit_test_common.hpp index 1ceb99b6160..b9e30684a63 100644 --- a/tests/gtests/graph/unit/unit_test_common.hpp +++ b/tests/gtests/graph/unit/unit_test_common.hpp @@ -33,7 +33,7 @@ #include "tests/gtests/dnnl_test_common.hpp" #ifdef DNNL_WITH_SYCL -#include "sycl/sycl_compat.hpp" +#include "gpu/intel/sycl/compat.hpp" #if __has_include() #include #elif __has_include() diff --git a/tests/gtests/sycl/api/test_engine.cpp b/tests/gtests/sycl/api/test_engine.cpp index a320b3429a7..e6844a17d25 100644 --- a/tests/gtests/sycl/api/test_engine.cpp +++ b/tests/gtests/sycl/api/test_engine.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2023 Intel Corporation +* Copyright 2019-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ #include "oneapi/dnnl/dnnl.hpp" #include "oneapi/dnnl/dnnl_sycl.hpp" -#include "sycl/sycl_compat.hpp" +#include "hrt/sycl/compat.hpp" #include @@ -207,7 +207,7 @@ TEST_P(sycl_engine_test, SubDevice) { #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_SYCL TEST_P(sycl_engine_test, non_sycl_cpu_runtime) { try { - device dev(dnnl::impl::sycl::compat::cpu_selector_v); + device dev(dnnl::impl::hrt::sycl::compat::cpu_selector_v); context ctx(dev); EXPECT_ANY_THROW(sycl_interop::make_engine(dev, ctx)); } catch (::sycl::exception &e) { diff --git a/tests/gtests/sycl/api/test_memory_buffer.cpp b/tests/gtests/sycl/api/test_memory_buffer.cpp index 61e226f6b79..8bc6ba055a7 100644 --- a/tests/gtests/sycl/api/test_memory_buffer.cpp +++ b/tests/gtests/sycl/api/test_memory_buffer.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2023 Intel Corporation +* Copyright 2019-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ #include "oneapi/dnnl/dnnl.h" #include "oneapi/dnnl/dnnl_sycl.hpp" -#include "sycl/sycl_compat.hpp" +#include "hrt/sycl/compat.hpp" #include #include @@ -361,9 +361,9 @@ TEST_P(sycl_memory_buffer_test, EltwiseWithUserKernel) { std::unique_ptr q; if (eng_kind == engine::kind::cpu) { - q.reset(new queue(dnnl::impl::sycl::compat::cpu_selector_v)); + q.reset(new queue(dnnl::impl::hrt::sycl::compat::cpu_selector_v)); } else { - q.reset(new queue(dnnl::impl::sycl::compat::gpu_selector_v)); + q.reset(new queue(dnnl::impl::hrt::sycl::compat::gpu_selector_v)); } q->submit([&](handler &cgh) { From 2837ff84b5e6a1c7ef3b0be14ca5155829d5f318 Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Mon, 6 May 2024 19:11:32 -0700 Subject: [PATCH 014/187] api: sycl: move sycl interop api files --- src/{ => hrt}/sycl/capi/capi_engine.cpp | 0 src/{ => hrt}/sycl/capi/capi_memory.cpp | 0 src/{ => hrt}/sycl/capi/capi_primitive.cpp | 2 +- src/{ => hrt}/sycl/capi/capi_stream.cpp | 2 +- 4 files changed, 2 insertions(+), 2 deletions(-) rename src/{ => hrt}/sycl/capi/capi_engine.cpp (100%) rename src/{ => hrt}/sycl/capi/capi_memory.cpp (100%) rename src/{ => hrt}/sycl/capi/capi_primitive.cpp (98%) rename src/{ => hrt}/sycl/capi/capi_stream.cpp (98%) diff --git a/src/sycl/capi/capi_engine.cpp b/src/hrt/sycl/capi/capi_engine.cpp similarity index 100% rename from src/sycl/capi/capi_engine.cpp rename to src/hrt/sycl/capi/capi_engine.cpp diff --git a/src/sycl/capi/capi_memory.cpp b/src/hrt/sycl/capi/capi_memory.cpp similarity index 100% rename from src/sycl/capi/capi_memory.cpp rename to src/hrt/sycl/capi/capi_memory.cpp diff --git a/src/sycl/capi/capi_primitive.cpp b/src/hrt/sycl/capi/capi_primitive.cpp similarity index 98% rename from src/sycl/capi/capi_primitive.cpp rename to src/hrt/sycl/capi/capi_primitive.cpp index 2d250e21c24..bee82fc8a7d 100644 --- a/src/sycl/capi/capi_primitive.cpp +++ b/src/hrt/sycl/capi/capi_primitive.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/sycl/capi/capi_stream.cpp b/src/hrt/sycl/capi/capi_stream.cpp similarity index 98% rename from src/sycl/capi/capi_stream.cpp rename to src/hrt/sycl/capi/capi_stream.cpp index 55c585be5a7..86f57088dde 100644 --- a/src/sycl/capi/capi_stream.cpp +++ b/src/hrt/sycl/capi/capi_stream.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2022 Intel Corporation +* Copyright 2019-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 0015b2f54bff874b796b11d843d6348a671e0dde Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Mon, 6 May 2024 19:21:03 -0700 Subject: [PATCH 015/187] api: ocl: move ocl interop api files --- src/{gpu/intel => hrt}/ocl/capi/engine.cpp | 0 src/{gpu/intel => hrt}/ocl/capi/memory.cpp | 0 src/{gpu/intel => hrt}/ocl/capi/primitive.cpp | 0 src/{gpu/intel => hrt}/ocl/capi/stream.cpp | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename src/{gpu/intel => hrt}/ocl/capi/engine.cpp (100%) rename src/{gpu/intel => hrt}/ocl/capi/memory.cpp (100%) rename src/{gpu/intel => hrt}/ocl/capi/primitive.cpp (100%) rename src/{gpu/intel => hrt}/ocl/capi/stream.cpp (100%) diff --git a/src/gpu/intel/ocl/capi/engine.cpp b/src/hrt/ocl/capi/engine.cpp similarity index 100% rename from src/gpu/intel/ocl/capi/engine.cpp rename to src/hrt/ocl/capi/engine.cpp diff --git a/src/gpu/intel/ocl/capi/memory.cpp b/src/hrt/ocl/capi/memory.cpp similarity index 100% rename from src/gpu/intel/ocl/capi/memory.cpp rename to src/hrt/ocl/capi/memory.cpp diff --git a/src/gpu/intel/ocl/capi/primitive.cpp b/src/hrt/ocl/capi/primitive.cpp similarity index 100% rename from src/gpu/intel/ocl/capi/primitive.cpp rename to src/hrt/ocl/capi/primitive.cpp diff --git a/src/gpu/intel/ocl/capi/stream.cpp b/src/hrt/ocl/capi/stream.cpp similarity index 100% rename from src/gpu/intel/ocl/capi/stream.cpp rename to src/hrt/ocl/capi/stream.cpp From b7aa14e8ac20b745c9684f1f0b384fdf6777b408 Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Mon, 6 May 2024 19:48:29 -0700 Subject: [PATCH 016/187] sycl: move memory storage and sycl types files to hrt --- .../miopen_batch_normalization_executor.hpp | 67 +++---- src/gpu/amd/miopen_binary.cpp | 6 +- src/gpu/amd/miopen_convolution.cpp | 14 +- src/gpu/amd/miopen_convolution.hpp | 4 +- src/gpu/amd/miopen_deconvolution.cpp | 4 +- src/gpu/amd/miopen_eltwise.cpp | 6 +- src/gpu/amd/miopen_inner_product.cpp | 6 +- src/gpu/amd/miopen_lrn.cpp | 8 +- src/gpu/amd/miopen_matmul_executor.hpp | 35 ++-- src/gpu/amd/miopen_pooling.cpp | 6 +- src/gpu/amd/miopen_reduction.cpp | 6 +- src/gpu/amd/miopen_reorder.cpp | 4 +- src/gpu/amd/miopen_softmax.cpp | 8 +- .../cudnn_batch_normalization_executor.hpp | 65 +++---- src/gpu/nvidia/cudnn_binary.cpp | 4 +- src/gpu/nvidia/cudnn_convolution.cpp | 16 +- src/gpu/nvidia/cudnn_convolution.hpp | 2 +- src/gpu/nvidia/cudnn_deconvolution.cpp | 2 +- src/gpu/nvidia/cudnn_eltwise.cpp | 4 +- src/gpu/nvidia/cudnn_inner_product.cpp | 4 +- src/gpu/nvidia/cudnn_lrn.cpp | 4 +- src/gpu/nvidia/cudnn_matmul_executor.hpp | 39 ++-- src/gpu/nvidia/cudnn_pooling.cpp | 4 +- src/gpu/nvidia/cudnn_reduction.cpp | 2 +- src/gpu/nvidia/cudnn_reorder.cpp | 2 +- src/gpu/nvidia/cudnn_resampling.cpp | 5 +- src/gpu/nvidia/cudnn_resampling.hpp | 6 +- src/gpu/nvidia/cudnn_softmax.cpp | 4 +- src/gpu/nvidia/sycl_cuda_stream_utils.hpp | 6 +- src/gpu/nvidia/sycl_cuda_utils.hpp | 6 +- src/gpu/sycl/batch_normalizations_kernels.hpp | 154 ++++++++-------- src/gpu/sycl/binary_kernels.hpp | 27 +-- src/gpu/sycl/eltwise_kernels.hpp | 73 ++++---- src/gpu/sycl/layer_normalizations_kernels.hpp | 167 +++++++++--------- src/gpu/sycl/lrn_kernels.hpp | 36 ++-- src/gpu/sycl/pooling_kernels.hpp | 98 +++++----- src/gpu/sycl/prelu_kernels.hpp | 62 ++++--- src/gpu/sycl/ref_batch_normalization.cpp | 45 ++--- src/gpu/sycl/ref_batch_normalization.hpp | 2 +- src/gpu/sycl/ref_binary.cpp | 10 +- src/gpu/sycl/ref_binary.hpp | 2 +- src/gpu/sycl/ref_eltwise.cpp | 14 +- src/gpu/sycl/ref_eltwise.hpp | 2 +- src/gpu/sycl/ref_layer_normalizations.cpp | 43 ++--- src/gpu/sycl/ref_layer_normalizations.hpp | 2 +- src/gpu/sycl/ref_lrn.cpp | 12 +- src/gpu/sycl/ref_lrn.hpp | 2 +- src/gpu/sycl/ref_pooling.cpp | 22 +-- src/gpu/sycl/ref_pooling.hpp | 2 +- src/gpu/sycl/ref_prelu.cpp | 18 +- src/gpu/sycl/ref_prelu.hpp | 2 +- src/gpu/sycl/ref_resampling.cpp | 12 +- src/gpu/sycl/ref_resampling.hpp | 2 +- src/gpu/sycl/ref_shuffle.cpp | 8 +- src/gpu/sycl/ref_shuffle.hpp | 2 +- src/gpu/sycl/ref_softmax.cpp | 12 +- src/gpu/sycl/resampling_kernels.hpp | 79 +++++---- src/gpu/sycl/shuffle_kernels.hpp | 40 ++--- src/gpu/sycl/softmax_kernels.hpp | 35 ++-- src/gpu/sycl/sycl_interop_gpu_kernel.cpp | 13 +- src/gpu/sycl/sycl_io_helper.hpp | 20 +-- src/gpu/sycl/sycl_post_ops.hpp | 6 +- src/gpu/sycl/sycl_primitive_conf.hpp | 130 +++++++------- src/gpu/sycl/sycl_q10n.hpp | 2 +- .../sycl/buffer_memory_storage.cpp} | 54 +++--- .../sycl/buffer_memory_storage.hpp} | 18 +- .../sycl/c_types_map.hpp} | 8 +- src/hrt/sycl/capi/capi_memory.cpp | 13 +- .../sycl/memory_storage.hpp} | 10 +- .../sycl/memory_storage_base.cpp} | 21 ++- .../sycl/memory_storage_base.hpp} | 26 +-- .../sycl/memory_storage_helper.hpp} | 34 ++-- .../sycl_types.hpp => hrt/sycl/types.hpp} | 67 ++++--- .../sycl/usm_memory_storage.cpp} | 29 ++- .../sycl/usm_memory_storage.hpp} | 30 ++-- src/sycl/sycl_engine_base.cpp | 6 +- src/sycl/sycl_stream.hpp | 57 +++--- src/sycl/sycl_stream_submit_cpu_primitive.cpp | 16 +- 78 files changed, 980 insertions(+), 914 deletions(-) rename src/{sycl/sycl_buffer_memory_storage.cpp => hrt/sycl/buffer_memory_storage.cpp} (72%) rename src/{sycl/sycl_buffer_memory_storage.hpp => hrt/sycl/buffer_memory_storage.hpp} (86%) rename src/{sycl/sycl_c_types_map.hpp => hrt/sycl/c_types_map.hpp} (88%) rename src/{sycl/sycl_memory_storage.hpp => hrt/sycl/memory_storage.hpp} (78%) rename src/{sycl/sycl_memory_storage_base.cpp => hrt/sycl/memory_storage_base.cpp} (73%) rename src/{sycl/sycl_memory_storage_base.hpp => hrt/sycl/memory_storage_base.hpp} (67%) rename src/{sycl/sycl_memory_storage_helper.hpp => hrt/sycl/memory_storage_helper.hpp} (76%) rename src/{gpu/sycl/sycl_types.hpp => hrt/sycl/types.hpp} (83%) rename src/{sycl/sycl_usm_memory_storage.cpp => hrt/sycl/usm_memory_storage.cpp} (79%) rename src/{sycl/sycl_usm_memory_storage.hpp => hrt/sycl/usm_memory_storage.hpp} (86%) diff --git a/src/gpu/amd/miopen_batch_normalization_executor.hpp b/src/gpu/amd/miopen_batch_normalization_executor.hpp index 9887501c6e6..5d9ecf58c43 100644 --- a/src/gpu/amd/miopen_batch_normalization_executor.hpp +++ b/src/gpu/amd/miopen_batch_normalization_executor.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020-2022 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -27,7 +27,7 @@ #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" #include "gpu/amd/sycl_hip_utils.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" #include "sycl_hip_utils.hpp" namespace dnnl { @@ -47,19 +47,22 @@ struct bnorm_exec_base_t { std::shared_ptr bnorm_impl, engine_t *engine, ::sycl::handler &cgh, amd::sycl_hip_stream_t *hip_stream, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> arg_src, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> arg_dst, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> arg_scale, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + arg_dst, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + arg_scale, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_scale_buf, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> arg_shift, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + arg_shift, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_shift_buf, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_wkspace, bool use_scale, bool use_shift, bool init_global_stats, - impl::sycl::sycl_memory_arg_t arg_mean = {}, - impl::sycl::sycl_memory_arg_t arg_var = {}) const { + hrt::sycl::interop_memory_arg_t arg_mean = {}, + hrt::sycl::interop_memory_arg_t arg_var = {}) const { compat::host_task(cgh, [=](const compat::interop_handle &ih) { auto &sycl_engine = *utils::downcast(engine); @@ -110,25 +113,26 @@ struct bnorm_exec_base_t { std::shared_ptr bnorm_impl, engine_t *engine, ::sycl::handler &cgh, amd::sycl_hip_stream_t *hip_stream, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> arg_src, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_diff_dst, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_src, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> arg_scale, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + arg_scale, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_scale_buf, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_scale, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_scale_buf, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_shift, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_shift_buf, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_wkspace, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read_write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> arg_temp_relu, bool use_scale, bool use_shift) const { compat::host_task(cgh, [=](const compat::interop_handle &ih) { @@ -187,7 +191,7 @@ struct bnorm_exec_base_t { void init_scaleshift(hip_sycl_scoped_context_handler_t &sc, const compat::interop_handle &ih, amd::sycl_hip_stream_t *hip_stream, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_scale, float val, const size_t n) const { @@ -211,8 +215,9 @@ struct bnorm_exec_base_t { void init_mean_var(hip_sycl_scoped_context_handler_t &sc, const compat::interop_handle &ih, amd::sycl_hip_stream_t *hip_stream, - impl::sycl::sycl_memory_arg_t<::sycl::access_mode::write> arg_mean, - impl::sycl::sycl_memory_arg_t<::sycl::access_mode::write> arg_var, + hrt::sycl::interop_memory_arg_t<::sycl::access_mode::write> + arg_mean, + hrt::sycl::interop_memory_arg_t<::sycl::access_mode::write> arg_var, const size_t n) const { constexpr T mean_var_val = 0; hip_stream->interop_task([&](::sycl::handler &cgh) { @@ -246,14 +251,14 @@ struct bnorm_exec_fwd_t : public bnorm_exec_base_t { auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_scale = CTX_IN_SYCL_MEMORY(DNNL_ARG_SCALE); - auto arg_scale_buf = impl::sycl::sycl_memory_arg_t< + auto arg_scale_buf = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(scale_buf, cgh); auto arg_shift = CTX_IN_SYCL_MEMORY(DNNL_ARG_SHIFT); - auto arg_shift_buf = impl::sycl::sycl_memory_arg_t< + auto arg_shift_buf = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(shift_buf, cgh); auto arg_wkspace = bnorm_impl->is_training() ? CTX_OUT_SYCL_MEMORY(DNNL_ARG_WORKSPACE) - : impl::sycl::sycl_memory_arg_t< + : hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(); if (!use_global_stats) { @@ -295,13 +300,13 @@ struct bnorm_exec_bwd_t : public bnorm_exec_base_t { auto arg_diff_dst = CTX_IN_SYCL_MEMORY(DNNL_ARG_DIFF_DST); auto arg_diff_src = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SRC); auto arg_scale = CTX_IN_SYCL_MEMORY(DNNL_ARG_SCALE); - auto arg_scale_buf = impl::sycl::sycl_memory_arg_t< + auto arg_scale_buf = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(scale_buf, cgh); auto arg_diff_scale = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SCALE); - auto arg_diff_scale_buf = impl::sycl::sycl_memory_arg_t< + auto arg_diff_scale_buf = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(diff_scale_buf, cgh); auto arg_diff_shift = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SHIFT); - auto arg_diff_shift_buf = impl::sycl::sycl_memory_arg_t< + auto arg_diff_shift_buf = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(diff_shift_buf, cgh); auto arg_wkspace = CTX_IN_SYCL_MEMORY(DNNL_ARG_WORKSPACE); auto arg_temp_relu diff --git a/src/gpu/amd/miopen_binary.cpp b/src/gpu/amd/miopen_binary.cpp index b20b4f05220..ccb411cf121 100644 --- a/src/gpu/amd/miopen_binary.cpp +++ b/src/gpu/amd/miopen_binary.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,8 +18,8 @@ #include "gpu/amd/miopen_binary.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_convolution.cpp b/src/gpu/amd/miopen_convolution.cpp index 3e8e107b1ba..1f7a236b672 100644 --- a/src/gpu/amd/miopen_convolution.cpp +++ b/src/gpu/amd/miopen_convolution.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,7 +19,7 @@ #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" #include "gpu/amd/sycl_hip_utils.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { @@ -42,17 +42,17 @@ status_t miopen_convolution_fwd_t::execute_convolution( memory_tracking::names::key_conv_miopen_filter); auto arg_oscale = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_OUTPUT_SCALES); - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read_write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> temp_dst; - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read_write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> temp_reorder; if (pd()->use_temp_dst()) { memory_storage_t *temp_dst_mem = scratch_storage.get(); memory_storage_t *temp_reorder_mem = scratch_storage_2.get(); - temp_dst = impl::sycl::sycl_memory_arg_t< + temp_dst = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(temp_dst_mem, cgh); - temp_reorder = impl::sycl::sycl_memory_arg_t< + temp_reorder = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(temp_reorder_mem, cgh); } @@ -149,7 +149,7 @@ status_t miopen_convolution_bwd_weights_t::execute_convolution( auto arg_filter_scratch = CTX_SCRATCH_SYCL_MEMORY( memory_tracking::names::key_conv_miopen_filter); - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_bias; if (with_bias) { diff --git a/src/gpu/amd/miopen_convolution.hpp b/src/gpu/amd/miopen_convolution.hpp index 5ad85b28815..f1c20907e72 100644 --- a/src/gpu/amd/miopen_convolution.hpp +++ b/src/gpu/amd/miopen_convolution.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -262,7 +262,7 @@ struct miopen_convolution_fwd_t : public primitive_t { private: ::sycl::buffer &buffer(memory_storage_t *mem_storage) const { - return utils::downcast( + return utils::downcast( mem_storage) ->buffer(); } diff --git a/src/gpu/amd/miopen_deconvolution.cpp b/src/gpu/amd/miopen_deconvolution.cpp index 8ecc621bc7f..b60d109abe3 100644 --- a/src/gpu/amd/miopen_deconvolution.cpp +++ b/src/gpu/amd/miopen_deconvolution.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,7 +19,7 @@ #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" #include "gpu/amd/sycl_hip_utils.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_eltwise.cpp b/src/gpu/amd/miopen_eltwise.cpp index 8855d9448fe..e14181e0c1b 100644 --- a/src/gpu/amd/miopen_eltwise.cpp +++ b/src/gpu/amd/miopen_eltwise.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020-2022 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,8 +18,8 @@ #include "gpu/amd/miopen_eltwise.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_inner_product.cpp b/src/gpu/amd/miopen_inner_product.cpp index bf3032ac068..3b72eb50b9e 100644 --- a/src/gpu/amd/miopen_inner_product.cpp +++ b/src/gpu/amd/miopen_inner_product.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020-2022 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,8 +19,8 @@ #include "gpu/amd/miopen_gemm_inner_product.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_lrn.cpp b/src/gpu/amd/miopen_lrn.cpp index fa08ce9b462..e9544de86f3 100644 --- a/src/gpu/amd/miopen_lrn.cpp +++ b/src/gpu/amd/miopen_lrn.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020-2022 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,11 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ + #include "gpu/amd/miopen_lrn.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" + namespace dnnl { namespace impl { namespace gpu { diff --git a/src/gpu/amd/miopen_matmul_executor.hpp b/src/gpu/amd/miopen_matmul_executor.hpp index 718e7372415..4fd4447f27a 100644 --- a/src/gpu/amd/miopen_matmul_executor.hpp +++ b/src/gpu/amd/miopen_matmul_executor.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -23,7 +23,7 @@ #include "gpu/amd/sycl_hip_engine.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" #include @@ -44,12 +44,13 @@ struct miopen_matmul_exec_base_t { void interop_task(std::shared_ptr matmul_impl_, engine_t *engine, ::sycl::handler &cgh, amd::sycl_hip_stream_t *hip_stream, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_weights, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> arg_src, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> arg_dst, - impl::sycl::sycl_memory_arg_t arg_bias, - impl::sycl::sycl_memory_arg_t arg_scratch) { + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + arg_dst, + hrt::sycl::interop_memory_arg_t arg_bias, + hrt::sycl::interop_memory_arg_t arg_scratch) { compat::host_task(cgh, [=](const compat::interop_handle &ih) { auto &sycl_engine = *utils::downcast( @@ -105,7 +106,7 @@ struct miopen_matmul_scratch_runtime_args_bias_exec_t auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS); - auto arg_scratch = impl::sycl::sycl_memory_arg_t< + auto arg_scratch = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(*scratch_buff_, cgh); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, @@ -129,9 +130,9 @@ struct miopen_matmul_runtime_args_scratch_exec_t auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); - auto arg_scratch = impl::sycl::sycl_memory_arg_t< + auto arg_scratch = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(*scratch_buff_, cgh); - auto arg_bias = impl::sycl::sycl_memory_arg_t< + auto arg_bias = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, @@ -155,7 +156,7 @@ struct miopen_matmul_runtime_args_bias_exec_t auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS); - auto arg_scratch = impl::sycl::sycl_memory_arg_t< + auto arg_scratch = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, @@ -177,9 +178,9 @@ struct miopen_matmul_runtime_args_exec_t : public miopen_matmul_exec_base_t { auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); - auto arg_bias = impl::sycl::sycl_memory_arg_t< + auto arg_bias = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); - auto arg_scratch = impl::sycl::sycl_memory_arg_t< + auto arg_scratch = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, @@ -226,7 +227,7 @@ struct miopen_matmul_scratch_exec_t : public miopen_matmul_exec_base_t { auto arg_scratch = CTX_SCRATCH_SYCL_MEMORY( memory_tracking::names::key_matmul_dst_in_acc_dt); - auto arg_bias = impl::sycl::sycl_memory_arg_t< + auto arg_bias = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, @@ -249,7 +250,7 @@ struct miopen_matmul_bias_exec_t : public miopen_matmul_exec_base_t { auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS); - auto arg_scratch = impl::sycl::sycl_memory_arg_t< + auto arg_scratch = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, @@ -271,9 +272,9 @@ struct miopen_matmul_exec_t : public miopen_matmul_exec_base_t { auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); - auto arg_bias = impl::sycl::sycl_memory_arg_t< + auto arg_bias = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); - auto arg_scratch = impl::sycl::sycl_memory_arg_t< + auto arg_scratch = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, diff --git a/src/gpu/amd/miopen_pooling.cpp b/src/gpu/amd/miopen_pooling.cpp index 6e4436bf344..81b9984de28 100644 --- a/src/gpu/amd/miopen_pooling.cpp +++ b/src/gpu/amd/miopen_pooling.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020-2022 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,8 +19,8 @@ #include "common/nstl.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_reduction.cpp b/src/gpu/amd/miopen_reduction.cpp index b8a2a462f77..3eb10cfc217 100644 --- a/src/gpu/amd/miopen_reduction.cpp +++ b/src/gpu/amd/miopen_reduction.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,8 +18,8 @@ #include "gpu/amd/miopen_reduction.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_reorder.cpp b/src/gpu/amd/miopen_reorder.cpp index ee91d7d754b..d7e3b4545c6 100644 --- a/src/gpu/amd/miopen_reorder.cpp +++ b/src/gpu/amd/miopen_reorder.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020-2022 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,7 +19,7 @@ #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_softmax.cpp b/src/gpu/amd/miopen_softmax.cpp index 7d7ef8d24bf..f82398aeefe 100644 --- a/src/gpu/amd/miopen_softmax.cpp +++ b/src/gpu/amd/miopen_softmax.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020-2022 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,8 +18,8 @@ #include "gpu/amd/miopen_softmax.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { @@ -85,4 +85,4 @@ status_t miopen_softmax_bwd_t::execute(const exec_ctx_t &ctx) const { } // namespace amd } // namespace gpu } // namespace impl -} // namespace dnnl \ No newline at end of file +} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp b/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp index 83d26e4ae16..b16926189ee 100644 --- a/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp +++ b/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp @@ -27,7 +27,7 @@ #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" #include "gpu/nvidia/sycl_cuda_utils.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" #include "sycl_cuda_utils.hpp" namespace dnnl { @@ -47,19 +47,22 @@ struct bnorm_exec_base_t { std::shared_ptr bnorm_impl, engine_t *engine, ::sycl::handler &cgh, nvidia::sycl_cuda_stream_t *cuda_stream, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> arg_src, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> arg_dst, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> arg_scale, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + arg_dst, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + arg_scale, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_scale_buf, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> arg_shift, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + arg_shift, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_shift_buf, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_wkspace, bool use_scale, bool use_shift, bool init_global_stats, - impl::sycl::sycl_memory_arg_t arg_mean = {}, - impl::sycl::sycl_memory_arg_t arg_var = {}) const { + hrt::sycl::interop_memory_arg_t arg_mean = {}, + hrt::sycl::interop_memory_arg_t arg_var = {}) const { compat::host_task(cgh, [=, this](const compat::interop_handle &ih) { auto &sycl_engine = *utils::downcast(engine); @@ -109,25 +112,26 @@ struct bnorm_exec_base_t { std::shared_ptr bnorm_impl, engine_t *engine, ::sycl::handler &cgh, nvidia::sycl_cuda_stream_t *cuda_stream, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> arg_src, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_diff_dst, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_src, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> arg_scale, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + arg_scale, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_scale_buf, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_scale, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_scale_buf, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_shift, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_shift_buf, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_wkspace, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read_write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> arg_temp_relu, bool use_scale, bool use_shift) const { compat::host_task(cgh, [=, this](const compat::interop_handle &ih) { @@ -186,7 +190,7 @@ struct bnorm_exec_base_t { void init_scaleshift(cuda_sycl_scoped_context_handler_t &sc, const compat::interop_handle &ih, nvidia::sycl_cuda_stream_t *cuda_stream, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_scale, float val, const size_t n) const { cuda_stream->interop_task([&](::sycl::handler &cgh) { @@ -210,8 +214,9 @@ struct bnorm_exec_base_t { void init_mean_var(cuda_sycl_scoped_context_handler_t &sc, const compat::interop_handle &ih, nvidia::sycl_cuda_stream_t *cuda_stream, - impl::sycl::sycl_memory_arg_t<::sycl::access_mode::write> arg_mean, - impl::sycl::sycl_memory_arg_t<::sycl::access_mode::write> arg_var, + hrt::sycl::interop_memory_arg_t<::sycl::access_mode::write> + arg_mean, + hrt::sycl::interop_memory_arg_t<::sycl::access_mode::write> arg_var, const size_t n) const { constexpr T mean_var_val = 0; cuda_stream->interop_task([&](::sycl::handler &cgh) { @@ -245,14 +250,14 @@ struct bnorm_exec_fwd_t : public bnorm_exec_base_t { auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_scale = CTX_IN_SYCL_MEMORY(DNNL_ARG_SCALE); - auto arg_scale_buf = impl::sycl::sycl_memory_arg_t< + auto arg_scale_buf = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(scale_buf, cgh); auto arg_shift = CTX_IN_SYCL_MEMORY(DNNL_ARG_SHIFT); - auto arg_shift_buf = impl::sycl::sycl_memory_arg_t< + auto arg_shift_buf = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(shift_buf, cgh); auto arg_wkspace = bnorm_impl->is_training() ? CTX_OUT_SYCL_MEMORY(DNNL_ARG_WORKSPACE) - : impl::sycl::sycl_memory_arg_t< + : hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(); if (!use_global_stats) { @@ -294,13 +299,13 @@ struct bnorm_exec_bwd_t : public bnorm_exec_base_t { auto arg_diff_dst = CTX_IN_SYCL_MEMORY(DNNL_ARG_DIFF_DST); auto arg_diff_src = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SRC); auto arg_scale = CTX_IN_SYCL_MEMORY(DNNL_ARG_SCALE); - auto arg_scale_buf = impl::sycl::sycl_memory_arg_t< + auto arg_scale_buf = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(scale_buf, cgh); auto arg_diff_scale = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SCALE); - auto arg_diff_scale_buf = impl::sycl::sycl_memory_arg_t< + auto arg_diff_scale_buf = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(diff_scale_buf, cgh); auto arg_diff_shift = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SHIFT); - auto arg_diff_shift_buf = impl::sycl::sycl_memory_arg_t< + auto arg_diff_shift_buf = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(diff_shift_buf, cgh); auto arg_wkspace = CTX_IN_SYCL_MEMORY(DNNL_ARG_WORKSPACE); auto arg_temp_relu diff --git a/src/gpu/nvidia/cudnn_binary.cpp b/src/gpu/nvidia/cudnn_binary.cpp index ce106b5d52a..015a8aa434f 100644 --- a/src/gpu/nvidia/cudnn_binary.cpp +++ b/src/gpu/nvidia/cudnn_binary.cpp @@ -20,8 +20,8 @@ #include "gpu/nvidia/sycl_cuda_stream.hpp" #include "gpu/nvidia/sycl_cuda_stream_utils.hpp" #include "gpu/nvidia/sycl_cuda_utils.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_convolution.cpp b/src/gpu/nvidia/cudnn_convolution.cpp index b853e330da0..d6d1966f2e9 100644 --- a/src/gpu/nvidia/cudnn_convolution.cpp +++ b/src/gpu/nvidia/cudnn_convolution.cpp @@ -19,7 +19,7 @@ #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" #include "gpu/nvidia/sycl_cuda_utils.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { @@ -47,27 +47,27 @@ status_t cudnn_convolution_fwd_t::execute_convolution( auto arg_dst_scale = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST); - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read_write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> temp_dst; - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read_write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> temp_reorder; if (pd()->use_temp_dst()) { memory_storage_t *temp_dst_mem = scratch_storage.get(); memory_storage_t *temp_reorder_mem = scratch_storage_2.get(); - temp_dst = impl::sycl::sycl_memory_arg_t< + temp_dst = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(temp_dst_mem, cgh); - temp_reorder = impl::sycl::sycl_memory_arg_t< + temp_reorder = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(temp_reorder_mem, cgh); } - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read_write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> y_fp32_data; if (!arg_dst_scale.empty() || !arg_src_scale.empty() || !arg_wei_scale.empty()) { memory_storage_t *y_fp32_data_mem = scratch_storage_3.get(); - y_fp32_data = impl::sycl::sycl_memory_arg_t< + y_fp32_data = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(y_fp32_data_mem, cgh); } @@ -167,7 +167,7 @@ status_t cudnn_convolution_bwd_weights_t::execute_convolution( auto arg_filter_scratch = CTX_SCRATCH_SYCL_MEMORY( memory_tracking::names::key_conv_cudnn_filter); - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_bias; if (with_bias) { diff --git a/src/gpu/nvidia/cudnn_convolution.hpp b/src/gpu/nvidia/cudnn_convolution.hpp index 63c6a76c43b..741a01fde2e 100644 --- a/src/gpu/nvidia/cudnn_convolution.hpp +++ b/src/gpu/nvidia/cudnn_convolution.hpp @@ -208,7 +208,7 @@ struct cudnn_convolution_fwd_t : public primitive_t { private: ::sycl::buffer &buffer(memory_storage_t *mem_storage) const { - return utils::downcast( + return utils::downcast( mem_storage) ->buffer(); } diff --git a/src/gpu/nvidia/cudnn_deconvolution.cpp b/src/gpu/nvidia/cudnn_deconvolution.cpp index ef9c1df7ba9..1192184c2b7 100644 --- a/src/gpu/nvidia/cudnn_deconvolution.cpp +++ b/src/gpu/nvidia/cudnn_deconvolution.cpp @@ -19,7 +19,7 @@ #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" #include "gpu/nvidia/sycl_cuda_utils.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_eltwise.cpp b/src/gpu/nvidia/cudnn_eltwise.cpp index 37183eea97b..0e21f4c5daa 100644 --- a/src/gpu/nvidia/cudnn_eltwise.cpp +++ b/src/gpu/nvidia/cudnn_eltwise.cpp @@ -18,8 +18,8 @@ #include "gpu/nvidia/cudnn_eltwise.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_inner_product.cpp b/src/gpu/nvidia/cudnn_inner_product.cpp index 60ddd9d0e53..d4f3c63a651 100644 --- a/src/gpu/nvidia/cudnn_inner_product.cpp +++ b/src/gpu/nvidia/cudnn_inner_product.cpp @@ -20,8 +20,8 @@ #include "gpu/nvidia/cudnn_gemm_inner_product.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_lrn.cpp b/src/gpu/nvidia/cudnn_lrn.cpp index 4457a25560d..d90c10ad95b 100644 --- a/src/gpu/nvidia/cudnn_lrn.cpp +++ b/src/gpu/nvidia/cudnn_lrn.cpp @@ -18,9 +18,9 @@ #include "gpu/nvidia/cudnn_lrn.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_matmul_executor.hpp b/src/gpu/nvidia/cudnn_matmul_executor.hpp index 382a1039bc6..1601cc57e76 100644 --- a/src/gpu/nvidia/cudnn_matmul_executor.hpp +++ b/src/gpu/nvidia/cudnn_matmul_executor.hpp @@ -23,7 +23,7 @@ #include "gpu/nvidia/sycl_cuda_engine.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" #include @@ -44,17 +44,18 @@ struct cudnn_matmul_exec_base_t { void interop_task(std::shared_ptr matmul_impl_, engine_t *engine, ::sycl::handler &cgh, nvidia::sycl_cuda_stream_t *cuda_stream, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_weights, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> arg_src, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write> arg_dst, - impl::sycl::sycl_memory_arg_t arg_bias, - impl::sycl::sycl_memory_arg_t arg_scratch, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + arg_dst, + hrt::sycl::interop_memory_arg_t arg_bias, + hrt::sycl::interop_memory_arg_t arg_scratch, + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src_scale, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_wei_scale, - impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read> + hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_dst_scale) { compat::host_task(cgh, [=](const compat::interop_handle &ih) { @@ -131,7 +132,7 @@ struct cudnn_matmul_scratch_runtime_args_bias_exec_t auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS); - auto arg_scratch = impl::sycl::sycl_memory_arg_t< + auto arg_scratch = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(*scratch_buff_, cgh); auto arg_src_scale @@ -163,9 +164,9 @@ struct cudnn_matmul_runtime_args_scratch_exec_t auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); - auto arg_scratch = impl::sycl::sycl_memory_arg_t< + auto arg_scratch = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(*scratch_buff_, cgh); - auto arg_bias = impl::sycl::sycl_memory_arg_t< + auto arg_bias = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); auto arg_src_scale @@ -196,7 +197,7 @@ struct cudnn_matmul_runtime_args_bias_exec_t : public cudnn_matmul_exec_base_t { auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS); - auto arg_scratch = impl::sycl::sycl_memory_arg_t< + auto arg_scratch = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); auto arg_src_scale @@ -226,9 +227,9 @@ struct cudnn_matmul_runtime_args_exec_t : public cudnn_matmul_exec_base_t { auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); - auto arg_bias = impl::sycl::sycl_memory_arg_t< + auto arg_bias = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); - auto arg_scratch = impl::sycl::sycl_memory_arg_t< + auto arg_scratch = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); auto arg_src_scale @@ -291,7 +292,7 @@ struct cudnn_matmul_scratch_exec_t : public cudnn_matmul_exec_base_t { auto arg_scratch = CTX_SCRATCH_SYCL_MEMORY( memory_tracking::names::key_matmul_dst_in_acc_dt); - auto arg_bias = impl::sycl::sycl_memory_arg_t< + auto arg_bias = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); auto arg_src_scale @@ -322,7 +323,7 @@ struct cudnn_matmul_bias_exec_t : public cudnn_matmul_exec_base_t { auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS); - auto arg_scratch = impl::sycl::sycl_memory_arg_t< + auto arg_scratch = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); auto arg_src_scale @@ -352,9 +353,9 @@ struct cudnn_matmul_exec_t : public cudnn_matmul_exec_base_t { auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); - auto arg_bias = impl::sycl::sycl_memory_arg_t< + auto arg_bias = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); - auto arg_scratch = impl::sycl::sycl_memory_arg_t< + auto arg_scratch = hrt::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); auto arg_src_scale diff --git a/src/gpu/nvidia/cudnn_pooling.cpp b/src/gpu/nvidia/cudnn_pooling.cpp index b00eab40245..b1234ff9ef9 100644 --- a/src/gpu/nvidia/cudnn_pooling.cpp +++ b/src/gpu/nvidia/cudnn_pooling.cpp @@ -18,11 +18,11 @@ #include "gpu/nvidia/cudnn_pooling.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" #include "common/nstl.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_reduction.cpp b/src/gpu/nvidia/cudnn_reduction.cpp index 17bf749b565..9cef33721a9 100644 --- a/src/gpu/nvidia/cudnn_reduction.cpp +++ b/src/gpu/nvidia/cudnn_reduction.cpp @@ -18,7 +18,7 @@ #include "gpu/nvidia/cudnn_reduction.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_reorder.cpp b/src/gpu/nvidia/cudnn_reorder.cpp index 24e0303fe16..44166e28023 100644 --- a/src/gpu/nvidia/cudnn_reorder.cpp +++ b/src/gpu/nvidia/cudnn_reorder.cpp @@ -20,7 +20,7 @@ #include "gpu/nvidia/sycl_cuda_stream.hpp" #include "gpu/nvidia/sycl_cuda_stream_utils.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_resampling.cpp b/src/gpu/nvidia/cudnn_resampling.cpp index 394d5f8a5c3..dd9ee7be29d 100644 --- a/src/gpu/nvidia/cudnn_resampling.cpp +++ b/src/gpu/nvidia/cudnn_resampling.cpp @@ -15,12 +15,11 @@ * limitations under the License. *******************************************************************************/ -#include "sycl/sycl_buffer_memory_storage.hpp" - #include "gpu/nvidia/cudnn_resampling.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_resampling.hpp b/src/gpu/nvidia/cudnn_resampling.hpp index c221c6c9384..ac3648ddff8 100644 --- a/src/gpu/nvidia/cudnn_resampling.hpp +++ b/src/gpu/nvidia/cudnn_resampling.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -57,12 +57,12 @@ struct cudnn_resampling_base_t : public primitive_t { }; ::sycl::buffer &buffer(memory_storage_t *mem_storage) { - return utils::downcast( + return utils::downcast( mem_storage) ->buffer(); } ::sycl::buffer &buffer(memory_storage_t *mem_storage) const { - return utils::downcast( + return utils::downcast( mem_storage) ->buffer(); } diff --git a/src/gpu/nvidia/cudnn_softmax.cpp b/src/gpu/nvidia/cudnn_softmax.cpp index 4bf4ac7a526..776c7c097b5 100644 --- a/src/gpu/nvidia/cudnn_softmax.cpp +++ b/src/gpu/nvidia/cudnn_softmax.cpp @@ -19,8 +19,8 @@ #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" #include "gpu/nvidia/sycl_cuda_stream_utils.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/sycl_cuda_stream_utils.hpp b/src/gpu/nvidia/sycl_cuda_stream_utils.hpp index 6066463f681..235c368e949 100644 --- a/src/gpu/nvidia/sycl_cuda_stream_utils.hpp +++ b/src/gpu/nvidia/sycl_cuda_stream_utils.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,8 +19,8 @@ #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "sycl/sycl_buffer_memory_storage.hpp" -#include "sycl/sycl_memory_storage_helper.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/sycl_cuda_utils.hpp b/src/gpu/nvidia/sycl_cuda_utils.hpp index 26009db58b7..6e6a81d2974 100644 --- a/src/gpu/nvidia/sycl_cuda_utils.hpp +++ b/src/gpu/nvidia/sycl_cuda_utils.hpp @@ -40,19 +40,19 @@ namespace gpu { namespace nvidia { #define CTX_OUT_ACCESSOR(arg) \ - utils::downcast( \ + utils::downcast( \ &CTX_OUT_STORAGE(arg)) \ ->buffer() \ .get_access<::sycl::access::mode::write>(cgh) #define CTX_IN_ACCESSOR(arg) \ - utils::downcast( \ + utils::downcast( \ &CTX_IN_STORAGE(arg)) \ ->buffer() \ .get_access<::sycl::access::mode::read>(cgh) #define CTX_SCRATCH_ACCESSOR(arg) \ - utils::downcast( \ + utils::downcast( \ ctx.get_scratchpad_grantor().get_memory_storage(arg).get()) \ ->buffer() \ .get_access<::sycl::access::mode::read_write>(cgh) diff --git a/src/gpu/sycl/batch_normalizations_kernels.hpp b/src/gpu/sycl/batch_normalizations_kernels.hpp index 5cc325059dc..1bac580a832 100644 --- a/src/gpu/sycl/batch_normalizations_kernels.hpp +++ b/src/gpu/sycl/batch_normalizations_kernels.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { @@ -50,10 +50,10 @@ inline float maybe_up_convert(bfloat16_t x) { struct batch_normalization_fwd_kernel_vec_t { batch_normalization_fwd_kernel_vec_t( const sycl_batch_normalization_conf_t &conf, - sycl_in_memory_arg_t &data, sycl_in_memory_arg_t &scale, - sycl_in_memory_arg_t &shift, sycl_in_memory_arg_t &stat, - sycl_in_memory_arg_t &var, sycl_out_memory_arg_t &dst, - sycl_out_memory_arg_t &ws, sycl_in_memory_arg_t &src1) + hrt::sycl::in_memory_arg_t &data, hrt::sycl::in_memory_arg_t &scale, + hrt::sycl::in_memory_arg_t &shift, hrt::sycl::in_memory_arg_t &stat, + hrt::sycl::in_memory_arg_t &var, hrt::sycl::out_memory_arg_t &dst, + hrt::sycl::out_memory_arg_t &ws, hrt::sycl::in_memory_arg_t &src1) : conf_(conf) , data_(data) , scale_(scale) @@ -72,19 +72,19 @@ struct batch_normalization_fwd_kernel_vec_t { } private: - const sycl_md_t &data_md() const { return conf_.data_md; } - const sycl_md_t &src1_md() const { return conf_.src1_md; } - const sycl_md_t &ws_md() const { return conf_.ws_md; } - const sycl_md_t &data_scaleshift_md() const { + const hrt::sycl::md_t &data_md() const { return conf_.data_md; } + const hrt::sycl::md_t &src1_md() const { return conf_.src1_md; } + const hrt::sycl::md_t &ws_md() const { return conf_.ws_md; } + const hrt::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const sycl_md_t &var_md() const { return conf_.var_md; } - const sycl_md_t &stat_d() const { return conf_.stat_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &var_md() const { return conf_.var_md; } + const hrt::sycl::md_t &stat_d() const { return conf_.stat_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } const float epsilon() const { return conf_.batch_norm_epsilon; } - inline static dim_t DATA_OFF( - const sycl_md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { + inline static dim_t DATA_OFF(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 1: return mdw.off(n); case 2: return mdw.off(n, c); @@ -161,7 +161,7 @@ struct batch_normalization_fwd_kernel_vec_t { if (data_md().data_type() == data_type::s8) { bn_res = ::dnnl::impl::sycl::qz_a1b0::type>()( + hrt::sycl::prec_traits::type>()( maybe_post_op(bn_res)); store_float_value( dst_md().data_type(), bn_res, dst_ptr(), d_off); @@ -185,23 +185,24 @@ struct batch_normalization_fwd_kernel_vec_t { sycl_batch_normalization_conf_t conf_; - sycl_in_memory_arg_t data_; - sycl_in_memory_arg_t scale_; - sycl_in_memory_arg_t shift_; - sycl_in_memory_arg_t stat_; - sycl_in_memory_arg_t var_; - sycl_out_memory_arg_t dst_; - sycl_out_memory_arg_t ws_; - sycl_in_memory_arg_t src1_; + hrt::sycl::in_memory_arg_t data_; + hrt::sycl::in_memory_arg_t scale_; + hrt::sycl::in_memory_arg_t shift_; + hrt::sycl::in_memory_arg_t stat_; + hrt::sycl::in_memory_arg_t var_; + hrt::sycl::out_memory_arg_t dst_; + hrt::sycl::out_memory_arg_t ws_; + hrt::sycl::in_memory_arg_t src1_; }; struct batch_normalization_fwd_kernel_vec_t1 { batch_normalization_fwd_kernel_vec_t1( const sycl_batch_normalization_conf_t &conf, - sycl_in_memory_arg_t &data, sycl_in_memory_arg_t &scale, - sycl_in_memory_arg_t &shift, sycl_out_memory_arg_t &dst, - sycl_out_memory_arg_t &mean_out, sycl_out_memory_arg_t &var_out, - sycl_out_memory_arg_t &ws, sycl_in_memory_arg_t &src1) + hrt::sycl::in_memory_arg_t &data, hrt::sycl::in_memory_arg_t &scale, + hrt::sycl::in_memory_arg_t &shift, hrt::sycl::out_memory_arg_t &dst, + hrt::sycl::out_memory_arg_t &mean_out, + hrt::sycl::out_memory_arg_t &var_out, + hrt::sycl::out_memory_arg_t &ws, hrt::sycl::in_memory_arg_t &src1) : conf_(conf) , data_(data) , scale_(scale) @@ -220,19 +221,19 @@ struct batch_normalization_fwd_kernel_vec_t1 { } private: - const sycl_md_t &data_md() const { return conf_.data_md; } - const sycl_md_t &src1_md() const { return conf_.src1_md; } - const sycl_md_t &ws_md() const { return conf_.ws_md; } - const sycl_md_t &data_scaleshift_md() const { + const hrt::sycl::md_t &data_md() const { return conf_.data_md; } + const hrt::sycl::md_t &src1_md() const { return conf_.src1_md; } + const hrt::sycl::md_t &ws_md() const { return conf_.ws_md; } + const hrt::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const sycl_md_t &var_md() const { return conf_.var_md; } - const sycl_md_t &stat_d() const { return conf_.stat_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &var_md() const { return conf_.var_md; } + const hrt::sycl::md_t &stat_d() const { return conf_.stat_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } const float epsilon() const { return conf_.batch_norm_epsilon; } - inline static dim_t DATA_OFF( - const sycl_md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { + inline static dim_t DATA_OFF(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 1: return mdw.off(n); case 2: return mdw.off(n, c); @@ -338,7 +339,7 @@ struct batch_normalization_fwd_kernel_vec_t1 { if (data_md().data_type() == data_type::s8) { bn_res = ::dnnl::impl::sycl::qz_a1b0::type>()( + hrt::sycl::prec_traits::type>()( maybe_post_op(bn_res)); store_float_value( dst_md().data_type(), bn_res, dst_ptr(), d_off); @@ -370,25 +371,28 @@ struct batch_normalization_fwd_kernel_vec_t1 { sycl_batch_normalization_conf_t conf_; - sycl_in_memory_arg_t data_; - sycl_in_memory_arg_t scale_; - sycl_in_memory_arg_t shift_; - sycl_out_memory_arg_t dst_; - sycl_out_memory_arg_t mean_out_; - sycl_out_memory_arg_t var_out_; - sycl_out_memory_arg_t ws_; - sycl_in_memory_arg_t src1_; + hrt::sycl::in_memory_arg_t data_; + hrt::sycl::in_memory_arg_t scale_; + hrt::sycl::in_memory_arg_t shift_; + hrt::sycl::out_memory_arg_t dst_; + hrt::sycl::out_memory_arg_t mean_out_; + hrt::sycl::out_memory_arg_t var_out_; + hrt::sycl::out_memory_arg_t ws_; + hrt::sycl::in_memory_arg_t src1_; }; struct batch_normalization_bwd_kernel_vec_t { batch_normalization_bwd_kernel_vec_t( const sycl_batch_normalization_conf_t &conf, - sycl_in_memory_arg_t &data, sycl_out_memory_arg_t &diff_data, - sycl_in_memory_arg_t &scale, sycl_out_memory_arg_t &diff_scale, - sycl_out_memory_arg_t &diff_shift, sycl_in_memory_arg_t &stat, - sycl_in_memory_arg_t &var, sycl_in_memory_arg_t &diff_dst, - sycl_in_memory_arg_t &dst, sycl_in_memory_arg_t &ws, - sycl_in_memory_arg_t &diff_src1) + hrt::sycl::in_memory_arg_t &data, + hrt::sycl::out_memory_arg_t &diff_data, + hrt::sycl::in_memory_arg_t &scale, + hrt::sycl::out_memory_arg_t &diff_scale, + hrt::sycl::out_memory_arg_t &diff_shift, + hrt::sycl::in_memory_arg_t &stat, hrt::sycl::in_memory_arg_t &var, + hrt::sycl::in_memory_arg_t &diff_dst, + hrt::sycl::in_memory_arg_t &dst, hrt::sycl::in_memory_arg_t &ws, + hrt::sycl::in_memory_arg_t &diff_src1) : conf_(conf) , data_(data) , diff_data_(diff_data) @@ -410,20 +414,20 @@ struct batch_normalization_bwd_kernel_vec_t { } private: - const sycl_md_t &data_md() const { return conf_.data_md; } - const sycl_md_t &diff_data_md() const { return conf_.diff_data_md; } - const sycl_md_t &diff_src1_md() const { return conf_.diff_src1_md; } - const sycl_md_t &stat_d() const { return conf_.stat_md; } - const sycl_md_t &ws_md() const { return conf_.ws_md; } - const sycl_md_t &data_scaleshift_md() const { + const hrt::sycl::md_t &data_md() const { return conf_.data_md; } + const hrt::sycl::md_t &diff_data_md() const { return conf_.diff_data_md; } + const hrt::sycl::md_t &diff_src1_md() const { return conf_.diff_src1_md; } + const hrt::sycl::md_t &stat_d() const { return conf_.stat_md; } + const hrt::sycl::md_t &ws_md() const { return conf_.ws_md; } + const hrt::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const sycl_md_t &diff_data_scaleshift_md() const { + const hrt::sycl::md_t &diff_data_scaleshift_md() const { return conf_.diff_data_scaleshift_md; } - const sycl_md_t &var_md() const { return conf_.var_md; } - const sycl_md_t &diff_dst_md() const { return conf_.diff_dst_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &var_md() const { return conf_.var_md; } + const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } const unsigned flags() const { return conf_.flags; } const float epsilon() const { return conf_.batch_norm_epsilon; } @@ -439,8 +443,8 @@ struct batch_normalization_bwd_kernel_vec_t { void *dst_ptr() const { return dst_.get_pointer(); } void *ws_ptr() const { return ws_.get_pointer(); } - static dim_t DATA_OFF( - const sycl_md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { + static dim_t DATA_OFF(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, dim_t d, + dim_t h, dim_t w) { switch (mdw.ndims()) { case 1: return mdw.off(n); case 2: return mdw.off(n, c); @@ -573,17 +577,17 @@ struct batch_normalization_bwd_kernel_vec_t { sycl_batch_normalization_conf_t conf_; - sycl_in_memory_arg_t data_; - sycl_out_memory_arg_t diff_data_; - sycl_in_memory_arg_t scale_; - sycl_out_memory_arg_t diff_scale_; - sycl_out_memory_arg_t diff_shift_; - sycl_in_memory_arg_t stat_; - sycl_in_memory_arg_t var_; - sycl_in_memory_arg_t diff_dst_; - sycl_in_memory_arg_t dst_; - sycl_in_memory_arg_t ws_; - sycl_in_memory_arg_t diff_src1_; + hrt::sycl::in_memory_arg_t data_; + hrt::sycl::out_memory_arg_t diff_data_; + hrt::sycl::in_memory_arg_t scale_; + hrt::sycl::out_memory_arg_t diff_scale_; + hrt::sycl::out_memory_arg_t diff_shift_; + hrt::sycl::in_memory_arg_t stat_; + hrt::sycl::in_memory_arg_t var_; + hrt::sycl::in_memory_arg_t diff_dst_; + hrt::sycl::in_memory_arg_t dst_; + hrt::sycl::in_memory_arg_t ws_; + hrt::sycl::in_memory_arg_t diff_src1_; }; } // namespace sycl diff --git a/src/gpu/sycl/binary_kernels.hpp b/src/gpu/sycl/binary_kernels.hpp index 7ed450e18fc..0be3af1fddf 100644 --- a/src/gpu/sycl/binary_kernels.hpp +++ b/src/gpu/sycl/binary_kernels.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { @@ -33,9 +33,10 @@ struct binary_kernel_vec_t { static constexpr int max_supported_ndims = 5; binary_kernel_vec_t(const sycl_binary_conf_t &conf, - sycl_in_memory_arg_t &src0, sycl_in_memory_arg_t &src1, - sycl_out_memory_arg_t &dst, sycl_in_memory_arg_t &src0_scale, - sycl_in_memory_arg_t &src1_scale, data_type_t scales_dt) + hrt::sycl::in_memory_arg_t &src0, hrt::sycl::in_memory_arg_t &src1, + hrt::sycl::out_memory_arg_t &dst, + hrt::sycl::in_memory_arg_t &src0_scale, + hrt::sycl::in_memory_arg_t &src1_scale, data_type_t scales_dt) : conf_(conf) , src0_(src0) , src1_(src1) @@ -131,9 +132,9 @@ struct binary_kernel_vec_t { } private: - const sycl_md_t &src0_md() const { return conf_.src0_md; } - const sycl_md_t &src1_md() const { return conf_.src1_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &src0_md() const { return conf_.src0_md; } + const hrt::sycl::md_t &src1_md() const { return conf_.src1_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } void *src0_ptr() const { return src0_.get_pointer(); } void *src1_ptr() const { return src1_.get_pointer(); } @@ -192,11 +193,11 @@ struct binary_kernel_vec_t { sycl_binary_conf_t conf_; - sycl_in_memory_arg_t src0_; - sycl_in_memory_arg_t src1_; - sycl_out_memory_arg_t dst_; - sycl_in_memory_arg_t src0_scale_; - sycl_in_memory_arg_t src1_scale_; + hrt::sycl::in_memory_arg_t src0_; + hrt::sycl::in_memory_arg_t src1_; + hrt::sycl::out_memory_arg_t dst_; + hrt::sycl::in_memory_arg_t src0_scale_; + hrt::sycl::in_memory_arg_t src1_scale_; data_type_t scales_dt_; }; diff --git a/src/gpu/sycl/eltwise_kernels.hpp b/src/gpu/sycl/eltwise_kernels.hpp index df05efad43b..867ac4b312b 100644 --- a/src/gpu/sycl/eltwise_kernels.hpp +++ b/src/gpu/sycl/eltwise_kernels.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,10 +29,12 @@ namespace sycl { struct eltwise_fwd_kernel_vec_t { eltwise_fwd_kernel_vec_t(const sycl_eltwise_conf_t &conf, - sycl_in_memory_arg_t &src, sycl_out_memory_arg_t &dst, - sycl_in_memory_arg_t &srcOp1, sycl_in_memory_arg_t &srcOp2, - sycl_in_memory_arg_t &srcOp3, sycl_in_memory_arg_t &srcOp4, - sycl_in_memory_arg_t &srcOp5) + hrt::sycl::in_memory_arg_t &src, hrt::sycl::out_memory_arg_t &dst, + hrt::sycl::in_memory_arg_t &srcOp1, + hrt::sycl::in_memory_arg_t &srcOp2, + hrt::sycl::in_memory_arg_t &srcOp3, + hrt::sycl::in_memory_arg_t &srcOp4, + hrt::sycl::in_memory_arg_t &srcOp5) : conf_(conf) , src_(src) , srcOp1_(srcOp1) @@ -91,8 +93,8 @@ struct eltwise_fwd_kernel_vec_t { } private: - const sycl_md_t &src_md() const { return conf_.src_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &src_md() const { return conf_.src_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } void *src_ptr() const { return src_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } @@ -214,8 +216,8 @@ struct eltwise_fwd_kernel_vec_t { return post_po_sr; } - inline dim_t data_offset(const sycl_md_t &mem, dim_t &n, dim_t &c, dim_t &d, - dim_t &h, dim_t &w) const { + inline dim_t data_offset(const hrt::sycl::md_t &mem, dim_t &n, dim_t &c, + dim_t &d, dim_t &h, dim_t &w) const { const auto ndims = mem.ndims(); switch (ndims) { case 1: return mem.off(n); @@ -228,8 +230,8 @@ struct eltwise_fwd_kernel_vec_t { return -1; } - float get_post_op_val(const sycl_in_memory_arg_t &bin_src_op, dim_t &idx, - dim_t &offset) const { + float get_post_op_val(const hrt::sycl::in_memory_arg_t &bin_src_op, + dim_t &idx, dim_t &offset) const { auto src1_desc = conf_.binary_src_arr[idx]; const auto off = get_binary_src1_off( @@ -240,17 +242,17 @@ struct eltwise_fwd_kernel_vec_t { return dst; } - dim_t get_binary_src1_off(const sycl_md_t &src1_md, const dim_t &l_offset, - const sycl_md_t::dims32_t &dst_dims, - const sycl_md_t::dim32_t &dst_ndims) const { + dim_t get_binary_src1_off(const hrt::sycl::md_t &src1_md, + const dim_t &l_offset, const hrt::sycl::md_t::dims32_t &dst_dims, + const hrt::sycl::md_t::dim32_t &dst_ndims) const { const dim_t mask_binary_po = get_dims_mask(dst_dims, src1_md.dims(), dst_ndims); return get_po_tensor_off( src1_md, l_offset, dst_dims, dst_ndims, mask_binary_po); } - inline dim_t get_dims_mask(const sycl_md_t::dims32_t &dims1, - const sycl_md_t::dims32_t &dims2, const dim_t &ndims, + inline dim_t get_dims_mask(const hrt::sycl::md_t::dims32_t &dims1, + const hrt::sycl::md_t::dims32_t &dims2, const dim_t &ndims, bool skip_dim_of_one = false) const { dim_t mask = 0; for (dim_t d = 0; d < ndims; ++d) { @@ -261,8 +263,8 @@ struct eltwise_fwd_kernel_vec_t { return mask; } - inline dim_t get_po_tensor_off(const sycl_md_t &tensor_md, - const dim_t &l_offset, const sycl_md_t::dims32_t &dst_dims, + inline dim_t get_po_tensor_off(const hrt::sycl::md_t &tensor_md, + const dim_t &l_offset, const hrt::sycl::md_t::dims32_t &dst_dims, const dim_t &dst_ndims, const dim_t &mask) const { dims_t l_dims_po {}; get_l_dims_po(l_dims_po, l_offset, dst_dims, dst_ndims, mask); @@ -271,7 +273,7 @@ struct eltwise_fwd_kernel_vec_t { } inline void get_l_dims_po(dims_t l_dims_po, dim_t l_offset, - const sycl_md_t::dims32_t &dst_dims, const dim_t &dst_ndims, + const hrt::sycl::md_t::dims32_t &dst_dims, const dim_t &dst_ndims, const dim_t &mask) const { l_dims_by_l_offset(l_dims_po, l_offset, dst_dims, dst_ndims); @@ -279,7 +281,7 @@ struct eltwise_fwd_kernel_vec_t { } inline void l_dims_by_l_offset(dims_t dims_pos, dim_t l_offset, - const sycl_md_t::dims32_t &dims, const dim_t &ndims) const { + const hrt::sycl::md_t::dims32_t &dims, const dim_t &ndims) const { for (dim_t rd = 0; rd < ndims; ++rd) { const dim_t d = ndims - 1 - rd; /* switch to faster 32-bit division when possible. */ @@ -294,19 +296,20 @@ struct eltwise_fwd_kernel_vec_t { } sycl_eltwise_conf_t conf_; - sycl_in_memory_arg_t src_; - sycl_in_memory_arg_t srcOp1_; - sycl_in_memory_arg_t srcOp2_; - sycl_in_memory_arg_t srcOp3_; - sycl_in_memory_arg_t srcOp4_; - sycl_in_memory_arg_t srcOp5_; - sycl_out_memory_arg_t dst_; + hrt::sycl::in_memory_arg_t src_; + hrt::sycl::in_memory_arg_t srcOp1_; + hrt::sycl::in_memory_arg_t srcOp2_; + hrt::sycl::in_memory_arg_t srcOp3_; + hrt::sycl::in_memory_arg_t srcOp4_; + hrt::sycl::in_memory_arg_t srcOp5_; + hrt::sycl::out_memory_arg_t dst_; }; struct eltwise_bwd_kernel_vec_t { eltwise_bwd_kernel_vec_t(const sycl_eltwise_conf_t &conf, - sycl_in_memory_arg_t &diff_src, sycl_in_memory_arg_t &src, - sycl_out_memory_arg_t &diff_dst) + hrt::sycl::in_memory_arg_t &diff_src, + hrt::sycl::in_memory_arg_t &src, + hrt::sycl::out_memory_arg_t &diff_dst) : conf_(conf), src_(src), diff_src_(diff_src), diff_dst_(diff_dst) {} void operator()(::sycl::nd_item<1> item) const { @@ -334,9 +337,9 @@ struct eltwise_bwd_kernel_vec_t { } private: - const sycl_md_t &src_md() const { return conf_.src_md; } - const sycl_md_t &diff_src_md() const { return conf_.diff_src_md; } - const sycl_md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const hrt::sycl::md_t &src_md() const { return conf_.src_md; } + const hrt::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } + const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } void *src_ptr() const { return src_.get_pointer(); } void *diff_src_ptr() const { return diff_src_.get_pointer(); } @@ -445,9 +448,9 @@ struct eltwise_bwd_kernel_vec_t { } sycl_eltwise_conf_t conf_; - sycl_in_memory_arg_t src_; - sycl_in_memory_arg_t diff_src_; - sycl_out_memory_arg_t diff_dst_; + hrt::sycl::in_memory_arg_t src_; + hrt::sycl::in_memory_arg_t diff_src_; + hrt::sycl::out_memory_arg_t diff_dst_; }; } // namespace sycl diff --git a/src/gpu/sycl/layer_normalizations_kernels.hpp b/src/gpu/sycl/layer_normalizations_kernels.hpp index 73f6059a488..1c3c55d7555 100644 --- a/src/gpu/sycl/layer_normalizations_kernels.hpp +++ b/src/gpu/sycl/layer_normalizations_kernels.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,7 +23,7 @@ #include "gpu/sycl/sycl_io_helper.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { @@ -33,10 +33,11 @@ namespace sycl { struct layer_normalization_fwd_kernel_vec_t { layer_normalization_fwd_kernel_vec_t( const sycl_layer_normalization_conf_t &conf, - sycl_in_memory_arg_t &data, sycl_in_memory_arg_t &scale, - sycl_in_memory_arg_t &shift, sycl_in_memory_arg_t &stat, - sycl_in_memory_arg_t &var, sycl_out_memory_arg_t &dst, - sycl_in_memory_arg_t &rt_scale, sycl_in_memory_arg_t &dst_scale) + hrt::sycl::in_memory_arg_t &data, hrt::sycl::in_memory_arg_t &scale, + hrt::sycl::in_memory_arg_t &shift, hrt::sycl::in_memory_arg_t &stat, + hrt::sycl::in_memory_arg_t &var, hrt::sycl::out_memory_arg_t &dst, + hrt::sycl::in_memory_arg_t &rt_scale, + hrt::sycl::in_memory_arg_t &dst_scale) : conf_(conf) , data_(data) , scale_(scale) @@ -61,14 +62,14 @@ struct layer_normalization_fwd_kernel_vec_t { } private: - const sycl_md_t &data_md() const { return conf_.data_md; } - const sycl_md_t &stat_md() const { return conf_.stat_md; } - const sycl_md_t &data_scaleshift_md() const { + const hrt::sycl::md_t &data_md() const { return conf_.data_md; } + const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } + const hrt::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const sycl_md_t &var_md() const { return conf_.var_md; } - const sycl_md_t &stat_d() const { return conf_.stat_d; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &var_md() const { return conf_.var_md; } + const hrt::sycl::md_t &stat_d() const { return conf_.stat_d; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } const unsigned flags() const { return conf_.flags; } const float epsilon() const { return conf_.layer_norm_epsilon; } @@ -127,23 +128,25 @@ struct layer_normalization_fwd_kernel_vec_t { } sycl_layer_normalization_conf_t conf_; - sycl_in_memory_arg_t data_; - sycl_in_memory_arg_t scale_; - sycl_in_memory_arg_t shift_; - sycl_in_memory_arg_t stat_; - sycl_in_memory_arg_t var_; - sycl_out_memory_arg_t dst_; - sycl_in_memory_arg_t rt_scale_; - sycl_in_memory_arg_t dst_scale_; + hrt::sycl::in_memory_arg_t data_; + hrt::sycl::in_memory_arg_t scale_; + hrt::sycl::in_memory_arg_t shift_; + hrt::sycl::in_memory_arg_t stat_; + hrt::sycl::in_memory_arg_t var_; + hrt::sycl::out_memory_arg_t dst_; + hrt::sycl::in_memory_arg_t rt_scale_; + hrt::sycl::in_memory_arg_t dst_scale_; }; struct layer_normalization_fwd_kernel_vec1_t { layer_normalization_fwd_kernel_vec1_t( const sycl_layer_normalization_conf_t &conf, - sycl_in_memory_arg_t &data, sycl_in_memory_arg_t &scale, - sycl_in_memory_arg_t &shift, sycl_out_memory_arg_t &dst, - sycl_out_memory_arg_t &mean_out, sycl_out_memory_arg_t &var_out, - sycl_in_memory_arg_t &rt_scale, sycl_in_memory_arg_t &dst_scale) + hrt::sycl::in_memory_arg_t &data, hrt::sycl::in_memory_arg_t &scale, + hrt::sycl::in_memory_arg_t &shift, hrt::sycl::out_memory_arg_t &dst, + hrt::sycl::out_memory_arg_t &mean_out, + hrt::sycl::out_memory_arg_t &var_out, + hrt::sycl::in_memory_arg_t &rt_scale, + hrt::sycl::in_memory_arg_t &dst_scale) : conf_(conf) , data_(data) , scale_(scale) @@ -169,14 +172,14 @@ struct layer_normalization_fwd_kernel_vec1_t { } private: - const sycl_md_t &data_md() const { return conf_.data_md; } - const sycl_md_t &stat_md() const { return conf_.stat_md; } - const sycl_md_t &data_scaleshift_md() const { + const hrt::sycl::md_t &data_md() const { return conf_.data_md; } + const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } + const hrt::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const sycl_md_t &var_md() const { return conf_.var_md; } - const sycl_md_t &stat_d() const { return conf_.stat_d; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &var_md() const { return conf_.var_md; } + const hrt::sycl::md_t &stat_d() const { return conf_.stat_d; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } const unsigned flags() const { return conf_.flags; } const float epsilon() const { return conf_.layer_norm_epsilon; } @@ -264,23 +267,26 @@ struct layer_normalization_fwd_kernel_vec1_t { } sycl_layer_normalization_conf_t conf_; - sycl_in_memory_arg_t data_; - sycl_in_memory_arg_t scale_; - sycl_in_memory_arg_t shift_; - sycl_out_memory_arg_t dst_; - sycl_out_memory_arg_t mean_out_; - sycl_out_memory_arg_t var_out_; - sycl_in_memory_arg_t rt_scale_; - sycl_in_memory_arg_t dst_scale_; + hrt::sycl::in_memory_arg_t data_; + hrt::sycl::in_memory_arg_t scale_; + hrt::sycl::in_memory_arg_t shift_; + hrt::sycl::out_memory_arg_t dst_; + hrt::sycl::out_memory_arg_t mean_out_; + hrt::sycl::out_memory_arg_t var_out_; + hrt::sycl::in_memory_arg_t rt_scale_; + hrt::sycl::in_memory_arg_t dst_scale_; }; struct layer_normalization_bwd_kernel_vec_t { layer_normalization_bwd_kernel_vec_t( const sycl_layer_normalization_conf_t &conf, - sycl_in_memory_arg_t &data, sycl_out_memory_arg_t &diff_data, - sycl_in_memory_arg_t &scale, sycl_out_memory_arg_t &diff_scale, - sycl_out_memory_arg_t &diff_shift, sycl_in_memory_arg_t &stat, - sycl_in_memory_arg_t &var, sycl_in_memory_arg_t &diff_dst) + hrt::sycl::in_memory_arg_t &data, + hrt::sycl::out_memory_arg_t &diff_data, + hrt::sycl::in_memory_arg_t &scale, + hrt::sycl::out_memory_arg_t &diff_scale, + hrt::sycl::out_memory_arg_t &diff_shift, + hrt::sycl::in_memory_arg_t &stat, hrt::sycl::in_memory_arg_t &var, + hrt::sycl::in_memory_arg_t &diff_dst) : conf_(conf) , data_(data) , diff_data_(diff_data) @@ -299,19 +305,19 @@ struct layer_normalization_bwd_kernel_vec_t { } private: - const sycl_md_t &data_md() const { return conf_.data_md; } - const sycl_md_t &diff_data_md() const { return conf_.diff_data_md; } - const sycl_md_t &stat_d() const { return conf_.stat_d; } - const sycl_md_t &stat_md() const { return conf_.stat_md; } - const sycl_md_t &data_scaleshift_md() const { + const hrt::sycl::md_t &data_md() const { return conf_.data_md; } + const hrt::sycl::md_t &diff_data_md() const { return conf_.diff_data_md; } + const hrt::sycl::md_t &stat_d() const { return conf_.stat_d; } + const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } + const hrt::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const sycl_md_t &diff_data_scaleshift_md() const { + const hrt::sycl::md_t &diff_data_scaleshift_md() const { return conf_.diff_data_scaleshift_md; } - const sycl_md_t &var_md() const { return conf_.var_md; } - const sycl_md_t &diff_dst_md() const { return conf_.diff_dst_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &var_md() const { return conf_.var_md; } + const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } const unsigned flags() const { return conf_.flags; } const float epsilon() const { return conf_.layer_norm_epsilon; } @@ -382,23 +388,26 @@ struct layer_normalization_bwd_kernel_vec_t { sycl_layer_normalization_conf_t conf_; - sycl_in_memory_arg_t data_; - sycl_out_memory_arg_t diff_data_; - sycl_in_memory_arg_t scale_; - sycl_out_memory_arg_t diff_scale_; - sycl_out_memory_arg_t diff_shift_; - sycl_in_memory_arg_t stat_; - sycl_in_memory_arg_t var_; - sycl_in_memory_arg_t diff_dst_; + hrt::sycl::in_memory_arg_t data_; + hrt::sycl::out_memory_arg_t diff_data_; + hrt::sycl::in_memory_arg_t scale_; + hrt::sycl::out_memory_arg_t diff_scale_; + hrt::sycl::out_memory_arg_t diff_shift_; + hrt::sycl::in_memory_arg_t stat_; + hrt::sycl::in_memory_arg_t var_; + hrt::sycl::in_memory_arg_t diff_dst_; }; struct layer_normalization_bwd_kernel_vec2_t { layer_normalization_bwd_kernel_vec2_t( const sycl_layer_normalization_conf_t &conf, - sycl_in_memory_arg_t &data, sycl_out_memory_arg_t &diff_data, - sycl_in_memory_arg_t &scale, sycl_out_memory_arg_t &diff_scale, - sycl_out_memory_arg_t &diff_shift, sycl_in_memory_arg_t &stat, - sycl_in_memory_arg_t &var, sycl_in_memory_arg_t &diff_dst) + hrt::sycl::in_memory_arg_t &data, + hrt::sycl::out_memory_arg_t &diff_data, + hrt::sycl::in_memory_arg_t &scale, + hrt::sycl::out_memory_arg_t &diff_scale, + hrt::sycl::out_memory_arg_t &diff_shift, + hrt::sycl::in_memory_arg_t &stat, hrt::sycl::in_memory_arg_t &var, + hrt::sycl::in_memory_arg_t &diff_dst) : conf_(conf) , data_(data) , diff_data_(diff_data) @@ -417,18 +426,18 @@ struct layer_normalization_bwd_kernel_vec2_t { } private: - const sycl_md_t &data_md() const { return conf_.data_md; } - const sycl_md_t &diff_data_md() const { return conf_.diff_data_md; } - const sycl_md_t &stat_d() const { return conf_.stat_d; } - const sycl_md_t &stat_md() const { return conf_.stat_md; } - const sycl_md_t &data_scaleshift_md() const { + const hrt::sycl::md_t &data_md() const { return conf_.data_md; } + const hrt::sycl::md_t &diff_data_md() const { return conf_.diff_data_md; } + const hrt::sycl::md_t &stat_d() const { return conf_.stat_d; } + const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } + const hrt::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const sycl_md_t &diff_data_scaleshift_md() const { + const hrt::sycl::md_t &diff_data_scaleshift_md() const { return conf_.diff_data_scaleshift_md; } - const sycl_md_t &var_md() const { return conf_.var_md; } - const sycl_md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const hrt::sycl::md_t &var_md() const { return conf_.var_md; } + const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } const unsigned flags() const { return conf_.flags; } const float epsilon() const { return conf_.layer_norm_epsilon; } @@ -510,14 +519,14 @@ struct layer_normalization_bwd_kernel_vec2_t { sycl_layer_normalization_conf_t conf_; - sycl_in_memory_arg_t data_; - sycl_out_memory_arg_t diff_data_; - sycl_in_memory_arg_t scale_; - sycl_out_memory_arg_t diff_scale_; - sycl_out_memory_arg_t diff_shift_; - sycl_in_memory_arg_t stat_; - sycl_in_memory_arg_t var_; - sycl_in_memory_arg_t diff_dst_; + hrt::sycl::in_memory_arg_t data_; + hrt::sycl::out_memory_arg_t diff_data_; + hrt::sycl::in_memory_arg_t scale_; + hrt::sycl::out_memory_arg_t diff_scale_; + hrt::sycl::out_memory_arg_t diff_shift_; + hrt::sycl::in_memory_arg_t stat_; + hrt::sycl::in_memory_arg_t var_; + hrt::sycl::in_memory_arg_t diff_dst_; }; } // namespace sycl diff --git a/src/gpu/sycl/lrn_kernels.hpp b/src/gpu/sycl/lrn_kernels.hpp index ce416364a61..dff003d1b6e 100644 --- a/src/gpu/sycl/lrn_kernels.hpp +++ b/src/gpu/sycl/lrn_kernels.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ #include "gpu/sycl/sycl_io_helper.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { @@ -27,8 +27,9 @@ namespace gpu { namespace sycl { struct lrn_fwd_kernel_vec_t { - lrn_fwd_kernel_vec_t(const sycl_lrn_conf_t &conf, sycl_in_memory_arg_t &src, - sycl_out_memory_arg_t &dst, const format_tag_t &tag) + lrn_fwd_kernel_vec_t(const sycl_lrn_conf_t &conf, + hrt::sycl::in_memory_arg_t &src, hrt::sycl::out_memory_arg_t &dst, + const format_tag_t &tag) : conf_(conf), src_(src), dst_(dst), tag_(tag) {} void operator()(::sycl::nd_item<1> item) const { @@ -136,22 +137,23 @@ struct lrn_fwd_kernel_vec_t { } private: - const sycl_md_t &src_md() const { return conf_.src_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &src_md() const { return conf_.src_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } void *src_ptr() const { return src_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } sycl_lrn_conf_t conf_; - sycl_in_memory_arg_t src_; - sycl_out_memory_arg_t dst_; + hrt::sycl::in_memory_arg_t src_; + hrt::sycl::out_memory_arg_t dst_; format_tag_t tag_; }; struct lrn_bwd_kernel_vec_t { - lrn_bwd_kernel_vec_t(const sycl_lrn_conf_t &conf, sycl_in_memory_arg_t &src, - sycl_in_memory_arg_t &diff_dst, sycl_out_memory_arg_t &diff_src, - const format_tag_t &tag) + lrn_bwd_kernel_vec_t(const sycl_lrn_conf_t &conf, + hrt::sycl::in_memory_arg_t &src, + hrt::sycl::in_memory_arg_t &diff_dst, + hrt::sycl::out_memory_arg_t &diff_src, const format_tag_t &tag) : conf_(conf) , src_(src) , diff_dst_(diff_dst) @@ -316,18 +318,18 @@ struct lrn_bwd_kernel_vec_t { } private: - const sycl_md_t &src_md() const { return conf_.src_md; } - const sycl_md_t &diff_dst_md() const { return conf_.diff_dst_md; } - const sycl_md_t &diff_src_md() const { return conf_.diff_src_md; } + const hrt::sycl::md_t &src_md() const { return conf_.src_md; } + const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const hrt::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } void *src_ptr() const { return src_.get_pointer(); } void *diff_dst_ptr() const { return diff_dst_.get_pointer(); } void *diff_src_ptr() const { return diff_src_.get_pointer(); } sycl_lrn_conf_t conf_; - sycl_in_memory_arg_t src_; - sycl_in_memory_arg_t diff_dst_; - sycl_out_memory_arg_t diff_src_; + hrt::sycl::in_memory_arg_t src_; + hrt::sycl::in_memory_arg_t diff_dst_; + hrt::sycl::out_memory_arg_t diff_src_; format_tag_t tag_; }; diff --git a/src/gpu/sycl/pooling_kernels.hpp b/src/gpu/sycl/pooling_kernels.hpp index 292ccafda56..d23017b8a9d 100644 --- a/src/gpu/sycl/pooling_kernels.hpp +++ b/src/gpu/sycl/pooling_kernels.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { @@ -34,10 +34,12 @@ namespace sycl { using namespace nstl; struct pooling_fwd_kernel_vec_t { pooling_fwd_kernel_vec_t(const sycl_pooling_conf_t &conf, - sycl_in_memory_arg_t &src, sycl_out_memory_arg_t &dst, - sycl_out_memory_arg_t &ws, sycl_in_memory_arg_t &src_1, - sycl_in_memory_arg_t &src_2, sycl_in_memory_arg_t &src_3, - sycl_in_memory_arg_t &src_4, sycl_in_memory_arg_t &src_5) + hrt::sycl::in_memory_arg_t &src, hrt::sycl::out_memory_arg_t &dst, + hrt::sycl::out_memory_arg_t &ws, hrt::sycl::in_memory_arg_t &src_1, + hrt::sycl::in_memory_arg_t &src_2, + hrt::sycl::in_memory_arg_t &src_3, + hrt::sycl::in_memory_arg_t &src_4, + hrt::sycl::in_memory_arg_t &src_5) : conf_(conf) , src_(src) , dst_(dst) @@ -96,19 +98,19 @@ struct pooling_fwd_kernel_vec_t { } private: - const sycl_md_t &src_md() const { return conf_.src_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } - const sycl_md_t &ws_md() const { return conf_.ws_md; } + const hrt::sycl::md_t &src_md() const { return conf_.src_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &ws_md() const { return conf_.ws_md; } void *src_ptr() const { return src_.get_pointer(); } - void *gen_ptr(sycl_in_memory_arg_t gen_) const { + void *gen_ptr(hrt::sycl::in_memory_arg_t gen_) const { return gen_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } void *ws_ptr() const { return ws_.get_pointer(); } - static dim_t get_offset( - const sycl_md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { + static dim_t get_offset(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 3: return mdw.off(n, c, w); case 4: return mdw.off(n, c, h, w); @@ -120,26 +122,28 @@ struct pooling_fwd_kernel_vec_t { float data_conv() const { switch (src_md().data_type()) { case data_type::bf16: - return (float)std::numeric_limits::lowest(); + return (float) + std::numeric_limits::lowest(); case data_type::s8: - return (float)numeric_limits< - typename prec_traits::type>::lowest(); + return (float)numeric_limits::type>::lowest(); case data_type::f16: - return (float)numeric_limits< - typename prec_traits::type>::lowest(); + return (float) + std::numeric_limits::type>::lowest(); case data_type::s32: - return (float)numeric_limits< - typename prec_traits::type>::lowest(); + return (float)numeric_limits::type>::lowest(); case data_type::u8: - return (float)numeric_limits< - typename prec_traits::type>::lowest(); + return (float)numeric_limits::type>::lowest(); default: - return (float)numeric_limits< - typename prec_traits::type>::lowest(); + return (float)numeric_limits::type>::lowest(); } } - float dst_Value(sycl_in_memory_arg_t arr, int idx, int offset) const { + float dst_Value(hrt::sycl::in_memory_arg_t arr, int idx, int offset) const { auto src1_desc = conf_.src1_md[idx]; dim_t src_dim[DNNL_MAX_NDIMS]; auto src_dim_ = src1_desc.dims(); @@ -153,8 +157,8 @@ struct pooling_fwd_kernel_vec_t { return dst; } - dim_t get_binary_src1_off(const sycl_md_t &src1_md, const dim_t *src_dim, - const dim_t l_offset, const dim_t *dst_dims, + dim_t get_binary_src1_off(const hrt::sycl::md_t &src1_md, + const dim_t *src_dim, const dim_t l_offset, const dim_t *dst_dims, const int dst_ndims) const { const int mask_binary_po @@ -164,8 +168,9 @@ struct pooling_fwd_kernel_vec_t { src1_md, l_offset, dst_dims, dst_ndims, mask_binary_po); } - dim_t get_po_tensor_off(const sycl_md_t &tensor_md, const dim_t l_offset, - const dim_t *dst_dims, const int dst_ndims, int mask) const { + dim_t get_po_tensor_off(const hrt::sycl::md_t &tensor_md, + const dim_t l_offset, const dim_t *dst_dims, const int dst_ndims, + int mask) const { dims_t l_dims_po {}; get_l_dims_po(l_dims_po, l_offset, dst_dims, dst_ndims, mask); @@ -284,20 +289,21 @@ struct pooling_fwd_kernel_vec_t { sycl_pooling_conf_t conf_; - sycl_in_memory_arg_t src_; - sycl_out_memory_arg_t dst_; - sycl_out_memory_arg_t ws_; - sycl_in_memory_arg_t src_1_; - sycl_in_memory_arg_t src_2_; - sycl_in_memory_arg_t src_3_; - sycl_in_memory_arg_t src_4_; - sycl_in_memory_arg_t src_5_; + hrt::sycl::in_memory_arg_t src_; + hrt::sycl::out_memory_arg_t dst_; + hrt::sycl::out_memory_arg_t ws_; + hrt::sycl::in_memory_arg_t src_1_; + hrt::sycl::in_memory_arg_t src_2_; + hrt::sycl::in_memory_arg_t src_3_; + hrt::sycl::in_memory_arg_t src_4_; + hrt::sycl::in_memory_arg_t src_5_; }; struct pooling_bwd_kernel_vec_t { pooling_bwd_kernel_vec_t(const sycl_pooling_conf_t &conf, - sycl_in_memory_arg_t &diff_dst, sycl_out_memory_arg_t &diff_src, - sycl_in_memory_arg_t &ws) + hrt::sycl::in_memory_arg_t &diff_dst, + hrt::sycl::out_memory_arg_t &diff_src, + hrt::sycl::in_memory_arg_t &ws) : conf_(conf), diff_dst_(diff_dst), diff_src_(diff_src), ws_(ws) {} void operator()(::sycl::nd_item<1> item) const { @@ -349,16 +355,16 @@ struct pooling_bwd_kernel_vec_t { } private: - const sycl_md_t &diff_src_md() const { return conf_.diff_src_md; } - const sycl_md_t &diff_dst_md() const { return conf_.diff_dst_md; } - const sycl_md_t &ws_md() const { return conf_.ws_md; } + const hrt::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } + const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const hrt::sycl::md_t &ws_md() const { return conf_.ws_md; } void *diff_src_ptr() const { return diff_src_.get_pointer(); } void *diff_dst_ptr() const { return diff_dst_.get_pointer(); } void *ws_ptr() const { return ws_.get_pointer(); } - static dim_t get_offset( - const sycl_md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { + static dim_t get_offset(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 3: return mdw.off(n, c, w); case 4: return mdw.off(n, c, h, w); @@ -472,9 +478,9 @@ struct pooling_bwd_kernel_vec_t { } sycl_pooling_conf_t conf_; - sycl_in_memory_arg_t diff_dst_; - sycl_out_memory_arg_t diff_src_; - sycl_in_memory_arg_t ws_; + hrt::sycl::in_memory_arg_t diff_dst_; + hrt::sycl::out_memory_arg_t diff_src_; + hrt::sycl::in_memory_arg_t ws_; }; } // namespace sycl diff --git a/src/gpu/sycl/prelu_kernels.hpp b/src/gpu/sycl/prelu_kernels.hpp index 07edee007e2..7c81c6e9e70 100644 --- a/src/gpu/sycl/prelu_kernels.hpp +++ b/src/gpu/sycl/prelu_kernels.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,7 +29,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { @@ -42,8 +42,9 @@ struct prelu_fwd_kernel_vec_t { static constexpr int vec_len = 8; prelu_fwd_kernel_vec_t(const sycl_prelu_conf_t &conf, - sycl_in_memory_arg_t &data, sycl_in_memory_arg_t &weights, - sycl_out_memory_arg_t &dst) + hrt::sycl::in_memory_arg_t &data, + hrt::sycl::in_memory_arg_t &weights, + hrt::sycl::out_memory_arg_t &dst) : conf_(conf), data_(data), weights_(weights), dst_(dst) {} void operator()(::sycl::nd_item<1> item) const { @@ -113,15 +114,15 @@ struct prelu_fwd_kernel_vec_t { } private: - const sycl_md_t &data_md() const { return conf_.data_md; } - const sycl_md_t &weights_md() const { return conf_.weights_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &data_md() const { return conf_.data_md; } + const hrt::sycl::md_t &weights_md() const { return conf_.weights_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } void *data_ptr() const { return data_.get_pointer(); } void *weights_ptr() const { return weights_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } - static dim_t offset(const sycl_md_t &mem, dims_t dims) { + static dim_t offset(const hrt::sycl::md_t &mem, dims_t dims) { const int ndims = mem.ndims(); switch (ndims) { case 1: return mem.off(dims[0]); @@ -135,7 +136,7 @@ struct prelu_fwd_kernel_vec_t { } static dim_t weights_offset( - const int mask, const sycl_md_t &mem, dims_t &dims) { + const int mask, const hrt::sycl::md_t &mem, dims_t &dims) { dims_t dims_w {}; std::copy(dims, dims + max_supported_ndims, dims_w); utils::apply_mask_on_dims(dims_w, mem.ndims(), mask); @@ -143,18 +144,21 @@ struct prelu_fwd_kernel_vec_t { } sycl_prelu_conf_t conf_; - sycl_in_memory_arg_t data_; - sycl_in_memory_arg_t weights_; - sycl_out_memory_arg_t dst_; + hrt::sycl::in_memory_arg_t data_; + hrt::sycl::in_memory_arg_t weights_; + hrt::sycl::out_memory_arg_t dst_; }; struct prelu_bwd_kernel_vec_t { static constexpr int vec_len = 8; prelu_bwd_kernel_vec_t(const sycl_prelu_conf_t &conf, - sycl_in_memory_arg_t &data, sycl_out_memory_arg_t &diff_data, - sycl_in_memory_arg_t &weights, sycl_out_memory_arg_t &diff_weights, - sycl_in_memory_arg_t &diff_dst, sycl_out_memory_arg_t &scratchpad) + hrt::sycl::in_memory_arg_t &data, + hrt::sycl::out_memory_arg_t &diff_data, + hrt::sycl::in_memory_arg_t &weights, + hrt::sycl::out_memory_arg_t &diff_weights, + hrt::sycl::in_memory_arg_t &diff_dst, + hrt::sycl::out_memory_arg_t &scratchpad) : conf_(conf) , data_(data) , diff_data_(diff_data) @@ -233,11 +237,13 @@ struct prelu_bwd_kernel_vec_t { } private: - const sycl_md_t &data_md() const { return conf_.data_md; } - const sycl_md_t &weights_md() const { return conf_.weights_md; } - const sycl_md_t &diff_data_md() const { return conf_.diff_data_md; } - const sycl_md_t &diff_weights_md() const { return conf_.diff_weights_md; } - const sycl_md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const hrt::sycl::md_t &data_md() const { return conf_.data_md; } + const hrt::sycl::md_t &weights_md() const { return conf_.weights_md; } + const hrt::sycl::md_t &diff_data_md() const { return conf_.diff_data_md; } + const hrt::sycl::md_t &diff_weights_md() const { + return conf_.diff_weights_md; + } + const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } float *data_ptr() const { return (float *)(data_.get_pointer()); } float *weights_ptr() const { return (float *)(weights_.get_pointer()); } @@ -250,7 +256,7 @@ struct prelu_bwd_kernel_vec_t { return (float *)(scratchpad_.get_pointer()); } - static dim_t offset(const sycl_md_t &mem, dims_t dims) { + static dim_t offset(const hrt::sycl::md_t &mem, dims_t dims) { const int ndims = mem.ndims(); switch (ndims) { case 1: return mem.off(dims[0]); @@ -264,7 +270,7 @@ struct prelu_bwd_kernel_vec_t { } static dim_t weights_offset( - const int mask, const sycl_md_t &mem, dims_t &dims) { + const int mask, const hrt::sycl::md_t &mem, dims_t &dims) { dims_t dims_w {}; std::copy(dims, dims + max_supported_ndims, dims_w); utils::apply_mask_on_dims(dims_w, mem.ndims(), mask); @@ -538,12 +544,12 @@ struct prelu_bwd_kernel_vec_t { } sycl_prelu_conf_t conf_; - sycl_in_memory_arg_t data_; - sycl_out_memory_arg_t diff_data_; - sycl_in_memory_arg_t weights_; - sycl_out_memory_arg_t diff_weights_; - sycl_in_memory_arg_t diff_dst_; - sycl_out_memory_arg_t scratchpad_; + hrt::sycl::in_memory_arg_t data_; + hrt::sycl::out_memory_arg_t diff_data_; + hrt::sycl::in_memory_arg_t weights_; + hrt::sycl::out_memory_arg_t diff_weights_; + hrt::sycl::in_memory_arg_t diff_dst_; + hrt::sycl::out_memory_arg_t scratchpad_; }; } // namespace sycl diff --git a/src/gpu/sycl/ref_batch_normalization.cpp b/src/gpu/sycl/ref_batch_normalization.cpp index 0238aa4997d..46b89b9fcae 100644 --- a/src/gpu/sycl/ref_batch_normalization.cpp +++ b/src/gpu/sycl/ref_batch_normalization.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #include "common/c_types_map.hpp" #include "common/dnnl_traits.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" #include "gpu/sycl/batch_normalizations_kernels.hpp" #include "gpu/sycl/ref_batch_normalization.hpp" @@ -33,20 +33,21 @@ status_t ref_batch_normalization_fwd_t::pd_t::init_conf() { conf_.ndims = ndims(); conf_.flags = desc()->flags; conf_.wk_size = memory_desc_wrapper(src_md(0)).nelems(); - conf_.src1_md = sycl_md_t(dst_md(3)); - conf_.dst1_md = sycl_md_t(dst_md(0)); + conf_.src1_md = hrt::sycl::md_t(dst_md(3)); + conf_.dst1_md = hrt::sycl::md_t(dst_md(0)); conf_.block_size = 16; conf_.wg_size = 32; conf_.dir = !is_fwd(); conf_.use_scale = use_scale(); conf_.use_shift = use_shift(); - conf_.data_md = sycl_md_t(src_md(0)); - conf_.data_scaleshift_md = sycl_md_t(weights_md(0)); - conf_.stat_md - = stats_is_src() ? sycl_md_t(src_md(1)) : sycl_md_t(dst_md(1)); - conf_.dst_md = sycl_md_t(dst_md(0)); - conf_.var_md = stats_is_src() ? sycl_md_t(src_md(2)) : sycl_md_t(dst_md(2)); - conf_.ws_md = sycl_md_t(workspace_md(0)); + conf_.data_md = hrt::sycl::md_t(src_md(0)); + conf_.data_scaleshift_md = hrt::sycl::md_t(weights_md(0)); + conf_.stat_md = stats_is_src() ? hrt::sycl::md_t(src_md(1)) + : hrt::sycl::md_t(dst_md(1)); + conf_.dst_md = hrt::sycl::md_t(dst_md(0)); + conf_.var_md = stats_is_src() ? hrt::sycl::md_t(src_md(2)) + : hrt::sycl::md_t(dst_md(2)); + conf_.ws_md = hrt::sycl::md_t(workspace_md(0)); int work_per_wg = conf_.wg_size * conf_.block_size; int n_wgs = (C() + work_per_wg - 1) / work_per_wg; conf_.n_thr = n_wgs * conf_.wg_size; @@ -136,17 +137,17 @@ status_t ref_batch_normalization_bwd_t::pd_t::init_conf() { conf_.prop_kind = desc_.prop_kind; conf_.use_scale = use_scale(); conf_.use_shift = use_shift(); - conf_.data_md = sycl_md_t(src_md(0)); - conf_.dst1_md = sycl_md_t(dst_md(0)); - conf_.diff_data_md = sycl_md_t(diff_src_md(0)); - conf_.diff_src1_md = sycl_md_t(diff_dst_md(1)); - conf_.data_scaleshift_md = sycl_md_t(weights_md(0)); - conf_.diff_data_scaleshift_md = sycl_md_t(diff_weights_md(0)); - conf_.diff_dst_md = sycl_md_t(diff_dst_md(0)); - conf_.stat_md = sycl_md_t(stat_md()); - conf_.var_md = sycl_md_t(src_md(2)); - conf_.dst_md = sycl_md_t(dst_md(0)); - conf_.ws_md = sycl_md_t(workspace_md(0)); + conf_.data_md = hrt::sycl::md_t(src_md(0)); + conf_.dst1_md = hrt::sycl::md_t(dst_md(0)); + conf_.diff_data_md = hrt::sycl::md_t(diff_src_md(0)); + conf_.diff_src1_md = hrt::sycl::md_t(diff_dst_md(1)); + conf_.data_scaleshift_md = hrt::sycl::md_t(weights_md(0)); + conf_.diff_data_scaleshift_md = hrt::sycl::md_t(diff_weights_md(0)); + conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md(0)); + conf_.stat_md = hrt::sycl::md_t(stat_md()); + conf_.var_md = hrt::sycl::md_t(src_md(2)); + conf_.dst_md = hrt::sycl::md_t(dst_md(0)); + conf_.ws_md = hrt::sycl::md_t(workspace_md(0)); int work_per_wg = conf_.wg_size * conf_.block_size; int n_wgs = (C() + work_per_wg - 1) / work_per_wg; conf_.n_thr = n_wgs * conf_.wg_size; diff --git a/src/gpu/sycl/ref_batch_normalization.hpp b/src/gpu/sycl/ref_batch_normalization.hpp index 9071961b111..e0bf21f315b 100644 --- a/src/gpu/sycl/ref_batch_normalization.hpp +++ b/src/gpu/sycl/ref_batch_normalization.hpp @@ -25,7 +25,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" namespace dnnl { diff --git a/src/gpu/sycl/ref_binary.cpp b/src/gpu/sycl/ref_binary.cpp index 1adcd3728c5..58f4c73778d 100644 --- a/src/gpu/sycl/ref_binary.cpp +++ b/src/gpu/sycl/ref_binary.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,9 +27,9 @@ using namespace impl::sycl; status_t ref_binary_t::pd_t::init_conf() { conf_ = sycl_binary_conf_t(); - conf_.src0_md = sycl_md_t(src_md(0)); - conf_.src1_md = sycl_md_t(src_md(1)); - conf_.dst_md = sycl_md_t(dst_md()); + conf_.src0_md = hrt::sycl::md_t(src_md(0)); + conf_.src1_md = hrt::sycl::md_t(src_md(1)); + conf_.dst_md = hrt::sycl::md_t(dst_md()); conf_.ndims = ndims(); // XXX: should probably be tuned. @@ -46,7 +46,7 @@ status_t ref_binary_t::pd_t::init_conf() { conf_.do_scale_src1 = !attr()->scales_.get(DNNL_ARG_SRC_1).has_default_values(); conf_.is_tensor_op = is_tensor_op(); - for (size_t i = 0; i < sycl_md_t::max_dims; i++) { + for (size_t i = 0; i < hrt::sycl::md_t::max_dims; i++) { conf_.broadcast_dims[i] = broadcast_dims()[i]; } diff --git a/src/gpu/sycl/ref_binary.hpp b/src/gpu/sycl/ref_binary.hpp index 17b19901676..9b47a047e27 100644 --- a/src/gpu/sycl/ref_binary.hpp +++ b/src/gpu/sycl/ref_binary.hpp @@ -23,7 +23,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" namespace dnnl { diff --git a/src/gpu/sycl/ref_eltwise.cpp b/src/gpu/sycl/ref_eltwise.cpp index e932762a5b8..35b2f4d7be5 100644 --- a/src/gpu/sycl/ref_eltwise.cpp +++ b/src/gpu/sycl/ref_eltwise.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,8 +25,8 @@ namespace sycl { using namespace impl::sycl; status_t ref_sycl_eltwise_fwd_t::pd_t::init_conf() { conf_ = sycl_eltwise_conf_t(); - conf_.src_md = sycl_md_t(src_md()); - conf_.dst_md = sycl_md_t(dst_md()); + conf_.src_md = hrt::sycl::md_t(src_md()); + conf_.dst_md = hrt::sycl::md_t(dst_md()); conf_.wk_size = memory_desc_wrapper(src_md()).nelems(); conf_.alg_kind = desc()->alg_kind; conf_.alpha = desc()->alpha; @@ -43,7 +43,7 @@ status_t ref_sycl_eltwise_fwd_t::pd_t::init_conf() { conf_.post_ops = sycl_post_ops_t(attr()); for (auto i = 0; i < conf_.post_po_len; ++i) - conf_.binary_src_arr[i] = sycl_md_t( + conf_.binary_src_arr[i] = hrt::sycl::md_t( arg_md(DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1)); const int block_size = conf_.block_size; @@ -89,9 +89,9 @@ status_t ref_sycl_eltwise_fwd_t::execute_forward(const exec_ctx_t &ctx) const { status_t ref_sycl_eltwise_bwd_t::pd_t::init_conf() { conf_ = sycl_eltwise_conf_t(); - conf_.src_md = sycl_md_t(data_md(0)); - conf_.diff_src_md = sycl_md_t(diff_src_md()); - conf_.diff_dst_md = sycl_md_t(diff_dst_md()); + conf_.src_md = hrt::sycl::md_t(data_md(0)); + conf_.diff_src_md = hrt::sycl::md_t(diff_src_md()); + conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md()); conf_.block_size = 16; conf_.wg_size = 32; conf_.wk_size = memory_desc_wrapper(data_md(0)).nelems(); diff --git a/src/gpu/sycl/ref_eltwise.hpp b/src/gpu/sycl/ref_eltwise.hpp index 7b3dbbc1073..705ab57cf3e 100644 --- a/src/gpu/sycl/ref_eltwise.hpp +++ b/src/gpu/sycl/ref_eltwise.hpp @@ -21,7 +21,7 @@ #include "gpu/sycl/sycl_gpu_primitive.hpp" #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/ref_layer_normalizations.cpp b/src/gpu/sycl/ref_layer_normalizations.cpp index 4bb13861c83..67bd51a9c96 100644 --- a/src/gpu/sycl/ref_layer_normalizations.cpp +++ b/src/gpu/sycl/ref_layer_normalizations.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ #include "common/c_types_map.hpp" #include "common/dnnl_traits.hpp" #include "gpu/sycl/layer_normalizations_kernels.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { @@ -29,13 +29,13 @@ using namespace impl::sycl; status_t ref_layer_normalization_fwd_t::pd_t::init_conf() { conf_ = sycl_layer_normalization_conf_t(); - conf_.var_md = stats_are_src() ? sycl_md_t(src_md(2)) - : is_training() ? sycl_md_t(dst_md(2)) - : sycl_md_t {}; + conf_.var_md = stats_are_src() ? hrt::sycl::md_t(src_md(2)) + : is_training() ? hrt::sycl::md_t(dst_md(2)) + : hrt::sycl::md_t {}; conf_.ndims = ndims(); conf_.flags = desc()->flags; conf_.wk_size = memory_desc_wrapper(src_md(0)).nelems(); - conf_.stat_d = sycl_md_t(stat_md()); + conf_.stat_d = hrt::sycl::md_t(stat_md()); conf_.block_size = 16; conf_.wg_size = 32; @@ -46,13 +46,13 @@ status_t ref_layer_normalization_fwd_t::pd_t::init_conf() { conf_.use_scale = use_scale(); conf_.use_shift = use_shift(); conf_.use_ss = conf_.use_scale || conf_.use_shift; - conf_.data_md = sycl_md_t(src_md(0)); - conf_.data_scaleshift_md = sycl_md_t(weights_md(0)); + conf_.data_md = hrt::sycl::md_t(src_md(0)); + conf_.data_scaleshift_md = hrt::sycl::md_t(weights_md(0)); - conf_.stat_md = stats_are_src() ? sycl_md_t(src_md(1)) - : is_training() ? sycl_md_t(dst_md(2)) - : sycl_md_t {}; - conf_.dst_md = sycl_md_t(dst_md(0)); + conf_.stat_md = stats_are_src() ? hrt::sycl::md_t(src_md(1)) + : is_training() ? hrt::sycl::md_t(dst_md(2)) + : hrt::sycl::md_t {}; + conf_.dst_md = hrt::sycl::md_t(dst_md(0)); conf_.shift_off = conf_.use_ss && !has_zero_dim_memory() ? conf_.data_scaleshift_md.off(1, 0) : 0; @@ -138,7 +138,7 @@ status_t ref_layer_normalization_fwd_t::execute_forward( status_t ref_layer_normalization_bwd_t::pd_t::init_conf() { conf_ = sycl_layer_normalization_conf_t(); - conf_.var_md = sycl_md_t(src_md(2)); + conf_.var_md = hrt::sycl::md_t(src_md(2)); conf_.ndims = ndims(); conf_.flags = desc()->flags; conf_.block_size = (16); @@ -147,14 +147,15 @@ status_t ref_layer_normalization_bwd_t::pd_t::init_conf() { conf_.use_scale = use_scale(); conf_.use_shift = use_shift(); conf_.use_ss = conf_.use_scale || conf_.use_shift; - conf_.data_md = sycl_md_t(src_md(0)); - conf_.diff_data_md = sycl_md_t(diff_src_md(0)); - conf_.data_scaleshift_md = sycl_md_t(weights_md(0)); - conf_.diff_data_scaleshift_md - = conf_.use_ss ? sycl_md_t(diff_weights_md(0)) : sycl_md_t {}; - conf_.stat_md = sycl_md_t(src_md(1)); - conf_.diff_dst_md = sycl_md_t(diff_dst_md(0)); - conf_.stat_d = sycl_md_t(stat_md()); + conf_.data_md = hrt::sycl::md_t(src_md(0)); + conf_.diff_data_md = hrt::sycl::md_t(diff_src_md(0)); + conf_.data_scaleshift_md = hrt::sycl::md_t(weights_md(0)); + conf_.diff_data_scaleshift_md = conf_.use_ss + ? hrt::sycl::md_t(diff_weights_md(0)) + : hrt::sycl::md_t {}; + conf_.stat_md = hrt::sycl::md_t(src_md(1)); + conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md(0)); + conf_.stat_d = hrt::sycl::md_t(stat_md()); conf_.zero_dims = has_zero_dim_memory(); auto nelems_A = memory_desc_wrapper(src_md(0)).nelems(); conf_.diff_shift_off = conf_.use_ss && !conf_.zero_dims diff --git a/src/gpu/sycl/ref_layer_normalizations.hpp b/src/gpu/sycl/ref_layer_normalizations.hpp index 8c6793ace99..a8fccd89405 100644 --- a/src/gpu/sycl/ref_layer_normalizations.hpp +++ b/src/gpu/sycl/ref_layer_normalizations.hpp @@ -24,7 +24,7 @@ #include "gpu/sycl/sycl_io_helper.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" namespace dnnl { diff --git a/src/gpu/sycl/ref_lrn.cpp b/src/gpu/sycl/ref_lrn.cpp index 7af14e7e6e6..19199e31792 100644 --- a/src/gpu/sycl/ref_lrn.cpp +++ b/src/gpu/sycl/ref_lrn.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,8 +25,8 @@ namespace sycl { using namespace impl::sycl; status_t ref_sycl_lrn_fwd_t::pd_t::init_conf() { conf_ = sycl_lrn_conf_t(); - conf_.src_md = sycl_md_t(src_md()); - conf_.dst_md = sycl_md_t(dst_md()); + conf_.src_md = hrt::sycl::md_t(src_md()); + conf_.dst_md = hrt::sycl::md_t(dst_md()); conf_.alg_kind = desc()->alg_kind; conf_.block_size = 16; conf_.wg_size = 32; @@ -87,9 +87,9 @@ status_t ref_sycl_lrn_fwd_t::execute_forward(const exec_ctx_t &ctx) const { status_t ref_sycl_lrn_bwd_t::pd_t::init_conf() { conf_ = sycl_lrn_conf_t(); - conf_.src_md = sycl_md_t(src_md()); - conf_.diff_dst_md = sycl_md_t(diff_dst_md()); - conf_.diff_src_md = sycl_md_t(diff_src_md()); + conf_.src_md = hrt::sycl::md_t(src_md()); + conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md()); + conf_.diff_src_md = hrt::sycl::md_t(diff_src_md()); conf_.alg_kind = desc()->alg_kind; conf_.block_size = 16; conf_.wg_size = 32; diff --git a/src/gpu/sycl/ref_lrn.hpp b/src/gpu/sycl/ref_lrn.hpp index 2768561d2ac..03ae0735d76 100644 --- a/src/gpu/sycl/ref_lrn.hpp +++ b/src/gpu/sycl/ref_lrn.hpp @@ -19,7 +19,7 @@ #include "gpu/gpu_lrn_pd.hpp" #include "gpu/sycl/sycl_gpu_primitive.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/ref_pooling.cpp b/src/gpu/sycl/ref_pooling.cpp index 5f0551d5f69..a48fd9272c3 100644 --- a/src/gpu/sycl/ref_pooling.cpp +++ b/src/gpu/sycl/ref_pooling.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ #include "common/c_types_map.hpp" #include "common/dnnl_traits.hpp" #include "gpu/sycl/pooling_kernels.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { @@ -33,11 +33,11 @@ status_t ref_pooling_fwd_t::pd_t::init_conf() { conf_.ndims = ndims(); conf_.block_size = 16; conf_.wg_size = 32; - conf_.src_md = sycl_md_t(src_md(0)); - conf_.dst_md = sycl_md_t(dst_md(0)); + conf_.src_md = hrt::sycl::md_t(src_md(0)); + conf_.dst_md = hrt::sycl::md_t(dst_md(0)); conf_.ws_md = !types::is_zero_md(workspace_md()) - ? sycl_md_t(workspace_md(0)) - : sycl_md_t {}; + ? hrt::sycl::md_t(workspace_md(0)) + : hrt::sycl::md_t {}; conf_.zero_dims = has_zero_dim_memory(); for (int i = 0; i < DNNL_MAX_NDIMS; i++) { conf_.dst_dims[i] = dst_md()->dims[i]; @@ -75,7 +75,7 @@ status_t ref_pooling_fwd_t::pd_t::init_conf() { for (auto i = 0; i < attr_po.len(); ++i) { if (attr_po.contain(binary, i)) { dnnl::impl::memory_desc_t mem = attr_po.entry_[i].binary.src1_desc; - conf_.src1_md[i] = sycl_md_t(&mem); + conf_.src1_md[i] = hrt::sycl::md_t(&mem); } } conf_.post_ops = sycl_post_ops_t(attr()); @@ -122,11 +122,11 @@ status_t ref_pooling_bwd_t::pd_t::init_conf() { conf_.ndims = ndims(); conf_.block_size = 16; conf_.wg_size = 32; - conf_.diff_src_md = sycl_md_t(diff_src_md(0)); - conf_.diff_dst_md = sycl_md_t(diff_dst_md(0)); + conf_.diff_src_md = hrt::sycl::md_t(diff_src_md(0)); + conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md(0)); conf_.ws_md = !types::is_zero_md(workspace_md()) - ? sycl_md_t(workspace_md(0)) - : sycl_md_t {}; + ? hrt::sycl::md_t(workspace_md(0)) + : hrt::sycl::md_t {}; conf_.zero_dims = has_zero_dim_memory(); auto nelems_A = memory_desc_wrapper(diff_src_md(0)).nelems(); int work_per_wg = conf_.wg_size * conf_.block_size; diff --git a/src/gpu/sycl/ref_pooling.hpp b/src/gpu/sycl/ref_pooling.hpp index fbc5660adb0..5f23ea9ed94 100644 --- a/src/gpu/sycl/ref_pooling.hpp +++ b/src/gpu/sycl/ref_pooling.hpp @@ -26,7 +26,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" namespace dnnl { diff --git a/src/gpu/sycl/ref_prelu.cpp b/src/gpu/sycl/ref_prelu.cpp index f2128694ecb..0d506159189 100644 --- a/src/gpu/sycl/ref_prelu.cpp +++ b/src/gpu/sycl/ref_prelu.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,9 +32,9 @@ status_t ref_prelu_fwd_t::pd_t::init_conf() { const memory_desc_wrapper data_d(src_md(0)); const memory_desc_wrapper weights_d(weights_md(0)); - conf_.data_md = sycl_md_t(src_md(0)); - conf_.weights_md = sycl_md_t(weights_md(0)); - conf_.dst_md = sycl_md_t(dst_md(0)); + conf_.data_md = hrt::sycl::md_t(src_md(0)); + conf_.weights_md = hrt::sycl::md_t(weights_md(0)); + conf_.dst_md = hrt::sycl::md_t(dst_md(0)); conf_.ndims = ndims(); conf_.mask = utils::get_dims_mask(data_d.dims(), weights_d.dims(), ndims()); @@ -77,11 +77,11 @@ status_t ref_prelu_fwd_t::execute_forward(const exec_ctx_t &ctx) const { status_t ref_prelu_bwd_t::pd_t::init_conf() { if (has_zero_dim_memory()) return status::success; conf_ = sycl_prelu_conf_t(); - conf_.data_md = sycl_md_t(src_md(0)); - conf_.weights_md = sycl_md_t(weights_md(0)); - conf_.diff_data_md = sycl_md_t(diff_src_md(0)); - conf_.diff_weights_md = sycl_md_t(diff_weights_md(0)); - conf_.diff_dst_md = sycl_md_t(diff_dst_md(0)); + conf_.data_md = hrt::sycl::md_t(src_md(0)); + conf_.weights_md = hrt::sycl::md_t(weights_md(0)); + conf_.diff_data_md = hrt::sycl::md_t(diff_src_md(0)); + conf_.diff_weights_md = hrt::sycl::md_t(diff_weights_md(0)); + conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md(0)); conf_.ndims = ndims(); const memory_desc_wrapper weights_d(weights_md(0)); diff --git a/src/gpu/sycl/ref_prelu.hpp b/src/gpu/sycl/ref_prelu.hpp index a9e655def35..046d91438c6 100644 --- a/src/gpu/sycl/ref_prelu.hpp +++ b/src/gpu/sycl/ref_prelu.hpp @@ -25,7 +25,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" namespace dnnl { diff --git a/src/gpu/sycl/ref_resampling.cpp b/src/gpu/sycl/ref_resampling.cpp index 235d466d214..f1149f361da 100644 --- a/src/gpu/sycl/ref_resampling.cpp +++ b/src/gpu/sycl/ref_resampling.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -50,8 +50,8 @@ status_t ref_resampling_fwd_t::pd_t::init_conf() { int n_wgs = (nelems_A + work_per_wg - 1) / work_per_wg; conf_.n_thr = n_wgs * conf_.wg_size; - conf_.src_md = sycl_md_t(src_md(0)); - conf_.dst_md = sycl_md_t(dst_md()); + conf_.src_md = hrt::sycl::md_t(src_md(0)); + conf_.dst_md = hrt::sycl::md_t(dst_md()); conf_.alg = desc()->alg_kind; const auto *att = attr(); @@ -61,7 +61,7 @@ status_t ref_resampling_fwd_t::pd_t::init_conf() { for (auto i = 0; i < attr_po.len(); ++i) { if (attr_po.contain(primitive_kind::binary, i)) { dnnl::impl::memory_desc_t mem = attr_po.entry_[i].binary.src1_desc; - conf_.src1_md[i] = sycl_md_t(&mem); + conf_.src1_md[i] = hrt::sycl::md_t(&mem); } } conf_.post_ops = sycl_post_ops_t(attr()); @@ -110,8 +110,8 @@ status_t ref_resampling_fwd_t::execute_forward(const exec_ctx_t &ctx) const { status_t ref_resampling_bwd_t::pd_t::init_conf() { conf_ = sycl_resampling_conf_t(); - conf_.diff_src_md = sycl_md_t(diff_src_md(0)); - conf_.diff_dst_md = sycl_md_t(diff_dst_md()); + conf_.diff_src_md = hrt::sycl::md_t(diff_src_md(0)); + conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md()); conf_.src_dt = src_md(0)->data_type; conf_.dst_dt = dst_md()->data_type; diff --git a/src/gpu/sycl/ref_resampling.hpp b/src/gpu/sycl/ref_resampling.hpp index 3e54f26b116..1a346048d43 100644 --- a/src/gpu/sycl/ref_resampling.hpp +++ b/src/gpu/sycl/ref_resampling.hpp @@ -23,7 +23,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" namespace dnnl { diff --git a/src/gpu/sycl/ref_shuffle.cpp b/src/gpu/sycl/ref_shuffle.cpp index 44ba0b6ee2d..651f3d34b0c 100644 --- a/src/gpu/sycl/ref_shuffle.cpp +++ b/src/gpu/sycl/ref_shuffle.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,10 +51,10 @@ status_t ref_shuffle_t::pd_t::init_conf() { conf_.HW = conf_.H * conf_.W; conf_.SP = conf_.D * conf_.HW; } - conf_.stat_md = sycl_md_t(src_md(0)); + conf_.stat_md = hrt::sycl::md_t(src_md(0)); conf_.work_amount = memory_desc_wrapper(src_md()).nelems(); - conf_.src_md = sycl_md_t(src_md(0)); - conf_.dst_md = sycl_md_t(dst_md(0)); + conf_.src_md = hrt::sycl::md_t(src_md(0)); + conf_.dst_md = hrt::sycl::md_t(dst_md(0)); if (ndims() == 5) { const auto tag diff --git a/src/gpu/sycl/ref_shuffle.hpp b/src/gpu/sycl/ref_shuffle.hpp index d8f77e177f0..eefc8c3a4a5 100644 --- a/src/gpu/sycl/ref_shuffle.hpp +++ b/src/gpu/sycl/ref_shuffle.hpp @@ -23,7 +23,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" namespace dnnl { diff --git a/src/gpu/sycl/ref_softmax.cpp b/src/gpu/sycl/ref_softmax.cpp index 082e43e8168..4d4c76b902e 100644 --- a/src/gpu/sycl/ref_softmax.cpp +++ b/src/gpu/sycl/ref_softmax.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,8 +25,8 @@ namespace sycl { using namespace impl::sycl; status_t ref_sycl_softmax_fwd_t::pd_t::init_conf() { conf_ = sycl_softmax_conf_t(); - conf_.src_md = sycl_md_t(src_md()); - conf_.dst_md = sycl_md_t(dst_md()); + conf_.src_md = hrt::sycl::md_t(src_md()); + conf_.dst_md = hrt::sycl::md_t(dst_md()); conf_.alg_kind = desc()->alg_kind; conf_.block_size = 16; conf_.wg_size = 32; @@ -76,9 +76,9 @@ status_t ref_sycl_softmax_fwd_t::execute_forward(const exec_ctx_t &ctx) const { status_t ref_sycl_softmax_bwd_t::pd_t::init_conf() { conf_ = sycl_softmax_conf_t(); - conf_.dst_md = sycl_md_t(dst_md()); - conf_.diff_dst_md = sycl_md_t(diff_dst_md()); - conf_.diff_src_md = sycl_md_t(diff_src_md()); + conf_.dst_md = hrt::sycl::md_t(dst_md()); + conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md()); + conf_.diff_src_md = hrt::sycl::md_t(diff_src_md()); conf_.alg_kind = desc()->alg_kind; conf_.block_size = 16; conf_.wg_size = 32; diff --git a/src/gpu/sycl/resampling_kernels.hpp b/src/gpu/sycl/resampling_kernels.hpp index 95fd9625b58..fb00a832488 100644 --- a/src/gpu/sycl/resampling_kernels.hpp +++ b/src/gpu/sycl/resampling_kernels.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { @@ -33,10 +33,12 @@ namespace sycl { struct resampling_kernel_fwd_vec_t { resampling_kernel_fwd_vec_t(const sycl_resampling_conf_t &conf, - sycl_in_memory_arg_t &src, sycl_out_memory_arg_t &dst, - sycl_in_memory_arg_t &src_1, sycl_in_memory_arg_t &src_2, - sycl_in_memory_arg_t &src_3, sycl_in_memory_arg_t &src_4, - sycl_in_memory_arg_t &src_5) + hrt::sycl::in_memory_arg_t &src, hrt::sycl::out_memory_arg_t &dst, + hrt::sycl::in_memory_arg_t &src_1, + hrt::sycl::in_memory_arg_t &src_2, + hrt::sycl::in_memory_arg_t &src_3, + hrt::sycl::in_memory_arg_t &src_4, + hrt::sycl::in_memory_arg_t &src_5) : conf_(conf) , src_(src) , dst_(dst) @@ -140,8 +142,8 @@ struct resampling_kernel_fwd_vec_t { } private: - const sycl_md_t &src_md() const { return conf_.src_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &src_md() const { return conf_.src_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } void *src_ptr() const { return src_.get_pointer(); } void *src_1_ptr() const { return src_1_.get_pointer(); } @@ -151,12 +153,12 @@ struct resampling_kernel_fwd_vec_t { void *src_5_ptr() const { return src_5_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } - void *gen_ptr(sycl_in_memory_arg_t gen_) const { + void *gen_ptr(hrt::sycl::in_memory_arg_t gen_) const { return gen_.get_pointer(); } - static dim_t get_offset( - const sycl_md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { + static dim_t get_offset(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 3: return mdw.off(n, c, w); case 4: return mdw.off(n, c, h, w); @@ -166,7 +168,7 @@ struct resampling_kernel_fwd_vec_t { return 0; } - float dst_value(sycl_in_memory_arg_t arr, int idx, int offset) const { + float dst_value(hrt::sycl::in_memory_arg_t arr, int idx, int offset) const { auto src1_desc = conf_.src1_md[idx]; dim_t src_dim[DNNL_MAX_NDIMS]; auto src_dim_ = src1_desc.dims(); @@ -180,8 +182,8 @@ struct resampling_kernel_fwd_vec_t { return dst; } - dim_t get_binary_src1_off(const sycl_md_t &src1_md, const dim_t *src_dim, - const dim_t l_offset, const dim_t *dst_dims, + dim_t get_binary_src1_off(const hrt::sycl::md_t &src1_md, + const dim_t *src_dim, const dim_t l_offset, const dim_t *dst_dims, const int dst_ndims) const { const int mask_binary_po @@ -191,8 +193,9 @@ struct resampling_kernel_fwd_vec_t { src1_md, l_offset, dst_dims, dst_ndims, mask_binary_po); } - dim_t get_po_tensor_off(const sycl_md_t &tensor_md, const dim_t l_offset, - const dim_t *dst_dims, const int dst_ndims, int mask) const { + dim_t get_po_tensor_off(const hrt::sycl::md_t &tensor_md, + const dim_t l_offset, const dim_t *dst_dims, const int dst_ndims, + int mask) const { dims_t l_dims_po {}; get_l_dims_po(l_dims_po, l_offset, dst_dims, dst_ndims, mask); @@ -208,18 +211,19 @@ struct resampling_kernel_fwd_vec_t { sycl_resampling_conf_t conf_; - sycl_in_memory_arg_t src_; - sycl_out_memory_arg_t dst_; - sycl_in_memory_arg_t src_1_; - sycl_in_memory_arg_t src_2_; - sycl_in_memory_arg_t src_3_; - sycl_in_memory_arg_t src_4_; - sycl_in_memory_arg_t src_5_; + hrt::sycl::in_memory_arg_t src_; + hrt::sycl::out_memory_arg_t dst_; + hrt::sycl::in_memory_arg_t src_1_; + hrt::sycl::in_memory_arg_t src_2_; + hrt::sycl::in_memory_arg_t src_3_; + hrt::sycl::in_memory_arg_t src_4_; + hrt::sycl::in_memory_arg_t src_5_; }; struct resampling_kernel_bwd_vec_t { resampling_kernel_bwd_vec_t(const sycl_resampling_conf_t &conf, - sycl_in_memory_arg_t &diff_dst, sycl_out_memory_arg_t &diff_src) + hrt::sycl::in_memory_arg_t &diff_dst, + hrt::sycl::out_memory_arg_t &diff_src) : conf_(conf), diff_dst_(diff_dst), diff_src_(diff_src) {} void operator()(::sycl::nd_item<1> item) const { @@ -274,14 +278,14 @@ struct resampling_kernel_bwd_vec_t { } private: - const sycl_md_t &diff_src_md() const { return conf_.diff_src_md; } - const sycl_md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const hrt::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } + const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } void *diff_src_ptr() const { return diff_src_.get_pointer(); } void *diff_dst_ptr() const { return diff_dst_.get_pointer(); } - static dim_t get_offset( - const sycl_md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { + static dim_t get_offset(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 3: return mdw.off(n, c, w); case 4: return mdw.off(n, c, h, w); @@ -292,13 +296,14 @@ struct resampling_kernel_bwd_vec_t { } sycl_resampling_conf_t conf_; - sycl_in_memory_arg_t diff_dst_; - sycl_out_memory_arg_t diff_src_; + hrt::sycl::in_memory_arg_t diff_dst_; + hrt::sycl::out_memory_arg_t diff_src_; }; struct resampling_kernel_bwd_vec1_t { resampling_kernel_bwd_vec1_t(const sycl_resampling_conf_t &conf, - sycl_in_memory_arg_t &diff_dst, sycl_out_memory_arg_t &diff_src) + hrt::sycl::in_memory_arg_t &diff_dst, + hrt::sycl::out_memory_arg_t &diff_src) : conf_(conf), diff_dst_(diff_dst), diff_src_(diff_src) {} void operator()(::sycl::nd_item<1> item) const { @@ -351,14 +356,14 @@ struct resampling_kernel_bwd_vec1_t { } private: - const sycl_md_t &diff_src_md() const { return conf_.diff_src_md; } - const sycl_md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const hrt::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } + const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } void *diff_src_ptr() const { return diff_src_.get_pointer(); } void *diff_dst_ptr() const { return diff_dst_.get_pointer(); } - static dim_t get_offset( - const sycl_md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { + static dim_t get_offset(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 3: return mdw.off(n, c, w); case 4: return mdw.off(n, c, h, w); @@ -369,8 +374,8 @@ struct resampling_kernel_bwd_vec1_t { } sycl_resampling_conf_t conf_; - sycl_in_memory_arg_t diff_dst_; - sycl_out_memory_arg_t diff_src_; + hrt::sycl::in_memory_arg_t diff_dst_; + hrt::sycl::out_memory_arg_t diff_src_; }; } // namespace sycl diff --git a/src/gpu/sycl/shuffle_kernels.hpp b/src/gpu/sycl/shuffle_kernels.hpp index 8bc3dc1931d..e4bf3c70330 100644 --- a/src/gpu/sycl/shuffle_kernels.hpp +++ b/src/gpu/sycl/shuffle_kernels.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,7 +23,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { @@ -32,7 +32,7 @@ namespace sycl { struct shuffle_kernel_vec1_t { shuffle_kernel_vec1_t(const sycl_shuffle_conf_t &conf, - sycl_in_memory_arg_t &data, sycl_out_memory_arg_t &dst) + hrt::sycl::in_memory_arg_t &data, hrt::sycl::out_memory_arg_t &dst) : conf_(conf), data_(data), dst_(dst) {} void operator()(::sycl::nd_item<1> item) const { @@ -63,21 +63,21 @@ struct shuffle_kernel_vec1_t { } private: - const sycl_md_t &data_md() const { return conf_.src_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } - const sycl_md_t &stat_md() const { return conf_.stat_md; } + const hrt::sycl::md_t &data_md() const { return conf_.src_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } void *data_ptr() const { return data_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } sycl_shuffle_conf_t conf_; - sycl_in_memory_arg_t data_; - sycl_out_memory_arg_t dst_; + hrt::sycl::in_memory_arg_t data_; + hrt::sycl::out_memory_arg_t dst_; }; struct shuffle_kernel_vec2_t { shuffle_kernel_vec2_t(const sycl_shuffle_conf_t &conf, - sycl_in_memory_arg_t &data, sycl_out_memory_arg_t &dst) + hrt::sycl::in_memory_arg_t &data, hrt::sycl::out_memory_arg_t &dst) : conf_(conf), data_(data), dst_(dst) {} void operator()(::sycl::nd_item<1> item) const { @@ -103,21 +103,21 @@ struct shuffle_kernel_vec2_t { } private: - const sycl_md_t &data_md() const { return conf_.src_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } - const sycl_md_t &stat_md() const { return conf_.stat_md; } + const hrt::sycl::md_t &data_md() const { return conf_.src_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } void *data_ptr() const { return data_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } sycl_shuffle_conf_t conf_; - sycl_in_memory_arg_t data_; - sycl_out_memory_arg_t dst_; + hrt::sycl::in_memory_arg_t data_; + hrt::sycl::out_memory_arg_t dst_; }; struct shuffle_kernel_vec3_t { shuffle_kernel_vec3_t(const sycl_shuffle_conf_t &conf, - sycl_in_memory_arg_t &data, sycl_out_memory_arg_t &dst) + hrt::sycl::in_memory_arg_t &data, hrt::sycl::out_memory_arg_t &dst) : conf_(conf), data_(data), dst_(dst) {} void operator()(::sycl::nd_item<1> item) const { @@ -149,16 +149,16 @@ struct shuffle_kernel_vec3_t { } private: - const sycl_md_t &data_md() const { return conf_.src_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } - const sycl_md_t &stat_md() const { return conf_.stat_md; } + const hrt::sycl::md_t &data_md() const { return conf_.src_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } void *data_ptr() const { return data_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } sycl_shuffle_conf_t conf_; - sycl_in_memory_arg_t data_; - sycl_out_memory_arg_t dst_; + hrt::sycl::in_memory_arg_t data_; + hrt::sycl::out_memory_arg_t dst_; }; } // namespace sycl diff --git a/src/gpu/sycl/softmax_kernels.hpp b/src/gpu/sycl/softmax_kernels.hpp index 34b51c01706..dd70308b919 100644 --- a/src/gpu/sycl/softmax_kernels.hpp +++ b/src/gpu/sycl/softmax_kernels.hpp @@ -29,8 +29,10 @@ namespace sycl { struct softmax_fwd_kernel_vec_t { softmax_fwd_kernel_vec_t(const sycl_softmax_conf_t &conf, - sycl_in_memory_arg_t &src, sycl_in_memory_arg_t &scale_src, - sycl_in_memory_arg_t &scale_dst, sycl_out_memory_arg_t &dst) + hrt::sycl::in_memory_arg_t &src, + hrt::sycl::in_memory_arg_t &scale_src, + hrt::sycl::in_memory_arg_t &scale_dst, + hrt::sycl::out_memory_arg_t &dst) : conf_(conf) , src_(src) , scale_src_(scale_src) @@ -111,8 +113,8 @@ struct softmax_fwd_kernel_vec_t { } private: - const sycl_md_t &src_md() const { return conf_.src_md; } - const sycl_md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &src_md() const { return conf_.src_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } void *src_ptr() const { return src_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } @@ -120,16 +122,17 @@ struct softmax_fwd_kernel_vec_t { void *scale_dst_ptr() const { return scale_dst_.get_pointer(); } sycl_softmax_conf_t conf_; - sycl_in_memory_arg_t src_; - sycl_in_memory_arg_t scale_src_; - sycl_in_memory_arg_t scale_dst_; - sycl_out_memory_arg_t dst_; + hrt::sycl::in_memory_arg_t src_; + hrt::sycl::in_memory_arg_t scale_src_; + hrt::sycl::in_memory_arg_t scale_dst_; + hrt::sycl::out_memory_arg_t dst_; }; struct softmax_bwd_kernel_vec_t { softmax_bwd_kernel_vec_t(const sycl_softmax_conf_t &conf, - sycl_in_memory_arg_t &dst, sycl_in_memory_arg_t &diff_dst, - sycl_out_memory_arg_t &diff_src) + hrt::sycl::in_memory_arg_t &dst, + hrt::sycl::in_memory_arg_t &diff_dst, + hrt::sycl::out_memory_arg_t &diff_src) : conf_(conf), dst_(dst), diff_dst_(diff_dst), diff_src_(diff_src) {} void operator()(::sycl::nd_item<1> item) const { @@ -195,18 +198,18 @@ struct softmax_bwd_kernel_vec_t { } private: - const sycl_md_t &dst_md() const { return conf_.dst_md; } - const sycl_md_t &diff_dst_md() const { return conf_.diff_dst_md; } - const sycl_md_t &diff_src_md() const { return conf_.diff_src_md; } + const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const hrt::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } void *dst_ptr() const { return dst_.get_pointer(); } void *diff_dst_ptr() const { return diff_dst_.get_pointer(); } void *diff_src_ptr() const { return diff_src_.get_pointer(); } sycl_softmax_conf_t conf_; - sycl_in_memory_arg_t dst_; - sycl_in_memory_arg_t diff_dst_; - sycl_out_memory_arg_t diff_src_; + hrt::sycl::in_memory_arg_t dst_; + hrt::sycl::in_memory_arg_t diff_dst_; + hrt::sycl::out_memory_arg_t diff_src_; }; } // namespace sycl diff --git a/src/gpu/sycl/sycl_interop_gpu_kernel.cpp b/src/gpu/sycl/sycl_interop_gpu_kernel.cpp index 022fbf5ded8..684dba1b064 100644 --- a/src/gpu/sycl/sycl_interop_gpu_kernel.cpp +++ b/src/gpu/sycl/sycl_interop_gpu_kernel.cpp @@ -24,8 +24,8 @@ #include "gpu/intel/sycl/l0/utils.hpp" #include "gpu/intel/sycl/utils.hpp" #include "gpu/intel/utils.hpp" +#include "hrt/sycl/c_types_map.hpp" #include "hrt/utils.hpp" -#include "sycl/sycl_c_types_map.hpp" #include "sycl/sycl_stream.hpp" namespace dnnl { @@ -123,11 +123,12 @@ status_t sycl_interop_gpu_kernel_t::parallel_for(stream_t &stream, = static_cast(arg.value()); if (*mem_storage) { auto *sycl_mem_storage = utils::downcast< - const sycl_memory_storage_base_t *>(mem_storage); + const hrt::sycl::memory_storage_base_t *>( + mem_storage); switch (sycl_mem_storage->memory_kind()) { - case memory_kind::buffer: { + case hrt::sycl::memory_kind::buffer: { auto *m = utils::downcast< - const sycl_buffer_memory_storage_t *>( + const hrt::sycl::buffer_memory_storage_t *>( mem_storage); auto &sycl_buf = m->buffer(); cgh.set_arg((int)i, @@ -136,9 +137,9 @@ status_t sycl_interop_gpu_kernel_t::parallel_for(stream_t &stream, cgh)); break; } - case memory_kind::usm: { + case hrt::sycl::memory_kind::usm: { auto *m = utils::downcast< - const sycl_usm_memory_storage_t *>( + const hrt::sycl::usm_memory_storage_t *>( mem_storage); cgh.set_arg((int)i, m->usm_ptr()); break; diff --git a/src/gpu/sycl/sycl_io_helper.hpp b/src/gpu/sycl/sycl_io_helper.hpp index 8fc830b5f02..5aa7a223b1e 100644 --- a/src/gpu/sycl/sycl_io_helper.hpp +++ b/src/gpu/sycl/sycl_io_helper.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,8 +31,7 @@ inline int load_int_value(data_type_t dt, const void *ptr, dim_t idx) { #define CASE(dt) \ case dt: \ return static_cast(reinterpret_cast< \ - const typename impl::gpu::sycl::sycl_prec_traits
::type *>( \ - ptr)[idx]); + const typename hrt::sycl::prec_traits
::type *>(ptr)[idx]); using namespace data_type; switch (dt) { CASE(s32); @@ -49,8 +48,7 @@ inline float load_float_value(data_type_t dt, const void *ptr, dim_t idx) { #define CASE(dt) \ case dt: \ return static_cast(reinterpret_cast< \ - const typename impl::gpu::sycl::sycl_prec_traits
::type *>( \ - ptr)[idx]); + const typename hrt::sycl::prec_traits
::type *>(ptr)[idx]); using namespace data_type; switch (dt) { @@ -70,7 +68,7 @@ inline float load_float_value(data_type_t dt, const void *ptr, dim_t idx) { inline void store_float_value(data_type_t dt, float val, void *ptr, dim_t idx) { #define CASE(dt) \ case dt: { \ - using type_ = typename impl::gpu::sycl::sycl_prec_traits
::type; \ + using type_ = typename hrt::sycl::prec_traits
::type; \ *(reinterpret_cast(ptr) + idx) \ = impl::sycl::saturate_and_round(val); \ } break; @@ -105,7 +103,8 @@ inline ::sycl::vec handle_bf16_load(void *ptr, dim_t offset) { ::sycl::vec vec_f32; for (int i = 0; i < width; i++) { // Convert u16 value to bfloat16_t. - const bfloat16_t bf16_val = static_cast(vec_u16[i]); + const hrt::sycl::bfloat16_t bf16_val + = static_cast(vec_u16[i]); // Convert bfloat16_t to float. const float f32_val = static_cast(bf16_val); // Write result to vector. @@ -122,7 +121,8 @@ inline void handle_bf16_store( for (int i = 0; i < width; i++) { // Convert float value to bfloat16_t. - const bfloat16_t bf16_val = static_cast(vec_f32[i]); + const hrt::sycl::bfloat16_t bf16_val + = static_cast(vec_f32[i]); // Convert bfloat16_t to uint16_t. const uint16_t u16_val = bf16_val.raw_bits_; // Write result to vector. @@ -137,7 +137,7 @@ inline ::sycl::vec load_float_vec( data_type_t dt, void *ptr, dim_t offset) { #define CASE(dt) \ case dt: { \ - using type = typename impl::gpu::sycl::sycl_prec_traits
::type; \ + using type = typename hrt::sycl::prec_traits
::type; \ global_ptr gptr_dt(reinterpret_cast(ptr)); \ ::sycl::vec vec_dt; \ vec_dt.load(offset, gptr_dt); \ @@ -163,7 +163,7 @@ inline void store_float_vec(data_type_t dt, ::sycl::vec vec_f32, void *ptr, dim_t offset) { #define CASE(dt) \ case dt: { \ - using type = typename impl::gpu::sycl::sycl_prec_traits
::type; \ + using type = typename hrt::sycl::prec_traits
::type; \ global_ptr gptr_dt(reinterpret_cast(ptr)); \ auto vec_dt = impl::sycl::saturate_and_round_vec(vec_f32); \ vec_dt.store(offset, gptr_dt); \ diff --git a/src/gpu/sycl/sycl_post_ops.hpp b/src/gpu/sycl/sycl_post_ops.hpp index 52382b43933..b88a79492d1 100644 --- a/src/gpu/sycl/sycl_post_ops.hpp +++ b/src/gpu/sycl/sycl_post_ops.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ #include "common/c_types_map.hpp" #include "common/primitive_attr.hpp" #include "gpu/sycl/sycl_math_utils.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { @@ -133,7 +133,7 @@ struct ref_binary_op_t { struct sycl_post_ops_t { // SYCL has a limitation on total size of kernel arguments. // This affects number of post ops, e.g. binary post op (which is not yet - // implemented) contains sycl_md_t which is large enough to limit + // implemented) contains hrt::sycl::md_t which is large enough to limit // the number of post ops. static constexpr int max_post_ops = 5; diff --git a/src/gpu/sycl/sycl_primitive_conf.hpp b/src/gpu/sycl/sycl_primitive_conf.hpp index e3d408ec71d..f5aee8cbc62 100644 --- a/src/gpu/sycl/sycl_primitive_conf.hpp +++ b/src/gpu/sycl/sycl_primitive_conf.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ #include "common/broadcast_strategy.hpp" #include "gpu/sycl/sycl_post_ops.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { @@ -27,16 +27,16 @@ namespace gpu { namespace sycl { struct sycl_binary_conf_t { - sycl_md_t src0_md; - sycl_md_t src1_md; - sycl_md_t dst_md; + hrt::sycl::md_t src0_md; + hrt::sycl::md_t src1_md; + hrt::sycl::md_t dst_md; alg_kind_t alg_kind; bool do_scale_src0; bool do_scale_src1; - int broadcast_dims[sycl_md_t::max_dims]; + int broadcast_dims[hrt::sycl::md_t::max_dims]; int ndims; bool is_tensor_op; @@ -49,10 +49,10 @@ struct sycl_binary_conf_t { struct sycl_eltwise_conf_t { prop_kind_t prop_kind; - sycl_md_t src_md; - sycl_md_t dst_md; - sycl_md_t diff_src_md; - sycl_md_t diff_dst_md; + hrt::sycl::md_t src_md; + hrt::sycl::md_t dst_md; + hrt::sycl::md_t diff_src_md; + hrt::sycl::md_t diff_dst_md; alg_kind_t alg_kind; float alpha; float beta; @@ -65,18 +65,18 @@ struct sycl_eltwise_conf_t { dim_t wg_size; dim_t wk_size; dim_t post_po_len; - sycl_md_t binary_src_arr[8]; + hrt::sycl::md_t binary_src_arr[8]; sycl_post_ops_t post_ops; }; struct sycl_prelu_conf_t { prop_kind_t prop_kind; - sycl_md_t data_md; - sycl_md_t dst_md; - sycl_md_t weights_md; - sycl_md_t diff_data_md; - sycl_md_t diff_dst_md; - sycl_md_t diff_weights_md; + hrt::sycl::md_t data_md; + hrt::sycl::md_t dst_md; + hrt::sycl::md_t weights_md; + hrt::sycl::md_t diff_data_md; + hrt::sycl::md_t diff_dst_md; + hrt::sycl::md_t diff_weights_md; dim_t work_amount; dim_t work_amount_wei; dim_t work_amount_src; @@ -93,10 +93,10 @@ struct sycl_prelu_conf_t { }; struct sycl_shuffle_conf_t { - sycl_md_t src_md; - sycl_md_t dst_md; - sycl_md_t stat_md; - sycl_md_t axis_md; + hrt::sycl::md_t src_md; + hrt::sycl::md_t dst_md; + hrt::sycl::md_t stat_md; + hrt::sycl::md_t axis_md; dim_t transpose_col; dim_t transpose_row; dim_t group_size; @@ -142,16 +142,16 @@ struct sycl_resampling_conf_t { data_type_t src_dt; data_type_t dst_dt; - sycl_md_t src_md; - sycl_md_t src1_md[8]; - sycl_md_t dst_md; - sycl_md_t diff_src_md; - sycl_md_t diff_dst_md; + hrt::sycl::md_t src_md; + hrt::sycl::md_t src1_md[8]; + hrt::sycl::md_t dst_md; + hrt::sycl::md_t diff_src_md; + hrt::sycl::md_t diff_dst_md; alg_kind_t alg; float src_scale; bool do_scale_src; - int broadcast_dims[sycl_md_t::max_dims]; + int broadcast_dims[hrt::sycl::md_t::max_dims]; int ndims; bool is_tensor_op; @@ -164,17 +164,17 @@ struct sycl_resampling_conf_t { struct sycl_layer_normalization_conf_t { prop_kind_t prop_kind; - sycl_md_t data_md; - sycl_md_t diff_data_md; - sycl_md_t data_scaleshift_md; - sycl_md_t diff_data_scaleshift_md; - sycl_md_t scale; - sycl_md_t shift; - sycl_md_t stat_md; - sycl_md_t stat_d; - sycl_md_t var_md; - sycl_md_t dst_md; - sycl_md_t diff_dst_md; + hrt::sycl::md_t data_md; + hrt::sycl::md_t diff_data_md; + hrt::sycl::md_t data_scaleshift_md; + hrt::sycl::md_t diff_data_scaleshift_md; + hrt::sycl::md_t scale; + hrt::sycl::md_t shift; + hrt::sycl::md_t stat_md; + hrt::sycl::md_t stat_d; + hrt::sycl::md_t var_md; + hrt::sycl::md_t dst_md; + hrt::sycl::md_t diff_dst_md; dim_t wk_size; bool is_fwd; bool src_def; @@ -216,18 +216,18 @@ struct sycl_batch_normalization_conf_t { bool use_shift; float alpha; bool dir; - sycl_md_t data_md; - sycl_md_t src1_md; - sycl_md_t dst1_md; - sycl_md_t diff_data_md; - sycl_md_t diff_src1_md; - sycl_md_t data_scaleshift_md; - sycl_md_t diff_data_scaleshift_md; - sycl_md_t stat_md; - sycl_md_t var_md; - sycl_md_t ws_md; - sycl_md_t dst_md; - sycl_md_t diff_dst_md; + hrt::sycl::md_t data_md; + hrt::sycl::md_t src1_md; + hrt::sycl::md_t dst1_md; + hrt::sycl::md_t diff_data_md; + hrt::sycl::md_t diff_src1_md; + hrt::sycl::md_t data_scaleshift_md; + hrt::sycl::md_t diff_data_scaleshift_md; + hrt::sycl::md_t stat_md; + hrt::sycl::md_t var_md; + hrt::sycl::md_t ws_md; + hrt::sycl::md_t dst_md; + hrt::sycl::md_t diff_dst_md; dim_t N; dim_t C; dim_t D; @@ -246,12 +246,12 @@ struct sycl_batch_normalization_conf_t { struct sycl_softmax_conf_t { prop_kind_t prop_kind; - sycl_md_t src_md; - sycl_md_t dst_md; + hrt::sycl::md_t src_md; + hrt::sycl::md_t dst_md; - sycl_md_t diff_md; - sycl_md_t diff_src_md; - sycl_md_t diff_dst_md; + hrt::sycl::md_t diff_md; + hrt::sycl::md_t diff_src_md; + hrt::sycl::md_t diff_dst_md; alg_kind_t alg_kind; dim_t block_size; dim_t wg_size; @@ -267,10 +267,10 @@ struct sycl_softmax_conf_t { }; struct sycl_lrn_conf_t { - sycl_md_t src_md; - sycl_md_t dst_md; - sycl_md_t diff_dst_md; - sycl_md_t diff_src_md; + hrt::sycl::md_t src_md; + hrt::sycl::md_t dst_md; + hrt::sycl::md_t diff_dst_md; + hrt::sycl::md_t diff_src_md; alg_kind_t alg_kind; dim_t mb; @@ -293,12 +293,12 @@ struct sycl_lrn_conf_t { }; struct sycl_pooling_conf_t { - sycl_md_t src_md; - sycl_md_t src1_md[8]; - sycl_md_t dst_md; - sycl_md_t ws_md; - sycl_md_t diff_src_md; - sycl_md_t diff_dst_md; + hrt::sycl::md_t src_md; + hrt::sycl::md_t src1_md[8]; + hrt::sycl::md_t dst_md; + hrt::sycl::md_t ws_md; + hrt::sycl::md_t diff_src_md; + hrt::sycl::md_t diff_dst_md; int ndims; int po_len; bool zero_dims; diff --git a/src/gpu/sycl/sycl_q10n.hpp b/src/gpu/sycl/sycl_q10n.hpp index d9910ea68d6..5d3f2950fec 100644 --- a/src/gpu/sycl/sycl_q10n.hpp +++ b/src/gpu/sycl/sycl_q10n.hpp @@ -20,7 +20,7 @@ #include "common/c_types_map.hpp" #include "common/math_utils.hpp" #include "common/utils.hpp" -#include "gpu/sycl/sycl_types.hpp" +#include "hrt/sycl/types.hpp" #include "hrt/sycl/utils.hpp" namespace dnnl { diff --git a/src/sycl/sycl_buffer_memory_storage.cpp b/src/hrt/sycl/buffer_memory_storage.cpp similarity index 72% rename from src/sycl/sycl_buffer_memory_storage.cpp rename to src/hrt/sycl/buffer_memory_storage.cpp index 4fd8c442379..28de215a229 100644 --- a/src/sycl/sycl_buffer_memory_storage.cpp +++ b/src/hrt/sycl/buffer_memory_storage.cpp @@ -14,7 +14,7 @@ * limitations under the License. *******************************************************************************/ -#include "sycl/sycl_buffer_memory_storage.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" #include "sycl/sycl_engine_base.hpp" #include "common/memory.hpp" @@ -24,17 +24,18 @@ namespace dnnl { namespace impl { +namespace hrt { namespace sycl { namespace { template <::sycl::access_mode mode> -gpu::sycl::sycl_memory_arg_t get_memory_arg( - const sycl_buffer_memory_storage_t *storage, stream_t *stream, - ::sycl::handler &cgh) { +memory_arg_t get_memory_arg(const buffer_memory_storage_t *storage, + stream_t *stream, ::sycl::handler &cgh) { void *handle = nullptr; storage->get_data_handle(&handle); if (!handle) { - auto *sycl_stream = utils::downcast(stream); + auto *sycl_stream + = utils::downcast(stream); return {sycl_stream->get_dummy_accessor(cgh)}; } return {storage->buffer().get_access(cgh)}; @@ -44,14 +45,14 @@ gpu::sycl::sycl_memory_arg_t get_memory_arg( struct map_buffer_tag; -sycl_buffer_memory_storage_t::sycl_buffer_memory_storage_t(engine_t *engine) - : sycl_memory_storage_base_t(engine) {} +buffer_memory_storage_t::buffer_memory_storage_t(engine_t *engine) + : memory_storage_base_t(engine) {} -sycl_buffer_memory_storage_t::sycl_buffer_memory_storage_t( +buffer_memory_storage_t::buffer_memory_storage_t( engine_t *engine, const memory_storage_t *parent_storage) - : sycl_memory_storage_base_t(engine, parent_storage) {} + : memory_storage_base_t(engine, parent_storage) {} -status_t sycl_buffer_memory_storage_t::map_data( +status_t buffer_memory_storage_t::map_data( void **mapped_ptr, stream_t *stream, size_t) const { if (!buffer_) { *mapped_ptr = nullptr; @@ -70,7 +71,7 @@ status_t sycl_buffer_memory_storage_t::map_data( return map_manager.map(this, stream, *mapped_ptr, unmap_callback); } -status_t sycl_buffer_memory_storage_t::unmap_data( +status_t buffer_memory_storage_t::unmap_data( void *mapped_ptr, stream_t *stream) const { if (!mapped_ptr) return status::success; @@ -78,9 +79,9 @@ status_t sycl_buffer_memory_storage_t::unmap_data( return map_manager.unmap(this, stream, mapped_ptr); } -std::unique_ptr sycl_buffer_memory_storage_t::get_sub_storage( +std::unique_ptr buffer_memory_storage_t::get_sub_storage( size_t offset, size_t size) const { - auto storage = utils::make_unique( + auto storage = utils::make_unique( engine(), parent_storage()); if (!storage) return nullptr; @@ -93,7 +94,8 @@ std::unique_ptr sycl_buffer_memory_storage_t::get_sub_storage( } else { gpu_assert(IMPLICATION( hrt::sycl::is_intel_device( - utils::downcast(engine()) + utils::downcast( + engine()) ->device()), offset % gpu::intel::ocl::OCL_BUFFER_ALIGNMENT == 0)); hrt::sycl::buffer_u8_t *sub_buffer = buffer_ @@ -107,8 +109,8 @@ std::unique_ptr sycl_buffer_memory_storage_t::get_sub_storage( return storage; } -std::unique_ptr sycl_buffer_memory_storage_t::clone() const { - auto storage = utils::make_unique(engine()); +std::unique_ptr buffer_memory_storage_t::clone() const { + auto storage = utils::make_unique(engine()); if (!storage) return nullptr; status_t status @@ -120,9 +122,10 @@ std::unique_ptr sycl_buffer_memory_storage_t::clone() const { return storage; } -status_t sycl_buffer_memory_storage_t::init_allocate(size_t size) { +status_t buffer_memory_storage_t::init_allocate(size_t size) { const auto &device - = utils::downcast(engine())->device(); + = utils::downcast(engine()) + ->device(); if (size > device.get_info<::sycl::info::device::max_mem_alloc_size>()) { return status::out_of_memory; } @@ -132,29 +135,26 @@ status_t sycl_buffer_memory_storage_t::init_allocate(size_t size) { return status::success; } -hrt::sycl::buffer_u8_t &sycl_buffer_memory_storage_t::parent_buffer() const { - return utils::downcast( - parent_storage()) +hrt::sycl::buffer_u8_t &buffer_memory_storage_t::parent_buffer() const { + return utils::downcast(parent_storage()) ->buffer(); } -gpu::sycl::sycl_in_memory_arg_t sycl_buffer_memory_storage_t::get_in_memory_arg( +in_memory_arg_t buffer_memory_storage_t::get_in_memory_arg( stream_t *stream, ::sycl::handler &cgh) const { return get_memory_arg<::sycl::access::mode::read>(this, stream, cgh); } -gpu::sycl::sycl_out_memory_arg_t -sycl_buffer_memory_storage_t::get_out_memory_arg( +out_memory_arg_t buffer_memory_storage_t::get_out_memory_arg( stream_t *stream, ::sycl::handler &cgh) const { return get_memory_arg<::sycl::access::mode::write>(this, stream, cgh); } -gpu::sycl::sycl_inout_memory_arg_t -sycl_buffer_memory_storage_t::get_inout_memory_arg( +inout_memory_arg_t buffer_memory_storage_t::get_inout_memory_arg( stream_t *stream, ::sycl::handler &cgh) const { return get_memory_arg<::sycl::access::mode::read_write>(this, stream, cgh); } - } // namespace sycl +} // namespace hrt } // namespace impl } // namespace dnnl diff --git a/src/sycl/sycl_buffer_memory_storage.hpp b/src/hrt/sycl/buffer_memory_storage.hpp similarity index 86% rename from src/sycl/sycl_buffer_memory_storage.hpp rename to src/hrt/sycl/buffer_memory_storage.hpp index 95ba98c955c..b359bbf7031 100644 --- a/src/sycl/sycl_buffer_memory_storage.hpp +++ b/src/hrt/sycl/buffer_memory_storage.hpp @@ -23,18 +23,19 @@ #include "common/memory_storage.hpp" #include "common/utils.hpp" #include "gpu/intel/sycl/utils.hpp" -#include "sycl/sycl_c_types_map.hpp" -#include "sycl/sycl_memory_storage_base.hpp" +#include "hrt/sycl/c_types_map.hpp" +#include "hrt/sycl/memory_storage_base.hpp" namespace dnnl { namespace impl { +namespace hrt { namespace sycl { -class sycl_buffer_memory_storage_t : public sycl_memory_storage_base_t { +class buffer_memory_storage_t : public memory_storage_base_t { public: - sycl_buffer_memory_storage_t(engine_t *engine); + buffer_memory_storage_t(engine_t *engine); - sycl_buffer_memory_storage_t( + buffer_memory_storage_t( engine_t *engine, const memory_storage_t *parent_storage); hrt::sycl::buffer_u8_t &buffer() const { return *buffer_; } @@ -67,11 +68,11 @@ class sycl_buffer_memory_storage_t : public sycl_memory_storage_base_t { std::unique_ptr clone() const override; - gpu::sycl::sycl_in_memory_arg_t get_in_memory_arg( + in_memory_arg_t get_in_memory_arg( stream_t *stream, ::sycl::handler &cgh) const override; - gpu::sycl::sycl_out_memory_arg_t get_out_memory_arg( + out_memory_arg_t get_out_memory_arg( stream_t *stream, ::sycl::handler &cgh) const override; - gpu::sycl::sycl_inout_memory_arg_t get_inout_memory_arg( + inout_memory_arg_t get_inout_memory_arg( stream_t *stream, ::sycl::handler &cgh) const override; protected: @@ -85,6 +86,7 @@ class sycl_buffer_memory_storage_t : public sycl_memory_storage_base_t { }; } // namespace sycl +} // namespace hrt } // namespace impl } // namespace dnnl diff --git a/src/sycl/sycl_c_types_map.hpp b/src/hrt/sycl/c_types_map.hpp similarity index 88% rename from src/sycl/sycl_c_types_map.hpp rename to src/hrt/sycl/c_types_map.hpp index b544508d9ff..a5dad3d5438 100644 --- a/src/sycl/sycl_c_types_map.hpp +++ b/src/hrt/sycl/c_types_map.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020 Intel Corporation +* Copyright 2020-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,14 @@ * limitations under the License. *******************************************************************************/ -#ifndef SYCL_SYCL_C_TYPES_MAP_HPP -#define SYCL_SYCL_C_TYPES_MAP_HPP +#ifndef HRT_SYCL_C_TYPES_MAP_HPP +#define HRT_SYCL_C_TYPES_MAP_HPP #include "oneapi/dnnl/dnnl_sycl_types.h" namespace dnnl { namespace impl { +namespace hrt { namespace sycl { using memory_kind_t = dnnl_sycl_interop_memory_kind_t; @@ -30,6 +31,7 @@ const memory_kind_t buffer = dnnl_sycl_interop_buffer; } // namespace memory_kind } // namespace sycl +} // namespace hrt } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/capi/capi_memory.cpp b/src/hrt/sycl/capi/capi_memory.cpp index 218dcf42165..33b4bdc9121 100644 --- a/src/hrt/sycl/capi/capi_memory.cpp +++ b/src/hrt/sycl/capi/capi_memory.cpp @@ -21,11 +21,12 @@ #include "common/memory.hpp" #include "common/utils.hpp" -#include "sycl/sycl_c_types_map.hpp" +#include "hrt/sycl/c_types_map.hpp" +#include "hrt/sycl/memory_storage.hpp" #include "sycl/sycl_engine.hpp" -#include "sycl/sycl_memory_storage.hpp" using namespace dnnl::impl::sycl; +using namespace dnnl::impl::hrt::sycl; using dnnl::impl::engine_t; using dnnl::impl::memory_desc_t; @@ -66,9 +67,9 @@ status_t dnnl_sycl_interop_memory_create(memory_t **memory, return status::invalid_arguments; } - mem_storage.reset(new sycl_usm_memory_storage_t(engine)); + mem_storage.reset(new usm_memory_storage_t(engine)); } else - mem_storage.reset(new sycl_buffer_memory_storage_t(engine)); + mem_storage.reset(new buffer_memory_storage_t(engine)); if (!mem_storage) return status::out_of_memory; CHECK(mem_storage->init(flags, size, handle_ptr)); @@ -85,7 +86,7 @@ status_t dnnl_sycl_interop_memory_set_buffer(memory_t *memory, void *buffer) { if (!ok) return status::invalid_arguments; std::unique_ptr mem_storage( - new sycl_buffer_memory_storage_t(memory->engine())); + new buffer_memory_storage_t(memory->engine())); if (!mem_storage) return status::out_of_memory; size_t size = memory_desc_wrapper(memory->md()).size(); @@ -103,7 +104,7 @@ status_t dnnl_sycl_interop_memory_get_memory_kind( && memory->engine()->runtime_kind() == runtime_kind::sycl; if (!ok) return status::invalid_arguments; - *memory_kind = utils::downcast( + *memory_kind = utils::downcast( memory->memory_storage()) ->memory_kind(); return status::success; diff --git a/src/sycl/sycl_memory_storage.hpp b/src/hrt/sycl/memory_storage.hpp similarity index 78% rename from src/sycl/sycl_memory_storage.hpp rename to src/hrt/sycl/memory_storage.hpp index 550dd3dddf8..0a914ab1302 100644 --- a/src/sycl/sycl_memory_storage.hpp +++ b/src/hrt/sycl/memory_storage.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2020 Intel Corporation +* Copyright 2019-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,10 +14,10 @@ * limitations under the License. *******************************************************************************/ -#ifndef SYCL_MEMORY_STORAGE_HPP -#define SYCL_MEMORY_STORAGE_HPP +#ifndef HRT_SYCL_MEMORY_STORAGE_HPP +#define HRT_SYCL_MEMORY_STORAGE_HPP -#include "sycl/sycl_buffer_memory_storage.hpp" -#include "sycl/sycl_usm_memory_storage.hpp" +#include "hrt/sycl/buffer_memory_storage.hpp" +#include "hrt/sycl/usm_memory_storage.hpp" #endif diff --git a/src/sycl/sycl_memory_storage_base.cpp b/src/hrt/sycl/memory_storage_base.cpp similarity index 73% rename from src/sycl/sycl_memory_storage_base.cpp rename to src/hrt/sycl/memory_storage_base.cpp index 1d8dd93ee1a..f7b409bb230 100644 --- a/src/sycl/sycl_memory_storage_base.cpp +++ b/src/hrt/sycl/memory_storage_base.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,40 +14,39 @@ * limitations under the License. *******************************************************************************/ -#include "sycl/sycl_memory_storage_base.hpp" +#include "hrt/sycl/memory_storage_base.hpp" #include "sycl/sycl_stream.hpp" namespace dnnl { namespace impl { +namespace hrt { namespace sycl { namespace { template <::sycl::access_mode mode> -gpu::sycl::sycl_memory_arg_t get_empty_memory_arg( +memory_arg_t get_empty_memory_arg( stream_t *stream, ::sycl::handler &cgh) { - using arg_type = gpu::sycl::sycl_memory_arg_t; - auto *sycl_stream = utils::downcast(stream); + using arg_type = memory_arg_t; + auto *sycl_stream = utils::downcast(stream); return arg_type::create_empty(sycl_stream->get_dummy_accessor(cgh)); } } // namespace -gpu::sycl::sycl_in_memory_arg_t sycl_memory_storage_base_t::empty_in_memory_arg( +in_memory_arg_t memory_storage_base_t::empty_in_memory_arg( stream_t *stream, ::sycl::handler &cgh) { return get_empty_memory_arg<::sycl::access::mode::read>(stream, cgh); } -gpu::sycl::sycl_out_memory_arg_t -sycl_memory_storage_base_t::empty_out_memory_arg( +out_memory_arg_t memory_storage_base_t::empty_out_memory_arg( stream_t *stream, ::sycl::handler &cgh) { return get_empty_memory_arg<::sycl::access::mode::write>(stream, cgh); } -gpu::sycl::sycl_inout_memory_arg_t -sycl_memory_storage_base_t::empty_inout_memory_arg( +inout_memory_arg_t memory_storage_base_t::empty_inout_memory_arg( stream_t *stream, ::sycl::handler &cgh) { return get_empty_memory_arg<::sycl::access::mode::read_write>(stream, cgh); } - } // namespace sycl +} // namespace hrt } // namespace impl } // namespace dnnl diff --git a/src/sycl/sycl_memory_storage_base.hpp b/src/hrt/sycl/memory_storage_base.hpp similarity index 67% rename from src/sycl/sycl_memory_storage_base.hpp rename to src/hrt/sycl/memory_storage_base.hpp index bc4214270b8..0f0aab5e29c 100644 --- a/src/sycl/sycl_memory_storage_base.hpp +++ b/src/hrt/sycl/memory_storage_base.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2022 Intel Corporation +* Copyright 2019-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,39 +14,41 @@ * limitations under the License. *******************************************************************************/ -#ifndef SYCL_MEMORY_STORAGE_BASE_HPP -#define SYCL_MEMORY_STORAGE_BASE_HPP +#ifndef HRT_SYCL_MEMORY_STORAGE_BASE_HPP +#define HRT_SYCL_MEMORY_STORAGE_BASE_HPP #include "common/memory_storage.hpp" -#include "gpu/sycl/sycl_types.hpp" -#include "sycl/sycl_c_types_map.hpp" +#include "hrt/sycl/c_types_map.hpp" +#include "hrt/sycl/types.hpp" namespace dnnl { namespace impl { +namespace hrt { namespace sycl { -class sycl_memory_storage_base_t : public memory_storage_t { +class memory_storage_base_t : public memory_storage_t { public: using memory_storage_t::memory_storage_t; virtual memory_kind_t memory_kind() const = 0; - virtual gpu::sycl::sycl_in_memory_arg_t get_in_memory_arg( + virtual in_memory_arg_t get_in_memory_arg( stream_t *stream, ::sycl::handler &cgh) const = 0; - virtual gpu::sycl::sycl_out_memory_arg_t get_out_memory_arg( + virtual out_memory_arg_t get_out_memory_arg( stream_t *stream, ::sycl::handler &cgh) const = 0; - virtual gpu::sycl::sycl_inout_memory_arg_t get_inout_memory_arg( + virtual inout_memory_arg_t get_inout_memory_arg( stream_t *stream, ::sycl::handler &cgh) const = 0; - static gpu::sycl::sycl_in_memory_arg_t empty_in_memory_arg( + static in_memory_arg_t empty_in_memory_arg( stream_t *stream, ::sycl::handler &cgh); - static gpu::sycl::sycl_out_memory_arg_t empty_out_memory_arg( + static out_memory_arg_t empty_out_memory_arg( stream_t *stream, ::sycl::handler &cgh); - static gpu::sycl::sycl_inout_memory_arg_t empty_inout_memory_arg( + static inout_memory_arg_t empty_inout_memory_arg( stream_t *stream, ::sycl::handler &cgh); }; } // namespace sycl +} // namespace hrt } // namespace impl } // namespace dnnl diff --git a/src/sycl/sycl_memory_storage_helper.hpp b/src/hrt/sycl/memory_storage_helper.hpp similarity index 76% rename from src/sycl/sycl_memory_storage_helper.hpp rename to src/hrt/sycl/memory_storage_helper.hpp index 8baac250d79..f6b29d131fa 100644 --- a/src/sycl/sycl_memory_storage_helper.hpp +++ b/src/hrt/sycl/memory_storage_helper.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022 Intel Corporation +* Copyright 2022-2024 Intel Corporation * Copyright 2022 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,11 +15,11 @@ * limitations under the License. *******************************************************************************/ -#ifndef SYCL_SYCL_MEMORY_STORAGE_HELPER_HPP -#define SYCL_SYCL_MEMORY_STORAGE_HELPER_HPP +#ifndef HRT_SYCL_MEMORY_STORAGE_HELPER_HPP +#define HRT_SYCL_MEMORY_STORAGE_HELPER_HPP #include -#include "sycl/sycl_memory_storage.hpp" +#include "hrt/sycl/memory_storage.hpp" #ifdef DNNL_SYCL_CUDA #include "gpu/nvidia/sycl_cuda_compat.hpp" @@ -31,22 +31,24 @@ namespace dnnl { namespace impl { +namespace hrt { namespace sycl { #define CTX_IN_SYCL_MEMORY(arg) \ - dnnl::impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read>( \ + dnnl::impl::hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read>( \ &CTX_IN_STORAGE(arg), cgh) #define CTX_OUT_SYCL_MEMORY(arg) \ - dnnl::impl::sycl::sycl_memory_arg_t<::sycl::access::mode::write>( \ + dnnl::impl::hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write>( \ &CTX_OUT_STORAGE(arg), cgh) #define CTX_SCRATCH_SYCL_MEMORY(arg) \ - dnnl::impl::sycl::sycl_memory_arg_t<::sycl::access::mode::read_write>( \ + dnnl::impl::hrt::sycl::interop_memory_arg_t< \ + ::sycl::access::mode::read_write>( \ ctx.get_scratchpad_grantor().get_memory_storage(arg).get(), cgh) template <::sycl::access_mode mode> -class sycl_memory_arg_t { +class interop_memory_arg_t { #if defined(DNNL_SYCL_CUDA) static constexpr auto be = ::sycl::backend::ext_oneapi_cuda; #elif defined(DNNL_SYCL_HIP) @@ -57,23 +59,20 @@ class sycl_memory_arg_t { #endif public: - sycl_memory_arg_t() = default; - sycl_memory_arg_t(memory_storage_t *raw_mem, ::sycl::handler &cgh) { + interop_memory_arg_t() = default; + interop_memory_arg_t(memory_storage_t *raw_mem, ::sycl::handler &cgh) { if (!raw_mem || raw_mem->is_null()) { return; } - auto *mem = static_cast( - raw_mem); + auto *mem = static_cast(raw_mem); switch (mem->memory_kind()) { case sycl::memory_kind::buffer: { auto *buffer_storage - = utils::downcast( - mem); + = utils::downcast(mem); acc_.emplace(buffer_storage->buffer(), cgh); offset_ = buffer_storage->base_offset(); break; } case sycl::memory_kind::usm: { - raw_ptr_ = utils::downcast< - const sycl::sycl_usm_memory_storage_t *>(mem) + raw_ptr_ = utils::downcast(mem) ->usm_ptr(); break; } @@ -81,7 +80,7 @@ class sycl_memory_arg_t { } } - sycl_memory_arg_t(::sycl::buffer buf, ::sycl::handler &cgh, + interop_memory_arg_t(::sycl::buffer buf, ::sycl::handler &cgh, size_t offset = 0) : offset_ {offset} { acc_.emplace(buf, cgh); @@ -117,6 +116,7 @@ class sycl_memory_arg_t { }; } // namespace sycl +} // namespace hrt } // namespace impl } // namespace dnnl diff --git a/src/gpu/sycl/sycl_types.hpp b/src/hrt/sycl/types.hpp similarity index 83% rename from src/gpu/sycl/sycl_types.hpp rename to src/hrt/sycl/types.hpp index 236c90fce46..b2a8fca17c9 100644 --- a/src/gpu/sycl/sycl_types.hpp +++ b/src/hrt/sycl/types.hpp @@ -14,36 +14,37 @@ * limitations under the License. *******************************************************************************/ -#ifndef GPU_SYCL_SYCL_TYPES_HPP -#define GPU_SYCL_SYCL_TYPES_HPP +#ifndef HRT_SYCL_TYPES_HPP +#define HRT_SYCL_TYPES_HPP #include #include "common/c_types_map.hpp" +#include "common/memory_desc_wrapper.hpp" #include "common/utils.hpp" #include "hrt/sycl/compat.hpp" #include "hrt/sycl/utils.hpp" namespace dnnl { namespace impl { -namespace gpu { +namespace hrt { namespace sycl { // The macros are expected to be called within a command group function object // that is passed to `parallel_for`. #define CTX_IN_SYCL_KERNEL_MEMORY(arg) \ CTX_IN_STORAGE(arg).is_null() \ - ? sycl_memory_storage_base_t::empty_in_memory_arg( \ + ? hrt::sycl::memory_storage_base_t::empty_in_memory_arg( \ ctx.stream(), cgh) \ - : utils::downcast( \ + : utils::downcast( \ &CTX_IN_STORAGE(arg)) \ ->get_in_memory_arg(ctx.stream(), cgh) #define CTX_OUT_SYCL_KERNEL_MEMORY(arg) \ CTX_OUT_STORAGE(arg).is_null() \ - ? sycl_memory_storage_base_t::empty_out_memory_arg( \ + ? hrt::sycl::memory_storage_base_t::empty_out_memory_arg( \ ctx.stream(), cgh) \ - : utils::downcast( \ + : utils::downcast( \ &CTX_OUT_STORAGE(arg)) \ ->get_out_memory_arg(ctx.stream(), cgh) @@ -51,19 +52,18 @@ namespace sycl { static_assert(::sycl::is_device_copyable_v) template <::sycl::access_mode mode> -struct sycl_memory_arg_t { +struct memory_arg_t { using acc_dt = uint8_t; using acc_t = ::sycl::accessor; - static sycl_memory_arg_t create_empty(const acc_t &dummy_acc) { - sycl_memory_arg_t arg(nullptr, dummy_acc); + static memory_arg_t create_empty(const acc_t &dummy_acc) { + memory_arg_t arg(nullptr, dummy_acc); arg.empty_ = true; return arg; } - sycl_memory_arg_t(void *usm, const acc_t &dummy_acc) + memory_arg_t(void *usm, const acc_t &dummy_acc) : empty_(false), usm_(usm), acc_(dummy_acc) {} - sycl_memory_arg_t(const acc_t &acc) - : empty_(false), usm_(nullptr), acc_(acc) {} + memory_arg_t(const acc_t &acc) : empty_(false), usm_(nullptr), acc_(acc) {} // This method must be called only from inside a kernel. void *get_pointer() const { if (usm_) return usm_; @@ -81,15 +81,14 @@ struct sycl_memory_arg_t { }; // TODO: come up with better names? -using sycl_in_memory_arg_t = sycl_memory_arg_t<::sycl::access::mode::read>; -using sycl_out_memory_arg_t = sycl_memory_arg_t<::sycl::access::mode::write>; -using sycl_inout_memory_arg_t - = sycl_memory_arg_t<::sycl::access::mode::read_write>; +using in_memory_arg_t = memory_arg_t<::sycl::access::mode::read>; +using out_memory_arg_t = memory_arg_t<::sycl::access::mode::write>; +using inout_memory_arg_t = memory_arg_t<::sycl::access::mode::read_write>; // TODO: this class mimics memory_desc_t and makes sure it can be passed // to SYCL kernels as a kernel argument. SYCL puts restrictions on kernel // arguments, e.g. those cannot contain unions. -struct sycl_md_t { +struct md_t { // There is a limitation on total size of kernel arguments hence using // reduced number of supported dimensions and int32_t for dimensions. static constexpr int max_dims = 6; @@ -110,8 +109,8 @@ struct sycl_md_t { const dims32_t &inner_blks() const { return inner_blks_; } const dims32_t &inner_idxs() const { return inner_idxs_; } - sycl_md_t() = default; - sycl_md_t(const memory_desc_t *md) { + md_t() = default; + md_t(const memory_desc_t *md) { memory_desc_wrapper mdw(md); assert(mdw.format_kind() == format_kind::blocked); @@ -253,58 +252,58 @@ using float16_t = ::sycl::half; // Add a check for every SYCL kernel argument type. // -// Exception: sycl_memory_arg_t doesn't pass the check because it contains +// Exception: memory_arg_t doesn't pass the check because it contains // sycl::accessor which is not device copyable. However, it is treated by the // compiler in a special way allowing it not to satisfy the requirement. -CHECK_SYCL_KERNEL_ARG_TYPE(sycl_md_t); +CHECK_SYCL_KERNEL_ARG_TYPE(md_t); CHECK_SYCL_KERNEL_ARG_TYPE(bfloat16_t); template -struct sycl_prec_traits; +struct prec_traits; template <> -struct sycl_prec_traits { +struct prec_traits { using type = float16_t; }; template <> -struct sycl_prec_traits { +struct prec_traits { using type = bfloat16_t; }; template <> -struct sycl_prec_traits { +struct prec_traits { using type = float; }; template <> -struct sycl_prec_traits { +struct prec_traits { using type = int32_t; }; template <> -struct sycl_prec_traits { +struct prec_traits { using type = int8_t; }; template <> -struct sycl_prec_traits { +struct prec_traits { using type = uint8_t; }; } // namespace sycl -} // namespace gpu +} // namespace hrt } // namespace impl } // namespace dnnl namespace std { template <> -class numeric_limits { +class numeric_limits { public: - static constexpr dnnl::impl::gpu::sycl::bfloat16_t lowest() { + static constexpr dnnl::impl::hrt::sycl::bfloat16_t lowest() { return {uint16_t(0xff7f)}; } - static constexpr dnnl::impl::gpu::sycl::bfloat16_t max() { + static constexpr dnnl::impl::hrt::sycl::bfloat16_t max() { return {uint16_t(0x7f7f)}; } static constexpr int digits = 8; - static constexpr dnnl::impl::gpu::sycl::bfloat16_t epsilon() { + static constexpr dnnl::impl::hrt::sycl::bfloat16_t epsilon() { return {uint16_t((0x7f - (digits - 1)) << (digits - 1))}; } }; diff --git a/src/sycl/sycl_usm_memory_storage.cpp b/src/hrt/sycl/usm_memory_storage.cpp similarity index 79% rename from src/sycl/sycl_usm_memory_storage.cpp rename to src/hrt/sycl/usm_memory_storage.cpp index 558f439da55..7068e9af1a4 100644 --- a/src/sycl/sycl_usm_memory_storage.cpp +++ b/src/hrt/sycl/usm_memory_storage.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ * limitations under the License. *******************************************************************************/ -#include "sycl/sycl_usm_memory_storage.hpp" +#include "hrt/sycl/usm_memory_storage.hpp" #include "common/memory.hpp" #include "common/memory_map_manager.hpp" @@ -25,14 +25,14 @@ namespace dnnl { namespace impl { +namespace hrt { namespace sycl { namespace { template <::sycl::access_mode mode> -gpu::sycl::sycl_memory_arg_t get_memory_arg( - const sycl_usm_memory_storage_t *storage, stream_t *stream, - ::sycl::handler &cgh) { - auto *sycl_stream = utils::downcast(stream); +memory_arg_t get_memory_arg(const usm_memory_storage_t *storage, + stream_t *stream, ::sycl::handler &cgh) { + auto *sycl_stream = utils::downcast(stream); return {storage->usm_ptr(), sycl_stream->get_dummy_accessor(cgh)}; } @@ -40,7 +40,7 @@ gpu::sycl::sycl_memory_arg_t get_memory_arg( struct map_usm_tag; -status_t sycl_usm_memory_storage_t::map_data( +status_t usm_memory_storage_t::map_data( void **mapped_ptr, stream_t *stream, size_t size) const { void *usm_ptr = this->usm_ptr(); // shadowing is bad @@ -57,7 +57,7 @@ status_t sycl_usm_memory_storage_t::map_data( if (!stream) CHECK(engine()->get_service_stream(stream)); ::sycl::queue sycl_queue - = utils::downcast(stream)->queue(); + = utils::downcast(stream)->queue(); void *host_ptr = ::sycl::malloc_host(size, sycl_queue.get_context()); if (!host_ptr) return status::out_of_memory; @@ -68,7 +68,7 @@ status_t sycl_usm_memory_storage_t::map_data( *mapped_ptr = host_ptr; auto unmap_callback = [usm_ptr, size](stream_t *stream, void *mapped_ptr) { ::sycl::queue sycl_queue - = utils::downcast(stream)->queue(); + = utils::downcast(stream)->queue(); sycl_queue.wait_and_throw(); sycl_queue.memcpy(usm_ptr, mapped_ptr, size).wait(); ::sycl::free(mapped_ptr, sycl_queue.get_context()); @@ -79,7 +79,7 @@ status_t sycl_usm_memory_storage_t::map_data( return map_manager.map(this, stream, *mapped_ptr, unmap_callback); } -status_t sycl_usm_memory_storage_t::unmap_data( +status_t usm_memory_storage_t::unmap_data( void *mapped_ptr, stream_t *stream) const { if (!mapped_ptr || is_host_accessible()) return status::success; @@ -88,22 +88,21 @@ status_t sycl_usm_memory_storage_t::unmap_data( return map_manager.unmap(this, stream, mapped_ptr); } -gpu::sycl::sycl_in_memory_arg_t sycl_usm_memory_storage_t::get_in_memory_arg( +in_memory_arg_t usm_memory_storage_t::get_in_memory_arg( stream_t *stream, ::sycl::handler &cgh) const { return get_memory_arg<::sycl::access::mode::read>(this, stream, cgh); } -gpu::sycl::sycl_out_memory_arg_t sycl_usm_memory_storage_t::get_out_memory_arg( +out_memory_arg_t usm_memory_storage_t::get_out_memory_arg( stream_t *stream, ::sycl::handler &cgh) const { return get_memory_arg<::sycl::access::mode::write>(this, stream, cgh); } -gpu::sycl::sycl_inout_memory_arg_t -sycl_usm_memory_storage_t::get_inout_memory_arg( +inout_memory_arg_t usm_memory_storage_t::get_inout_memory_arg( stream_t *stream, ::sycl::handler &cgh) const { return get_memory_arg<::sycl::access::mode::read_write>(this, stream, cgh); } - } // namespace sycl +} // namespace hrt } // namespace impl } // namespace dnnl diff --git a/src/sycl/sycl_usm_memory_storage.hpp b/src/hrt/sycl/usm_memory_storage.hpp similarity index 86% rename from src/sycl/sycl_usm_memory_storage.hpp rename to src/hrt/sycl/usm_memory_storage.hpp index c198a002ed4..6b66adabbe5 100644 --- a/src/sycl/sycl_usm_memory_storage.hpp +++ b/src/hrt/sycl/usm_memory_storage.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2023 Intel Corporation +* Copyright 2019-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,21 +21,22 @@ #include "common/memory_storage.hpp" #include "common/utils.hpp" +#include "hrt/sycl/memory_storage_base.hpp" #include "sycl/sycl_engine_base.hpp" -#include "sycl/sycl_memory_storage_base.hpp" #include namespace dnnl { namespace impl { +namespace hrt { namespace sycl { -class sycl_usm_memory_storage_t : public sycl_memory_storage_base_t { +class usm_memory_storage_t : public memory_storage_base_t { public: - using sycl_memory_storage_base_t::sycl_memory_storage_base_t; + using memory_storage_base_t::memory_storage_base_t; - sycl_usm_memory_storage_t(engine_t *engine, ::sycl::usm::alloc usm_kind) - : sycl_memory_storage_base_t(engine), usm_kind_(usm_kind) {} + usm_memory_storage_t(engine_t *engine, ::sycl::usm::alloc usm_kind) + : memory_storage_base_t(engine), usm_kind_(usm_kind) {} uint8_t *usm_ptr() const { return static_cast(usm_ptr_.get()); } @@ -47,7 +48,8 @@ class sycl_usm_memory_storage_t : public sycl_memory_storage_base_t { } status_t set_data_handle(void *handle) override { - auto *sycl_engine = utils::downcast(engine()); + auto *sycl_engine + = utils::downcast(engine()); auto &sycl_ctx = sycl_engine->context(); usm_ptr_ = decltype(usm_ptr_)(handle, [](void *) {}); @@ -90,14 +92,14 @@ class sycl_usm_memory_storage_t : public sycl_memory_storage_base_t { void *sub_ptr = usm_ptr_.get() ? reinterpret_cast(usm_ptr_.get()) + offset : nullptr; - auto storage = utils::make_unique(engine()); + auto storage = utils::make_unique(engine()); if (!storage) return nullptr; storage->init(memory_flags_t::use_runtime_ptr, size, sub_ptr); return storage; } std::unique_ptr clone() const override { - auto storage = utils::make_unique(engine()); + auto storage = utils::make_unique(engine()); if (!storage) return nullptr; status_t status @@ -110,16 +112,17 @@ class sycl_usm_memory_storage_t : public sycl_memory_storage_base_t { return storage; } - gpu::sycl::sycl_in_memory_arg_t get_in_memory_arg( + in_memory_arg_t get_in_memory_arg( stream_t *stream, ::sycl::handler &cgh) const override; - gpu::sycl::sycl_out_memory_arg_t get_out_memory_arg( + out_memory_arg_t get_out_memory_arg( stream_t *stream, ::sycl::handler &cgh) const override; - gpu::sycl::sycl_inout_memory_arg_t get_inout_memory_arg( + inout_memory_arg_t get_inout_memory_arg( stream_t *stream, ::sycl::handler &cgh) const override; protected: status_t init_allocate(size_t size) override { - auto *sycl_engine = utils::downcast(engine()); + auto *sycl_engine + = utils::downcast(engine()); auto &sycl_dev = sycl_engine->device(); auto &sycl_ctx = sycl_engine->context(); using ::sycl::usm::alloc; @@ -153,6 +156,7 @@ class sycl_usm_memory_storage_t : public sycl_memory_storage_base_t { }; } // namespace sycl +} // namespace hrt } // namespace impl } // namespace dnnl diff --git a/src/sycl/sycl_engine_base.cpp b/src/sycl/sycl_engine_base.cpp index 3d32795f251..7d37972283c 100644 --- a/src/sycl/sycl_engine_base.cpp +++ b/src/sycl/sycl_engine_base.cpp @@ -19,8 +19,8 @@ #include "common/memory.hpp" #include "common/memory_storage.hpp" #include "gpu/intel/sycl/compat.hpp" +#include "hrt/sycl/memory_storage.hpp" #include "sycl/sycl_device_info.hpp" -#include "sycl/sycl_memory_storage.hpp" #include "sycl/sycl_stream.hpp" namespace dnnl { @@ -32,10 +32,10 @@ status_t sycl_engine_base_t::create_memory_storage( std::unique_ptr _storage; if (flags & memory_flags_t::prefer_device_usm) { - _storage.reset(new sycl_usm_memory_storage_t( + _storage.reset(new hrt::sycl::usm_memory_storage_t( this, ::sycl::usm::alloc::device)); } else - _storage.reset(new sycl_buffer_memory_storage_t(this)); + _storage.reset(new hrt::sycl::buffer_memory_storage_t(this)); if (!_storage) return status::out_of_memory; diff --git a/src/sycl/sycl_stream.hpp b/src/sycl/sycl_stream.hpp index 2d2e0a9d8ed..4fdec09e4eb 100644 --- a/src/sycl/sycl_stream.hpp +++ b/src/sycl/sycl_stream.hpp @@ -26,9 +26,9 @@ #include "gpu/intel/compute/compute_stream.hpp" #include "gpu/intel/ocl/ocl_utils.hpp" #include "gpu/sycl/sycl_gpu_engine.hpp" +#include "hrt/sycl/memory_storage.hpp" #include "sycl/stream_profiler.hpp" #include "sycl/sycl_context.hpp" -#include "sycl/sycl_memory_storage.hpp" #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL #include "sycl/sycl_stream_cpu_thunk.hpp" @@ -162,28 +162,32 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { // Handle all other cases. auto *sycl_src - = utils::downcast(&src); + = utils::downcast( + &src); auto *sycl_dst - = utils::downcast(&dst); - bool usm_src = sycl_src->memory_kind() == memory_kind::usm; - bool usm_dst = sycl_dst->memory_kind() == memory_kind::usm; + = utils::downcast( + &dst); + bool usm_src = sycl_src->memory_kind() == hrt::sycl::memory_kind::usm; + bool usm_dst = sycl_dst->memory_kind() == hrt::sycl::memory_kind::usm; ::sycl::event e; if (usm_src && usm_dst) { auto *usm_src - = utils::downcast(&src); + = utils::downcast( + &src); auto *usm_dst - = utils::downcast(&dst); + = utils::downcast( + &dst); e = queue_->submit([&](::sycl::handler &cgh) { cgh.depends_on(sycl_event_t::from(deps).events); cgh.memcpy(usm_dst->usm_ptr(), usm_src->usm_ptr(), size); }); } else if (usm_src && !usm_dst) { auto *usm_src - = utils::downcast(&src); - auto *buffer_dst - = utils::downcast( - &dst); + = utils::downcast( + &src); + auto *buffer_dst = utils::downcast< + const hrt::sycl::buffer_memory_storage_t *>(&dst); auto &b_dst = buffer_dst->buffer(); e = queue_->submit([&](::sycl::handler &cgh) { cgh.depends_on(sycl_event_t::from(deps).events); @@ -192,12 +196,12 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { cgh.copy(usm_src->usm_ptr(), acc_dst); }); } else if (!usm_src && usm_dst) { - auto *buffer_src - = utils::downcast( - &src); + auto *buffer_src = utils::downcast< + const hrt::sycl::buffer_memory_storage_t *>(&src); auto &b_src = buffer_src->buffer(); auto *usm_dst - = utils::downcast(&dst); + = utils::downcast( + &dst); e = queue_->submit([&](::sycl::handler &cgh) { cgh.depends_on(sycl_event_t::from(deps).events); auto acc_src @@ -206,12 +210,10 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { }); } else { // if (!usm_src && !usm_dst) assert(!usm_src && !usm_dst && "USM is not supported yet"); - auto *buffer_src - = utils::downcast( - &src); - auto *buffer_dst - = utils::downcast( - &dst); + auto *buffer_src = utils::downcast< + const hrt::sycl::buffer_memory_storage_t *>(&src); + auto *buffer_dst = utils::downcast< + const hrt::sycl::buffer_memory_storage_t *>(&dst); auto &b_src = buffer_src->buffer(); auto &b_dst = buffer_dst->buffer(); e = queue_->submit([&](::sycl::handler &cgh) { @@ -239,14 +241,16 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { const gpu::intel::compute::event_t &deps, gpu::intel::compute::event_t &out_dep) override { auto *sycl_dst - = utils::downcast(&dst); - bool usm = sycl_dst->memory_kind() == memory_kind::usm; + = utils::downcast( + &dst); + bool usm = sycl_dst->memory_kind() == hrt::sycl::memory_kind::usm; ::sycl::event out_event; if (usm) { auto *usm_dst - = utils::downcast(&dst); + = utils::downcast( + &dst); auto dst_ptr = static_cast(usm_dst->usm_ptr()); // Note: we cannot use queue_.fill since it cannot handle // events as input @@ -255,9 +259,8 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { cgh.memset(dst_ptr, pattern, size); }); } else { - auto *buffer_dst - = utils::downcast( - &dst); + auto *buffer_dst = utils::downcast< + const hrt::sycl::buffer_memory_storage_t *>(&dst); out_event = queue_->submit([&](::sycl::handler &cgh) { // need a u8 accessor to get the proper range ::sycl::accessor #include @@ -87,7 +87,8 @@ void fast_dispatch_by_size(submit_ctx_t *submit_ctx, ::sycl::handler &cgh, constexpr size_t nparams = sizeof...(storage_types); auto params_tp = std::make_tuple( - utils::downcast(storages) + utils::downcast( + storages) ->buffer()...); submit_cpu_primitive_with_params_impl( submit_ctx, cgh, params_tp, nstl::make_index_sequence {}); @@ -120,11 +121,10 @@ void submit_cpu_primitive(stream_t *stream, const primitive_iface_t *prim_iface, if (!mem_storage->is_null()) { // Skip USM memory storages as they do not require special // handling and can be accessed directly - auto mem_api_kind - = utils::downcast( - mem_storage) - ->memory_kind(); - if (mem_api_kind == memory_kind::usm) continue; + auto mem_api_kind = utils::downcast< + const hrt::sycl::memory_storage_base_t *>(mem_storage) + ->memory_kind(); + if (mem_api_kind == hrt::sycl::memory_kind::usm) continue; sycl_mem_storages.push_back(mem_storage); } } From f415c903d9352a88cef27653bc069367522bebe0 Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Tue, 7 May 2024 00:45:37 -0700 Subject: [PATCH 017/187] sycl: move engine id to hrt --- .../sycl/engine_id.hpp} | 21 +++++++++++-------- src/sycl/sycl_engine_base.hpp | 4 ++-- 2 files changed, 14 insertions(+), 11 deletions(-) rename src/{sycl/sycl_engine_id.hpp => hrt/sycl/engine_id.hpp} (73%) diff --git a/src/sycl/sycl_engine_id.hpp b/src/hrt/sycl/engine_id.hpp similarity index 73% rename from src/sycl/sycl_engine_id.hpp rename to src/hrt/sycl/engine_id.hpp index edd48be9dad..cea6a7d7db5 100644 --- a/src/sycl/sycl_engine_id.hpp +++ b/src/hrt/sycl/engine_id.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2022 Intel Corporation +* Copyright 2021-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,30 +14,32 @@ * limitations under the License. *******************************************************************************/ -#ifndef SYCL_SYCL_ENGINE_ID_HPP -#define SYCL_SYCL_ENGINE_ID_HPP +#ifndef HRT_SYCL_ENGINE_ID_HPP +#define HRT_SYCL_ENGINE_ID_HPP #include "common/utils.hpp" namespace dnnl { namespace impl { +namespace hrt { namespace sycl { -struct sycl_engine_id_impl_t : public engine_id_impl_t { +struct engine_id_impl_t : public impl::engine_id_impl_t { - sycl_engine_id_impl_t(const ::sycl::device &device, + engine_id_impl_t(const ::sycl::device &device, const ::sycl::context &context, engine_kind_t kind, runtime_kind_t runtime_kind, size_t index) - : engine_id_impl_t(kind, runtime_kind, index) + : impl::engine_id_impl_t(kind, runtime_kind, index) , device_(device) , context_(context) {} - ~sycl_engine_id_impl_t() override = default; + ~engine_id_impl_t() override = default; private: - bool compare_resource(const engine_id_impl_t *id_impl) const override { + bool compare_resource( + const impl::engine_id_impl_t *id_impl) const override { const auto *typed_id - = utils::downcast(id_impl); + = utils::downcast(id_impl); return device_ == typed_id->device_ && context_ == typed_id->context_; } @@ -53,6 +55,7 @@ struct sycl_engine_id_impl_t : public engine_id_impl_t { }; } // namespace sycl +} // namespace hrt } // namespace impl } // namespace dnnl diff --git a/src/sycl/sycl_engine_base.hpp b/src/sycl/sycl_engine_base.hpp index 31142592d29..609c7de87c2 100644 --- a/src/sycl/sycl_engine_base.hpp +++ b/src/sycl/sycl_engine_base.hpp @@ -28,7 +28,7 @@ #include "gpu/intel/sycl/compat.hpp" #include "gpu/intel/sycl/utils.hpp" #include "gpu/sycl/sycl_interop_gpu_kernel.hpp" -#include "sycl_engine_id.hpp" +#include "hrt/sycl/engine_id.hpp" namespace dnnl { namespace impl { @@ -192,7 +192,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { } engine_id_t engine_id() const override { - return engine_id_t(new sycl_engine_id_impl_t( + return engine_id_t(new hrt::sycl::engine_id_impl_t( device(), context(), kind(), runtime_kind(), index())); } From b5ab7a72d258f9cd66a29fe8e5486f652273158a Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Tue, 7 May 2024 17:55:57 -0700 Subject: [PATCH 018/187] common, gpu: rename hrt to xpu --- src/CMakeLists.txt | 2 +- .../miopen_batch_normalization_executor.hpp | 60 +++--- src/gpu/amd/miopen_binary.cpp | 4 +- src/gpu/amd/miopen_convolution.cpp | 12 +- src/gpu/amd/miopen_convolution.hpp | 2 +- src/gpu/amd/miopen_deconvolution.cpp | 2 +- src/gpu/amd/miopen_eltwise.cpp | 4 +- src/gpu/amd/miopen_inner_product.cpp | 4 +- src/gpu/amd/miopen_lrn.cpp | 4 +- src/gpu/amd/miopen_matmul_executor.hpp | 32 ++-- src/gpu/amd/miopen_pooling.cpp | 4 +- src/gpu/amd/miopen_reduction.cpp | 4 +- src/gpu/amd/miopen_reorder.cpp | 2 +- src/gpu/amd/miopen_softmax.cpp | 4 +- src/gpu/amd/sycl_hip_compat.hpp | 2 +- src/gpu/amd/sycl_hip_engine.cpp | 4 +- src/gpu/amd/sycl_hip_utils.hpp | 2 +- src/gpu/intel/compute/compute_engine.hpp | 4 +- src/gpu/intel/compute/device_info.cpp | 2 +- src/gpu/intel/compute/kernel.hpp | 6 +- src/gpu/intel/gpu_primitive.hpp | 4 +- src/gpu/intel/jit/codegen/kernel.hpp | 10 +- src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp | 2 +- src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp | 4 +- src/gpu/intel/jit/jit_generator.hpp | 4 +- src/gpu/intel/jit/jit_generator_base.hpp | 4 +- src/gpu/intel/ocl/mdapi_utils.cpp | 4 +- .../intel/ocl/ocl_buffer_memory_storage.cpp | 4 +- .../intel/ocl/ocl_buffer_memory_storage.hpp | 4 +- src/gpu/intel/ocl/ocl_context.hpp | 16 +- src/gpu/intel/ocl/ocl_engine.hpp | 4 +- src/gpu/intel/ocl/ocl_gpu_engine.cpp | 28 +-- src/gpu/intel/ocl/ocl_gpu_engine.hpp | 8 +- src/gpu/intel/ocl/ocl_gpu_engine_id.hpp | 4 +- src/gpu/intel/ocl/ocl_gpu_kernel.cpp | 14 +- src/gpu/intel/ocl/ocl_gpu_kernel.hpp | 4 +- src/gpu/intel/ocl/ocl_stream.cpp | 8 +- src/gpu/intel/ocl/ocl_stream.hpp | 2 +- src/gpu/intel/ocl/ocl_usm_utils.cpp | 24 +-- src/gpu/intel/ocl/ocl_utils.cpp | 10 +- src/gpu/intel/ocl/ocl_utils.hpp | 8 +- src/gpu/intel/sycl/compat.cpp | 12 +- src/gpu/intel/sycl/compat.hpp | 4 +- src/gpu/intel/sycl/l0/utils.cpp | 16 +- src/gpu/intel/sycl/l0/utils.hpp | 4 +- src/gpu/intel/sycl/utils.cpp | 72 ++++---- src/gpu/intel/sycl/utils.hpp | 4 +- .../cudnn_batch_normalization_executor.hpp | 60 +++--- src/gpu/nvidia/cudnn_binary.cpp | 4 +- src/gpu/nvidia/cudnn_convolution.cpp | 16 +- src/gpu/nvidia/cudnn_convolution.hpp | 2 +- src/gpu/nvidia/cudnn_deconvolution.cpp | 2 +- src/gpu/nvidia/cudnn_eltwise.cpp | 4 +- src/gpu/nvidia/cudnn_inner_product.cpp | 4 +- src/gpu/nvidia/cudnn_lrn.cpp | 4 +- src/gpu/nvidia/cudnn_matmul_executor.hpp | 38 ++-- src/gpu/nvidia/cudnn_pooling.cpp | 4 +- src/gpu/nvidia/cudnn_reduction.cpp | 2 +- src/gpu/nvidia/cudnn_reorder.cpp | 2 +- src/gpu/nvidia/cudnn_resampling.cpp | 4 +- src/gpu/nvidia/cudnn_resampling.hpp | 4 +- src/gpu/nvidia/cudnn_softmax.cpp | 4 +- src/gpu/nvidia/sycl_cuda_compat.hpp | 2 +- src/gpu/nvidia/sycl_cuda_engine.cpp | 4 +- src/gpu/nvidia/sycl_cuda_stream_utils.hpp | 4 +- src/gpu/nvidia/sycl_cuda_utils.hpp | 12 +- src/gpu/sycl/batch_normalizations_kernels.hpp | 150 +++++++-------- src/gpu/sycl/binary_kernels.hpp | 26 +-- src/gpu/sycl/eltwise_kernels.hpp | 70 +++---- src/gpu/sycl/layer_normalizations_kernels.hpp | 174 +++++++++--------- src/gpu/sycl/lrn_kernels.hpp | 30 +-- src/gpu/sycl/pooling_kernels.hpp | 78 ++++---- src/gpu/sycl/prelu_kernels.hpp | 62 +++---- src/gpu/sycl/ref_batch_normalization.cpp | 44 ++--- src/gpu/sycl/ref_batch_normalization.hpp | 2 +- src/gpu/sycl/ref_binary.cpp | 8 +- src/gpu/sycl/ref_binary.hpp | 2 +- src/gpu/sycl/ref_eltwise.cpp | 12 +- src/gpu/sycl/ref_eltwise.hpp | 2 +- src/gpu/sycl/ref_layer_normalizations.cpp | 40 ++-- src/gpu/sycl/ref_layer_normalizations.hpp | 2 +- src/gpu/sycl/ref_lrn.cpp | 10 +- src/gpu/sycl/ref_lrn.hpp | 2 +- src/gpu/sycl/ref_pooling.cpp | 20 +- src/gpu/sycl/ref_pooling.hpp | 2 +- src/gpu/sycl/ref_prelu.cpp | 16 +- src/gpu/sycl/ref_prelu.hpp | 2 +- src/gpu/sycl/ref_resampling.cpp | 10 +- src/gpu/sycl/ref_resampling.hpp | 2 +- src/gpu/sycl/ref_shuffle.cpp | 6 +- src/gpu/sycl/ref_shuffle.hpp | 2 +- src/gpu/sycl/ref_softmax.cpp | 10 +- src/gpu/sycl/resampling_kernels.hpp | 70 +++---- src/gpu/sycl/shuffle_kernels.hpp | 38 ++-- src/gpu/sycl/softmax_kernels.hpp | 38 ++-- src/gpu/sycl/sycl_gpu_kernel.hpp | 2 +- src/gpu/sycl/sycl_interop_gpu_kernel.cpp | 20 +- src/gpu/sycl/sycl_io_helper.hpp | 18 +- src/gpu/sycl/sycl_math_utils.hpp | 2 +- src/gpu/sycl/sycl_post_ops.hpp | 4 +- src/gpu/sycl/sycl_primitive_conf.hpp | 128 ++++++------- src/gpu/sycl/sycl_q10n.hpp | 4 +- src/sycl/stream_profiler.cpp | 2 +- src/sycl/sycl_cpu_engine.hpp | 2 +- src/sycl/sycl_device_info.cpp | 26 +-- src/sycl/sycl_engine.cpp | 6 +- src/sycl/sycl_engine.hpp | 2 +- src/sycl/sycl_engine_base.cpp | 6 +- src/sycl/sycl_engine_base.hpp | 44 ++--- src/sycl/sycl_stream.cpp | 2 +- src/sycl/sycl_stream.hpp | 38 ++-- src/sycl/sycl_stream_submit_cpu_primitive.cpp | 12 +- src/sycl/sycl_stream_submit_cpu_primitive.hpp | 2 +- src/{hrt => xpu}/CMakeLists.txt | 2 +- src/{hrt => xpu}/ocl/CMakeLists.txt | 2 +- src/{hrt => xpu}/ocl/capi/engine.cpp | 6 +- src/{hrt => xpu}/ocl/capi/memory.cpp | 0 src/{hrt => xpu}/ocl/capi/primitive.cpp | 6 +- src/{hrt => xpu}/ocl/capi/stream.cpp | 0 src/{hrt => xpu}/ocl/utils.cpp | 12 +- src/{hrt => xpu}/ocl/utils.hpp | 18 +- src/{hrt => xpu}/sycl/CMakeLists.txt | 2 +- .../sycl/buffer_memory_storage.cpp | 16 +- .../sycl/buffer_memory_storage.hpp | 18 +- src/{hrt => xpu}/sycl/c_types_map.hpp | 8 +- src/{hrt => xpu}/sycl/capi/capi_engine.cpp | 6 +- src/{hrt => xpu}/sycl/capi/capi_memory.cpp | 6 +- src/{hrt => xpu}/sycl/capi/capi_primitive.cpp | 0 src/{hrt => xpu}/sycl/capi/capi_stream.cpp | 0 src/{hrt => xpu}/sycl/compat.cpp | 8 +- src/{hrt => xpu}/sycl/compat.hpp | 10 +- src/{hrt => xpu}/sycl/engine_id.hpp | 8 +- src/{hrt => xpu}/sycl/memory_storage.hpp | 8 +- src/{hrt => xpu}/sycl/memory_storage_base.cpp | 6 +- src/{hrt => xpu}/sycl/memory_storage_base.hpp | 12 +- .../sycl/memory_storage_helper.hpp | 16 +- src/{hrt => xpu}/sycl/types.hpp | 28 +-- src/{hrt => xpu}/sycl/usm_memory_storage.cpp | 6 +- src/{hrt => xpu}/sycl/usm_memory_storage.hpp | 6 +- src/{hrt => xpu}/sycl/utils.cpp | 10 +- src/{hrt => xpu}/sycl/utils.hpp | 10 +- src/{hrt => xpu}/utils.cpp | 6 +- src/{hrt => xpu}/utils.hpp | 8 +- .../sycl/test_cpp_api_compiled_partition.cpp | 8 +- .../graph/api/sycl/test_cpp_api_engine.cpp | 4 +- .../graph/api/sycl/test_cpp_api_tensor.cpp | 2 +- tests/gtests/graph/api/test_api_common.cpp | 8 +- tests/gtests/graph/api/test_api_common.hpp | 2 +- .../unit/interface/sycl/test_allocator.cpp | 8 +- tests/gtests/graph/unit/unit_test_common.cpp | 4 +- tests/gtests/sycl/api/test_engine.cpp | 4 +- tests/gtests/sycl/api/test_memory_buffer.cpp | 6 +- 152 files changed, 1079 insertions(+), 1079 deletions(-) rename src/{hrt => xpu}/CMakeLists.txt (97%) rename src/{hrt => xpu}/ocl/CMakeLists.txt (96%) rename src/{hrt => xpu}/ocl/capi/engine.cpp (97%) rename src/{hrt => xpu}/ocl/capi/memory.cpp (100%) rename src/{hrt => xpu}/ocl/capi/primitive.cpp (94%) rename src/{hrt => xpu}/ocl/capi/stream.cpp (100%) rename src/{hrt => xpu}/ocl/utils.cpp (98%) rename src/{hrt => xpu}/ocl/utils.hpp (95%) rename src/{hrt => xpu}/sycl/CMakeLists.txt (96%) rename src/{hrt => xpu}/sycl/buffer_memory_storage.cpp (93%) rename src/{hrt => xpu}/sycl/buffer_memory_storage.hpp (86%) rename src/{hrt => xpu}/sycl/c_types_map.hpp (91%) rename src/{hrt => xpu}/sycl/capi/capi_engine.cpp (95%) rename src/{hrt => xpu}/sycl/capi/capi_memory.cpp (97%) rename src/{hrt => xpu}/sycl/capi/capi_primitive.cpp (100%) rename src/{hrt => xpu}/sycl/capi/capi_stream.cpp (100%) rename src/{hrt => xpu}/sycl/compat.cpp (95%) rename src/{hrt => xpu}/sycl/compat.hpp (95%) rename src/{hrt => xpu}/sycl/engine_id.hpp (94%) rename src/{hrt => xpu}/sycl/memory_storage.hpp (82%) rename src/{hrt => xpu}/sycl/memory_storage_base.cpp (95%) rename src/{hrt => xpu}/sycl/memory_storage_base.hpp (90%) rename src/{hrt => xpu}/sycl/memory_storage_helper.hpp (91%) rename src/{hrt => xpu}/sycl/types.hpp (93%) rename src/{hrt => xpu}/sycl/usm_memory_storage.cpp (97%) rename src/{hrt => xpu}/sycl/usm_memory_storage.hpp (98%) rename src/{hrt => xpu}/sycl/utils.cpp (98%) rename src/{hrt => xpu}/sycl/utils.hpp (95%) rename src/{hrt => xpu}/utils.cpp (94%) rename src/{hrt => xpu}/utils.hpp (92%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a681ce19e6e..0b74e33d563 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -123,7 +123,7 @@ if(NOT DNNL_GPU_RUNTIME STREQUAL "NONE") endif() if(DNNL_WITH_SYCL OR DNNL_GPU_RUNTIME STREQUAL "OCL") - add_subdirectory(hrt) + add_subdirectory(xpu) endif() if(DNNL_WITH_SYCL) diff --git a/src/gpu/amd/miopen_batch_normalization_executor.hpp b/src/gpu/amd/miopen_batch_normalization_executor.hpp index 5d9ecf58c43..bc67d981be7 100644 --- a/src/gpu/amd/miopen_batch_normalization_executor.hpp +++ b/src/gpu/amd/miopen_batch_normalization_executor.hpp @@ -27,8 +27,8 @@ #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" #include "gpu/amd/sycl_hip_utils.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" #include "sycl_hip_utils.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { @@ -47,22 +47,22 @@ struct bnorm_exec_base_t { std::shared_ptr bnorm_impl, engine_t *engine, ::sycl::handler &cgh, amd::sycl_hip_stream_t *hip_stream, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_dst, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_scale, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_scale_buf, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_shift, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_shift_buf, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_wkspace, bool use_scale, bool use_shift, bool init_global_stats, - hrt::sycl::interop_memory_arg_t arg_mean = {}, - hrt::sycl::interop_memory_arg_t arg_var = {}) const { + xpu::sycl::interop_memory_arg_t arg_mean = {}, + xpu::sycl::interop_memory_arg_t arg_var = {}) const { compat::host_task(cgh, [=](const compat::interop_handle &ih) { auto &sycl_engine = *utils::downcast(engine); @@ -113,26 +113,26 @@ struct bnorm_exec_base_t { std::shared_ptr bnorm_impl, engine_t *engine, ::sycl::handler &cgh, amd::sycl_hip_stream_t *hip_stream, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_diff_dst, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_src, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_scale, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_scale_buf, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_scale, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_scale_buf, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_shift, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_shift_buf, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_wkspace, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> arg_temp_relu, bool use_scale, bool use_shift) const { compat::host_task(cgh, [=](const compat::interop_handle &ih) { @@ -191,7 +191,7 @@ struct bnorm_exec_base_t { void init_scaleshift(hip_sycl_scoped_context_handler_t &sc, const compat::interop_handle &ih, amd::sycl_hip_stream_t *hip_stream, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_scale, float val, const size_t n) const { @@ -215,9 +215,9 @@ struct bnorm_exec_base_t { void init_mean_var(hip_sycl_scoped_context_handler_t &sc, const compat::interop_handle &ih, amd::sycl_hip_stream_t *hip_stream, - hrt::sycl::interop_memory_arg_t<::sycl::access_mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access_mode::write> arg_mean, - hrt::sycl::interop_memory_arg_t<::sycl::access_mode::write> arg_var, + xpu::sycl::interop_memory_arg_t<::sycl::access_mode::write> arg_var, const size_t n) const { constexpr T mean_var_val = 0; hip_stream->interop_task([&](::sycl::handler &cgh) { @@ -251,14 +251,14 @@ struct bnorm_exec_fwd_t : public bnorm_exec_base_t { auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_scale = CTX_IN_SYCL_MEMORY(DNNL_ARG_SCALE); - auto arg_scale_buf = hrt::sycl::interop_memory_arg_t< + auto arg_scale_buf = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(scale_buf, cgh); auto arg_shift = CTX_IN_SYCL_MEMORY(DNNL_ARG_SHIFT); - auto arg_shift_buf = hrt::sycl::interop_memory_arg_t< + auto arg_shift_buf = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(shift_buf, cgh); auto arg_wkspace = bnorm_impl->is_training() ? CTX_OUT_SYCL_MEMORY(DNNL_ARG_WORKSPACE) - : hrt::sycl::interop_memory_arg_t< + : xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(); if (!use_global_stats) { @@ -300,13 +300,13 @@ struct bnorm_exec_bwd_t : public bnorm_exec_base_t { auto arg_diff_dst = CTX_IN_SYCL_MEMORY(DNNL_ARG_DIFF_DST); auto arg_diff_src = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SRC); auto arg_scale = CTX_IN_SYCL_MEMORY(DNNL_ARG_SCALE); - auto arg_scale_buf = hrt::sycl::interop_memory_arg_t< + auto arg_scale_buf = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(scale_buf, cgh); auto arg_diff_scale = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SCALE); - auto arg_diff_scale_buf = hrt::sycl::interop_memory_arg_t< + auto arg_diff_scale_buf = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(diff_scale_buf, cgh); auto arg_diff_shift = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SHIFT); - auto arg_diff_shift_buf = hrt::sycl::interop_memory_arg_t< + auto arg_diff_shift_buf = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(diff_shift_buf, cgh); auto arg_wkspace = CTX_IN_SYCL_MEMORY(DNNL_ARG_WORKSPACE); auto arg_temp_relu diff --git a/src/gpu/amd/miopen_binary.cpp b/src/gpu/amd/miopen_binary.cpp index ccb411cf121..8a9af7f2c04 100644 --- a/src/gpu/amd/miopen_binary.cpp +++ b/src/gpu/amd/miopen_binary.cpp @@ -18,8 +18,8 @@ #include "gpu/amd/miopen_binary.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_convolution.cpp b/src/gpu/amd/miopen_convolution.cpp index 1f7a236b672..1641e05d6ad 100644 --- a/src/gpu/amd/miopen_convolution.cpp +++ b/src/gpu/amd/miopen_convolution.cpp @@ -19,7 +19,7 @@ #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" #include "gpu/amd/sycl_hip_utils.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { @@ -42,17 +42,17 @@ status_t miopen_convolution_fwd_t::execute_convolution( memory_tracking::names::key_conv_miopen_filter); auto arg_oscale = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_OUTPUT_SCALES); - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> temp_dst; - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> temp_reorder; if (pd()->use_temp_dst()) { memory_storage_t *temp_dst_mem = scratch_storage.get(); memory_storage_t *temp_reorder_mem = scratch_storage_2.get(); - temp_dst = hrt::sycl::interop_memory_arg_t< + temp_dst = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(temp_dst_mem, cgh); - temp_reorder = hrt::sycl::interop_memory_arg_t< + temp_reorder = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(temp_reorder_mem, cgh); } @@ -149,7 +149,7 @@ status_t miopen_convolution_bwd_weights_t::execute_convolution( auto arg_filter_scratch = CTX_SCRATCH_SYCL_MEMORY( memory_tracking::names::key_conv_miopen_filter); - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_bias; if (with_bias) { diff --git a/src/gpu/amd/miopen_convolution.hpp b/src/gpu/amd/miopen_convolution.hpp index f1c20907e72..ab8b1aa391d 100644 --- a/src/gpu/amd/miopen_convolution.hpp +++ b/src/gpu/amd/miopen_convolution.hpp @@ -262,7 +262,7 @@ struct miopen_convolution_fwd_t : public primitive_t { private: ::sycl::buffer &buffer(memory_storage_t *mem_storage) const { - return utils::downcast( + return utils::downcast( mem_storage) ->buffer(); } diff --git a/src/gpu/amd/miopen_deconvolution.cpp b/src/gpu/amd/miopen_deconvolution.cpp index b60d109abe3..54350a5bef4 100644 --- a/src/gpu/amd/miopen_deconvolution.cpp +++ b/src/gpu/amd/miopen_deconvolution.cpp @@ -19,7 +19,7 @@ #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" #include "gpu/amd/sycl_hip_utils.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_eltwise.cpp b/src/gpu/amd/miopen_eltwise.cpp index e14181e0c1b..8077f962bc9 100644 --- a/src/gpu/amd/miopen_eltwise.cpp +++ b/src/gpu/amd/miopen_eltwise.cpp @@ -18,8 +18,8 @@ #include "gpu/amd/miopen_eltwise.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_inner_product.cpp b/src/gpu/amd/miopen_inner_product.cpp index 3b72eb50b9e..de83d8065e4 100644 --- a/src/gpu/amd/miopen_inner_product.cpp +++ b/src/gpu/amd/miopen_inner_product.cpp @@ -19,8 +19,8 @@ #include "gpu/amd/miopen_gemm_inner_product.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_lrn.cpp b/src/gpu/amd/miopen_lrn.cpp index e9544de86f3..547b9687b6a 100644 --- a/src/gpu/amd/miopen_lrn.cpp +++ b/src/gpu/amd/miopen_lrn.cpp @@ -18,8 +18,8 @@ #include "gpu/amd/miopen_lrn.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_matmul_executor.hpp b/src/gpu/amd/miopen_matmul_executor.hpp index 4fd4447f27a..d6638f199eb 100644 --- a/src/gpu/amd/miopen_matmul_executor.hpp +++ b/src/gpu/amd/miopen_matmul_executor.hpp @@ -23,7 +23,7 @@ #include "gpu/amd/sycl_hip_engine.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" #include @@ -44,13 +44,13 @@ struct miopen_matmul_exec_base_t { void interop_task(std::shared_ptr matmul_impl_, engine_t *engine, ::sycl::handler &cgh, amd::sycl_hip_stream_t *hip_stream, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_weights, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_dst, - hrt::sycl::interop_memory_arg_t arg_bias, - hrt::sycl::interop_memory_arg_t arg_scratch) { + xpu::sycl::interop_memory_arg_t arg_bias, + xpu::sycl::interop_memory_arg_t arg_scratch) { compat::host_task(cgh, [=](const compat::interop_handle &ih) { auto &sycl_engine = *utils::downcast( @@ -106,7 +106,7 @@ struct miopen_matmul_scratch_runtime_args_bias_exec_t auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS); - auto arg_scratch = hrt::sycl::interop_memory_arg_t< + auto arg_scratch = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(*scratch_buff_, cgh); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, @@ -130,9 +130,9 @@ struct miopen_matmul_runtime_args_scratch_exec_t auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); - auto arg_scratch = hrt::sycl::interop_memory_arg_t< + auto arg_scratch = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(*scratch_buff_, cgh); - auto arg_bias = hrt::sycl::interop_memory_arg_t< + auto arg_bias = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, @@ -156,7 +156,7 @@ struct miopen_matmul_runtime_args_bias_exec_t auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS); - auto arg_scratch = hrt::sycl::interop_memory_arg_t< + auto arg_scratch = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, @@ -178,9 +178,9 @@ struct miopen_matmul_runtime_args_exec_t : public miopen_matmul_exec_base_t { auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); - auto arg_bias = hrt::sycl::interop_memory_arg_t< + auto arg_bias = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); - auto arg_scratch = hrt::sycl::interop_memory_arg_t< + auto arg_scratch = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, @@ -227,7 +227,7 @@ struct miopen_matmul_scratch_exec_t : public miopen_matmul_exec_base_t { auto arg_scratch = CTX_SCRATCH_SYCL_MEMORY( memory_tracking::names::key_matmul_dst_in_acc_dt); - auto arg_bias = hrt::sycl::interop_memory_arg_t< + auto arg_bias = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, @@ -250,7 +250,7 @@ struct miopen_matmul_bias_exec_t : public miopen_matmul_exec_base_t { auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS); - auto arg_scratch = hrt::sycl::interop_memory_arg_t< + auto arg_scratch = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, @@ -272,9 +272,9 @@ struct miopen_matmul_exec_t : public miopen_matmul_exec_base_t { auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); - auto arg_bias = hrt::sycl::interop_memory_arg_t< + auto arg_bias = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); - auto arg_scratch = hrt::sycl::interop_memory_arg_t< + auto arg_scratch = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src, diff --git a/src/gpu/amd/miopen_pooling.cpp b/src/gpu/amd/miopen_pooling.cpp index 81b9984de28..9cac94827be 100644 --- a/src/gpu/amd/miopen_pooling.cpp +++ b/src/gpu/amd/miopen_pooling.cpp @@ -19,8 +19,8 @@ #include "common/nstl.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_reduction.cpp b/src/gpu/amd/miopen_reduction.cpp index 3eb10cfc217..aa4e596b557 100644 --- a/src/gpu/amd/miopen_reduction.cpp +++ b/src/gpu/amd/miopen_reduction.cpp @@ -18,8 +18,8 @@ #include "gpu/amd/miopen_reduction.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_reorder.cpp b/src/gpu/amd/miopen_reorder.cpp index d7e3b4545c6..c60007f593c 100644 --- a/src/gpu/amd/miopen_reorder.cpp +++ b/src/gpu/amd/miopen_reorder.cpp @@ -19,7 +19,7 @@ #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/miopen_softmax.cpp b/src/gpu/amd/miopen_softmax.cpp index f82398aeefe..ca5c6032cc5 100644 --- a/src/gpu/amd/miopen_softmax.cpp +++ b/src/gpu/amd/miopen_softmax.cpp @@ -18,8 +18,8 @@ #include "gpu/amd/miopen_softmax.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" #include "gpu/amd/sycl_hip_stream.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/amd/sycl_hip_compat.hpp b/src/gpu/amd/sycl_hip_compat.hpp index ccdcf51f09f..1c3f186d56f 100644 --- a/src/gpu/amd/sycl_hip_compat.hpp +++ b/src/gpu/amd/sycl_hip_compat.hpp @@ -19,7 +19,7 @@ #include -#include "hrt/sycl/compat.hpp" +#include "xpu/sycl/compat.hpp" #include "gpu/amd/sycl_hip_utils.hpp" diff --git a/src/gpu/amd/sycl_hip_engine.cpp b/src/gpu/amd/sycl_hip_engine.cpp index 8399789f0d6..7e9cef7526e 100644 --- a/src/gpu/amd/sycl_hip_engine.cpp +++ b/src/gpu/amd/sycl_hip_engine.cpp @@ -19,8 +19,8 @@ #include "common/utils.hpp" #include "hip/hip_runtime.h" -#include "hrt/sycl/utils.hpp" #include "miopen/miopen.h" +#include "xpu/sycl/utils.hpp" #include "gpu/amd/miopen_batch_normalization.hpp" #include "gpu/amd/miopen_binary.hpp" @@ -136,7 +136,7 @@ rocblas_handle *sycl_hip_engine_t::get_rocblas_handle() { } device_id_t sycl_hip_engine_t::device_id() const { - return device_id_t(static_cast(hrt::sycl::backend_t::amd), + return device_id_t(static_cast(xpu::sycl::backend_t::amd), static_cast(compat::get_native(device())), static_cast(0)); } diff --git a/src/gpu/amd/sycl_hip_utils.hpp b/src/gpu/amd/sycl_hip_utils.hpp index a60423a5928..24340313872 100644 --- a/src/gpu/amd/sycl_hip_utils.hpp +++ b/src/gpu/amd/sycl_hip_utils.hpp @@ -29,7 +29,7 @@ #include "common/utils.hpp" #include "common/z_magic.hpp" -#include "hrt/sycl/utils.hpp" +#include "xpu/sycl/utils.hpp" #include "gpu/amd/sycl_hip_compat.hpp" diff --git a/src/gpu/intel/compute/compute_engine.hpp b/src/gpu/intel/compute/compute_engine.hpp index 0c108ffa167..fcbf9289197 100644 --- a/src/gpu/intel/compute/compute_engine.hpp +++ b/src/gpu/intel/compute/compute_engine.hpp @@ -33,7 +33,7 @@ #include "gpu/intel/compute/kernel.hpp" #include "gpu/intel/compute/kernel_ctx.hpp" #include "gpu/intel/jit/jit_generator_base.hpp" -#include "hrt/utils.hpp" +#include "xpu/utils.hpp" namespace dnnl { namespace impl { @@ -81,7 +81,7 @@ class compute_engine_t : public engine_t { }; virtual status_t create_kernel_from_binary(compute::kernel_t &kernel, - const hrt::binary_t &binary, const char *kernel_name) const = 0; + const xpu::binary_t &binary, const char *kernel_name) const = 0; virtual status_t create_kernels_from_cache_blob( const cache_blob_t &cache_blob, diff --git a/src/gpu/intel/compute/device_info.cpp b/src/gpu/intel/compute/device_info.cpp index 3bfb6b59570..c592e5714ed 100644 --- a/src/gpu/intel/compute/device_info.cpp +++ b/src/gpu/intel/compute/device_info.cpp @@ -253,7 +253,7 @@ status_t device_info_t::init_attributes_common(engine_t *engine) { using namespace impl::sycl; if (engine->runtime_kind() == runtime_kind::sycl) { auto *sycl_engine = utils::downcast(engine); - ocl_backend = (sycl_engine->backend() == hrt::sycl::backend_t::opencl); + ocl_backend = (sycl_engine->backend() == xpu::sycl::backend_t::opencl); } #endif diff --git a/src/gpu/intel/compute/kernel.hpp b/src/gpu/intel/compute/kernel.hpp index 1350eedd132..03f60dc511d 100644 --- a/src/gpu/intel/compute/kernel.hpp +++ b/src/gpu/intel/compute/kernel.hpp @@ -26,7 +26,7 @@ #include "gpu/intel/compute/kernel_arg_list.hpp" #include "gpu/intel/compute/utils.hpp" #include "gpu/intel/utils.hpp" -#include "hrt/utils.hpp" +#include "xpu/utils.hpp" namespace dnnl { namespace impl { @@ -61,7 +61,7 @@ class kernel_impl_t { return status::runtime_error; } virtual status_t get_binary( - const engine_t *engine, hrt::binary_t &binary) const { + const engine_t *engine, xpu::binary_t &binary) const { gpu_assert(false) << "unimplemented function get_binary() called"; return status::runtime_error; } @@ -145,7 +145,7 @@ class kernel_t { return impl_->get_binary_size(engine, binary_size); } - status_t get_binary(const engine_t *engine, hrt::binary_t &binary) const { + status_t get_binary(const engine_t *engine, xpu::binary_t &binary) const { return impl_->get_binary(engine, binary); } diff --git a/src/gpu/intel/gpu_primitive.hpp b/src/gpu/intel/gpu_primitive.hpp index 03e14c41d9e..501fa4cb7ac 100644 --- a/src/gpu/intel/gpu_primitive.hpp +++ b/src/gpu/intel/gpu_primitive.hpp @@ -31,7 +31,7 @@ #include "gpu/intel/jit/jit_generator_base.hpp" #include "gpu/intel/kernel_cache.hpp" #include "gpu/intel/ocl/types_interop.hpp" -#include "hrt/utils.hpp" +#include "xpu/utils.hpp" #define CTX_GPU_RES_STORAGE(arg) \ (*(ctx.get_resource_mapper() \ @@ -120,7 +120,7 @@ struct gpu_primitive_t : public primitive_t { switch (cb.kind()) { case compute_block_t::kind_t::kernel: { // Get a binary for each kernel within current primitive. - hrt::binary_t binary; + xpu::binary_t binary; CHECK(cb.kernel().get_binary(engine, binary)); CHECK(blob.add_binary(binary.data(), binary.size())); break; diff --git a/src/gpu/intel/jit/codegen/kernel.hpp b/src/gpu/intel/jit/codegen/kernel.hpp index 94a6101df4a..bf02a62ecc0 100644 --- a/src/gpu/intel/jit/codegen/kernel.hpp +++ b/src/gpu/intel/jit/codegen/kernel.hpp @@ -32,7 +32,7 @@ #include "gpu/intel/jit/jit_generator.hpp" #include "gpu/intel/jit/ngen/ngen.hpp" #include "gpu/intel/jit/ngen/ngen_register_allocator.hpp" -#include "hrt/utils.hpp" +#include "xpu/utils.hpp" namespace dnnl { namespace impl { @@ -47,10 +47,10 @@ struct ir_generator_t : public jit_generator_base { const char *kernel_name() const override { return kernel_name_.c_str(); } - hrt::binary_t get_binary(cl_context context, cl_device_id device) override { + xpu::binary_t get_binary(cl_context context, cl_device_id device) override { kernel_info_t kernel_info; auto status = kernel_desc_.init_kernel_info(kernel_info); - if (status != status::success) return hrt::binary_t(); + if (status != status::success) return xpu::binary_t(); try { #define CASE(hw) \ case ngen::HW::hw: { \ @@ -68,8 +68,8 @@ struct ir_generator_t : public jit_generator_base { default: gpu_assert(false) << "Unexpected GPU architecture"; } #undef CASE - } catch (ngen::out_of_registers_exception &) { return hrt::binary_t(); } - return hrt::binary_t(); + } catch (ngen::out_of_registers_exception &) { return xpu::binary_t(); } + return xpu::binary_t(); } private: diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp index d8642c849fa..4db55c87876 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp @@ -703,7 +703,7 @@ void gen_gemm_kernel_t::init_interface() { interface_.externalName(kernel_name()); } -hrt::binary_t gen_gemm_kernel_t::get_binary( +xpu::binary_t gen_gemm_kernel_t::get_binary( cl_context context, cl_device_id device) { init_interface(); diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp index 9f1c27a6757..dc1877820d9 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp @@ -25,7 +25,7 @@ #include "gpu/intel/jit/gemm/kernel_evaluator.hpp" #include "gpu/intel/jit/jit_generator_base.hpp" #include "gpu/intel/kernel_cache.hpp" -#include "hrt/utils.hpp" +#include "xpu/utils.hpp" namespace dnnl { namespace impl { @@ -160,7 +160,7 @@ struct gen_gemm_kernel_t : public jit_generator_base { : desc_(desc) {} const char *kernel_name() const override { return "gemm_kernel"; } - hrt::binary_t get_binary(cl_context context, cl_device_id device) override; + xpu::binary_t get_binary(cl_context context, cl_device_id device) override; const gen_gemm_kernel_desc_t *desc() const { return &desc_; } diff --git a/src/gpu/intel/jit/jit_generator.hpp b/src/gpu/intel/jit/jit_generator.hpp index 94445fecb0c..81d8012fd6d 100644 --- a/src/gpu/intel/jit/jit_generator.hpp +++ b/src/gpu/intel/jit/jit_generator.hpp @@ -30,7 +30,7 @@ #include "gpu/intel/jit/jit_generator_base.hpp" #include "gpu/intel/jit/utils/ngen_type_bridge.hpp" #include "gpu/intel/jit/utils/utils.hpp" -#include "hrt/utils.hpp" +#include "xpu/utils.hpp" #include "gpu/intel/jit/ngen/ngen_opencl.hpp" @@ -138,7 +138,7 @@ class jit_generator : public ngen::OpenCLCodeGenerator, return ngen::OpenCLCodeGenerator::getExternalName().c_str(); } - hrt::binary_t get_binary(cl_context context, cl_device_id device) override { + xpu::binary_t get_binary(cl_context context, cl_device_id device) override { return ngen::OpenCLCodeGenerator::getBinary(context, device); } diff --git a/src/gpu/intel/jit/jit_generator_base.hpp b/src/gpu/intel/jit/jit_generator_base.hpp index 592aa99c9ba..883d8fbb69e 100644 --- a/src/gpu/intel/jit/jit_generator_base.hpp +++ b/src/gpu/intel/jit/jit_generator_base.hpp @@ -20,7 +20,7 @@ #include #include -#include "hrt/utils.hpp" +#include "xpu/utils.hpp" namespace dnnl { namespace impl { @@ -31,7 +31,7 @@ namespace jit { struct jit_generator_base { virtual ~jit_generator_base() = default; virtual const char *kernel_name() const = 0; - virtual hrt::binary_t get_binary(cl_context context, cl_device_id device) + virtual xpu::binary_t get_binary(cl_context context, cl_device_id device) = 0; }; diff --git a/src/gpu/intel/ocl/mdapi_utils.cpp b/src/gpu/intel/ocl/mdapi_utils.cpp index 72b984cafbc..385c10e8650 100644 --- a/src/gpu/intel/ocl/mdapi_utils.cpp +++ b/src/gpu/intel/ocl/mdapi_utils.cpp @@ -91,12 +91,12 @@ class mdapi_helper_impl_t { using clCreatePerfCountersCommandQueueINTEL_func_t = cl_command_queue (*)(cl_context, cl_device_id, cl_command_queue_properties, cl_uint, cl_int *); - static hrt::ocl::ext_func_t< + static xpu::ocl::ext_func_t< clCreatePerfCountersCommandQueueINTEL_func_t> create_queue_with_perf_counters( "clCreatePerfCountersCommandQueueINTEL"); auto func = create_queue_with_perf_counters.get_func( - hrt::ocl::get_platform(dev)); + xpu::ocl::get_platform(dev)); if (!func) { *err = CL_INVALID_VALUE; return nullptr; diff --git a/src/gpu/intel/ocl/ocl_buffer_memory_storage.cpp b/src/gpu/intel/ocl/ocl_buffer_memory_storage.cpp index 2bc08ccf521..07cae4ca57f 100644 --- a/src/gpu/intel/ocl/ocl_buffer_memory_storage.cpp +++ b/src/gpu/intel/ocl/ocl_buffer_memory_storage.cpp @@ -84,7 +84,7 @@ status_t ocl_buffer_memory_storage_t::map_data( cl_int err; *mapped_ptr = clEnqueueMapBuffer(queue, mem_object(), CL_TRUE, map_flags, 0, mem_bytes, 0, nullptr, nullptr, &err); - return hrt::ocl::convert_to_dnnl(err); + return xpu::ocl::convert_to_dnnl(err); } status_t ocl_buffer_memory_storage_t::unmap_data( @@ -116,7 +116,7 @@ std::unique_ptr ocl_buffer_memory_storage_t::get_sub_storage( gpu_assert(offset % OCL_BUFFER_ALIGNMENT == 0); cl_buffer_region buffer_region = {base_offset_ + offset, size}; - hrt::ocl::wrapper_t sub_buffer + xpu::ocl::wrapper_t sub_buffer = clCreateSubBuffer(parent_mem_object(), mem_flags, CL_BUFFER_CREATE_TYPE_REGION, &buffer_region, &err); gpu_assert(err == CL_SUCCESS); diff --git a/src/gpu/intel/ocl/ocl_buffer_memory_storage.hpp b/src/gpu/intel/ocl/ocl_buffer_memory_storage.hpp index c242e2e3211..c6c2623caa8 100644 --- a/src/gpu/intel/ocl/ocl_buffer_memory_storage.hpp +++ b/src/gpu/intel/ocl/ocl_buffer_memory_storage.hpp @@ -45,7 +45,7 @@ class ocl_buffer_memory_storage_t : public ocl_memory_storage_base_t { } status_t set_data_handle(void *handle) override { - mem_object_ = hrt::ocl::wrapper_t( + mem_object_ = xpu::ocl::wrapper_t( static_cast(handle), true); return status::success; } @@ -71,7 +71,7 @@ class ocl_buffer_memory_storage_t : public ocl_memory_storage_base_t { private: cl_mem parent_mem_object() const; - hrt::ocl::wrapper_t mem_object_; + xpu::ocl::wrapper_t mem_object_; size_t base_offset_ = 0; DNNL_DISALLOW_COPY_AND_ASSIGN(ocl_buffer_memory_storage_t); diff --git a/src/gpu/intel/ocl/ocl_context.hpp b/src/gpu/intel/ocl/ocl_context.hpp index 44820bbe379..81361d89470 100644 --- a/src/gpu/intel/ocl/ocl_context.hpp +++ b/src/gpu/intel/ocl/ocl_context.hpp @@ -28,18 +28,18 @@ namespace ocl { struct ocl_event_t final : compute::event_t { ocl_event_t() = default; - ocl_event_t(const std::vector> &events) + ocl_event_t(const std::vector> &events) : events(events) {} - ocl_event_t(std::vector> &&events) + ocl_event_t(std::vector> &&events) : events(std::move(events)) {} - ocl_event_t(hrt::ocl::wrapper_t &&event) { + ocl_event_t(xpu::ocl::wrapper_t &&event) { events.emplace_back(std::move(event)); } - const hrt::ocl::wrapper_t &operator[](size_t i) const { + const xpu::ocl::wrapper_t &operator[](size_t i) const { return events[i]; } - hrt::ocl::wrapper_t &operator[](size_t i) { return events[i]; } + xpu::ocl::wrapper_t &operator[](size_t i) { return events[i]; } size_t size() const { return events.size(); } static ocl_event_t &from(compute::event_t &event) { @@ -57,12 +57,12 @@ struct ocl_event_t final : compute::event_t { events.insert(events.end(), other.events.begin(), other.events.end()); }; - std::vector> events; + std::vector> events; }; struct ocl_context_t final : public gpu::intel::compute::context_t { ocl_context_t() = default; - ocl_context_t(const std::vector> &&events) + ocl_context_t(const std::vector> &&events) : events_(std::move(events)) {}; ocl_context_t(const ocl_context_t &) = default; ~ocl_context_t() = default; @@ -79,7 +79,7 @@ struct ocl_context_t final : public gpu::intel::compute::context_t { return events_; } - void set_deps(std::vector> &&event) { + void set_deps(std::vector> &&event) { events_ = ocl_event_t(std::move(event)); } void set_deps(ocl_event_t &&events) { events_ = std::move(events); }; diff --git a/src/gpu/intel/ocl/ocl_engine.hpp b/src/gpu/intel/ocl/ocl_engine.hpp index ae81b63f7b6..1a5f3e76e6d 100644 --- a/src/gpu/intel/ocl/ocl_engine.hpp +++ b/src/gpu/intel/ocl/ocl_engine.hpp @@ -35,7 +35,7 @@ class ocl_engine_factory_t : public engine_factory_t { size_t count() const override { std::vector ocl_devices; status_t status - = hrt::ocl::get_devices(&ocl_devices, CL_DEVICE_TYPE_GPU); + = xpu::ocl::get_devices(&ocl_devices, CL_DEVICE_TYPE_GPU); if (status != status::success) return status; return ocl_devices.size(); } @@ -44,7 +44,7 @@ class ocl_engine_factory_t : public engine_factory_t { status_t status; std::vector ocl_devices; - status = hrt::ocl::get_devices(&ocl_devices, CL_DEVICE_TYPE_GPU); + status = xpu::ocl::get_devices(&ocl_devices, CL_DEVICE_TYPE_GPU); VERROR_ENGINE( status == status::success, status, "no ocl devices found"); diff --git a/src/gpu/intel/ocl/ocl_gpu_engine.cpp b/src/gpu/intel/ocl/ocl_gpu_engine.cpp index ce9146f77d6..2b8bc720247 100644 --- a/src/gpu/intel/ocl/ocl_gpu_engine.cpp +++ b/src/gpu/intel/ocl/ocl_gpu_engine.cpp @@ -86,7 +86,7 @@ status_t ocl_gpu_engine_t::init(const std::vector &cache_blob) { OCL_CHECK(err); - CHECK(hrt::ocl::check_device(engine_kind::gpu, device_, context_)); + CHECK(xpu::ocl::check_device(engine_kind::gpu, device_, context_)); compute::compute_engine_t::init(cache_blob); return status::success; @@ -139,7 +139,7 @@ status_t create_ocl_kernel_from_cache_blob(const ocl_gpu_engine_t *ocl_engine, CHECK(cache_blob.get_binary(&binary, &binary_size)); - auto program = hrt::ocl::make_wrapper(clCreateProgramWithBinary( + auto program = xpu::ocl::make_wrapper(clCreateProgramWithBinary( ctx, 1, &dev, &binary_size, &binary, nullptr, &err)); OCL_CHECK(err); err = clBuildProgram(program, 1, &dev, nullptr, nullptr, nullptr); @@ -165,7 +165,7 @@ status_t create_ocl_kernel_from_cache_blob(const ocl_gpu_engine_t *ocl_engine, // Remove the null terminator as std::string already includes it. kernel_name.pop_back(); } - auto ocl_kernel = hrt::ocl::make_wrapper( + auto ocl_kernel = xpu::ocl::make_wrapper( clCreateKernel(program, kernel_name.c_str(), &err)); OCL_CHECK(err); @@ -229,7 +229,7 @@ inline status_t preprocess_headers( } // namespace status_t ocl_gpu_engine_t::build_program_from_source( - hrt::ocl::wrapper_t &program, const char *code_string, + xpu::ocl::wrapper_t &program, const char *code_string, const compute::kernel_ctx_t &kernel_ctx) const { std::string options = kernel_ctx.options(); @@ -252,7 +252,7 @@ status_t ocl_gpu_engine_t::build_program_from_source( debugdump_processed_source( pp_code_str, options, dev_info->get_cl_ext_options()); - program = hrt::ocl::make_wrapper(clCreateProgramWithSource( + program = xpu::ocl::make_wrapper(clCreateProgramWithSource( context(), 1, &pp_code_str_ptr, nullptr, &err)); OCL_CHECK(err); @@ -262,10 +262,10 @@ status_t ocl_gpu_engine_t::build_program_from_source( return status::success; } -status_t ocl_gpu_engine_t::create_binary_from_ocl_source(hrt::binary_t &binary, +status_t ocl_gpu_engine_t::create_binary_from_ocl_source(xpu::binary_t &binary, const char *code_string, const compute::kernel_ctx_t &kernel_ctx) const { - hrt::ocl::wrapper_t program; + xpu::ocl::wrapper_t program; CHECK(build_program_from_source(program, code_string, kernel_ctx)); CHECK(get_ocl_program_binary(program, device(), binary)); @@ -273,13 +273,13 @@ status_t ocl_gpu_engine_t::create_binary_from_ocl_source(hrt::binary_t &binary, } status_t ocl_gpu_engine_t::create_kernel_from_binary(compute::kernel_t &kernel, - const hrt::binary_t &binary, const char *kernel_name) const { - hrt::ocl::wrapper_t program; - CHECK(hrt::ocl::create_program( + const xpu::binary_t &binary, const char *kernel_name) const { + xpu::ocl::wrapper_t program; + CHECK(xpu::ocl::create_program( program, this->device(), this->context(), binary)); cl_int err; - auto ocl_kernel = hrt::ocl::make_wrapper( + auto ocl_kernel = xpu::ocl::make_wrapper( clCreateKernel(program, kernel_name, &err)); OCL_CHECK(err); @@ -315,7 +315,7 @@ status_t ocl_gpu_engine_t::create_kernel(compute::kernel_t *kernel, return status::success; } - hrt::binary_t binary = jitter->get_binary(context(), device()); + xpu::binary_t binary = jitter->get_binary(context(), device()); if (binary.empty()) return status::runtime_error; return create_kernel_from_binary(*kernel, binary, kernel_name); } @@ -346,14 +346,14 @@ status_t ocl_gpu_engine_t::create_kernels_from_ocl_source( std::vector *kernels, const std::vector &kernel_names, const char *code_string, const compute::kernel_ctx_t &kernel_ctx) const { - hrt::ocl::wrapper_t program; + xpu::ocl::wrapper_t program; CHECK(build_program_from_source(program, code_string, kernel_ctx)); *kernels = std::vector(kernel_names.size()); for (size_t i = 0; i < kernel_names.size(); ++i) { if (!kernel_names[i]) continue; cl_int err; - hrt::ocl::wrapper_t ocl_kernel + xpu::ocl::wrapper_t ocl_kernel = clCreateKernel(program, kernel_names[i], &err); OCL_CHECK(err); std::vector arg_types; diff --git a/src/gpu/intel/ocl/ocl_gpu_engine.hpp b/src/gpu/intel/ocl/ocl_gpu_engine.hpp index b5d17fdb746..7cb6882f705 100644 --- a/src/gpu/intel/ocl/ocl_gpu_engine.hpp +++ b/src/gpu/intel/ocl/ocl_gpu_engine.hpp @@ -23,7 +23,7 @@ #include "gpu/intel/compute/compute_engine.hpp" #include "gpu/intel/ocl/ocl_gpu_engine_id.hpp" #include "gpu/intel/ocl/ocl_utils.hpp" -#include "hrt/utils.hpp" +#include "xpu/utils.hpp" namespace dnnl { namespace impl { @@ -48,12 +48,12 @@ class ocl_gpu_engine_t : public compute::compute_engine_t { status_t create_stream(stream_t **stream, unsigned flags) override; status_t create_stream(stream_t **stream, cl_command_queue queue); - status_t create_binary_from_ocl_source(hrt::binary_t &binary, + status_t create_binary_from_ocl_source(xpu::binary_t &binary, const char *code_string, const compute::kernel_ctx_t &kernel_ctx) const; status_t create_kernel_from_binary(compute::kernel_t &kernel, - const hrt::binary_t &binary, + const xpu::binary_t &binary, const char *kernel_name) const override; status_t create_kernels_from_cache_blob(const cache_blob_t &cache_blob, @@ -118,7 +118,7 @@ class ocl_gpu_engine_t : public compute::compute_engine_t { } protected: - status_t build_program_from_source(hrt::ocl::wrapper_t &program, + status_t build_program_from_source(xpu::ocl::wrapper_t &program, const char *code_string, const compute::kernel_ctx_t &kernel_ctx) const; diff --git a/src/gpu/intel/ocl/ocl_gpu_engine_id.hpp b/src/gpu/intel/ocl/ocl_gpu_engine_id.hpp index af6b71804fa..5f471c1722d 100644 --- a/src/gpu/intel/ocl/ocl_gpu_engine_id.hpp +++ b/src/gpu/intel/ocl/ocl_gpu_engine_id.hpp @@ -49,8 +49,8 @@ struct ocl_gpu_engine_id_impl_t : public engine_id_impl_t { return seed; } - hrt::ocl::wrapper_t device_; - hrt::ocl::wrapper_t context_; + xpu::ocl::wrapper_t device_; + xpu::ocl::wrapper_t context_; }; } // namespace ocl diff --git a/src/gpu/intel/ocl/ocl_gpu_kernel.cpp b/src/gpu/intel/ocl/ocl_gpu_kernel.cpp index 0e0b4a4da46..ff17e3b7701 100644 --- a/src/gpu/intel/ocl/ocl_gpu_kernel.cpp +++ b/src/gpu/intel/ocl/ocl_gpu_kernel.cpp @@ -44,13 +44,13 @@ class kernel_wrapper_t { status_t set_arg(int arg_index, size_t arg_size, const void *arg_value) { cl_int err = clSetKernelArg(kernel_, arg_index, arg_size, arg_value); - return hrt::ocl::convert_to_dnnl(err); + return xpu::ocl::convert_to_dnnl(err); } status_t set_svm_arg(int arg_index, const void *arg_value) { #ifdef CL_VERSION_2_0 cl_int err = clSetKernelArgSVMPointer(kernel_, arg_index, arg_value); - return hrt::ocl::convert_to_dnnl(err); + return xpu::ocl::convert_to_dnnl(err); #else // SVM is not supported. UNUSED(arg_index); @@ -92,7 +92,7 @@ class ocl_gpu_kernel_cache_t { // No copy for this thread, clone the original kernel and save the // copy. cl_kernel cloned_kernel; - CHECK(hrt::ocl::clone_kernel(main_kernel_, &cloned_kernel)); + CHECK(xpu::ocl::clone_kernel(main_kernel_, &cloned_kernel)); utils::lock_write_t lock_write(mutex_); auto ret = kernels_.emplace(id, cloned_kernel); @@ -118,7 +118,7 @@ ocl_gpu_kernel_t::~ocl_gpu_kernel_t() { } status_t ocl_gpu_kernel_t::get_binary( - const engine_t *engine, hrt::binary_t &binary) const { + const engine_t *engine, xpu::binary_t &binary) const { auto *ocl_engine = utils::downcast(engine); return get_ocl_program_binary(ocl_kernel(), ocl_engine->device(), binary); } @@ -205,7 +205,7 @@ status_t ocl_gpu_kernel_t::parallel_for(stream_t &stream, cl_uint ndims = static_cast(range.ndims()); if (range.is_zero()) { return status::success; } - hrt::ocl::wrapper_t event; + xpu::ocl::wrapper_t event; if (ocl_stream->flags() & stream_flags::out_of_order) { const auto &event_wrappers = ocl_event_t::from(deps).events; std::vector events( @@ -237,14 +237,14 @@ status_t ocl_gpu_kernel_t::parallel_for(stream_t &stream, } status_t ocl_gpu_kernel_t::dump() const { - hrt::binary_t binary; + xpu::binary_t binary; CHECK(get_ocl_kernel_binary(ocl_kernel(), binary)); CHECK(gpu_utils::dump_kernel_binary(binary, name())); return status::success; } std::string ocl_gpu_kernel_t::name() const { - return hrt::ocl::get_kernel_name(ocl_kernel()); + return xpu::ocl::get_kernel_name(ocl_kernel()); } } // namespace ocl diff --git a/src/gpu/intel/ocl/ocl_gpu_kernel.hpp b/src/gpu/intel/ocl/ocl_gpu_kernel.hpp index 36237c11194..86ae7264402 100644 --- a/src/gpu/intel/ocl/ocl_gpu_kernel.hpp +++ b/src/gpu/intel/ocl/ocl_gpu_kernel.hpp @@ -21,7 +21,7 @@ #include #include "gpu/intel/compute/kernel.hpp" -#include "hrt/utils.hpp" +#include "xpu/utils.hpp" namespace dnnl { namespace impl { @@ -40,7 +40,7 @@ class ocl_gpu_kernel_t : public compute::kernel_impl_t { cl_kernel ocl_kernel() const { return ocl_kernel_; } status_t get_binary( - const engine_t *engine, hrt::binary_t &binary) const override; + const engine_t *engine, xpu::binary_t &binary) const override; status_t get_binary_size( const engine_t *engine, size_t *binary_size) const override; diff --git a/src/gpu/intel/ocl/ocl_stream.cpp b/src/gpu/intel/ocl/ocl_stream.cpp index d3a997d993b..ae1f665bb47 100644 --- a/src/gpu/intel/ocl/ocl_stream.cpp +++ b/src/gpu/intel/ocl/ocl_stream.cpp @@ -131,7 +131,7 @@ status_t ocl_stream_t::copy(const memory_storage_t &src, cl_uint num_events = (cl_uint)events.size(); const cl_event *events_ptr = events.data(); - hrt::ocl::wrapper_t out_event; + xpu::ocl::wrapper_t out_event; bool need_out_event = is_profiling_enabled() || flags() & stream_flags::out_of_order; cl_event *out_event_ptr = need_out_event ? &out_event.unwrap() : nullptr; @@ -247,7 +247,7 @@ status_t ocl_stream_t::copy(const memory_storage_t &src, if (is_profiling_enabled()) { auto ocl_event = utils::make_unique( - std::vector> {out_event}); + std::vector> {out_event}); profiler_->register_event(std::move(ocl_event)); } @@ -274,7 +274,7 @@ status_t ocl_stream_t::fill(const memory_storage_t &dst, uint8_t pattern, cl_uint num_events = (cl_uint)events.size(); const cl_event *events_ptr = events.data(); - hrt::ocl::wrapper_t out_event; + xpu::ocl::wrapper_t out_event; bool need_out_event = is_profiling_enabled() || flags() & stream_flags::out_of_order; cl_event *out_event_ptr = need_out_event ? &out_event.unwrap() : nullptr; @@ -295,7 +295,7 @@ status_t ocl_stream_t::fill(const memory_storage_t &dst, uint8_t pattern, if (is_profiling_enabled()) { auto ocl_event = utils::make_unique( - std::vector> {out_event}); + std::vector> {out_event}); profiler_->register_event(std::move(ocl_event)); } diff --git a/src/gpu/intel/ocl/ocl_stream.hpp b/src/gpu/intel/ocl/ocl_stream.hpp index dc1d38d8212..3c0ccd4f01e 100644 --- a/src/gpu/intel/ocl/ocl_stream.hpp +++ b/src/gpu/intel/ocl/ocl_stream.hpp @@ -114,7 +114,7 @@ struct ocl_stream_t : public compute::compute_stream_t { return ocl_ctx(); } - const hrt::ocl::wrapper_t &get_output_event() const { + const xpu::ocl::wrapper_t &get_output_event() const { auto &deps = ocl_event_t::from(ctx().get_deps()); assert(deps.size() == 1); return deps[0]; diff --git a/src/gpu/intel/ocl/ocl_usm_utils.cpp b/src/gpu/intel/ocl/ocl_usm_utils.cpp index 7a239a9f59e..1987155dfaf 100644 --- a/src/gpu/intel/ocl/ocl_usm_utils.cpp +++ b/src/gpu/intel/ocl/ocl_usm_utils.cpp @@ -53,7 +53,7 @@ cl_command_queue get_ocl_queue(stream_t *stream) { bool is_usm_supported(engine_t *engine) { using clSharedMemAllocINTEL_func_t = void *(*)(cl_context, cl_device_id, cl_ulong *, size_t, cl_uint, cl_int *); - static hrt::ocl::ext_func_t ext_func( + static xpu::ocl::ext_func_t ext_func( "clSharedMemAllocINTEL"); return (bool)ext_func.get_func(engine); } @@ -64,7 +64,7 @@ void *malloc_host(engine_t *engine, size_t size) { if (size == 0) return nullptr; - static hrt::ocl::ext_func_t ext_func( + static xpu::ocl::ext_func_t ext_func( "clHostMemAllocINTEL"); cl_int err; void *p = ext_func(engine, get_ocl_context(engine), nullptr, size, 0, &err); @@ -79,7 +79,7 @@ void *malloc_device(engine_t *engine, size_t size) { if (size == 0) return nullptr; - static hrt::ocl::ext_func_t ext_func( + static xpu::ocl::ext_func_t ext_func( "clDeviceMemAllocINTEL"); cl_int err; void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine), @@ -95,7 +95,7 @@ void *malloc_shared(engine_t *engine, size_t size) { if (size == 0) return nullptr; - static hrt::ocl::ext_func_t ext_func( + static xpu::ocl::ext_func_t ext_func( "clSharedMemAllocINTEL"); cl_int err; void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine), @@ -109,7 +109,7 @@ void free(engine_t *engine, void *ptr) { using clMemFreeINTEL_func_t = cl_int (*)(cl_context, void *); if (!ptr) return; - static hrt::ocl::ext_func_t ext_func( + static xpu::ocl::ext_func_t ext_func( "clMemFreeINTEL"); cl_int err = ext_func(engine, get_ocl_context(engine), ptr); assert(err == CL_SUCCESS); @@ -120,9 +120,9 @@ status_t set_kernel_arg_usm(engine_t *engine, cl_kernel kernel, int arg_index, const void *arg_value) { using clSetKernelArgMemPointerINTEL_func_t = cl_int (*)(cl_kernel, cl_uint, const void *); - static hrt::ocl::ext_func_t ext_func( + static xpu::ocl::ext_func_t ext_func( "clSetKernelArgMemPointerINTEL"); - return hrt::ocl::convert_to_dnnl( + return xpu::ocl::convert_to_dnnl( ext_func(engine, kernel, arg_index, arg_value)); } @@ -131,9 +131,9 @@ status_t memcpy(stream_t *stream, void *dst, const void *src, size_t size, using clEnqueueMemcpyINTEL_func_t = cl_int (*)(cl_command_queue, cl_bool, void *, const void *, size_t, cl_uint, const cl_event *, cl_event *); - static hrt::ocl::ext_func_t ext_func( + static xpu::ocl::ext_func_t ext_func( "clEnqueueMemcpyINTEL"); - return hrt::ocl::convert_to_dnnl( + return xpu::ocl::convert_to_dnnl( ext_func(stream->engine(), get_ocl_queue(stream), /* blocking */ CL_FALSE, dst, src, size, num_events, events, out_event)); @@ -149,9 +149,9 @@ status_t fill(stream_t *stream, void *ptr, const void *pattern, using clEnqueueMemFillINTEL_func_t = cl_int (*)(cl_command_queue, void *, const void *, size_t, size_t, cl_uint, const cl_event *, cl_event *); - static hrt::ocl::ext_func_t ext_func( + static xpu::ocl::ext_func_t ext_func( "clEnqueueMemFillINTEL"); - return hrt::ocl::convert_to_dnnl( + return xpu::ocl::convert_to_dnnl( ext_func(stream->engine(), get_ocl_queue(stream), ptr, pattern, pattern_size, size, num_events, events, out_event)); } @@ -174,7 +174,7 @@ ocl_usm_kind_t get_pointer_type(engine_t *engine, const void *ptr) { static constexpr cl_uint cl_mem_alloc_type_intel = 0x419A; - static hrt::ocl::ext_func_t ext_func( + static xpu::ocl::ext_func_t ext_func( "clGetMemAllocInfoINTEL"); if (!ptr) return ocl_usm_kind_t::unknown; diff --git a/src/gpu/intel/ocl/ocl_utils.cpp b/src/gpu/intel/ocl/ocl_utils.cpp index b85ba1d7e6c..e1538766651 100644 --- a/src/gpu/intel/ocl/ocl_utils.cpp +++ b/src/gpu/intel/ocl/ocl_utils.cpp @@ -153,7 +153,7 @@ status_t get_ocl_program_binary_size( } status_t get_ocl_program_binary( - cl_program program, cl_device_id device, hrt::binary_t &binary) { + cl_program program, cl_device_id device, xpu::binary_t &binary) { size_t n_devices = 0; CHECK(get_number_devices(program, &n_devices)); @@ -170,9 +170,9 @@ status_t get_ocl_program_binary( size_t device_idx = std::distance( devices.begin(), std::find(devices.begin(), devices.end(), device)); std::vector binary_pointers(n_devices); - std::vector binaries(n_devices); + std::vector binaries(n_devices); for (size_t i = 0; i < n_devices; ++i) { - binaries[i] = hrt::binary_t(binarySize[i]); + binaries[i] = xpu::binary_t(binarySize[i]); binary_pointers[i] = binaries[i].data(); } @@ -184,7 +184,7 @@ status_t get_ocl_program_binary( } status_t get_ocl_program_binary( - cl_kernel kernel, cl_device_id device, hrt::binary_t &binary) { + cl_kernel kernel, cl_device_id device, xpu::binary_t &binary) { cl_int err; cl_program program; @@ -195,7 +195,7 @@ status_t get_ocl_program_binary( return get_ocl_program_binary(program, device, binary); } -status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, hrt::binary_t &binary) { +status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary) { binary.clear(); size_t binary_size; OCL_CHECK(clGetKernelInfo(ocl_kernel, CL_KERNEL_BINARY_PROGRAM_INTEL, 0, diff --git a/src/gpu/intel/ocl/ocl_utils.hpp b/src/gpu/intel/ocl/ocl_utils.hpp index 8c24d608ec3..920828349e4 100644 --- a/src/gpu/intel/ocl/ocl_utils.hpp +++ b/src/gpu/intel/ocl/ocl_utils.hpp @@ -30,7 +30,7 @@ #include "gpu/intel/compute/device_info.hpp" #include "gpu/intel/compute/kernel_arg_list.hpp" #include "gpu/intel/compute/utils.hpp" -#include "hrt/ocl/utils.hpp" +#include "xpu/ocl/utils.hpp" namespace dnnl { namespace impl { @@ -52,12 +52,12 @@ cl_mem clCreateBuffer_wrapper(cl_context context, cl_mem_flags flags, #endif status_t get_ocl_program_binary( - cl_program program, cl_device_id device, hrt::binary_t &binary); + cl_program program, cl_device_id device, xpu::binary_t &binary); status_t get_ocl_program_binary( - cl_kernel kernel, cl_device_id device, hrt::binary_t &binary); + cl_kernel kernel, cl_device_id device, xpu::binary_t &binary); -status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, hrt::binary_t &binary); +status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary); status_t get_ocl_program_binary_size( cl_kernel kernel, cl_device_id device, size_t *size); diff --git a/src/gpu/intel/sycl/compat.cpp b/src/gpu/intel/sycl/compat.cpp index c0f8615fba9..e7e79e09bb8 100644 --- a/src/gpu/intel/sycl/compat.cpp +++ b/src/gpu/intel/sycl/compat.cpp @@ -52,11 +52,11 @@ using namespace gpu::intel::compute; status_t make_kernel(std::unique_ptr<::sycl::kernel> &sycl_kernel, const impl::sycl::sycl_engine_base_t *sycl_engine, - const hrt::binary_t &binary, const char *kernel_name) { - auto backend = hrt::sycl::get_backend(sycl_engine->device()); - if (backend == hrt::sycl::backend_t::opencl) { - hrt::ocl::wrapper_t ocl_program; - CHECK(hrt::ocl::create_program(ocl_program, sycl_engine->ocl_device(), + const xpu::binary_t &binary, const char *kernel_name) { + auto backend = xpu::sycl::get_backend(sycl_engine->device()); + if (backend == xpu::sycl::backend_t::opencl) { + xpu::ocl::wrapper_t ocl_program; + CHECK(xpu::ocl::create_program(ocl_program, sycl_engine->ocl_device(), sycl_engine->ocl_context(), binary)); cl_int err; cl_kernel ocl_kernel = clCreateKernel(ocl_program, kernel_name, &err); @@ -64,7 +64,7 @@ status_t make_kernel(std::unique_ptr<::sycl::kernel> &sycl_kernel, sycl_kernel = utils::make_unique<::sycl::kernel>( ::sycl::make_kernel<::sycl::backend::opencl>( ocl_kernel, sycl_engine->context())); - } else if (backend == hrt::sycl::backend_t::level0) { + } else if (backend == xpu::sycl::backend_t::level0) { CHECK(sycl_create_kernel_with_level_zero( sycl_kernel, kernel_name, sycl_engine, binary)); } else { diff --git a/src/gpu/intel/sycl/compat.hpp b/src/gpu/intel/sycl/compat.hpp index a25362e1350..455dcc6fc13 100644 --- a/src/gpu/intel/sycl/compat.hpp +++ b/src/gpu/intel/sycl/compat.hpp @@ -17,7 +17,7 @@ #ifndef GPU_INTEL_SYCL_COMPAT_HPP #define GPU_INTEL_SYCL_COMPAT_HPP -#include "hrt/sycl/compat.hpp" +#include "xpu/sycl/compat.hpp" #include "gpu/intel/sycl/utils.hpp" @@ -41,7 +41,7 @@ namespace compat { status_t make_kernel(std::unique_ptr<::sycl::kernel> &sycl_kernel, const impl::sycl::sycl_engine_base_t *sycl_engine, - const hrt::binary_t &binary, const char *kernel_name); + const xpu::binary_t &binary, const char *kernel_name); uint64_t init_extensions(const ::sycl::device &dev); diff --git a/src/gpu/intel/sycl/l0/utils.cpp b/src/gpu/intel/sycl/l0/utils.cpp index abaed7640f8..88c837e08cb 100644 --- a/src/gpu/intel/sycl/l0/utils.cpp +++ b/src/gpu/intel/sycl/l0/utils.cpp @@ -152,14 +152,14 @@ status_t func_zeModuleGetNativeBinary(ze_module_handle_t hModule, size_t *pSize, // we query it directly from Level0 with the zeDeviceGetProperties function. // The `get_device_uuid` function packs 128 bits of the device UUID, which are // represented as an uint8_t array of size 16, to 2 uint64_t values. -hrt::device_uuid_t get_device_uuid(const ::sycl::device &dev) { +xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev) { static_assert(ZE_MAX_DEVICE_UUID_SIZE == 16, "ZE_MAX_DEVICE_UUID_SIZE is expected to be 16"); auto ze_device_properties = ze_device_properties_t(); ze_device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; - auto ze_device = hrt::sycl::compat::get_native(dev); + auto ze_device = xpu::sycl::compat::get_native(dev); auto status = func_zeDeviceGetProperties(ze_device, &ze_device_properties); MAYBE_UNUSED(status); assert(status == status::success); @@ -171,14 +171,14 @@ hrt::device_uuid_t get_device_uuid(const ::sycl::device &dev) { size_t shift = i % sizeof(uint64_t) * CHAR_BIT; uuid[i / sizeof(uint64_t)] |= (((uint64_t)ze_device_id[i]) << shift); } - return hrt::device_uuid_t(uuid[0], uuid[1]); + return xpu::device_uuid_t(uuid[0], uuid[1]); } status_t sycl_create_kernel_with_level_zero( std::unique_ptr<::sycl::kernel> &sycl_kernel, const std::string &kernel_name, const impl::sycl::sycl_engine_base_t *sycl_engine, - const hrt::binary_t &binary) { + const xpu::binary_t &binary) { auto desc = ze_module_desc_t(); desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC; desc.format = ZE_MODULE_FORMAT_NATIVE; @@ -189,9 +189,9 @@ status_t sycl_create_kernel_with_level_zero( ze_module_handle_t ze_module; - auto ze_device = hrt::sycl::compat::get_native( + auto ze_device = xpu::sycl::compat::get_native( sycl_engine->device()); - auto ze_ctx = hrt::sycl::compat::get_native( + auto ze_ctx = xpu::sycl::compat::get_native( sycl_engine->context()); CHECK(func_zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr)); @@ -212,8 +212,8 @@ status_t sycl_create_kernel_with_level_zero( } bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) { - auto lhs_ze_handle = hrt::sycl::compat::get_native(lhs); - auto rhs_ze_handle = hrt::sycl::compat::get_native(rhs); + auto lhs_ze_handle = xpu::sycl::compat::get_native(lhs); + auto rhs_ze_handle = xpu::sycl::compat::get_native(rhs); return lhs_ze_handle == rhs_ze_handle; } diff --git a/src/gpu/intel/sycl/l0/utils.hpp b/src/gpu/intel/sycl/l0/utils.hpp index c3bcb8c150a..46db51d2313 100644 --- a/src/gpu/intel/sycl/l0/utils.hpp +++ b/src/gpu/intel/sycl/l0/utils.hpp @@ -39,13 +39,13 @@ namespace gpu { namespace intel { namespace sycl { -hrt::device_uuid_t get_device_uuid(const ::sycl::device &dev); +xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev); status_t sycl_create_kernel_with_level_zero( std::unique_ptr<::sycl::kernel> &sycl_kernel, const std::string &kernel_name, const impl::sycl::sycl_engine_base_t *sycl_engine, - const hrt::binary_t &binary); + const xpu::binary_t &binary); bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs); diff --git a/src/gpu/intel/sycl/utils.cpp b/src/gpu/intel/sycl/utils.cpp index 0651c3262be..5f687d15f2a 100644 --- a/src/gpu/intel/sycl/utils.cpp +++ b/src/gpu/intel/sycl/utils.cpp @@ -20,7 +20,7 @@ #include "sycl/sycl_engine_base.hpp" #include "gpu/intel/sycl/l0/utils.hpp" -#include "hrt/ocl/utils.hpp" +#include "xpu/ocl/utils.hpp" #include @@ -55,14 +55,14 @@ ::sycl::nd_range<3> to_sycl_nd_range( struct uuid2ocl_dev_t { uuid2ocl_dev_t() = default; - status_t add(hrt::device_uuid_t uuid, - const hrt::ocl::wrapper_t &d) { + status_t add(xpu::device_uuid_t uuid, + const xpu::ocl::wrapper_t &d) { auto it = mapper_.insert(std::make_pair(uuid, d)); if (!it.second) return status::runtime_error; return status::success; } - cl_device_id get(hrt::device_uuid_t uuid) const { + cl_device_id get(xpu::device_uuid_t uuid) const { auto it = mapper_.find(uuid); if (it == mapper_.end()) return nullptr; return it->second; @@ -78,8 +78,8 @@ struct uuid2ocl_dev_t { } private: - using mapper_t = std::unordered_map, hrt::device_uuid_hasher_t>; + using mapper_t = std::unordered_map, xpu::device_uuid_hasher_t>; void release() { auto t = utils::make_unique(); @@ -94,25 +94,25 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) { #error "cl_khr_device_uuid is required" #endif using namespace gpu::intel::compute; - assert(hrt::sycl::get_backend(dev) == hrt::sycl::backend_t::level0); - if (hrt::sycl::get_backend(dev) != hrt::sycl::backend_t::level0) + assert(xpu::sycl::get_backend(dev) == xpu::sycl::backend_t::level0); + if (xpu::sycl::get_backend(dev) != xpu::sycl::backend_t::level0) return status::runtime_error; static const uuid2ocl_dev_t uuid2ocl_dev = []() { auto uuid2ocl_dev_tmp = uuid2ocl_dev_t(); std::vector ocl_devices; - std::vector> ocl_sub_devices; - auto st = hrt::ocl::get_devices( + std::vector> ocl_sub_devices; + auto st = xpu::ocl::get_devices( &ocl_devices, &ocl_sub_devices, CL_DEVICE_TYPE_GPU); assert(st == status::success); MAYBE_UNUSED(st); const auto register_ocl_dev = [&uuid2ocl_dev_tmp]( - const hrt::ocl::wrapper_t &d) { - hrt::device_uuid_t ocl_dev_uuid; - auto st = hrt::ocl::get_device_uuid(ocl_dev_uuid, d); + const xpu::ocl::wrapper_t &d) { + xpu::device_uuid_t ocl_dev_uuid; + auto st = xpu::ocl::get_device_uuid(ocl_dev_uuid, d); assert(st == status::success); st = uuid2ocl_dev_tmp.add(ocl_dev_uuid, d); assert(st == status::success); @@ -120,7 +120,7 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) { }; for (cl_device_id d : ocl_devices) { - register_ocl_dev(hrt::ocl::make_wrapper(d)); + register_ocl_dev(xpu::ocl::make_wrapper(d)); } for (const auto &sd_wrapper : ocl_sub_devices) { register_ocl_dev(sd_wrapper); @@ -131,7 +131,7 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) { if (uuid2ocl_dev.empty()) return status::runtime_error; - const hrt::device_uuid_t l0_dev_uuid + const xpu::device_uuid_t l0_dev_uuid = gpu::intel::sycl::get_device_uuid(dev); auto d = uuid2ocl_dev.get(l0_dev_uuid); @@ -148,28 +148,28 @@ static status_t create_ocl_engine( const ::sycl::device &sycl_dev, const ::sycl::context *sycl_ctx = nullptr) { gpu::intel::ocl::ocl_engine_factory_t f(engine_kind::gpu); - const auto backend = hrt::sycl::get_backend(sycl_dev); + const auto backend = xpu::sycl::get_backend(sycl_dev); // The SYCL context is always provided for OpenCL backend. - if (backend == hrt::sycl::backend_t::opencl && !sycl_ctx) + if (backend == xpu::sycl::backend_t::opencl && !sycl_ctx) return status::runtime_error; - hrt::ocl::wrapper_t ocl_dev; - hrt::ocl::wrapper_t ocl_ctx; + xpu::ocl::wrapper_t ocl_dev; + xpu::ocl::wrapper_t ocl_ctx; switch (backend) { - case hrt::sycl::backend_t::opencl: - ocl_dev = hrt::ocl::make_wrapper( - hrt::sycl::compat::get_native(sycl_dev)); - ocl_ctx = hrt::ocl::make_wrapper( - hrt::sycl::compat::get_native(*sycl_ctx)); + case xpu::sycl::backend_t::opencl: + ocl_dev = xpu::ocl::make_wrapper( + xpu::sycl::compat::get_native(sycl_dev)); + ocl_ctx = xpu::ocl::make_wrapper( + xpu::sycl::compat::get_native(*sycl_ctx)); break; - case hrt::sycl::backend_t::level0: { + case xpu::sycl::backend_t::level0: { cl_device_id d {nullptr}; CHECK(sycl_dev2ocl_dev(&d, sycl_dev)); - ocl_dev = hrt::ocl::make_wrapper(d, true); + ocl_dev = xpu::ocl::make_wrapper(d, true); cl_int err; - ocl_ctx = hrt::ocl::make_wrapper( + ocl_ctx = xpu::ocl::make_wrapper( clCreateContext(nullptr, 1, &d, nullptr, nullptr, &err)); OCL_CHECK(err); break; @@ -178,7 +178,7 @@ static status_t create_ocl_engine( } engine_t *ocl_engine_ptr; size_t index; - CHECK(hrt::ocl::get_device_index(&index, ocl_dev)); + CHECK(xpu::ocl::get_device_index(&index, ocl_dev)); CHECK(f.engine_create(&ocl_engine_ptr, ocl_dev, ocl_ctx, index)); ocl_engine->reset(utils::downcast( ocl_engine_ptr)); @@ -194,17 +194,17 @@ status_t create_ocl_engine( } status_t get_kernel_binary( - const ::sycl::kernel &kernel, hrt::binary_t &binary) { + const ::sycl::kernel &kernel, xpu::binary_t &binary) { auto devs = kernel.get_context().get_devices(); assert(!devs.empty()); - switch (hrt::sycl::get_backend(devs[0])) { - case hrt::sycl::backend_t::level0: { + switch (xpu::sycl::get_backend(devs[0])) { + case xpu::sycl::backend_t::level0: { auto bundle = kernel.get_kernel_bundle(); auto module_vec = ::sycl::get_native< ::sycl::backend::ext_oneapi_level_zero>(bundle); auto module = module_vec[0]; size_t module_binary_size; - hrt::binary_t module_binary; + xpu::binary_t module_binary; CHECK(gpu::intel::sycl::func_zeModuleGetNativeBinary( module, &module_binary_size, nullptr)); module_binary.resize(module_binary_size); @@ -215,15 +215,15 @@ status_t get_kernel_binary( engine_deleter_t> ocl_engine; CHECK(create_ocl_engine(&ocl_engine, devs[0])); - hrt::ocl::wrapper_t ocl_program; - CHECK(hrt::ocl::create_program(ocl_program, + xpu::ocl::wrapper_t ocl_program; + CHECK(xpu::ocl::create_program(ocl_program, ocl_engine->device(), ocl_engine->context(), module_binary)); cl_int err; auto name = kernel.get_info< ::sycl::info::kernel::function_name>(); - auto ocl_kernel = hrt::ocl::make_wrapper( + auto ocl_kernel = xpu::ocl::make_wrapper( clCreateKernel(ocl_program, name.c_str(), &err)); OCL_CHECK(err); CHECK(gpu::intel::ocl::get_ocl_kernel_binary( @@ -231,7 +231,7 @@ status_t get_kernel_binary( } return status::success; } - case hrt::sycl::backend_t::opencl: { + case xpu::sycl::backend_t::opencl: { auto ocl_kernel = ::sycl::get_native<::sycl::backend::opencl>(kernel); CHECK(gpu::intel::ocl::get_ocl_kernel_binary(ocl_kernel, binary)); diff --git a/src/gpu/intel/sycl/utils.hpp b/src/gpu/intel/sycl/utils.hpp index 49a2787b1d6..23915a3b913 100644 --- a/src/gpu/intel/sycl/utils.hpp +++ b/src/gpu/intel/sycl/utils.hpp @@ -19,7 +19,7 @@ #include "gpu/intel/compute/utils.hpp" #include "gpu/intel/ocl/ocl_gpu_engine.hpp" -#include "hrt/sycl/utils.hpp" +#include "xpu/sycl/utils.hpp" namespace dnnl { namespace impl { @@ -43,7 +43,7 @@ status_t create_ocl_engine( *ocl_engine, const impl::sycl::sycl_engine_base_t *engine); -status_t get_kernel_binary(const ::sycl::kernel &kernel, hrt::binary_t &binary); +status_t get_kernel_binary(const ::sycl::kernel &kernel, xpu::binary_t &binary); status_t create_ocl_engine( std::unique_ptr diff --git a/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp b/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp index b16926189ee..45cc05edbde 100644 --- a/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp +++ b/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp @@ -27,8 +27,8 @@ #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" #include "gpu/nvidia/sycl_cuda_utils.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" #include "sycl_cuda_utils.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { @@ -47,22 +47,22 @@ struct bnorm_exec_base_t { std::shared_ptr bnorm_impl, engine_t *engine, ::sycl::handler &cgh, nvidia::sycl_cuda_stream_t *cuda_stream, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_dst, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_scale, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_scale_buf, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_shift, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_shift_buf, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_wkspace, bool use_scale, bool use_shift, bool init_global_stats, - hrt::sycl::interop_memory_arg_t arg_mean = {}, - hrt::sycl::interop_memory_arg_t arg_var = {}) const { + xpu::sycl::interop_memory_arg_t arg_mean = {}, + xpu::sycl::interop_memory_arg_t arg_var = {}) const { compat::host_task(cgh, [=, this](const compat::interop_handle &ih) { auto &sycl_engine = *utils::downcast(engine); @@ -112,26 +112,26 @@ struct bnorm_exec_base_t { std::shared_ptr bnorm_impl, engine_t *engine, ::sycl::handler &cgh, nvidia::sycl_cuda_stream_t *cuda_stream, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_diff_dst, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_src, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_scale, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_scale_buf, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_scale, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_scale_buf, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_shift, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_shift_buf, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_wkspace, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> arg_temp_relu, bool use_scale, bool use_shift) const { compat::host_task(cgh, [=, this](const compat::interop_handle &ih) { @@ -190,7 +190,7 @@ struct bnorm_exec_base_t { void init_scaleshift(cuda_sycl_scoped_context_handler_t &sc, const compat::interop_handle &ih, nvidia::sycl_cuda_stream_t *cuda_stream, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_scale, float val, const size_t n) const { cuda_stream->interop_task([&](::sycl::handler &cgh) { @@ -214,9 +214,9 @@ struct bnorm_exec_base_t { void init_mean_var(cuda_sycl_scoped_context_handler_t &sc, const compat::interop_handle &ih, nvidia::sycl_cuda_stream_t *cuda_stream, - hrt::sycl::interop_memory_arg_t<::sycl::access_mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access_mode::write> arg_mean, - hrt::sycl::interop_memory_arg_t<::sycl::access_mode::write> arg_var, + xpu::sycl::interop_memory_arg_t<::sycl::access_mode::write> arg_var, const size_t n) const { constexpr T mean_var_val = 0; cuda_stream->interop_task([&](::sycl::handler &cgh) { @@ -250,14 +250,14 @@ struct bnorm_exec_fwd_t : public bnorm_exec_base_t { auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_scale = CTX_IN_SYCL_MEMORY(DNNL_ARG_SCALE); - auto arg_scale_buf = hrt::sycl::interop_memory_arg_t< + auto arg_scale_buf = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(scale_buf, cgh); auto arg_shift = CTX_IN_SYCL_MEMORY(DNNL_ARG_SHIFT); - auto arg_shift_buf = hrt::sycl::interop_memory_arg_t< + auto arg_shift_buf = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(shift_buf, cgh); auto arg_wkspace = bnorm_impl->is_training() ? CTX_OUT_SYCL_MEMORY(DNNL_ARG_WORKSPACE) - : hrt::sycl::interop_memory_arg_t< + : xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(); if (!use_global_stats) { @@ -299,13 +299,13 @@ struct bnorm_exec_bwd_t : public bnorm_exec_base_t { auto arg_diff_dst = CTX_IN_SYCL_MEMORY(DNNL_ARG_DIFF_DST); auto arg_diff_src = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SRC); auto arg_scale = CTX_IN_SYCL_MEMORY(DNNL_ARG_SCALE); - auto arg_scale_buf = hrt::sycl::interop_memory_arg_t< + auto arg_scale_buf = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(scale_buf, cgh); auto arg_diff_scale = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SCALE); - auto arg_diff_scale_buf = hrt::sycl::interop_memory_arg_t< + auto arg_diff_scale_buf = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(diff_scale_buf, cgh); auto arg_diff_shift = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SHIFT); - auto arg_diff_shift_buf = hrt::sycl::interop_memory_arg_t< + auto arg_diff_shift_buf = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::write>(diff_shift_buf, cgh); auto arg_wkspace = CTX_IN_SYCL_MEMORY(DNNL_ARG_WORKSPACE); auto arg_temp_relu diff --git a/src/gpu/nvidia/cudnn_binary.cpp b/src/gpu/nvidia/cudnn_binary.cpp index 015a8aa434f..31f38aac3ad 100644 --- a/src/gpu/nvidia/cudnn_binary.cpp +++ b/src/gpu/nvidia/cudnn_binary.cpp @@ -20,8 +20,8 @@ #include "gpu/nvidia/sycl_cuda_stream.hpp" #include "gpu/nvidia/sycl_cuda_stream_utils.hpp" #include "gpu/nvidia/sycl_cuda_utils.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_convolution.cpp b/src/gpu/nvidia/cudnn_convolution.cpp index d6d1966f2e9..4913f494bda 100644 --- a/src/gpu/nvidia/cudnn_convolution.cpp +++ b/src/gpu/nvidia/cudnn_convolution.cpp @@ -19,7 +19,7 @@ #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" #include "gpu/nvidia/sycl_cuda_utils.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { @@ -47,27 +47,27 @@ status_t cudnn_convolution_fwd_t::execute_convolution( auto arg_dst_scale = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST); - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> temp_dst; - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> temp_reorder; if (pd()->use_temp_dst()) { memory_storage_t *temp_dst_mem = scratch_storage.get(); memory_storage_t *temp_reorder_mem = scratch_storage_2.get(); - temp_dst = hrt::sycl::interop_memory_arg_t< + temp_dst = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(temp_dst_mem, cgh); - temp_reorder = hrt::sycl::interop_memory_arg_t< + temp_reorder = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(temp_reorder_mem, cgh); } - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> y_fp32_data; if (!arg_dst_scale.empty() || !arg_src_scale.empty() || !arg_wei_scale.empty()) { memory_storage_t *y_fp32_data_mem = scratch_storage_3.get(); - y_fp32_data = hrt::sycl::interop_memory_arg_t< + y_fp32_data = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(y_fp32_data_mem, cgh); } @@ -167,7 +167,7 @@ status_t cudnn_convolution_bwd_weights_t::execute_convolution( auto arg_filter_scratch = CTX_SCRATCH_SYCL_MEMORY( memory_tracking::names::key_conv_cudnn_filter); - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_diff_bias; if (with_bias) { diff --git a/src/gpu/nvidia/cudnn_convolution.hpp b/src/gpu/nvidia/cudnn_convolution.hpp index 741a01fde2e..fa79a5d4f49 100644 --- a/src/gpu/nvidia/cudnn_convolution.hpp +++ b/src/gpu/nvidia/cudnn_convolution.hpp @@ -208,7 +208,7 @@ struct cudnn_convolution_fwd_t : public primitive_t { private: ::sycl::buffer &buffer(memory_storage_t *mem_storage) const { - return utils::downcast( + return utils::downcast( mem_storage) ->buffer(); } diff --git a/src/gpu/nvidia/cudnn_deconvolution.cpp b/src/gpu/nvidia/cudnn_deconvolution.cpp index 1192184c2b7..56b4d92753b 100644 --- a/src/gpu/nvidia/cudnn_deconvolution.cpp +++ b/src/gpu/nvidia/cudnn_deconvolution.cpp @@ -19,7 +19,7 @@ #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" #include "gpu/nvidia/sycl_cuda_utils.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_eltwise.cpp b/src/gpu/nvidia/cudnn_eltwise.cpp index 0e21f4c5daa..9d632e27fc3 100644 --- a/src/gpu/nvidia/cudnn_eltwise.cpp +++ b/src/gpu/nvidia/cudnn_eltwise.cpp @@ -18,8 +18,8 @@ #include "gpu/nvidia/cudnn_eltwise.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_inner_product.cpp b/src/gpu/nvidia/cudnn_inner_product.cpp index d4f3c63a651..4408f11d3bf 100644 --- a/src/gpu/nvidia/cudnn_inner_product.cpp +++ b/src/gpu/nvidia/cudnn_inner_product.cpp @@ -20,8 +20,8 @@ #include "gpu/nvidia/cudnn_gemm_inner_product.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_lrn.cpp b/src/gpu/nvidia/cudnn_lrn.cpp index d90c10ad95b..cc47af6dcfb 100644 --- a/src/gpu/nvidia/cudnn_lrn.cpp +++ b/src/gpu/nvidia/cudnn_lrn.cpp @@ -18,9 +18,9 @@ #include "gpu/nvidia/cudnn_lrn.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_matmul_executor.hpp b/src/gpu/nvidia/cudnn_matmul_executor.hpp index 1601cc57e76..e26a5ded117 100644 --- a/src/gpu/nvidia/cudnn_matmul_executor.hpp +++ b/src/gpu/nvidia/cudnn_matmul_executor.hpp @@ -23,7 +23,7 @@ #include "gpu/nvidia/sycl_cuda_engine.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" #include @@ -44,18 +44,18 @@ struct cudnn_matmul_exec_base_t { void interop_task(std::shared_ptr matmul_impl_, engine_t *engine, ::sycl::handler &cgh, nvidia::sycl_cuda_stream_t *cuda_stream, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_weights, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src, + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write> arg_dst, - hrt::sycl::interop_memory_arg_t arg_bias, - hrt::sycl::interop_memory_arg_t arg_scratch, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t arg_bias, + xpu::sycl::interop_memory_arg_t arg_scratch, + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src_scale, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_wei_scale, - hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read> + xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_dst_scale) { compat::host_task(cgh, [=](const compat::interop_handle &ih) { @@ -132,7 +132,7 @@ struct cudnn_matmul_scratch_runtime_args_bias_exec_t auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS); - auto arg_scratch = hrt::sycl::interop_memory_arg_t< + auto arg_scratch = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(*scratch_buff_, cgh); auto arg_src_scale @@ -164,9 +164,9 @@ struct cudnn_matmul_runtime_args_scratch_exec_t auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); - auto arg_scratch = hrt::sycl::interop_memory_arg_t< + auto arg_scratch = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(*scratch_buff_, cgh); - auto arg_bias = hrt::sycl::interop_memory_arg_t< + auto arg_bias = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); auto arg_src_scale @@ -197,7 +197,7 @@ struct cudnn_matmul_runtime_args_bias_exec_t : public cudnn_matmul_exec_base_t { auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS); - auto arg_scratch = hrt::sycl::interop_memory_arg_t< + auto arg_scratch = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); auto arg_src_scale @@ -227,9 +227,9 @@ struct cudnn_matmul_runtime_args_exec_t : public cudnn_matmul_exec_base_t { auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); - auto arg_bias = hrt::sycl::interop_memory_arg_t< + auto arg_bias = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); - auto arg_scratch = hrt::sycl::interop_memory_arg_t< + auto arg_scratch = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); auto arg_src_scale @@ -292,7 +292,7 @@ struct cudnn_matmul_scratch_exec_t : public cudnn_matmul_exec_base_t { auto arg_scratch = CTX_SCRATCH_SYCL_MEMORY( memory_tracking::names::key_matmul_dst_in_acc_dt); - auto arg_bias = hrt::sycl::interop_memory_arg_t< + auto arg_bias = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); auto arg_src_scale @@ -323,7 +323,7 @@ struct cudnn_matmul_bias_exec_t : public cudnn_matmul_exec_base_t { auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS); - auto arg_scratch = hrt::sycl::interop_memory_arg_t< + auto arg_scratch = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); auto arg_src_scale @@ -353,9 +353,9 @@ struct cudnn_matmul_exec_t : public cudnn_matmul_exec_base_t { auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS); auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); - auto arg_bias = hrt::sycl::interop_memory_arg_t< + auto arg_bias = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read>(); - auto arg_scratch = hrt::sycl::interop_memory_arg_t< + auto arg_scratch = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(); auto arg_src_scale diff --git a/src/gpu/nvidia/cudnn_pooling.cpp b/src/gpu/nvidia/cudnn_pooling.cpp index b1234ff9ef9..d0ef4040131 100644 --- a/src/gpu/nvidia/cudnn_pooling.cpp +++ b/src/gpu/nvidia/cudnn_pooling.cpp @@ -18,11 +18,11 @@ #include "gpu/nvidia/cudnn_pooling.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" #include "common/nstl.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_reduction.cpp b/src/gpu/nvidia/cudnn_reduction.cpp index 9cef33721a9..771f2b851ca 100644 --- a/src/gpu/nvidia/cudnn_reduction.cpp +++ b/src/gpu/nvidia/cudnn_reduction.cpp @@ -18,7 +18,7 @@ #include "gpu/nvidia/cudnn_reduction.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_reorder.cpp b/src/gpu/nvidia/cudnn_reorder.cpp index 44166e28023..57458a2117f 100644 --- a/src/gpu/nvidia/cudnn_reorder.cpp +++ b/src/gpu/nvidia/cudnn_reorder.cpp @@ -20,7 +20,7 @@ #include "gpu/nvidia/sycl_cuda_stream.hpp" #include "gpu/nvidia/sycl_cuda_stream_utils.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_resampling.cpp b/src/gpu/nvidia/cudnn_resampling.cpp index dd9ee7be29d..e825d73075b 100644 --- a/src/gpu/nvidia/cudnn_resampling.cpp +++ b/src/gpu/nvidia/cudnn_resampling.cpp @@ -18,8 +18,8 @@ #include "gpu/nvidia/cudnn_resampling.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/cudnn_resampling.hpp b/src/gpu/nvidia/cudnn_resampling.hpp index ac3648ddff8..1b02240bbb1 100644 --- a/src/gpu/nvidia/cudnn_resampling.hpp +++ b/src/gpu/nvidia/cudnn_resampling.hpp @@ -57,12 +57,12 @@ struct cudnn_resampling_base_t : public primitive_t { }; ::sycl::buffer &buffer(memory_storage_t *mem_storage) { - return utils::downcast( + return utils::downcast( mem_storage) ->buffer(); } ::sycl::buffer &buffer(memory_storage_t *mem_storage) const { - return utils::downcast( + return utils::downcast( mem_storage) ->buffer(); } diff --git a/src/gpu/nvidia/cudnn_softmax.cpp b/src/gpu/nvidia/cudnn_softmax.cpp index 776c7c097b5..7abaa36a7cb 100644 --- a/src/gpu/nvidia/cudnn_softmax.cpp +++ b/src/gpu/nvidia/cudnn_softmax.cpp @@ -19,8 +19,8 @@ #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" #include "gpu/nvidia/sycl_cuda_stream_utils.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/sycl_cuda_compat.hpp b/src/gpu/nvidia/sycl_cuda_compat.hpp index 46738790b46..197008123c0 100644 --- a/src/gpu/nvidia/sycl_cuda_compat.hpp +++ b/src/gpu/nvidia/sycl_cuda_compat.hpp @@ -20,7 +20,7 @@ #include -#include "hrt/sycl/compat.hpp" +#include "xpu/sycl/compat.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/sycl_cuda_engine.cpp b/src/gpu/nvidia/sycl_cuda_engine.cpp index 3c5cac94234..e515cda7791 100644 --- a/src/gpu/nvidia/sycl_cuda_engine.cpp +++ b/src/gpu/nvidia/sycl_cuda_engine.cpp @@ -18,7 +18,7 @@ #include "common/impl_list_item.hpp" #include "common/utils.hpp" -#include "hrt/sycl/utils.hpp" +#include "xpu/sycl/utils.hpp" #include "gpu/nvidia/cudnn_batch_normalization.hpp" #include "gpu/nvidia/cudnn_binary.hpp" @@ -147,7 +147,7 @@ cublasHandle_t *sycl_cuda_engine_t::get_cublas_handle() { } device_id_t sycl_cuda_engine_t::device_id() const { - return device_id_t(static_cast(hrt::sycl::backend_t::nvidia), + return device_id_t(static_cast(xpu::sycl::backend_t::nvidia), static_cast(compat::get_native(device())), static_cast(0)); } diff --git a/src/gpu/nvidia/sycl_cuda_stream_utils.hpp b/src/gpu/nvidia/sycl_cuda_stream_utils.hpp index 235c368e949..e8cffbe31a8 100644 --- a/src/gpu/nvidia/sycl_cuda_stream_utils.hpp +++ b/src/gpu/nvidia/sycl_cuda_stream_utils.hpp @@ -19,8 +19,8 @@ #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/memory_storage_helper.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/memory_storage_helper.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/nvidia/sycl_cuda_utils.hpp b/src/gpu/nvidia/sycl_cuda_utils.hpp index 6e6a81d2974..a4baad172c3 100644 --- a/src/gpu/nvidia/sycl_cuda_utils.hpp +++ b/src/gpu/nvidia/sycl_cuda_utils.hpp @@ -30,7 +30,7 @@ #include "common/primitive_attr.hpp" #include "common/z_magic.hpp" -#include "hrt/sycl/utils.hpp" +#include "xpu/sycl/utils.hpp" #include "gpu/nvidia/sycl_cuda_compat.hpp" @@ -40,19 +40,19 @@ namespace gpu { namespace nvidia { #define CTX_OUT_ACCESSOR(arg) \ - utils::downcast( \ + utils::downcast( \ &CTX_OUT_STORAGE(arg)) \ ->buffer() \ .get_access<::sycl::access::mode::write>(cgh) #define CTX_IN_ACCESSOR(arg) \ - utils::downcast( \ + utils::downcast( \ &CTX_IN_STORAGE(arg)) \ ->buffer() \ .get_access<::sycl::access::mode::read>(cgh) #define CTX_SCRATCH_ACCESSOR(arg) \ - utils::downcast( \ + utils::downcast( \ ctx.get_scratchpad_grantor().get_memory_storage(arg).get()) \ ->buffer() \ .get_access<::sycl::access::mode::read_write>(cgh) @@ -325,7 +325,7 @@ ::sycl::event copy(::sycl::queue &q, T *src, ::sycl::buffer &dst) { auto event = q.submit([&, src](::sycl::handler &cgh) { // Retrieve a write accessor to a global buffer auto acc = dst.template get_access<::sycl::access::mode::write, - hrt::sycl::compat::target_device>(cgh); + xpu::sycl::compat::target_device>(cgh); // Copy from the input pointer into the buffer associated with the // accessor cgh.copy(src, acc); @@ -339,7 +339,7 @@ ::sycl::event copy(::sycl::queue &q, ::sycl::buffer &src, T *dst) { auto event = q.submit([&, dst](::sycl::handler &cgh) { // Retrieve a read accessor to a global buffer auto acc = src.template get_access<::sycl::access::mode::read, - hrt::sycl::compat::target_device>(cgh); + xpu::sycl::compat::target_device>(cgh); // Copy from the buffer associated with the accessor into the output // pointer cgh.copy(acc, dst); diff --git a/src/gpu/sycl/batch_normalizations_kernels.hpp b/src/gpu/sycl/batch_normalizations_kernels.hpp index 1bac580a832..b0a9c19ff5d 100644 --- a/src/gpu/sycl/batch_normalizations_kernels.hpp +++ b/src/gpu/sycl/batch_normalizations_kernels.hpp @@ -24,7 +24,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { @@ -50,10 +50,10 @@ inline float maybe_up_convert(bfloat16_t x) { struct batch_normalization_fwd_kernel_vec_t { batch_normalization_fwd_kernel_vec_t( const sycl_batch_normalization_conf_t &conf, - hrt::sycl::in_memory_arg_t &data, hrt::sycl::in_memory_arg_t &scale, - hrt::sycl::in_memory_arg_t &shift, hrt::sycl::in_memory_arg_t &stat, - hrt::sycl::in_memory_arg_t &var, hrt::sycl::out_memory_arg_t &dst, - hrt::sycl::out_memory_arg_t &ws, hrt::sycl::in_memory_arg_t &src1) + xpu::sycl::in_memory_arg_t &data, xpu::sycl::in_memory_arg_t &scale, + xpu::sycl::in_memory_arg_t &shift, xpu::sycl::in_memory_arg_t &stat, + xpu::sycl::in_memory_arg_t &var, xpu::sycl::out_memory_arg_t &dst, + xpu::sycl::out_memory_arg_t &ws, xpu::sycl::in_memory_arg_t &src1) : conf_(conf) , data_(data) , scale_(scale) @@ -72,18 +72,18 @@ struct batch_normalization_fwd_kernel_vec_t { } private: - const hrt::sycl::md_t &data_md() const { return conf_.data_md; } - const hrt::sycl::md_t &src1_md() const { return conf_.src1_md; } - const hrt::sycl::md_t &ws_md() const { return conf_.ws_md; } - const hrt::sycl::md_t &data_scaleshift_md() const { + const xpu::sycl::md_t &data_md() const { return conf_.data_md; } + const xpu::sycl::md_t &src1_md() const { return conf_.src1_md; } + const xpu::sycl::md_t &ws_md() const { return conf_.ws_md; } + const xpu::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const hrt::sycl::md_t &var_md() const { return conf_.var_md; } - const hrt::sycl::md_t &stat_d() const { return conf_.stat_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &var_md() const { return conf_.var_md; } + const xpu::sycl::md_t &stat_d() const { return conf_.stat_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } const float epsilon() const { return conf_.batch_norm_epsilon; } - inline static dim_t DATA_OFF(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + inline static dim_t DATA_OFF(const xpu::sycl::md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 1: return mdw.off(n); @@ -161,7 +161,7 @@ struct batch_normalization_fwd_kernel_vec_t { if (data_md().data_type() == data_type::s8) { bn_res = ::dnnl::impl::sycl::qz_a1b0::type>()( + xpu::sycl::prec_traits::type>()( maybe_post_op(bn_res)); store_float_value( dst_md().data_type(), bn_res, dst_ptr(), d_off); @@ -185,24 +185,24 @@ struct batch_normalization_fwd_kernel_vec_t { sycl_batch_normalization_conf_t conf_; - hrt::sycl::in_memory_arg_t data_; - hrt::sycl::in_memory_arg_t scale_; - hrt::sycl::in_memory_arg_t shift_; - hrt::sycl::in_memory_arg_t stat_; - hrt::sycl::in_memory_arg_t var_; - hrt::sycl::out_memory_arg_t dst_; - hrt::sycl::out_memory_arg_t ws_; - hrt::sycl::in_memory_arg_t src1_; + xpu::sycl::in_memory_arg_t data_; + xpu::sycl::in_memory_arg_t scale_; + xpu::sycl::in_memory_arg_t shift_; + xpu::sycl::in_memory_arg_t stat_; + xpu::sycl::in_memory_arg_t var_; + xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::out_memory_arg_t ws_; + xpu::sycl::in_memory_arg_t src1_; }; struct batch_normalization_fwd_kernel_vec_t1 { batch_normalization_fwd_kernel_vec_t1( const sycl_batch_normalization_conf_t &conf, - hrt::sycl::in_memory_arg_t &data, hrt::sycl::in_memory_arg_t &scale, - hrt::sycl::in_memory_arg_t &shift, hrt::sycl::out_memory_arg_t &dst, - hrt::sycl::out_memory_arg_t &mean_out, - hrt::sycl::out_memory_arg_t &var_out, - hrt::sycl::out_memory_arg_t &ws, hrt::sycl::in_memory_arg_t &src1) + xpu::sycl::in_memory_arg_t &data, xpu::sycl::in_memory_arg_t &scale, + xpu::sycl::in_memory_arg_t &shift, xpu::sycl::out_memory_arg_t &dst, + xpu::sycl::out_memory_arg_t &mean_out, + xpu::sycl::out_memory_arg_t &var_out, + xpu::sycl::out_memory_arg_t &ws, xpu::sycl::in_memory_arg_t &src1) : conf_(conf) , data_(data) , scale_(scale) @@ -221,18 +221,18 @@ struct batch_normalization_fwd_kernel_vec_t1 { } private: - const hrt::sycl::md_t &data_md() const { return conf_.data_md; } - const hrt::sycl::md_t &src1_md() const { return conf_.src1_md; } - const hrt::sycl::md_t &ws_md() const { return conf_.ws_md; } - const hrt::sycl::md_t &data_scaleshift_md() const { + const xpu::sycl::md_t &data_md() const { return conf_.data_md; } + const xpu::sycl::md_t &src1_md() const { return conf_.src1_md; } + const xpu::sycl::md_t &ws_md() const { return conf_.ws_md; } + const xpu::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const hrt::sycl::md_t &var_md() const { return conf_.var_md; } - const hrt::sycl::md_t &stat_d() const { return conf_.stat_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &var_md() const { return conf_.var_md; } + const xpu::sycl::md_t &stat_d() const { return conf_.stat_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } const float epsilon() const { return conf_.batch_norm_epsilon; } - inline static dim_t DATA_OFF(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + inline static dim_t DATA_OFF(const xpu::sycl::md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 1: return mdw.off(n); @@ -339,7 +339,7 @@ struct batch_normalization_fwd_kernel_vec_t1 { if (data_md().data_type() == data_type::s8) { bn_res = ::dnnl::impl::sycl::qz_a1b0::type>()( + xpu::sycl::prec_traits::type>()( maybe_post_op(bn_res)); store_float_value( dst_md().data_type(), bn_res, dst_ptr(), d_off); @@ -371,28 +371,28 @@ struct batch_normalization_fwd_kernel_vec_t1 { sycl_batch_normalization_conf_t conf_; - hrt::sycl::in_memory_arg_t data_; - hrt::sycl::in_memory_arg_t scale_; - hrt::sycl::in_memory_arg_t shift_; - hrt::sycl::out_memory_arg_t dst_; - hrt::sycl::out_memory_arg_t mean_out_; - hrt::sycl::out_memory_arg_t var_out_; - hrt::sycl::out_memory_arg_t ws_; - hrt::sycl::in_memory_arg_t src1_; + xpu::sycl::in_memory_arg_t data_; + xpu::sycl::in_memory_arg_t scale_; + xpu::sycl::in_memory_arg_t shift_; + xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::out_memory_arg_t mean_out_; + xpu::sycl::out_memory_arg_t var_out_; + xpu::sycl::out_memory_arg_t ws_; + xpu::sycl::in_memory_arg_t src1_; }; struct batch_normalization_bwd_kernel_vec_t { batch_normalization_bwd_kernel_vec_t( const sycl_batch_normalization_conf_t &conf, - hrt::sycl::in_memory_arg_t &data, - hrt::sycl::out_memory_arg_t &diff_data, - hrt::sycl::in_memory_arg_t &scale, - hrt::sycl::out_memory_arg_t &diff_scale, - hrt::sycl::out_memory_arg_t &diff_shift, - hrt::sycl::in_memory_arg_t &stat, hrt::sycl::in_memory_arg_t &var, - hrt::sycl::in_memory_arg_t &diff_dst, - hrt::sycl::in_memory_arg_t &dst, hrt::sycl::in_memory_arg_t &ws, - hrt::sycl::in_memory_arg_t &diff_src1) + xpu::sycl::in_memory_arg_t &data, + xpu::sycl::out_memory_arg_t &diff_data, + xpu::sycl::in_memory_arg_t &scale, + xpu::sycl::out_memory_arg_t &diff_scale, + xpu::sycl::out_memory_arg_t &diff_shift, + xpu::sycl::in_memory_arg_t &stat, xpu::sycl::in_memory_arg_t &var, + xpu::sycl::in_memory_arg_t &diff_dst, + xpu::sycl::in_memory_arg_t &dst, xpu::sycl::in_memory_arg_t &ws, + xpu::sycl::in_memory_arg_t &diff_src1) : conf_(conf) , data_(data) , diff_data_(diff_data) @@ -414,20 +414,20 @@ struct batch_normalization_bwd_kernel_vec_t { } private: - const hrt::sycl::md_t &data_md() const { return conf_.data_md; } - const hrt::sycl::md_t &diff_data_md() const { return conf_.diff_data_md; } - const hrt::sycl::md_t &diff_src1_md() const { return conf_.diff_src1_md; } - const hrt::sycl::md_t &stat_d() const { return conf_.stat_md; } - const hrt::sycl::md_t &ws_md() const { return conf_.ws_md; } - const hrt::sycl::md_t &data_scaleshift_md() const { + const xpu::sycl::md_t &data_md() const { return conf_.data_md; } + const xpu::sycl::md_t &diff_data_md() const { return conf_.diff_data_md; } + const xpu::sycl::md_t &diff_src1_md() const { return conf_.diff_src1_md; } + const xpu::sycl::md_t &stat_d() const { return conf_.stat_md; } + const xpu::sycl::md_t &ws_md() const { return conf_.ws_md; } + const xpu::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const hrt::sycl::md_t &diff_data_scaleshift_md() const { + const xpu::sycl::md_t &diff_data_scaleshift_md() const { return conf_.diff_data_scaleshift_md; } - const hrt::sycl::md_t &var_md() const { return conf_.var_md; } - const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &var_md() const { return conf_.var_md; } + const xpu::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } const unsigned flags() const { return conf_.flags; } const float epsilon() const { return conf_.batch_norm_epsilon; } @@ -443,7 +443,7 @@ struct batch_normalization_bwd_kernel_vec_t { void *dst_ptr() const { return dst_.get_pointer(); } void *ws_ptr() const { return ws_.get_pointer(); } - static dim_t DATA_OFF(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, dim_t d, + static dim_t DATA_OFF(const xpu::sycl::md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 1: return mdw.off(n); @@ -577,17 +577,17 @@ struct batch_normalization_bwd_kernel_vec_t { sycl_batch_normalization_conf_t conf_; - hrt::sycl::in_memory_arg_t data_; - hrt::sycl::out_memory_arg_t diff_data_; - hrt::sycl::in_memory_arg_t scale_; - hrt::sycl::out_memory_arg_t diff_scale_; - hrt::sycl::out_memory_arg_t diff_shift_; - hrt::sycl::in_memory_arg_t stat_; - hrt::sycl::in_memory_arg_t var_; - hrt::sycl::in_memory_arg_t diff_dst_; - hrt::sycl::in_memory_arg_t dst_; - hrt::sycl::in_memory_arg_t ws_; - hrt::sycl::in_memory_arg_t diff_src1_; + xpu::sycl::in_memory_arg_t data_; + xpu::sycl::out_memory_arg_t diff_data_; + xpu::sycl::in_memory_arg_t scale_; + xpu::sycl::out_memory_arg_t diff_scale_; + xpu::sycl::out_memory_arg_t diff_shift_; + xpu::sycl::in_memory_arg_t stat_; + xpu::sycl::in_memory_arg_t var_; + xpu::sycl::in_memory_arg_t diff_dst_; + xpu::sycl::in_memory_arg_t dst_; + xpu::sycl::in_memory_arg_t ws_; + xpu::sycl::in_memory_arg_t diff_src1_; }; } // namespace sycl diff --git a/src/gpu/sycl/binary_kernels.hpp b/src/gpu/sycl/binary_kernels.hpp index 0be3af1fddf..8fb556fab63 100644 --- a/src/gpu/sycl/binary_kernels.hpp +++ b/src/gpu/sycl/binary_kernels.hpp @@ -21,7 +21,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { @@ -33,10 +33,10 @@ struct binary_kernel_vec_t { static constexpr int max_supported_ndims = 5; binary_kernel_vec_t(const sycl_binary_conf_t &conf, - hrt::sycl::in_memory_arg_t &src0, hrt::sycl::in_memory_arg_t &src1, - hrt::sycl::out_memory_arg_t &dst, - hrt::sycl::in_memory_arg_t &src0_scale, - hrt::sycl::in_memory_arg_t &src1_scale, data_type_t scales_dt) + xpu::sycl::in_memory_arg_t &src0, xpu::sycl::in_memory_arg_t &src1, + xpu::sycl::out_memory_arg_t &dst, + xpu::sycl::in_memory_arg_t &src0_scale, + xpu::sycl::in_memory_arg_t &src1_scale, data_type_t scales_dt) : conf_(conf) , src0_(src0) , src1_(src1) @@ -132,9 +132,9 @@ struct binary_kernel_vec_t { } private: - const hrt::sycl::md_t &src0_md() const { return conf_.src0_md; } - const hrt::sycl::md_t &src1_md() const { return conf_.src1_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &src0_md() const { return conf_.src0_md; } + const xpu::sycl::md_t &src1_md() const { return conf_.src1_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } void *src0_ptr() const { return src0_.get_pointer(); } void *src1_ptr() const { return src1_.get_pointer(); } @@ -193,11 +193,11 @@ struct binary_kernel_vec_t { sycl_binary_conf_t conf_; - hrt::sycl::in_memory_arg_t src0_; - hrt::sycl::in_memory_arg_t src1_; - hrt::sycl::out_memory_arg_t dst_; - hrt::sycl::in_memory_arg_t src0_scale_; - hrt::sycl::in_memory_arg_t src1_scale_; + xpu::sycl::in_memory_arg_t src0_; + xpu::sycl::in_memory_arg_t src1_; + xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::in_memory_arg_t src0_scale_; + xpu::sycl::in_memory_arg_t src1_scale_; data_type_t scales_dt_; }; diff --git a/src/gpu/sycl/eltwise_kernels.hpp b/src/gpu/sycl/eltwise_kernels.hpp index 867ac4b312b..999e73a1785 100644 --- a/src/gpu/sycl/eltwise_kernels.hpp +++ b/src/gpu/sycl/eltwise_kernels.hpp @@ -29,12 +29,12 @@ namespace sycl { struct eltwise_fwd_kernel_vec_t { eltwise_fwd_kernel_vec_t(const sycl_eltwise_conf_t &conf, - hrt::sycl::in_memory_arg_t &src, hrt::sycl::out_memory_arg_t &dst, - hrt::sycl::in_memory_arg_t &srcOp1, - hrt::sycl::in_memory_arg_t &srcOp2, - hrt::sycl::in_memory_arg_t &srcOp3, - hrt::sycl::in_memory_arg_t &srcOp4, - hrt::sycl::in_memory_arg_t &srcOp5) + xpu::sycl::in_memory_arg_t &src, xpu::sycl::out_memory_arg_t &dst, + xpu::sycl::in_memory_arg_t &srcOp1, + xpu::sycl::in_memory_arg_t &srcOp2, + xpu::sycl::in_memory_arg_t &srcOp3, + xpu::sycl::in_memory_arg_t &srcOp4, + xpu::sycl::in_memory_arg_t &srcOp5) : conf_(conf) , src_(src) , srcOp1_(srcOp1) @@ -93,8 +93,8 @@ struct eltwise_fwd_kernel_vec_t { } private: - const hrt::sycl::md_t &src_md() const { return conf_.src_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &src_md() const { return conf_.src_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } void *src_ptr() const { return src_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } @@ -216,7 +216,7 @@ struct eltwise_fwd_kernel_vec_t { return post_po_sr; } - inline dim_t data_offset(const hrt::sycl::md_t &mem, dim_t &n, dim_t &c, + inline dim_t data_offset(const xpu::sycl::md_t &mem, dim_t &n, dim_t &c, dim_t &d, dim_t &h, dim_t &w) const { const auto ndims = mem.ndims(); switch (ndims) { @@ -230,7 +230,7 @@ struct eltwise_fwd_kernel_vec_t { return -1; } - float get_post_op_val(const hrt::sycl::in_memory_arg_t &bin_src_op, + float get_post_op_val(const xpu::sycl::in_memory_arg_t &bin_src_op, dim_t &idx, dim_t &offset) const { auto src1_desc = conf_.binary_src_arr[idx]; @@ -242,17 +242,17 @@ struct eltwise_fwd_kernel_vec_t { return dst; } - dim_t get_binary_src1_off(const hrt::sycl::md_t &src1_md, - const dim_t &l_offset, const hrt::sycl::md_t::dims32_t &dst_dims, - const hrt::sycl::md_t::dim32_t &dst_ndims) const { + dim_t get_binary_src1_off(const xpu::sycl::md_t &src1_md, + const dim_t &l_offset, const xpu::sycl::md_t::dims32_t &dst_dims, + const xpu::sycl::md_t::dim32_t &dst_ndims) const { const dim_t mask_binary_po = get_dims_mask(dst_dims, src1_md.dims(), dst_ndims); return get_po_tensor_off( src1_md, l_offset, dst_dims, dst_ndims, mask_binary_po); } - inline dim_t get_dims_mask(const hrt::sycl::md_t::dims32_t &dims1, - const hrt::sycl::md_t::dims32_t &dims2, const dim_t &ndims, + inline dim_t get_dims_mask(const xpu::sycl::md_t::dims32_t &dims1, + const xpu::sycl::md_t::dims32_t &dims2, const dim_t &ndims, bool skip_dim_of_one = false) const { dim_t mask = 0; for (dim_t d = 0; d < ndims; ++d) { @@ -263,8 +263,8 @@ struct eltwise_fwd_kernel_vec_t { return mask; } - inline dim_t get_po_tensor_off(const hrt::sycl::md_t &tensor_md, - const dim_t &l_offset, const hrt::sycl::md_t::dims32_t &dst_dims, + inline dim_t get_po_tensor_off(const xpu::sycl::md_t &tensor_md, + const dim_t &l_offset, const xpu::sycl::md_t::dims32_t &dst_dims, const dim_t &dst_ndims, const dim_t &mask) const { dims_t l_dims_po {}; get_l_dims_po(l_dims_po, l_offset, dst_dims, dst_ndims, mask); @@ -273,7 +273,7 @@ struct eltwise_fwd_kernel_vec_t { } inline void get_l_dims_po(dims_t l_dims_po, dim_t l_offset, - const hrt::sycl::md_t::dims32_t &dst_dims, const dim_t &dst_ndims, + const xpu::sycl::md_t::dims32_t &dst_dims, const dim_t &dst_ndims, const dim_t &mask) const { l_dims_by_l_offset(l_dims_po, l_offset, dst_dims, dst_ndims); @@ -281,7 +281,7 @@ struct eltwise_fwd_kernel_vec_t { } inline void l_dims_by_l_offset(dims_t dims_pos, dim_t l_offset, - const hrt::sycl::md_t::dims32_t &dims, const dim_t &ndims) const { + const xpu::sycl::md_t::dims32_t &dims, const dim_t &ndims) const { for (dim_t rd = 0; rd < ndims; ++rd) { const dim_t d = ndims - 1 - rd; /* switch to faster 32-bit division when possible. */ @@ -296,20 +296,20 @@ struct eltwise_fwd_kernel_vec_t { } sycl_eltwise_conf_t conf_; - hrt::sycl::in_memory_arg_t src_; - hrt::sycl::in_memory_arg_t srcOp1_; - hrt::sycl::in_memory_arg_t srcOp2_; - hrt::sycl::in_memory_arg_t srcOp3_; - hrt::sycl::in_memory_arg_t srcOp4_; - hrt::sycl::in_memory_arg_t srcOp5_; - hrt::sycl::out_memory_arg_t dst_; + xpu::sycl::in_memory_arg_t src_; + xpu::sycl::in_memory_arg_t srcOp1_; + xpu::sycl::in_memory_arg_t srcOp2_; + xpu::sycl::in_memory_arg_t srcOp3_; + xpu::sycl::in_memory_arg_t srcOp4_; + xpu::sycl::in_memory_arg_t srcOp5_; + xpu::sycl::out_memory_arg_t dst_; }; struct eltwise_bwd_kernel_vec_t { eltwise_bwd_kernel_vec_t(const sycl_eltwise_conf_t &conf, - hrt::sycl::in_memory_arg_t &diff_src, - hrt::sycl::in_memory_arg_t &src, - hrt::sycl::out_memory_arg_t &diff_dst) + xpu::sycl::in_memory_arg_t &diff_src, + xpu::sycl::in_memory_arg_t &src, + xpu::sycl::out_memory_arg_t &diff_dst) : conf_(conf), src_(src), diff_src_(diff_src), diff_dst_(diff_dst) {} void operator()(::sycl::nd_item<1> item) const { @@ -337,9 +337,9 @@ struct eltwise_bwd_kernel_vec_t { } private: - const hrt::sycl::md_t &src_md() const { return conf_.src_md; } - const hrt::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } - const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const xpu::sycl::md_t &src_md() const { return conf_.src_md; } + const xpu::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } + const xpu::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } void *src_ptr() const { return src_.get_pointer(); } void *diff_src_ptr() const { return diff_src_.get_pointer(); } @@ -448,9 +448,9 @@ struct eltwise_bwd_kernel_vec_t { } sycl_eltwise_conf_t conf_; - hrt::sycl::in_memory_arg_t src_; - hrt::sycl::in_memory_arg_t diff_src_; - hrt::sycl::out_memory_arg_t diff_dst_; + xpu::sycl::in_memory_arg_t src_; + xpu::sycl::in_memory_arg_t diff_src_; + xpu::sycl::out_memory_arg_t diff_dst_; }; } // namespace sycl diff --git a/src/gpu/sycl/layer_normalizations_kernels.hpp b/src/gpu/sycl/layer_normalizations_kernels.hpp index 1c3c55d7555..e34ac951e33 100644 --- a/src/gpu/sycl/layer_normalizations_kernels.hpp +++ b/src/gpu/sycl/layer_normalizations_kernels.hpp @@ -23,7 +23,7 @@ #include "gpu/sycl/sycl_io_helper.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { @@ -33,11 +33,11 @@ namespace sycl { struct layer_normalization_fwd_kernel_vec_t { layer_normalization_fwd_kernel_vec_t( const sycl_layer_normalization_conf_t &conf, - hrt::sycl::in_memory_arg_t &data, hrt::sycl::in_memory_arg_t &scale, - hrt::sycl::in_memory_arg_t &shift, hrt::sycl::in_memory_arg_t &stat, - hrt::sycl::in_memory_arg_t &var, hrt::sycl::out_memory_arg_t &dst, - hrt::sycl::in_memory_arg_t &rt_scale, - hrt::sycl::in_memory_arg_t &dst_scale) + xpu::sycl::in_memory_arg_t &data, xpu::sycl::in_memory_arg_t &scale, + xpu::sycl::in_memory_arg_t &shift, xpu::sycl::in_memory_arg_t &stat, + xpu::sycl::in_memory_arg_t &var, xpu::sycl::out_memory_arg_t &dst, + xpu::sycl::in_memory_arg_t &rt_scale, + xpu::sycl::in_memory_arg_t &dst_scale) : conf_(conf) , data_(data) , scale_(scale) @@ -62,14 +62,14 @@ struct layer_normalization_fwd_kernel_vec_t { } private: - const hrt::sycl::md_t &data_md() const { return conf_.data_md; } - const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } - const hrt::sycl::md_t &data_scaleshift_md() const { + const xpu::sycl::md_t &data_md() const { return conf_.data_md; } + const xpu::sycl::md_t &stat_md() const { return conf_.stat_md; } + const xpu::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const hrt::sycl::md_t &var_md() const { return conf_.var_md; } - const hrt::sycl::md_t &stat_d() const { return conf_.stat_d; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &var_md() const { return conf_.var_md; } + const xpu::sycl::md_t &stat_d() const { return conf_.stat_d; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } const unsigned flags() const { return conf_.flags; } const float epsilon() const { return conf_.layer_norm_epsilon; } @@ -128,25 +128,25 @@ struct layer_normalization_fwd_kernel_vec_t { } sycl_layer_normalization_conf_t conf_; - hrt::sycl::in_memory_arg_t data_; - hrt::sycl::in_memory_arg_t scale_; - hrt::sycl::in_memory_arg_t shift_; - hrt::sycl::in_memory_arg_t stat_; - hrt::sycl::in_memory_arg_t var_; - hrt::sycl::out_memory_arg_t dst_; - hrt::sycl::in_memory_arg_t rt_scale_; - hrt::sycl::in_memory_arg_t dst_scale_; + xpu::sycl::in_memory_arg_t data_; + xpu::sycl::in_memory_arg_t scale_; + xpu::sycl::in_memory_arg_t shift_; + xpu::sycl::in_memory_arg_t stat_; + xpu::sycl::in_memory_arg_t var_; + xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::in_memory_arg_t rt_scale_; + xpu::sycl::in_memory_arg_t dst_scale_; }; struct layer_normalization_fwd_kernel_vec1_t { layer_normalization_fwd_kernel_vec1_t( const sycl_layer_normalization_conf_t &conf, - hrt::sycl::in_memory_arg_t &data, hrt::sycl::in_memory_arg_t &scale, - hrt::sycl::in_memory_arg_t &shift, hrt::sycl::out_memory_arg_t &dst, - hrt::sycl::out_memory_arg_t &mean_out, - hrt::sycl::out_memory_arg_t &var_out, - hrt::sycl::in_memory_arg_t &rt_scale, - hrt::sycl::in_memory_arg_t &dst_scale) + xpu::sycl::in_memory_arg_t &data, xpu::sycl::in_memory_arg_t &scale, + xpu::sycl::in_memory_arg_t &shift, xpu::sycl::out_memory_arg_t &dst, + xpu::sycl::out_memory_arg_t &mean_out, + xpu::sycl::out_memory_arg_t &var_out, + xpu::sycl::in_memory_arg_t &rt_scale, + xpu::sycl::in_memory_arg_t &dst_scale) : conf_(conf) , data_(data) , scale_(scale) @@ -172,14 +172,14 @@ struct layer_normalization_fwd_kernel_vec1_t { } private: - const hrt::sycl::md_t &data_md() const { return conf_.data_md; } - const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } - const hrt::sycl::md_t &data_scaleshift_md() const { + const xpu::sycl::md_t &data_md() const { return conf_.data_md; } + const xpu::sycl::md_t &stat_md() const { return conf_.stat_md; } + const xpu::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const hrt::sycl::md_t &var_md() const { return conf_.var_md; } - const hrt::sycl::md_t &stat_d() const { return conf_.stat_d; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &var_md() const { return conf_.var_md; } + const xpu::sycl::md_t &stat_d() const { return conf_.stat_d; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } const unsigned flags() const { return conf_.flags; } const float epsilon() const { return conf_.layer_norm_epsilon; } @@ -267,26 +267,26 @@ struct layer_normalization_fwd_kernel_vec1_t { } sycl_layer_normalization_conf_t conf_; - hrt::sycl::in_memory_arg_t data_; - hrt::sycl::in_memory_arg_t scale_; - hrt::sycl::in_memory_arg_t shift_; - hrt::sycl::out_memory_arg_t dst_; - hrt::sycl::out_memory_arg_t mean_out_; - hrt::sycl::out_memory_arg_t var_out_; - hrt::sycl::in_memory_arg_t rt_scale_; - hrt::sycl::in_memory_arg_t dst_scale_; + xpu::sycl::in_memory_arg_t data_; + xpu::sycl::in_memory_arg_t scale_; + xpu::sycl::in_memory_arg_t shift_; + xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::out_memory_arg_t mean_out_; + xpu::sycl::out_memory_arg_t var_out_; + xpu::sycl::in_memory_arg_t rt_scale_; + xpu::sycl::in_memory_arg_t dst_scale_; }; struct layer_normalization_bwd_kernel_vec_t { layer_normalization_bwd_kernel_vec_t( const sycl_layer_normalization_conf_t &conf, - hrt::sycl::in_memory_arg_t &data, - hrt::sycl::out_memory_arg_t &diff_data, - hrt::sycl::in_memory_arg_t &scale, - hrt::sycl::out_memory_arg_t &diff_scale, - hrt::sycl::out_memory_arg_t &diff_shift, - hrt::sycl::in_memory_arg_t &stat, hrt::sycl::in_memory_arg_t &var, - hrt::sycl::in_memory_arg_t &diff_dst) + xpu::sycl::in_memory_arg_t &data, + xpu::sycl::out_memory_arg_t &diff_data, + xpu::sycl::in_memory_arg_t &scale, + xpu::sycl::out_memory_arg_t &diff_scale, + xpu::sycl::out_memory_arg_t &diff_shift, + xpu::sycl::in_memory_arg_t &stat, xpu::sycl::in_memory_arg_t &var, + xpu::sycl::in_memory_arg_t &diff_dst) : conf_(conf) , data_(data) , diff_data_(diff_data) @@ -305,19 +305,19 @@ struct layer_normalization_bwd_kernel_vec_t { } private: - const hrt::sycl::md_t &data_md() const { return conf_.data_md; } - const hrt::sycl::md_t &diff_data_md() const { return conf_.diff_data_md; } - const hrt::sycl::md_t &stat_d() const { return conf_.stat_d; } - const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } - const hrt::sycl::md_t &data_scaleshift_md() const { + const xpu::sycl::md_t &data_md() const { return conf_.data_md; } + const xpu::sycl::md_t &diff_data_md() const { return conf_.diff_data_md; } + const xpu::sycl::md_t &stat_d() const { return conf_.stat_d; } + const xpu::sycl::md_t &stat_md() const { return conf_.stat_md; } + const xpu::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const hrt::sycl::md_t &diff_data_scaleshift_md() const { + const xpu::sycl::md_t &diff_data_scaleshift_md() const { return conf_.diff_data_scaleshift_md; } - const hrt::sycl::md_t &var_md() const { return conf_.var_md; } - const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &var_md() const { return conf_.var_md; } + const xpu::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } const unsigned flags() const { return conf_.flags; } const float epsilon() const { return conf_.layer_norm_epsilon; } @@ -388,26 +388,26 @@ struct layer_normalization_bwd_kernel_vec_t { sycl_layer_normalization_conf_t conf_; - hrt::sycl::in_memory_arg_t data_; - hrt::sycl::out_memory_arg_t diff_data_; - hrt::sycl::in_memory_arg_t scale_; - hrt::sycl::out_memory_arg_t diff_scale_; - hrt::sycl::out_memory_arg_t diff_shift_; - hrt::sycl::in_memory_arg_t stat_; - hrt::sycl::in_memory_arg_t var_; - hrt::sycl::in_memory_arg_t diff_dst_; + xpu::sycl::in_memory_arg_t data_; + xpu::sycl::out_memory_arg_t diff_data_; + xpu::sycl::in_memory_arg_t scale_; + xpu::sycl::out_memory_arg_t diff_scale_; + xpu::sycl::out_memory_arg_t diff_shift_; + xpu::sycl::in_memory_arg_t stat_; + xpu::sycl::in_memory_arg_t var_; + xpu::sycl::in_memory_arg_t diff_dst_; }; struct layer_normalization_bwd_kernel_vec2_t { layer_normalization_bwd_kernel_vec2_t( const sycl_layer_normalization_conf_t &conf, - hrt::sycl::in_memory_arg_t &data, - hrt::sycl::out_memory_arg_t &diff_data, - hrt::sycl::in_memory_arg_t &scale, - hrt::sycl::out_memory_arg_t &diff_scale, - hrt::sycl::out_memory_arg_t &diff_shift, - hrt::sycl::in_memory_arg_t &stat, hrt::sycl::in_memory_arg_t &var, - hrt::sycl::in_memory_arg_t &diff_dst) + xpu::sycl::in_memory_arg_t &data, + xpu::sycl::out_memory_arg_t &diff_data, + xpu::sycl::in_memory_arg_t &scale, + xpu::sycl::out_memory_arg_t &diff_scale, + xpu::sycl::out_memory_arg_t &diff_shift, + xpu::sycl::in_memory_arg_t &stat, xpu::sycl::in_memory_arg_t &var, + xpu::sycl::in_memory_arg_t &diff_dst) : conf_(conf) , data_(data) , diff_data_(diff_data) @@ -426,18 +426,18 @@ struct layer_normalization_bwd_kernel_vec2_t { } private: - const hrt::sycl::md_t &data_md() const { return conf_.data_md; } - const hrt::sycl::md_t &diff_data_md() const { return conf_.diff_data_md; } - const hrt::sycl::md_t &stat_d() const { return conf_.stat_d; } - const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } - const hrt::sycl::md_t &data_scaleshift_md() const { + const xpu::sycl::md_t &data_md() const { return conf_.data_md; } + const xpu::sycl::md_t &diff_data_md() const { return conf_.diff_data_md; } + const xpu::sycl::md_t &stat_d() const { return conf_.stat_d; } + const xpu::sycl::md_t &stat_md() const { return conf_.stat_md; } + const xpu::sycl::md_t &data_scaleshift_md() const { return conf_.data_scaleshift_md; } - const hrt::sycl::md_t &diff_data_scaleshift_md() const { + const xpu::sycl::md_t &diff_data_scaleshift_md() const { return conf_.diff_data_scaleshift_md; } - const hrt::sycl::md_t &var_md() const { return conf_.var_md; } - const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const xpu::sycl::md_t &var_md() const { return conf_.var_md; } + const xpu::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } const unsigned flags() const { return conf_.flags; } const float epsilon() const { return conf_.layer_norm_epsilon; } @@ -519,14 +519,14 @@ struct layer_normalization_bwd_kernel_vec2_t { sycl_layer_normalization_conf_t conf_; - hrt::sycl::in_memory_arg_t data_; - hrt::sycl::out_memory_arg_t diff_data_; - hrt::sycl::in_memory_arg_t scale_; - hrt::sycl::out_memory_arg_t diff_scale_; - hrt::sycl::out_memory_arg_t diff_shift_; - hrt::sycl::in_memory_arg_t stat_; - hrt::sycl::in_memory_arg_t var_; - hrt::sycl::in_memory_arg_t diff_dst_; + xpu::sycl::in_memory_arg_t data_; + xpu::sycl::out_memory_arg_t diff_data_; + xpu::sycl::in_memory_arg_t scale_; + xpu::sycl::out_memory_arg_t diff_scale_; + xpu::sycl::out_memory_arg_t diff_shift_; + xpu::sycl::in_memory_arg_t stat_; + xpu::sycl::in_memory_arg_t var_; + xpu::sycl::in_memory_arg_t diff_dst_; }; } // namespace sycl diff --git a/src/gpu/sycl/lrn_kernels.hpp b/src/gpu/sycl/lrn_kernels.hpp index dff003d1b6e..27ef5b34bc1 100644 --- a/src/gpu/sycl/lrn_kernels.hpp +++ b/src/gpu/sycl/lrn_kernels.hpp @@ -19,7 +19,7 @@ #include "gpu/sycl/sycl_io_helper.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { @@ -28,7 +28,7 @@ namespace sycl { struct lrn_fwd_kernel_vec_t { lrn_fwd_kernel_vec_t(const sycl_lrn_conf_t &conf, - hrt::sycl::in_memory_arg_t &src, hrt::sycl::out_memory_arg_t &dst, + xpu::sycl::in_memory_arg_t &src, xpu::sycl::out_memory_arg_t &dst, const format_tag_t &tag) : conf_(conf), src_(src), dst_(dst), tag_(tag) {} @@ -137,23 +137,23 @@ struct lrn_fwd_kernel_vec_t { } private: - const hrt::sycl::md_t &src_md() const { return conf_.src_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &src_md() const { return conf_.src_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } void *src_ptr() const { return src_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } sycl_lrn_conf_t conf_; - hrt::sycl::in_memory_arg_t src_; - hrt::sycl::out_memory_arg_t dst_; + xpu::sycl::in_memory_arg_t src_; + xpu::sycl::out_memory_arg_t dst_; format_tag_t tag_; }; struct lrn_bwd_kernel_vec_t { lrn_bwd_kernel_vec_t(const sycl_lrn_conf_t &conf, - hrt::sycl::in_memory_arg_t &src, - hrt::sycl::in_memory_arg_t &diff_dst, - hrt::sycl::out_memory_arg_t &diff_src, const format_tag_t &tag) + xpu::sycl::in_memory_arg_t &src, + xpu::sycl::in_memory_arg_t &diff_dst, + xpu::sycl::out_memory_arg_t &diff_src, const format_tag_t &tag) : conf_(conf) , src_(src) , diff_dst_(diff_dst) @@ -318,18 +318,18 @@ struct lrn_bwd_kernel_vec_t { } private: - const hrt::sycl::md_t &src_md() const { return conf_.src_md; } - const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } - const hrt::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } + const xpu::sycl::md_t &src_md() const { return conf_.src_md; } + const xpu::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const xpu::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } void *src_ptr() const { return src_.get_pointer(); } void *diff_dst_ptr() const { return diff_dst_.get_pointer(); } void *diff_src_ptr() const { return diff_src_.get_pointer(); } sycl_lrn_conf_t conf_; - hrt::sycl::in_memory_arg_t src_; - hrt::sycl::in_memory_arg_t diff_dst_; - hrt::sycl::out_memory_arg_t diff_src_; + xpu::sycl::in_memory_arg_t src_; + xpu::sycl::in_memory_arg_t diff_dst_; + xpu::sycl::out_memory_arg_t diff_src_; format_tag_t tag_; }; diff --git a/src/gpu/sycl/pooling_kernels.hpp b/src/gpu/sycl/pooling_kernels.hpp index d23017b8a9d..e56b761a805 100644 --- a/src/gpu/sycl/pooling_kernels.hpp +++ b/src/gpu/sycl/pooling_kernels.hpp @@ -25,7 +25,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { @@ -34,12 +34,12 @@ namespace sycl { using namespace nstl; struct pooling_fwd_kernel_vec_t { pooling_fwd_kernel_vec_t(const sycl_pooling_conf_t &conf, - hrt::sycl::in_memory_arg_t &src, hrt::sycl::out_memory_arg_t &dst, - hrt::sycl::out_memory_arg_t &ws, hrt::sycl::in_memory_arg_t &src_1, - hrt::sycl::in_memory_arg_t &src_2, - hrt::sycl::in_memory_arg_t &src_3, - hrt::sycl::in_memory_arg_t &src_4, - hrt::sycl::in_memory_arg_t &src_5) + xpu::sycl::in_memory_arg_t &src, xpu::sycl::out_memory_arg_t &dst, + xpu::sycl::out_memory_arg_t &ws, xpu::sycl::in_memory_arg_t &src_1, + xpu::sycl::in_memory_arg_t &src_2, + xpu::sycl::in_memory_arg_t &src_3, + xpu::sycl::in_memory_arg_t &src_4, + xpu::sycl::in_memory_arg_t &src_5) : conf_(conf) , src_(src) , dst_(dst) @@ -98,18 +98,18 @@ struct pooling_fwd_kernel_vec_t { } private: - const hrt::sycl::md_t &src_md() const { return conf_.src_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } - const hrt::sycl::md_t &ws_md() const { return conf_.ws_md; } + const xpu::sycl::md_t &src_md() const { return conf_.src_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &ws_md() const { return conf_.ws_md; } void *src_ptr() const { return src_.get_pointer(); } - void *gen_ptr(hrt::sycl::in_memory_arg_t gen_) const { + void *gen_ptr(xpu::sycl::in_memory_arg_t gen_) const { return gen_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } void *ws_ptr() const { return ws_.get_pointer(); } - static dim_t get_offset(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + static dim_t get_offset(const xpu::sycl::md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 3: return mdw.off(n, c, w); @@ -123,27 +123,27 @@ struct pooling_fwd_kernel_vec_t { switch (src_md().data_type()) { case data_type::bf16: return (float) - std::numeric_limits::lowest(); + std::numeric_limits::lowest(); case data_type::s8: - return (float)numeric_limits::type>::lowest(); case data_type::f16: return (float) - std::numeric_limits::type>::lowest(); case data_type::s32: - return (float)numeric_limits::type>::lowest(); case data_type::u8: - return (float)numeric_limits::type>::lowest(); default: - return (float)numeric_limits::type>::lowest(); } } - float dst_Value(hrt::sycl::in_memory_arg_t arr, int idx, int offset) const { + float dst_Value(xpu::sycl::in_memory_arg_t arr, int idx, int offset) const { auto src1_desc = conf_.src1_md[idx]; dim_t src_dim[DNNL_MAX_NDIMS]; auto src_dim_ = src1_desc.dims(); @@ -157,7 +157,7 @@ struct pooling_fwd_kernel_vec_t { return dst; } - dim_t get_binary_src1_off(const hrt::sycl::md_t &src1_md, + dim_t get_binary_src1_off(const xpu::sycl::md_t &src1_md, const dim_t *src_dim, const dim_t l_offset, const dim_t *dst_dims, const int dst_ndims) const { @@ -168,7 +168,7 @@ struct pooling_fwd_kernel_vec_t { src1_md, l_offset, dst_dims, dst_ndims, mask_binary_po); } - dim_t get_po_tensor_off(const hrt::sycl::md_t &tensor_md, + dim_t get_po_tensor_off(const xpu::sycl::md_t &tensor_md, const dim_t l_offset, const dim_t *dst_dims, const int dst_ndims, int mask) const { @@ -289,21 +289,21 @@ struct pooling_fwd_kernel_vec_t { sycl_pooling_conf_t conf_; - hrt::sycl::in_memory_arg_t src_; - hrt::sycl::out_memory_arg_t dst_; - hrt::sycl::out_memory_arg_t ws_; - hrt::sycl::in_memory_arg_t src_1_; - hrt::sycl::in_memory_arg_t src_2_; - hrt::sycl::in_memory_arg_t src_3_; - hrt::sycl::in_memory_arg_t src_4_; - hrt::sycl::in_memory_arg_t src_5_; + xpu::sycl::in_memory_arg_t src_; + xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::out_memory_arg_t ws_; + xpu::sycl::in_memory_arg_t src_1_; + xpu::sycl::in_memory_arg_t src_2_; + xpu::sycl::in_memory_arg_t src_3_; + xpu::sycl::in_memory_arg_t src_4_; + xpu::sycl::in_memory_arg_t src_5_; }; struct pooling_bwd_kernel_vec_t { pooling_bwd_kernel_vec_t(const sycl_pooling_conf_t &conf, - hrt::sycl::in_memory_arg_t &diff_dst, - hrt::sycl::out_memory_arg_t &diff_src, - hrt::sycl::in_memory_arg_t &ws) + xpu::sycl::in_memory_arg_t &diff_dst, + xpu::sycl::out_memory_arg_t &diff_src, + xpu::sycl::in_memory_arg_t &ws) : conf_(conf), diff_dst_(diff_dst), diff_src_(diff_src), ws_(ws) {} void operator()(::sycl::nd_item<1> item) const { @@ -355,15 +355,15 @@ struct pooling_bwd_kernel_vec_t { } private: - const hrt::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } - const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } - const hrt::sycl::md_t &ws_md() const { return conf_.ws_md; } + const xpu::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } + const xpu::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const xpu::sycl::md_t &ws_md() const { return conf_.ws_md; } void *diff_src_ptr() const { return diff_src_.get_pointer(); } void *diff_dst_ptr() const { return diff_dst_.get_pointer(); } void *ws_ptr() const { return ws_.get_pointer(); } - static dim_t get_offset(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + static dim_t get_offset(const xpu::sycl::md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 3: return mdw.off(n, c, w); @@ -478,9 +478,9 @@ struct pooling_bwd_kernel_vec_t { } sycl_pooling_conf_t conf_; - hrt::sycl::in_memory_arg_t diff_dst_; - hrt::sycl::out_memory_arg_t diff_src_; - hrt::sycl::in_memory_arg_t ws_; + xpu::sycl::in_memory_arg_t diff_dst_; + xpu::sycl::out_memory_arg_t diff_src_; + xpu::sycl::in_memory_arg_t ws_; }; } // namespace sycl diff --git a/src/gpu/sycl/prelu_kernels.hpp b/src/gpu/sycl/prelu_kernels.hpp index 7c81c6e9e70..864af311715 100644 --- a/src/gpu/sycl/prelu_kernels.hpp +++ b/src/gpu/sycl/prelu_kernels.hpp @@ -29,7 +29,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { @@ -42,9 +42,9 @@ struct prelu_fwd_kernel_vec_t { static constexpr int vec_len = 8; prelu_fwd_kernel_vec_t(const sycl_prelu_conf_t &conf, - hrt::sycl::in_memory_arg_t &data, - hrt::sycl::in_memory_arg_t &weights, - hrt::sycl::out_memory_arg_t &dst) + xpu::sycl::in_memory_arg_t &data, + xpu::sycl::in_memory_arg_t &weights, + xpu::sycl::out_memory_arg_t &dst) : conf_(conf), data_(data), weights_(weights), dst_(dst) {} void operator()(::sycl::nd_item<1> item) const { @@ -114,15 +114,15 @@ struct prelu_fwd_kernel_vec_t { } private: - const hrt::sycl::md_t &data_md() const { return conf_.data_md; } - const hrt::sycl::md_t &weights_md() const { return conf_.weights_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &data_md() const { return conf_.data_md; } + const xpu::sycl::md_t &weights_md() const { return conf_.weights_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } void *data_ptr() const { return data_.get_pointer(); } void *weights_ptr() const { return weights_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } - static dim_t offset(const hrt::sycl::md_t &mem, dims_t dims) { + static dim_t offset(const xpu::sycl::md_t &mem, dims_t dims) { const int ndims = mem.ndims(); switch (ndims) { case 1: return mem.off(dims[0]); @@ -136,7 +136,7 @@ struct prelu_fwd_kernel_vec_t { } static dim_t weights_offset( - const int mask, const hrt::sycl::md_t &mem, dims_t &dims) { + const int mask, const xpu::sycl::md_t &mem, dims_t &dims) { dims_t dims_w {}; std::copy(dims, dims + max_supported_ndims, dims_w); utils::apply_mask_on_dims(dims_w, mem.ndims(), mask); @@ -144,21 +144,21 @@ struct prelu_fwd_kernel_vec_t { } sycl_prelu_conf_t conf_; - hrt::sycl::in_memory_arg_t data_; - hrt::sycl::in_memory_arg_t weights_; - hrt::sycl::out_memory_arg_t dst_; + xpu::sycl::in_memory_arg_t data_; + xpu::sycl::in_memory_arg_t weights_; + xpu::sycl::out_memory_arg_t dst_; }; struct prelu_bwd_kernel_vec_t { static constexpr int vec_len = 8; prelu_bwd_kernel_vec_t(const sycl_prelu_conf_t &conf, - hrt::sycl::in_memory_arg_t &data, - hrt::sycl::out_memory_arg_t &diff_data, - hrt::sycl::in_memory_arg_t &weights, - hrt::sycl::out_memory_arg_t &diff_weights, - hrt::sycl::in_memory_arg_t &diff_dst, - hrt::sycl::out_memory_arg_t &scratchpad) + xpu::sycl::in_memory_arg_t &data, + xpu::sycl::out_memory_arg_t &diff_data, + xpu::sycl::in_memory_arg_t &weights, + xpu::sycl::out_memory_arg_t &diff_weights, + xpu::sycl::in_memory_arg_t &diff_dst, + xpu::sycl::out_memory_arg_t &scratchpad) : conf_(conf) , data_(data) , diff_data_(diff_data) @@ -237,13 +237,13 @@ struct prelu_bwd_kernel_vec_t { } private: - const hrt::sycl::md_t &data_md() const { return conf_.data_md; } - const hrt::sycl::md_t &weights_md() const { return conf_.weights_md; } - const hrt::sycl::md_t &diff_data_md() const { return conf_.diff_data_md; } - const hrt::sycl::md_t &diff_weights_md() const { + const xpu::sycl::md_t &data_md() const { return conf_.data_md; } + const xpu::sycl::md_t &weights_md() const { return conf_.weights_md; } + const xpu::sycl::md_t &diff_data_md() const { return conf_.diff_data_md; } + const xpu::sycl::md_t &diff_weights_md() const { return conf_.diff_weights_md; } - const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const xpu::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } float *data_ptr() const { return (float *)(data_.get_pointer()); } float *weights_ptr() const { return (float *)(weights_.get_pointer()); } @@ -256,7 +256,7 @@ struct prelu_bwd_kernel_vec_t { return (float *)(scratchpad_.get_pointer()); } - static dim_t offset(const hrt::sycl::md_t &mem, dims_t dims) { + static dim_t offset(const xpu::sycl::md_t &mem, dims_t dims) { const int ndims = mem.ndims(); switch (ndims) { case 1: return mem.off(dims[0]); @@ -270,7 +270,7 @@ struct prelu_bwd_kernel_vec_t { } static dim_t weights_offset( - const int mask, const hrt::sycl::md_t &mem, dims_t &dims) { + const int mask, const xpu::sycl::md_t &mem, dims_t &dims) { dims_t dims_w {}; std::copy(dims, dims + max_supported_ndims, dims_w); utils::apply_mask_on_dims(dims_w, mem.ndims(), mask); @@ -544,12 +544,12 @@ struct prelu_bwd_kernel_vec_t { } sycl_prelu_conf_t conf_; - hrt::sycl::in_memory_arg_t data_; - hrt::sycl::out_memory_arg_t diff_data_; - hrt::sycl::in_memory_arg_t weights_; - hrt::sycl::out_memory_arg_t diff_weights_; - hrt::sycl::in_memory_arg_t diff_dst_; - hrt::sycl::out_memory_arg_t scratchpad_; + xpu::sycl::in_memory_arg_t data_; + xpu::sycl::out_memory_arg_t diff_data_; + xpu::sycl::in_memory_arg_t weights_; + xpu::sycl::out_memory_arg_t diff_weights_; + xpu::sycl::in_memory_arg_t diff_dst_; + xpu::sycl::out_memory_arg_t scratchpad_; }; } // namespace sycl diff --git a/src/gpu/sycl/ref_batch_normalization.cpp b/src/gpu/sycl/ref_batch_normalization.cpp index 46b89b9fcae..3a73f511c77 100644 --- a/src/gpu/sycl/ref_batch_normalization.cpp +++ b/src/gpu/sycl/ref_batch_normalization.cpp @@ -16,7 +16,7 @@ #include "common/c_types_map.hpp" #include "common/dnnl_traits.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" #include "gpu/sycl/batch_normalizations_kernels.hpp" #include "gpu/sycl/ref_batch_normalization.hpp" @@ -33,21 +33,21 @@ status_t ref_batch_normalization_fwd_t::pd_t::init_conf() { conf_.ndims = ndims(); conf_.flags = desc()->flags; conf_.wk_size = memory_desc_wrapper(src_md(0)).nelems(); - conf_.src1_md = hrt::sycl::md_t(dst_md(3)); - conf_.dst1_md = hrt::sycl::md_t(dst_md(0)); + conf_.src1_md = xpu::sycl::md_t(dst_md(3)); + conf_.dst1_md = xpu::sycl::md_t(dst_md(0)); conf_.block_size = 16; conf_.wg_size = 32; conf_.dir = !is_fwd(); conf_.use_scale = use_scale(); conf_.use_shift = use_shift(); - conf_.data_md = hrt::sycl::md_t(src_md(0)); - conf_.data_scaleshift_md = hrt::sycl::md_t(weights_md(0)); - conf_.stat_md = stats_is_src() ? hrt::sycl::md_t(src_md(1)) - : hrt::sycl::md_t(dst_md(1)); - conf_.dst_md = hrt::sycl::md_t(dst_md(0)); - conf_.var_md = stats_is_src() ? hrt::sycl::md_t(src_md(2)) - : hrt::sycl::md_t(dst_md(2)); - conf_.ws_md = hrt::sycl::md_t(workspace_md(0)); + conf_.data_md = xpu::sycl::md_t(src_md(0)); + conf_.data_scaleshift_md = xpu::sycl::md_t(weights_md(0)); + conf_.stat_md = stats_is_src() ? xpu::sycl::md_t(src_md(1)) + : xpu::sycl::md_t(dst_md(1)); + conf_.dst_md = xpu::sycl::md_t(dst_md(0)); + conf_.var_md = stats_is_src() ? xpu::sycl::md_t(src_md(2)) + : xpu::sycl::md_t(dst_md(2)); + conf_.ws_md = xpu::sycl::md_t(workspace_md(0)); int work_per_wg = conf_.wg_size * conf_.block_size; int n_wgs = (C() + work_per_wg - 1) / work_per_wg; conf_.n_thr = n_wgs * conf_.wg_size; @@ -137,17 +137,17 @@ status_t ref_batch_normalization_bwd_t::pd_t::init_conf() { conf_.prop_kind = desc_.prop_kind; conf_.use_scale = use_scale(); conf_.use_shift = use_shift(); - conf_.data_md = hrt::sycl::md_t(src_md(0)); - conf_.dst1_md = hrt::sycl::md_t(dst_md(0)); - conf_.diff_data_md = hrt::sycl::md_t(diff_src_md(0)); - conf_.diff_src1_md = hrt::sycl::md_t(diff_dst_md(1)); - conf_.data_scaleshift_md = hrt::sycl::md_t(weights_md(0)); - conf_.diff_data_scaleshift_md = hrt::sycl::md_t(diff_weights_md(0)); - conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md(0)); - conf_.stat_md = hrt::sycl::md_t(stat_md()); - conf_.var_md = hrt::sycl::md_t(src_md(2)); - conf_.dst_md = hrt::sycl::md_t(dst_md(0)); - conf_.ws_md = hrt::sycl::md_t(workspace_md(0)); + conf_.data_md = xpu::sycl::md_t(src_md(0)); + conf_.dst1_md = xpu::sycl::md_t(dst_md(0)); + conf_.diff_data_md = xpu::sycl::md_t(diff_src_md(0)); + conf_.diff_src1_md = xpu::sycl::md_t(diff_dst_md(1)); + conf_.data_scaleshift_md = xpu::sycl::md_t(weights_md(0)); + conf_.diff_data_scaleshift_md = xpu::sycl::md_t(diff_weights_md(0)); + conf_.diff_dst_md = xpu::sycl::md_t(diff_dst_md(0)); + conf_.stat_md = xpu::sycl::md_t(stat_md()); + conf_.var_md = xpu::sycl::md_t(src_md(2)); + conf_.dst_md = xpu::sycl::md_t(dst_md(0)); + conf_.ws_md = xpu::sycl::md_t(workspace_md(0)); int work_per_wg = conf_.wg_size * conf_.block_size; int n_wgs = (C() + work_per_wg - 1) / work_per_wg; conf_.n_thr = n_wgs * conf_.wg_size; diff --git a/src/gpu/sycl/ref_batch_normalization.hpp b/src/gpu/sycl/ref_batch_normalization.hpp index e0bf21f315b..41914aee756 100644 --- a/src/gpu/sycl/ref_batch_normalization.hpp +++ b/src/gpu/sycl/ref_batch_normalization.hpp @@ -25,8 +25,8 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/ref_binary.cpp b/src/gpu/sycl/ref_binary.cpp index 58f4c73778d..5a89b009afd 100644 --- a/src/gpu/sycl/ref_binary.cpp +++ b/src/gpu/sycl/ref_binary.cpp @@ -27,9 +27,9 @@ using namespace impl::sycl; status_t ref_binary_t::pd_t::init_conf() { conf_ = sycl_binary_conf_t(); - conf_.src0_md = hrt::sycl::md_t(src_md(0)); - conf_.src1_md = hrt::sycl::md_t(src_md(1)); - conf_.dst_md = hrt::sycl::md_t(dst_md()); + conf_.src0_md = xpu::sycl::md_t(src_md(0)); + conf_.src1_md = xpu::sycl::md_t(src_md(1)); + conf_.dst_md = xpu::sycl::md_t(dst_md()); conf_.ndims = ndims(); // XXX: should probably be tuned. @@ -46,7 +46,7 @@ status_t ref_binary_t::pd_t::init_conf() { conf_.do_scale_src1 = !attr()->scales_.get(DNNL_ARG_SRC_1).has_default_values(); conf_.is_tensor_op = is_tensor_op(); - for (size_t i = 0; i < hrt::sycl::md_t::max_dims; i++) { + for (size_t i = 0; i < xpu::sycl::md_t::max_dims; i++) { conf_.broadcast_dims[i] = broadcast_dims()[i]; } diff --git a/src/gpu/sycl/ref_binary.hpp b/src/gpu/sycl/ref_binary.hpp index 9b47a047e27..c7c4f90fe52 100644 --- a/src/gpu/sycl/ref_binary.hpp +++ b/src/gpu/sycl/ref_binary.hpp @@ -23,8 +23,8 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/ref_eltwise.cpp b/src/gpu/sycl/ref_eltwise.cpp index 35b2f4d7be5..362ab8b6fc8 100644 --- a/src/gpu/sycl/ref_eltwise.cpp +++ b/src/gpu/sycl/ref_eltwise.cpp @@ -25,8 +25,8 @@ namespace sycl { using namespace impl::sycl; status_t ref_sycl_eltwise_fwd_t::pd_t::init_conf() { conf_ = sycl_eltwise_conf_t(); - conf_.src_md = hrt::sycl::md_t(src_md()); - conf_.dst_md = hrt::sycl::md_t(dst_md()); + conf_.src_md = xpu::sycl::md_t(src_md()); + conf_.dst_md = xpu::sycl::md_t(dst_md()); conf_.wk_size = memory_desc_wrapper(src_md()).nelems(); conf_.alg_kind = desc()->alg_kind; conf_.alpha = desc()->alpha; @@ -43,7 +43,7 @@ status_t ref_sycl_eltwise_fwd_t::pd_t::init_conf() { conf_.post_ops = sycl_post_ops_t(attr()); for (auto i = 0; i < conf_.post_po_len; ++i) - conf_.binary_src_arr[i] = hrt::sycl::md_t( + conf_.binary_src_arr[i] = xpu::sycl::md_t( arg_md(DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1)); const int block_size = conf_.block_size; @@ -89,9 +89,9 @@ status_t ref_sycl_eltwise_fwd_t::execute_forward(const exec_ctx_t &ctx) const { status_t ref_sycl_eltwise_bwd_t::pd_t::init_conf() { conf_ = sycl_eltwise_conf_t(); - conf_.src_md = hrt::sycl::md_t(data_md(0)); - conf_.diff_src_md = hrt::sycl::md_t(diff_src_md()); - conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md()); + conf_.src_md = xpu::sycl::md_t(data_md(0)); + conf_.diff_src_md = xpu::sycl::md_t(diff_src_md()); + conf_.diff_dst_md = xpu::sycl::md_t(diff_dst_md()); conf_.block_size = 16; conf_.wg_size = 32; conf_.wk_size = memory_desc_wrapper(data_md(0)).nelems(); diff --git a/src/gpu/sycl/ref_eltwise.hpp b/src/gpu/sycl/ref_eltwise.hpp index 705ab57cf3e..30d11f1d980 100644 --- a/src/gpu/sycl/ref_eltwise.hpp +++ b/src/gpu/sycl/ref_eltwise.hpp @@ -21,7 +21,7 @@ #include "gpu/sycl/sycl_gpu_primitive.hpp" #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/ref_layer_normalizations.cpp b/src/gpu/sycl/ref_layer_normalizations.cpp index 67bd51a9c96..94a2188c160 100644 --- a/src/gpu/sycl/ref_layer_normalizations.cpp +++ b/src/gpu/sycl/ref_layer_normalizations.cpp @@ -17,7 +17,7 @@ #include "common/c_types_map.hpp" #include "common/dnnl_traits.hpp" #include "gpu/sycl/layer_normalizations_kernels.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { @@ -29,13 +29,13 @@ using namespace impl::sycl; status_t ref_layer_normalization_fwd_t::pd_t::init_conf() { conf_ = sycl_layer_normalization_conf_t(); - conf_.var_md = stats_are_src() ? hrt::sycl::md_t(src_md(2)) - : is_training() ? hrt::sycl::md_t(dst_md(2)) - : hrt::sycl::md_t {}; + conf_.var_md = stats_are_src() ? xpu::sycl::md_t(src_md(2)) + : is_training() ? xpu::sycl::md_t(dst_md(2)) + : xpu::sycl::md_t {}; conf_.ndims = ndims(); conf_.flags = desc()->flags; conf_.wk_size = memory_desc_wrapper(src_md(0)).nelems(); - conf_.stat_d = hrt::sycl::md_t(stat_md()); + conf_.stat_d = xpu::sycl::md_t(stat_md()); conf_.block_size = 16; conf_.wg_size = 32; @@ -46,13 +46,13 @@ status_t ref_layer_normalization_fwd_t::pd_t::init_conf() { conf_.use_scale = use_scale(); conf_.use_shift = use_shift(); conf_.use_ss = conf_.use_scale || conf_.use_shift; - conf_.data_md = hrt::sycl::md_t(src_md(0)); - conf_.data_scaleshift_md = hrt::sycl::md_t(weights_md(0)); + conf_.data_md = xpu::sycl::md_t(src_md(0)); + conf_.data_scaleshift_md = xpu::sycl::md_t(weights_md(0)); - conf_.stat_md = stats_are_src() ? hrt::sycl::md_t(src_md(1)) - : is_training() ? hrt::sycl::md_t(dst_md(2)) - : hrt::sycl::md_t {}; - conf_.dst_md = hrt::sycl::md_t(dst_md(0)); + conf_.stat_md = stats_are_src() ? xpu::sycl::md_t(src_md(1)) + : is_training() ? xpu::sycl::md_t(dst_md(2)) + : xpu::sycl::md_t {}; + conf_.dst_md = xpu::sycl::md_t(dst_md(0)); conf_.shift_off = conf_.use_ss && !has_zero_dim_memory() ? conf_.data_scaleshift_md.off(1, 0) : 0; @@ -138,7 +138,7 @@ status_t ref_layer_normalization_fwd_t::execute_forward( status_t ref_layer_normalization_bwd_t::pd_t::init_conf() { conf_ = sycl_layer_normalization_conf_t(); - conf_.var_md = hrt::sycl::md_t(src_md(2)); + conf_.var_md = xpu::sycl::md_t(src_md(2)); conf_.ndims = ndims(); conf_.flags = desc()->flags; conf_.block_size = (16); @@ -147,15 +147,15 @@ status_t ref_layer_normalization_bwd_t::pd_t::init_conf() { conf_.use_scale = use_scale(); conf_.use_shift = use_shift(); conf_.use_ss = conf_.use_scale || conf_.use_shift; - conf_.data_md = hrt::sycl::md_t(src_md(0)); - conf_.diff_data_md = hrt::sycl::md_t(diff_src_md(0)); - conf_.data_scaleshift_md = hrt::sycl::md_t(weights_md(0)); + conf_.data_md = xpu::sycl::md_t(src_md(0)); + conf_.diff_data_md = xpu::sycl::md_t(diff_src_md(0)); + conf_.data_scaleshift_md = xpu::sycl::md_t(weights_md(0)); conf_.diff_data_scaleshift_md = conf_.use_ss - ? hrt::sycl::md_t(diff_weights_md(0)) - : hrt::sycl::md_t {}; - conf_.stat_md = hrt::sycl::md_t(src_md(1)); - conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md(0)); - conf_.stat_d = hrt::sycl::md_t(stat_md()); + ? xpu::sycl::md_t(diff_weights_md(0)) + : xpu::sycl::md_t {}; + conf_.stat_md = xpu::sycl::md_t(src_md(1)); + conf_.diff_dst_md = xpu::sycl::md_t(diff_dst_md(0)); + conf_.stat_d = xpu::sycl::md_t(stat_md()); conf_.zero_dims = has_zero_dim_memory(); auto nelems_A = memory_desc_wrapper(src_md(0)).nelems(); conf_.diff_shift_off = conf_.use_ss && !conf_.zero_dims diff --git a/src/gpu/sycl/ref_layer_normalizations.hpp b/src/gpu/sycl/ref_layer_normalizations.hpp index a8fccd89405..d91384cf451 100644 --- a/src/gpu/sycl/ref_layer_normalizations.hpp +++ b/src/gpu/sycl/ref_layer_normalizations.hpp @@ -24,8 +24,8 @@ #include "gpu/sycl/sycl_io_helper.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/ref_lrn.cpp b/src/gpu/sycl/ref_lrn.cpp index 19199e31792..4b283f78f2e 100644 --- a/src/gpu/sycl/ref_lrn.cpp +++ b/src/gpu/sycl/ref_lrn.cpp @@ -25,8 +25,8 @@ namespace sycl { using namespace impl::sycl; status_t ref_sycl_lrn_fwd_t::pd_t::init_conf() { conf_ = sycl_lrn_conf_t(); - conf_.src_md = hrt::sycl::md_t(src_md()); - conf_.dst_md = hrt::sycl::md_t(dst_md()); + conf_.src_md = xpu::sycl::md_t(src_md()); + conf_.dst_md = xpu::sycl::md_t(dst_md()); conf_.alg_kind = desc()->alg_kind; conf_.block_size = 16; conf_.wg_size = 32; @@ -87,9 +87,9 @@ status_t ref_sycl_lrn_fwd_t::execute_forward(const exec_ctx_t &ctx) const { status_t ref_sycl_lrn_bwd_t::pd_t::init_conf() { conf_ = sycl_lrn_conf_t(); - conf_.src_md = hrt::sycl::md_t(src_md()); - conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md()); - conf_.diff_src_md = hrt::sycl::md_t(diff_src_md()); + conf_.src_md = xpu::sycl::md_t(src_md()); + conf_.diff_dst_md = xpu::sycl::md_t(diff_dst_md()); + conf_.diff_src_md = xpu::sycl::md_t(diff_src_md()); conf_.alg_kind = desc()->alg_kind; conf_.block_size = 16; conf_.wg_size = 32; diff --git a/src/gpu/sycl/ref_lrn.hpp b/src/gpu/sycl/ref_lrn.hpp index 03ae0735d76..071ddac7cef 100644 --- a/src/gpu/sycl/ref_lrn.hpp +++ b/src/gpu/sycl/ref_lrn.hpp @@ -19,7 +19,7 @@ #include "gpu/gpu_lrn_pd.hpp" #include "gpu/sycl/sycl_gpu_primitive.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/ref_pooling.cpp b/src/gpu/sycl/ref_pooling.cpp index a48fd9272c3..d53d332ce52 100644 --- a/src/gpu/sycl/ref_pooling.cpp +++ b/src/gpu/sycl/ref_pooling.cpp @@ -18,7 +18,7 @@ #include "common/c_types_map.hpp" #include "common/dnnl_traits.hpp" #include "gpu/sycl/pooling_kernels.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { @@ -33,11 +33,11 @@ status_t ref_pooling_fwd_t::pd_t::init_conf() { conf_.ndims = ndims(); conf_.block_size = 16; conf_.wg_size = 32; - conf_.src_md = hrt::sycl::md_t(src_md(0)); - conf_.dst_md = hrt::sycl::md_t(dst_md(0)); + conf_.src_md = xpu::sycl::md_t(src_md(0)); + conf_.dst_md = xpu::sycl::md_t(dst_md(0)); conf_.ws_md = !types::is_zero_md(workspace_md()) - ? hrt::sycl::md_t(workspace_md(0)) - : hrt::sycl::md_t {}; + ? xpu::sycl::md_t(workspace_md(0)) + : xpu::sycl::md_t {}; conf_.zero_dims = has_zero_dim_memory(); for (int i = 0; i < DNNL_MAX_NDIMS; i++) { conf_.dst_dims[i] = dst_md()->dims[i]; @@ -75,7 +75,7 @@ status_t ref_pooling_fwd_t::pd_t::init_conf() { for (auto i = 0; i < attr_po.len(); ++i) { if (attr_po.contain(binary, i)) { dnnl::impl::memory_desc_t mem = attr_po.entry_[i].binary.src1_desc; - conf_.src1_md[i] = hrt::sycl::md_t(&mem); + conf_.src1_md[i] = xpu::sycl::md_t(&mem); } } conf_.post_ops = sycl_post_ops_t(attr()); @@ -122,11 +122,11 @@ status_t ref_pooling_bwd_t::pd_t::init_conf() { conf_.ndims = ndims(); conf_.block_size = 16; conf_.wg_size = 32; - conf_.diff_src_md = hrt::sycl::md_t(diff_src_md(0)); - conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md(0)); + conf_.diff_src_md = xpu::sycl::md_t(diff_src_md(0)); + conf_.diff_dst_md = xpu::sycl::md_t(diff_dst_md(0)); conf_.ws_md = !types::is_zero_md(workspace_md()) - ? hrt::sycl::md_t(workspace_md(0)) - : hrt::sycl::md_t {}; + ? xpu::sycl::md_t(workspace_md(0)) + : xpu::sycl::md_t {}; conf_.zero_dims = has_zero_dim_memory(); auto nelems_A = memory_desc_wrapper(diff_src_md(0)).nelems(); int work_per_wg = conf_.wg_size * conf_.block_size; diff --git a/src/gpu/sycl/ref_pooling.hpp b/src/gpu/sycl/ref_pooling.hpp index 5f23ea9ed94..593699513cc 100644 --- a/src/gpu/sycl/ref_pooling.hpp +++ b/src/gpu/sycl/ref_pooling.hpp @@ -26,8 +26,8 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/ref_prelu.cpp b/src/gpu/sycl/ref_prelu.cpp index 0d506159189..0c2af104d13 100644 --- a/src/gpu/sycl/ref_prelu.cpp +++ b/src/gpu/sycl/ref_prelu.cpp @@ -32,9 +32,9 @@ status_t ref_prelu_fwd_t::pd_t::init_conf() { const memory_desc_wrapper data_d(src_md(0)); const memory_desc_wrapper weights_d(weights_md(0)); - conf_.data_md = hrt::sycl::md_t(src_md(0)); - conf_.weights_md = hrt::sycl::md_t(weights_md(0)); - conf_.dst_md = hrt::sycl::md_t(dst_md(0)); + conf_.data_md = xpu::sycl::md_t(src_md(0)); + conf_.weights_md = xpu::sycl::md_t(weights_md(0)); + conf_.dst_md = xpu::sycl::md_t(dst_md(0)); conf_.ndims = ndims(); conf_.mask = utils::get_dims_mask(data_d.dims(), weights_d.dims(), ndims()); @@ -77,11 +77,11 @@ status_t ref_prelu_fwd_t::execute_forward(const exec_ctx_t &ctx) const { status_t ref_prelu_bwd_t::pd_t::init_conf() { if (has_zero_dim_memory()) return status::success; conf_ = sycl_prelu_conf_t(); - conf_.data_md = hrt::sycl::md_t(src_md(0)); - conf_.weights_md = hrt::sycl::md_t(weights_md(0)); - conf_.diff_data_md = hrt::sycl::md_t(diff_src_md(0)); - conf_.diff_weights_md = hrt::sycl::md_t(diff_weights_md(0)); - conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md(0)); + conf_.data_md = xpu::sycl::md_t(src_md(0)); + conf_.weights_md = xpu::sycl::md_t(weights_md(0)); + conf_.diff_data_md = xpu::sycl::md_t(diff_src_md(0)); + conf_.diff_weights_md = xpu::sycl::md_t(diff_weights_md(0)); + conf_.diff_dst_md = xpu::sycl::md_t(diff_dst_md(0)); conf_.ndims = ndims(); const memory_desc_wrapper weights_d(weights_md(0)); diff --git a/src/gpu/sycl/ref_prelu.hpp b/src/gpu/sycl/ref_prelu.hpp index 046d91438c6..95da50788bd 100644 --- a/src/gpu/sycl/ref_prelu.hpp +++ b/src/gpu/sycl/ref_prelu.hpp @@ -25,8 +25,8 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/ref_resampling.cpp b/src/gpu/sycl/ref_resampling.cpp index f1149f361da..01d7aa02a14 100644 --- a/src/gpu/sycl/ref_resampling.cpp +++ b/src/gpu/sycl/ref_resampling.cpp @@ -50,8 +50,8 @@ status_t ref_resampling_fwd_t::pd_t::init_conf() { int n_wgs = (nelems_A + work_per_wg - 1) / work_per_wg; conf_.n_thr = n_wgs * conf_.wg_size; - conf_.src_md = hrt::sycl::md_t(src_md(0)); - conf_.dst_md = hrt::sycl::md_t(dst_md()); + conf_.src_md = xpu::sycl::md_t(src_md(0)); + conf_.dst_md = xpu::sycl::md_t(dst_md()); conf_.alg = desc()->alg_kind; const auto *att = attr(); @@ -61,7 +61,7 @@ status_t ref_resampling_fwd_t::pd_t::init_conf() { for (auto i = 0; i < attr_po.len(); ++i) { if (attr_po.contain(primitive_kind::binary, i)) { dnnl::impl::memory_desc_t mem = attr_po.entry_[i].binary.src1_desc; - conf_.src1_md[i] = hrt::sycl::md_t(&mem); + conf_.src1_md[i] = xpu::sycl::md_t(&mem); } } conf_.post_ops = sycl_post_ops_t(attr()); @@ -110,8 +110,8 @@ status_t ref_resampling_fwd_t::execute_forward(const exec_ctx_t &ctx) const { status_t ref_resampling_bwd_t::pd_t::init_conf() { conf_ = sycl_resampling_conf_t(); - conf_.diff_src_md = hrt::sycl::md_t(diff_src_md(0)); - conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md()); + conf_.diff_src_md = xpu::sycl::md_t(diff_src_md(0)); + conf_.diff_dst_md = xpu::sycl::md_t(diff_dst_md()); conf_.src_dt = src_md(0)->data_type; conf_.dst_dt = dst_md()->data_type; diff --git a/src/gpu/sycl/ref_resampling.hpp b/src/gpu/sycl/ref_resampling.hpp index 1a346048d43..5848903892d 100644 --- a/src/gpu/sycl/ref_resampling.hpp +++ b/src/gpu/sycl/ref_resampling.hpp @@ -23,8 +23,8 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/ref_shuffle.cpp b/src/gpu/sycl/ref_shuffle.cpp index 651f3d34b0c..89036a47386 100644 --- a/src/gpu/sycl/ref_shuffle.cpp +++ b/src/gpu/sycl/ref_shuffle.cpp @@ -51,10 +51,10 @@ status_t ref_shuffle_t::pd_t::init_conf() { conf_.HW = conf_.H * conf_.W; conf_.SP = conf_.D * conf_.HW; } - conf_.stat_md = hrt::sycl::md_t(src_md(0)); + conf_.stat_md = xpu::sycl::md_t(src_md(0)); conf_.work_amount = memory_desc_wrapper(src_md()).nelems(); - conf_.src_md = hrt::sycl::md_t(src_md(0)); - conf_.dst_md = hrt::sycl::md_t(dst_md(0)); + conf_.src_md = xpu::sycl::md_t(src_md(0)); + conf_.dst_md = xpu::sycl::md_t(dst_md(0)); if (ndims() == 5) { const auto tag diff --git a/src/gpu/sycl/ref_shuffle.hpp b/src/gpu/sycl/ref_shuffle.hpp index eefc8c3a4a5..bc0169fb8f0 100644 --- a/src/gpu/sycl/ref_shuffle.hpp +++ b/src/gpu/sycl/ref_shuffle.hpp @@ -23,8 +23,8 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" #include "sycl/sycl_stream.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/ref_softmax.cpp b/src/gpu/sycl/ref_softmax.cpp index 4d4c76b902e..3f86ec361eb 100644 --- a/src/gpu/sycl/ref_softmax.cpp +++ b/src/gpu/sycl/ref_softmax.cpp @@ -25,8 +25,8 @@ namespace sycl { using namespace impl::sycl; status_t ref_sycl_softmax_fwd_t::pd_t::init_conf() { conf_ = sycl_softmax_conf_t(); - conf_.src_md = hrt::sycl::md_t(src_md()); - conf_.dst_md = hrt::sycl::md_t(dst_md()); + conf_.src_md = xpu::sycl::md_t(src_md()); + conf_.dst_md = xpu::sycl::md_t(dst_md()); conf_.alg_kind = desc()->alg_kind; conf_.block_size = 16; conf_.wg_size = 32; @@ -76,9 +76,9 @@ status_t ref_sycl_softmax_fwd_t::execute_forward(const exec_ctx_t &ctx) const { status_t ref_sycl_softmax_bwd_t::pd_t::init_conf() { conf_ = sycl_softmax_conf_t(); - conf_.dst_md = hrt::sycl::md_t(dst_md()); - conf_.diff_dst_md = hrt::sycl::md_t(diff_dst_md()); - conf_.diff_src_md = hrt::sycl::md_t(diff_src_md()); + conf_.dst_md = xpu::sycl::md_t(dst_md()); + conf_.diff_dst_md = xpu::sycl::md_t(diff_dst_md()); + conf_.diff_src_md = xpu::sycl::md_t(diff_src_md()); conf_.alg_kind = desc()->alg_kind; conf_.block_size = 16; conf_.wg_size = 32; diff --git a/src/gpu/sycl/resampling_kernels.hpp b/src/gpu/sycl/resampling_kernels.hpp index fb00a832488..01325318f94 100644 --- a/src/gpu/sycl/resampling_kernels.hpp +++ b/src/gpu/sycl/resampling_kernels.hpp @@ -24,7 +24,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { @@ -33,12 +33,12 @@ namespace sycl { struct resampling_kernel_fwd_vec_t { resampling_kernel_fwd_vec_t(const sycl_resampling_conf_t &conf, - hrt::sycl::in_memory_arg_t &src, hrt::sycl::out_memory_arg_t &dst, - hrt::sycl::in_memory_arg_t &src_1, - hrt::sycl::in_memory_arg_t &src_2, - hrt::sycl::in_memory_arg_t &src_3, - hrt::sycl::in_memory_arg_t &src_4, - hrt::sycl::in_memory_arg_t &src_5) + xpu::sycl::in_memory_arg_t &src, xpu::sycl::out_memory_arg_t &dst, + xpu::sycl::in_memory_arg_t &src_1, + xpu::sycl::in_memory_arg_t &src_2, + xpu::sycl::in_memory_arg_t &src_3, + xpu::sycl::in_memory_arg_t &src_4, + xpu::sycl::in_memory_arg_t &src_5) : conf_(conf) , src_(src) , dst_(dst) @@ -142,8 +142,8 @@ struct resampling_kernel_fwd_vec_t { } private: - const hrt::sycl::md_t &src_md() const { return conf_.src_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &src_md() const { return conf_.src_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } void *src_ptr() const { return src_.get_pointer(); } void *src_1_ptr() const { return src_1_.get_pointer(); } @@ -153,11 +153,11 @@ struct resampling_kernel_fwd_vec_t { void *src_5_ptr() const { return src_5_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } - void *gen_ptr(hrt::sycl::in_memory_arg_t gen_) const { + void *gen_ptr(xpu::sycl::in_memory_arg_t gen_) const { return gen_.get_pointer(); } - static dim_t get_offset(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + static dim_t get_offset(const xpu::sycl::md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 3: return mdw.off(n, c, w); @@ -168,7 +168,7 @@ struct resampling_kernel_fwd_vec_t { return 0; } - float dst_value(hrt::sycl::in_memory_arg_t arr, int idx, int offset) const { + float dst_value(xpu::sycl::in_memory_arg_t arr, int idx, int offset) const { auto src1_desc = conf_.src1_md[idx]; dim_t src_dim[DNNL_MAX_NDIMS]; auto src_dim_ = src1_desc.dims(); @@ -182,7 +182,7 @@ struct resampling_kernel_fwd_vec_t { return dst; } - dim_t get_binary_src1_off(const hrt::sycl::md_t &src1_md, + dim_t get_binary_src1_off(const xpu::sycl::md_t &src1_md, const dim_t *src_dim, const dim_t l_offset, const dim_t *dst_dims, const int dst_ndims) const { @@ -193,7 +193,7 @@ struct resampling_kernel_fwd_vec_t { src1_md, l_offset, dst_dims, dst_ndims, mask_binary_po); } - dim_t get_po_tensor_off(const hrt::sycl::md_t &tensor_md, + dim_t get_po_tensor_off(const xpu::sycl::md_t &tensor_md, const dim_t l_offset, const dim_t *dst_dims, const int dst_ndims, int mask) const { @@ -211,19 +211,19 @@ struct resampling_kernel_fwd_vec_t { sycl_resampling_conf_t conf_; - hrt::sycl::in_memory_arg_t src_; - hrt::sycl::out_memory_arg_t dst_; - hrt::sycl::in_memory_arg_t src_1_; - hrt::sycl::in_memory_arg_t src_2_; - hrt::sycl::in_memory_arg_t src_3_; - hrt::sycl::in_memory_arg_t src_4_; - hrt::sycl::in_memory_arg_t src_5_; + xpu::sycl::in_memory_arg_t src_; + xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::in_memory_arg_t src_1_; + xpu::sycl::in_memory_arg_t src_2_; + xpu::sycl::in_memory_arg_t src_3_; + xpu::sycl::in_memory_arg_t src_4_; + xpu::sycl::in_memory_arg_t src_5_; }; struct resampling_kernel_bwd_vec_t { resampling_kernel_bwd_vec_t(const sycl_resampling_conf_t &conf, - hrt::sycl::in_memory_arg_t &diff_dst, - hrt::sycl::out_memory_arg_t &diff_src) + xpu::sycl::in_memory_arg_t &diff_dst, + xpu::sycl::out_memory_arg_t &diff_src) : conf_(conf), diff_dst_(diff_dst), diff_src_(diff_src) {} void operator()(::sycl::nd_item<1> item) const { @@ -278,13 +278,13 @@ struct resampling_kernel_bwd_vec_t { } private: - const hrt::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } - const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const xpu::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } + const xpu::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } void *diff_src_ptr() const { return diff_src_.get_pointer(); } void *diff_dst_ptr() const { return diff_dst_.get_pointer(); } - static dim_t get_offset(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + static dim_t get_offset(const xpu::sycl::md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 3: return mdw.off(n, c, w); @@ -296,14 +296,14 @@ struct resampling_kernel_bwd_vec_t { } sycl_resampling_conf_t conf_; - hrt::sycl::in_memory_arg_t diff_dst_; - hrt::sycl::out_memory_arg_t diff_src_; + xpu::sycl::in_memory_arg_t diff_dst_; + xpu::sycl::out_memory_arg_t diff_src_; }; struct resampling_kernel_bwd_vec1_t { resampling_kernel_bwd_vec1_t(const sycl_resampling_conf_t &conf, - hrt::sycl::in_memory_arg_t &diff_dst, - hrt::sycl::out_memory_arg_t &diff_src) + xpu::sycl::in_memory_arg_t &diff_dst, + xpu::sycl::out_memory_arg_t &diff_src) : conf_(conf), diff_dst_(diff_dst), diff_src_(diff_src) {} void operator()(::sycl::nd_item<1> item) const { @@ -356,13 +356,13 @@ struct resampling_kernel_bwd_vec1_t { } private: - const hrt::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } - const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const xpu::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } + const xpu::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } void *diff_src_ptr() const { return diff_src_.get_pointer(); } void *diff_dst_ptr() const { return diff_dst_.get_pointer(); } - static dim_t get_offset(const hrt::sycl::md_t &mdw, dim_t n, dim_t c, + static dim_t get_offset(const xpu::sycl::md_t &mdw, dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { switch (mdw.ndims()) { case 3: return mdw.off(n, c, w); @@ -374,8 +374,8 @@ struct resampling_kernel_bwd_vec1_t { } sycl_resampling_conf_t conf_; - hrt::sycl::in_memory_arg_t diff_dst_; - hrt::sycl::out_memory_arg_t diff_src_; + xpu::sycl::in_memory_arg_t diff_dst_; + xpu::sycl::out_memory_arg_t diff_src_; }; } // namespace sycl diff --git a/src/gpu/sycl/shuffle_kernels.hpp b/src/gpu/sycl/shuffle_kernels.hpp index e4bf3c70330..ef5a8057634 100644 --- a/src/gpu/sycl/shuffle_kernels.hpp +++ b/src/gpu/sycl/shuffle_kernels.hpp @@ -23,7 +23,7 @@ #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_primitive_conf.hpp" #include "gpu/sycl/sycl_q10n.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { @@ -32,7 +32,7 @@ namespace sycl { struct shuffle_kernel_vec1_t { shuffle_kernel_vec1_t(const sycl_shuffle_conf_t &conf, - hrt::sycl::in_memory_arg_t &data, hrt::sycl::out_memory_arg_t &dst) + xpu::sycl::in_memory_arg_t &data, xpu::sycl::out_memory_arg_t &dst) : conf_(conf), data_(data), dst_(dst) {} void operator()(::sycl::nd_item<1> item) const { @@ -63,21 +63,21 @@ struct shuffle_kernel_vec1_t { } private: - const hrt::sycl::md_t &data_md() const { return conf_.src_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } - const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } + const xpu::sycl::md_t &data_md() const { return conf_.src_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &stat_md() const { return conf_.stat_md; } void *data_ptr() const { return data_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } sycl_shuffle_conf_t conf_; - hrt::sycl::in_memory_arg_t data_; - hrt::sycl::out_memory_arg_t dst_; + xpu::sycl::in_memory_arg_t data_; + xpu::sycl::out_memory_arg_t dst_; }; struct shuffle_kernel_vec2_t { shuffle_kernel_vec2_t(const sycl_shuffle_conf_t &conf, - hrt::sycl::in_memory_arg_t &data, hrt::sycl::out_memory_arg_t &dst) + xpu::sycl::in_memory_arg_t &data, xpu::sycl::out_memory_arg_t &dst) : conf_(conf), data_(data), dst_(dst) {} void operator()(::sycl::nd_item<1> item) const { @@ -103,21 +103,21 @@ struct shuffle_kernel_vec2_t { } private: - const hrt::sycl::md_t &data_md() const { return conf_.src_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } - const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } + const xpu::sycl::md_t &data_md() const { return conf_.src_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &stat_md() const { return conf_.stat_md; } void *data_ptr() const { return data_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } sycl_shuffle_conf_t conf_; - hrt::sycl::in_memory_arg_t data_; - hrt::sycl::out_memory_arg_t dst_; + xpu::sycl::in_memory_arg_t data_; + xpu::sycl::out_memory_arg_t dst_; }; struct shuffle_kernel_vec3_t { shuffle_kernel_vec3_t(const sycl_shuffle_conf_t &conf, - hrt::sycl::in_memory_arg_t &data, hrt::sycl::out_memory_arg_t &dst) + xpu::sycl::in_memory_arg_t &data, xpu::sycl::out_memory_arg_t &dst) : conf_(conf), data_(data), dst_(dst) {} void operator()(::sycl::nd_item<1> item) const { @@ -149,16 +149,16 @@ struct shuffle_kernel_vec3_t { } private: - const hrt::sycl::md_t &data_md() const { return conf_.src_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } - const hrt::sycl::md_t &stat_md() const { return conf_.stat_md; } + const xpu::sycl::md_t &data_md() const { return conf_.src_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &stat_md() const { return conf_.stat_md; } void *data_ptr() const { return data_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } sycl_shuffle_conf_t conf_; - hrt::sycl::in_memory_arg_t data_; - hrt::sycl::out_memory_arg_t dst_; + xpu::sycl::in_memory_arg_t data_; + xpu::sycl::out_memory_arg_t dst_; }; } // namespace sycl diff --git a/src/gpu/sycl/softmax_kernels.hpp b/src/gpu/sycl/softmax_kernels.hpp index dd70308b919..64fb4066cde 100644 --- a/src/gpu/sycl/softmax_kernels.hpp +++ b/src/gpu/sycl/softmax_kernels.hpp @@ -29,10 +29,10 @@ namespace sycl { struct softmax_fwd_kernel_vec_t { softmax_fwd_kernel_vec_t(const sycl_softmax_conf_t &conf, - hrt::sycl::in_memory_arg_t &src, - hrt::sycl::in_memory_arg_t &scale_src, - hrt::sycl::in_memory_arg_t &scale_dst, - hrt::sycl::out_memory_arg_t &dst) + xpu::sycl::in_memory_arg_t &src, + xpu::sycl::in_memory_arg_t &scale_src, + xpu::sycl::in_memory_arg_t &scale_dst, + xpu::sycl::out_memory_arg_t &dst) : conf_(conf) , src_(src) , scale_src_(scale_src) @@ -113,8 +113,8 @@ struct softmax_fwd_kernel_vec_t { } private: - const hrt::sycl::md_t &src_md() const { return conf_.src_md; } - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &src_md() const { return conf_.src_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } void *src_ptr() const { return src_.get_pointer(); } void *dst_ptr() const { return dst_.get_pointer(); } @@ -122,17 +122,17 @@ struct softmax_fwd_kernel_vec_t { void *scale_dst_ptr() const { return scale_dst_.get_pointer(); } sycl_softmax_conf_t conf_; - hrt::sycl::in_memory_arg_t src_; - hrt::sycl::in_memory_arg_t scale_src_; - hrt::sycl::in_memory_arg_t scale_dst_; - hrt::sycl::out_memory_arg_t dst_; + xpu::sycl::in_memory_arg_t src_; + xpu::sycl::in_memory_arg_t scale_src_; + xpu::sycl::in_memory_arg_t scale_dst_; + xpu::sycl::out_memory_arg_t dst_; }; struct softmax_bwd_kernel_vec_t { softmax_bwd_kernel_vec_t(const sycl_softmax_conf_t &conf, - hrt::sycl::in_memory_arg_t &dst, - hrt::sycl::in_memory_arg_t &diff_dst, - hrt::sycl::out_memory_arg_t &diff_src) + xpu::sycl::in_memory_arg_t &dst, + xpu::sycl::in_memory_arg_t &diff_dst, + xpu::sycl::out_memory_arg_t &diff_src) : conf_(conf), dst_(dst), diff_dst_(diff_dst), diff_src_(diff_src) {} void operator()(::sycl::nd_item<1> item) const { @@ -198,18 +198,18 @@ struct softmax_bwd_kernel_vec_t { } private: - const hrt::sycl::md_t &dst_md() const { return conf_.dst_md; } - const hrt::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } - const hrt::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } + const xpu::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; } + const xpu::sycl::md_t &diff_src_md() const { return conf_.diff_src_md; } void *dst_ptr() const { return dst_.get_pointer(); } void *diff_dst_ptr() const { return diff_dst_.get_pointer(); } void *diff_src_ptr() const { return diff_src_.get_pointer(); } sycl_softmax_conf_t conf_; - hrt::sycl::in_memory_arg_t dst_; - hrt::sycl::in_memory_arg_t diff_dst_; - hrt::sycl::out_memory_arg_t diff_src_; + xpu::sycl::in_memory_arg_t dst_; + xpu::sycl::in_memory_arg_t diff_dst_; + xpu::sycl::out_memory_arg_t diff_src_; }; } // namespace sycl diff --git a/src/gpu/sycl/sycl_gpu_kernel.hpp b/src/gpu/sycl/sycl_gpu_kernel.hpp index fd0bc0d0e95..750f6e7329d 100644 --- a/src/gpu/sycl/sycl_gpu_kernel.hpp +++ b/src/gpu/sycl/sycl_gpu_kernel.hpp @@ -18,7 +18,7 @@ #define GPU_SYCL_SYCL_GPU_KERNEL_HPP #include "common/utils.hpp" -#include "hrt/sycl/utils.hpp" +#include "xpu/sycl/utils.hpp" #include "gpu/intel/compute/kernel.hpp" diff --git a/src/gpu/sycl/sycl_interop_gpu_kernel.cpp b/src/gpu/sycl/sycl_interop_gpu_kernel.cpp index 684dba1b064..9c39b08e7ae 100644 --- a/src/gpu/sycl/sycl_interop_gpu_kernel.cpp +++ b/src/gpu/sycl/sycl_interop_gpu_kernel.cpp @@ -24,9 +24,9 @@ #include "gpu/intel/sycl/l0/utils.hpp" #include "gpu/intel/sycl/utils.hpp" #include "gpu/intel/utils.hpp" -#include "hrt/sycl/c_types_map.hpp" -#include "hrt/utils.hpp" #include "sycl/sycl_stream.hpp" +#include "xpu/sycl/c_types_map.hpp" +#include "xpu/utils.hpp" namespace dnnl { namespace impl { @@ -100,7 +100,7 @@ status_t sycl_interop_gpu_kernel_t::parallel_for(stream_t &stream, // XXX: DPCPP/L0 does not support non-uniform work-groups and does not // provide any diagnostics. This is to catch potential issues on oneDNN // side. - if (sycl_engine->backend() == hrt::sycl::backend_t::level0 + if (sycl_engine->backend() == xpu::sycl::backend_t::level0 && range.local_range()) { for (size_t i = 0; i < range.ndims(); i++) { size_t gws = range.global_range()[i]; @@ -123,12 +123,12 @@ status_t sycl_interop_gpu_kernel_t::parallel_for(stream_t &stream, = static_cast(arg.value()); if (*mem_storage) { auto *sycl_mem_storage = utils::downcast< - const hrt::sycl::memory_storage_base_t *>( + const xpu::sycl::memory_storage_base_t *>( mem_storage); switch (sycl_mem_storage->memory_kind()) { - case hrt::sycl::memory_kind::buffer: { + case xpu::sycl::memory_kind::buffer: { auto *m = utils::downcast< - const hrt::sycl::buffer_memory_storage_t *>( + const xpu::sycl::buffer_memory_storage_t *>( mem_storage); auto &sycl_buf = m->buffer(); cgh.set_arg((int)i, @@ -137,9 +137,9 @@ status_t sycl_interop_gpu_kernel_t::parallel_for(stream_t &stream, cgh)); break; } - case hrt::sycl::memory_kind::usm: { + case xpu::sycl::memory_kind::usm: { auto *m = utils::downcast< - const hrt::sycl::usm_memory_storage_t *>( + const xpu::sycl::usm_memory_storage_t *>( mem_storage); cgh.set_arg((int)i, m->usm_ptr()); break; @@ -150,7 +150,7 @@ status_t sycl_interop_gpu_kernel_t::parallel_for(stream_t &stream, cgh.set_arg((int)i, nullptr); } } else if (arg.is_local()) { - auto acc = hrt::sycl::compat::local_accessor( + auto acc = xpu::sycl::compat::local_accessor( ::sycl::range<1>(arg.size()), cgh); cgh.set_arg((int)i, acc); } else { @@ -181,7 +181,7 @@ status_t sycl_interop_gpu_kernel_t::parallel_for(stream_t &stream, } status_t sycl_interop_gpu_kernel_t::dump() const { - hrt::binary_t binary; + xpu::binary_t binary; CHECK(gpu::intel::sycl::get_kernel_binary(sycl_kernel(), binary)); return gpu::intel::gpu_utils::dump_kernel_binary(binary, name()); } diff --git a/src/gpu/sycl/sycl_io_helper.hpp b/src/gpu/sycl/sycl_io_helper.hpp index 5aa7a223b1e..5bf0e225667 100644 --- a/src/gpu/sycl/sycl_io_helper.hpp +++ b/src/gpu/sycl/sycl_io_helper.hpp @@ -31,7 +31,7 @@ inline int load_int_value(data_type_t dt, const void *ptr, dim_t idx) { #define CASE(dt) \ case dt: \ return static_cast(reinterpret_cast< \ - const typename hrt::sycl::prec_traits
::type *>(ptr)[idx]); + const typename xpu::sycl::prec_traits
::type *>(ptr)[idx]); using namespace data_type; switch (dt) { CASE(s32); @@ -48,7 +48,7 @@ inline float load_float_value(data_type_t dt, const void *ptr, dim_t idx) { #define CASE(dt) \ case dt: \ return static_cast(reinterpret_cast< \ - const typename hrt::sycl::prec_traits
::type *>(ptr)[idx]); + const typename xpu::sycl::prec_traits
::type *>(ptr)[idx]); using namespace data_type; switch (dt) { @@ -68,7 +68,7 @@ inline float load_float_value(data_type_t dt, const void *ptr, dim_t idx) { inline void store_float_value(data_type_t dt, float val, void *ptr, dim_t idx) { #define CASE(dt) \ case dt: { \ - using type_ = typename hrt::sycl::prec_traits
::type; \ + using type_ = typename xpu::sycl::prec_traits
::type; \ *(reinterpret_cast(ptr) + idx) \ = impl::sycl::saturate_and_round(val); \ } break; @@ -103,8 +103,8 @@ inline ::sycl::vec handle_bf16_load(void *ptr, dim_t offset) { ::sycl::vec vec_f32; for (int i = 0; i < width; i++) { // Convert u16 value to bfloat16_t. - const hrt::sycl::bfloat16_t bf16_val - = static_cast(vec_u16[i]); + const xpu::sycl::bfloat16_t bf16_val + = static_cast(vec_u16[i]); // Convert bfloat16_t to float. const float f32_val = static_cast(bf16_val); // Write result to vector. @@ -121,8 +121,8 @@ inline void handle_bf16_store( for (int i = 0; i < width; i++) { // Convert float value to bfloat16_t. - const hrt::sycl::bfloat16_t bf16_val - = static_cast(vec_f32[i]); + const xpu::sycl::bfloat16_t bf16_val + = static_cast(vec_f32[i]); // Convert bfloat16_t to uint16_t. const uint16_t u16_val = bf16_val.raw_bits_; // Write result to vector. @@ -137,7 +137,7 @@ inline ::sycl::vec load_float_vec( data_type_t dt, void *ptr, dim_t offset) { #define CASE(dt) \ case dt: { \ - using type = typename hrt::sycl::prec_traits
::type; \ + using type = typename xpu::sycl::prec_traits
::type; \ global_ptr gptr_dt(reinterpret_cast(ptr)); \ ::sycl::vec vec_dt; \ vec_dt.load(offset, gptr_dt); \ @@ -163,7 +163,7 @@ inline void store_float_vec(data_type_t dt, ::sycl::vec vec_f32, void *ptr, dim_t offset) { #define CASE(dt) \ case dt: { \ - using type = typename hrt::sycl::prec_traits
::type; \ + using type = typename xpu::sycl::prec_traits
::type; \ global_ptr gptr_dt(reinterpret_cast(ptr)); \ auto vec_dt = impl::sycl::saturate_and_round_vec(vec_f32); \ vec_dt.store(offset, gptr_dt); \ diff --git a/src/gpu/sycl/sycl_math_utils.hpp b/src/gpu/sycl/sycl_math_utils.hpp index 234212b0153..67a11ff51db 100644 --- a/src/gpu/sycl/sycl_math_utils.hpp +++ b/src/gpu/sycl/sycl_math_utils.hpp @@ -20,7 +20,7 @@ #include "common/c_types_map.hpp" #include "common/math_utils.hpp" #include "common/utils.hpp" -#include "hrt/sycl/utils.hpp" +#include "xpu/sycl/utils.hpp" namespace dnnl { namespace impl { diff --git a/src/gpu/sycl/sycl_post_ops.hpp b/src/gpu/sycl/sycl_post_ops.hpp index b88a79492d1..641cfcbe6c9 100644 --- a/src/gpu/sycl/sycl_post_ops.hpp +++ b/src/gpu/sycl/sycl_post_ops.hpp @@ -20,7 +20,7 @@ #include "common/c_types_map.hpp" #include "common/primitive_attr.hpp" #include "gpu/sycl/sycl_math_utils.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { @@ -133,7 +133,7 @@ struct ref_binary_op_t { struct sycl_post_ops_t { // SYCL has a limitation on total size of kernel arguments. // This affects number of post ops, e.g. binary post op (which is not yet - // implemented) contains hrt::sycl::md_t which is large enough to limit + // implemented) contains xpu::sycl::md_t which is large enough to limit // the number of post ops. static constexpr int max_post_ops = 5; diff --git a/src/gpu/sycl/sycl_primitive_conf.hpp b/src/gpu/sycl/sycl_primitive_conf.hpp index f5aee8cbc62..2adcfb2034c 100644 --- a/src/gpu/sycl/sycl_primitive_conf.hpp +++ b/src/gpu/sycl/sycl_primitive_conf.hpp @@ -19,7 +19,7 @@ #include "common/broadcast_strategy.hpp" #include "gpu/sycl/sycl_post_ops.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { @@ -27,16 +27,16 @@ namespace gpu { namespace sycl { struct sycl_binary_conf_t { - hrt::sycl::md_t src0_md; - hrt::sycl::md_t src1_md; - hrt::sycl::md_t dst_md; + xpu::sycl::md_t src0_md; + xpu::sycl::md_t src1_md; + xpu::sycl::md_t dst_md; alg_kind_t alg_kind; bool do_scale_src0; bool do_scale_src1; - int broadcast_dims[hrt::sycl::md_t::max_dims]; + int broadcast_dims[xpu::sycl::md_t::max_dims]; int ndims; bool is_tensor_op; @@ -49,10 +49,10 @@ struct sycl_binary_conf_t { struct sycl_eltwise_conf_t { prop_kind_t prop_kind; - hrt::sycl::md_t src_md; - hrt::sycl::md_t dst_md; - hrt::sycl::md_t diff_src_md; - hrt::sycl::md_t diff_dst_md; + xpu::sycl::md_t src_md; + xpu::sycl::md_t dst_md; + xpu::sycl::md_t diff_src_md; + xpu::sycl::md_t diff_dst_md; alg_kind_t alg_kind; float alpha; float beta; @@ -65,18 +65,18 @@ struct sycl_eltwise_conf_t { dim_t wg_size; dim_t wk_size; dim_t post_po_len; - hrt::sycl::md_t binary_src_arr[8]; + xpu::sycl::md_t binary_src_arr[8]; sycl_post_ops_t post_ops; }; struct sycl_prelu_conf_t { prop_kind_t prop_kind; - hrt::sycl::md_t data_md; - hrt::sycl::md_t dst_md; - hrt::sycl::md_t weights_md; - hrt::sycl::md_t diff_data_md; - hrt::sycl::md_t diff_dst_md; - hrt::sycl::md_t diff_weights_md; + xpu::sycl::md_t data_md; + xpu::sycl::md_t dst_md; + xpu::sycl::md_t weights_md; + xpu::sycl::md_t diff_data_md; + xpu::sycl::md_t diff_dst_md; + xpu::sycl::md_t diff_weights_md; dim_t work_amount; dim_t work_amount_wei; dim_t work_amount_src; @@ -93,10 +93,10 @@ struct sycl_prelu_conf_t { }; struct sycl_shuffle_conf_t { - hrt::sycl::md_t src_md; - hrt::sycl::md_t dst_md; - hrt::sycl::md_t stat_md; - hrt::sycl::md_t axis_md; + xpu::sycl::md_t src_md; + xpu::sycl::md_t dst_md; + xpu::sycl::md_t stat_md; + xpu::sycl::md_t axis_md; dim_t transpose_col; dim_t transpose_row; dim_t group_size; @@ -142,16 +142,16 @@ struct sycl_resampling_conf_t { data_type_t src_dt; data_type_t dst_dt; - hrt::sycl::md_t src_md; - hrt::sycl::md_t src1_md[8]; - hrt::sycl::md_t dst_md; - hrt::sycl::md_t diff_src_md; - hrt::sycl::md_t diff_dst_md; + xpu::sycl::md_t src_md; + xpu::sycl::md_t src1_md[8]; + xpu::sycl::md_t dst_md; + xpu::sycl::md_t diff_src_md; + xpu::sycl::md_t diff_dst_md; alg_kind_t alg; float src_scale; bool do_scale_src; - int broadcast_dims[hrt::sycl::md_t::max_dims]; + int broadcast_dims[xpu::sycl::md_t::max_dims]; int ndims; bool is_tensor_op; @@ -164,17 +164,17 @@ struct sycl_resampling_conf_t { struct sycl_layer_normalization_conf_t { prop_kind_t prop_kind; - hrt::sycl::md_t data_md; - hrt::sycl::md_t diff_data_md; - hrt::sycl::md_t data_scaleshift_md; - hrt::sycl::md_t diff_data_scaleshift_md; - hrt::sycl::md_t scale; - hrt::sycl::md_t shift; - hrt::sycl::md_t stat_md; - hrt::sycl::md_t stat_d; - hrt::sycl::md_t var_md; - hrt::sycl::md_t dst_md; - hrt::sycl::md_t diff_dst_md; + xpu::sycl::md_t data_md; + xpu::sycl::md_t diff_data_md; + xpu::sycl::md_t data_scaleshift_md; + xpu::sycl::md_t diff_data_scaleshift_md; + xpu::sycl::md_t scale; + xpu::sycl::md_t shift; + xpu::sycl::md_t stat_md; + xpu::sycl::md_t stat_d; + xpu::sycl::md_t var_md; + xpu::sycl::md_t dst_md; + xpu::sycl::md_t diff_dst_md; dim_t wk_size; bool is_fwd; bool src_def; @@ -216,18 +216,18 @@ struct sycl_batch_normalization_conf_t { bool use_shift; float alpha; bool dir; - hrt::sycl::md_t data_md; - hrt::sycl::md_t src1_md; - hrt::sycl::md_t dst1_md; - hrt::sycl::md_t diff_data_md; - hrt::sycl::md_t diff_src1_md; - hrt::sycl::md_t data_scaleshift_md; - hrt::sycl::md_t diff_data_scaleshift_md; - hrt::sycl::md_t stat_md; - hrt::sycl::md_t var_md; - hrt::sycl::md_t ws_md; - hrt::sycl::md_t dst_md; - hrt::sycl::md_t diff_dst_md; + xpu::sycl::md_t data_md; + xpu::sycl::md_t src1_md; + xpu::sycl::md_t dst1_md; + xpu::sycl::md_t diff_data_md; + xpu::sycl::md_t diff_src1_md; + xpu::sycl::md_t data_scaleshift_md; + xpu::sycl::md_t diff_data_scaleshift_md; + xpu::sycl::md_t stat_md; + xpu::sycl::md_t var_md; + xpu::sycl::md_t ws_md; + xpu::sycl::md_t dst_md; + xpu::sycl::md_t diff_dst_md; dim_t N; dim_t C; dim_t D; @@ -246,12 +246,12 @@ struct sycl_batch_normalization_conf_t { struct sycl_softmax_conf_t { prop_kind_t prop_kind; - hrt::sycl::md_t src_md; - hrt::sycl::md_t dst_md; + xpu::sycl::md_t src_md; + xpu::sycl::md_t dst_md; - hrt::sycl::md_t diff_md; - hrt::sycl::md_t diff_src_md; - hrt::sycl::md_t diff_dst_md; + xpu::sycl::md_t diff_md; + xpu::sycl::md_t diff_src_md; + xpu::sycl::md_t diff_dst_md; alg_kind_t alg_kind; dim_t block_size; dim_t wg_size; @@ -267,10 +267,10 @@ struct sycl_softmax_conf_t { }; struct sycl_lrn_conf_t { - hrt::sycl::md_t src_md; - hrt::sycl::md_t dst_md; - hrt::sycl::md_t diff_dst_md; - hrt::sycl::md_t diff_src_md; + xpu::sycl::md_t src_md; + xpu::sycl::md_t dst_md; + xpu::sycl::md_t diff_dst_md; + xpu::sycl::md_t diff_src_md; alg_kind_t alg_kind; dim_t mb; @@ -293,12 +293,12 @@ struct sycl_lrn_conf_t { }; struct sycl_pooling_conf_t { - hrt::sycl::md_t src_md; - hrt::sycl::md_t src1_md[8]; - hrt::sycl::md_t dst_md; - hrt::sycl::md_t ws_md; - hrt::sycl::md_t diff_src_md; - hrt::sycl::md_t diff_dst_md; + xpu::sycl::md_t src_md; + xpu::sycl::md_t src1_md[8]; + xpu::sycl::md_t dst_md; + xpu::sycl::md_t ws_md; + xpu::sycl::md_t diff_src_md; + xpu::sycl::md_t diff_dst_md; int ndims; int po_len; bool zero_dims; diff --git a/src/gpu/sycl/sycl_q10n.hpp b/src/gpu/sycl/sycl_q10n.hpp index 5d3f2950fec..4c0ed6df3fe 100644 --- a/src/gpu/sycl/sycl_q10n.hpp +++ b/src/gpu/sycl/sycl_q10n.hpp @@ -20,8 +20,8 @@ #include "common/c_types_map.hpp" #include "common/math_utils.hpp" #include "common/utils.hpp" -#include "hrt/sycl/types.hpp" -#include "hrt/sycl/utils.hpp" +#include "xpu/sycl/types.hpp" +#include "xpu/sycl/utils.hpp" namespace dnnl { namespace impl { diff --git a/src/sycl/stream_profiler.cpp b/src/sycl/stream_profiler.cpp index df754a27f36..4fd78c979c4 100644 --- a/src/sycl/stream_profiler.cpp +++ b/src/sycl/stream_profiler.cpp @@ -21,8 +21,8 @@ #include "common/c_types_map.hpp" #include "common/utils.hpp" -#include "hrt/sycl/utils.hpp" #include "sycl/sycl_stream.hpp" +#include "xpu/sycl/utils.hpp" namespace dnnl { namespace impl { diff --git a/src/sycl/sycl_cpu_engine.hpp b/src/sycl/sycl_cpu_engine.hpp index 1283916668b..8ad46f519ea 100644 --- a/src/sycl/sycl_cpu_engine.hpp +++ b/src/sycl/sycl_cpu_engine.hpp @@ -36,7 +36,7 @@ class sycl_cpu_engine_t : public sycl_engine_base_t { sycl_cpu_engine_t( const ::sycl::device &dev, const ::sycl::context &ctx, size_t index) : sycl_engine_base_t(engine_kind::cpu, dev, ctx, index) { - assert(dev.is_cpu() || hrt::sycl::is_host(dev)); + assert(dev.is_cpu() || xpu::sycl::is_host(dev)); } status_t create_memory_storage(memory_storage_t **storage, unsigned flags, diff --git a/src/sycl/sycl_device_info.cpp b/src/sycl/sycl_device_info.cpp index e426c36dbf7..d71b0f58b78 100644 --- a/src/sycl/sycl_device_info.cpp +++ b/src/sycl/sycl_device_info.cpp @@ -37,23 +37,23 @@ status_t sycl_device_info_t::init_arch(engine_t *engine) { if (!device.is_gpu()) return status::success; // skip other vendors - if (!hrt::sycl::is_intel_device(device)) return status::success; + if (!xpu::sycl::is_intel_device(device)) return status::success; - auto be = hrt::sycl::get_backend(device); - if (be == hrt::sycl::backend_t::opencl) { + auto be = xpu::sycl::get_backend(device); + if (be == xpu::sycl::backend_t::opencl) { cl_int err = CL_SUCCESS; - auto ocl_dev = hrt::sycl::compat::get_native(device); - auto ocl_dev_wrapper = hrt::ocl::make_wrapper(ocl_dev); + auto ocl_dev = xpu::sycl::compat::get_native(device); + auto ocl_dev_wrapper = xpu::ocl::make_wrapper(ocl_dev); - auto ocl_ctx_wrapper = hrt::ocl::make_wrapper( + auto ocl_ctx_wrapper = xpu::ocl::make_wrapper( clCreateContext(nullptr, 1, &ocl_dev, nullptr, nullptr, &err)); OCL_CHECK(err); gpu::intel::ocl::init_gpu_hw_info(engine, ocl_dev_wrapper, ocl_ctx_wrapper, gpu_arch_, stepping_id_, native_extensions_, mayiuse_systolic_, mayiuse_ngen_kernels_); - } else if (be == hrt::sycl::backend_t::level0) { + } else if (be == xpu::sycl::backend_t::level0) { // TODO: add support for L0 binary ngen check // XXX: query from ocl_engine for now gpu::intel::ocl::ocl_engine_factory_t f(engine_kind::gpu); @@ -117,20 +117,20 @@ status_t sycl_device_info_t::init_extensions(engine_t *engine) { status_t sycl_device_info_t::init_attributes(engine_t *engine) { auto &device = utils::downcast(engine)->device(); - if (device.is_gpu() && hrt::sycl::is_intel_device(device)) { - hrt::sycl::backend_t be = hrt::sycl::get_backend(device); - if (be == hrt::sycl::backend_t::opencl) { + if (device.is_gpu() && xpu::sycl::is_intel_device(device)) { + xpu::sycl::backend_t be = xpu::sycl::get_backend(device); + if (be == xpu::sycl::backend_t::opencl) { // XXX: OpenCL backend get_info() queries below are not yet // supported so query OpenCL directly. cl_device_id ocl_dev - = hrt::sycl::compat::get_native(device); + = xpu::sycl::compat::get_native(device); CHECK(gpu::intel::ocl::get_ocl_device_eu_count( ocl_dev, gpu_arch_, &eu_count_)); } else { auto slices = device.get_info< - hrt::sycl::compat::ext_intel_gpu_slices>(); + xpu::sycl::compat::ext_intel_gpu_slices>(); auto sub_slices = device.get_info< - hrt::sycl::compat::ext_intel_gpu_subslices_per_slice>(); + xpu::sycl::compat::ext_intel_gpu_subslices_per_slice>(); auto eus_per_subslice = device.get_info<::sycl::info::device:: ext_intel_gpu_eu_count_per_subslice>(); if (gpu_arch_ == gpu::intel::compute::gpu_arch_t::xe2) diff --git a/src/sycl/sycl_engine.cpp b/src/sycl/sycl_engine.cpp index 1c73b597c03..85b2f2ab271 100644 --- a/src/sycl/sycl_engine.cpp +++ b/src/sycl/sycl_engine.cpp @@ -31,7 +31,7 @@ status_t sycl_engine_factory_t::engine_create( auto dev_type = (engine_kind_ == engine_kind::cpu) ? ::sycl::info::device_type::cpu : ::sycl::info::device_type::gpu; - auto devices = hrt::sycl::get_devices(dev_type); + auto devices = xpu::sycl::get_devices(dev_type); auto &dev = devices[index]; auto exception_handler = [](const ::sycl::exception_list &eptr_list) { @@ -61,7 +61,7 @@ status_t sycl_engine_factory_t::engine_create(engine_t **engine, const ::sycl::device &dev, const ::sycl::context &ctx, size_t index) const { // Validate device and context. - VERROR_ENGINE(hrt::sycl::dev_ctx_consistency_check(dev, ctx), + VERROR_ENGINE(xpu::sycl::dev_ctx_consistency_check(dev, ctx), status::invalid_arguments, VERBOSE_DEVICE_CTX_MISMATCH); #ifdef DNNL_SYCL_CUDA @@ -76,7 +76,7 @@ status_t sycl_engine_factory_t::engine_create(engine_t **engine, engine, engine_kind_, dev, ctx, index); #endif VERROR_ENGINE(!(engine_kind_ == engine_kind::cpu && !dev.is_cpu() - && !hrt::sycl::is_host(dev)), + && !xpu::sycl::is_host(dev)), status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); VERROR_ENGINE(!(engine_kind_ == engine_kind::gpu && !dev.is_gpu()), status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); diff --git a/src/sycl/sycl_engine.hpp b/src/sycl/sycl_engine.hpp index 135964afa26..37fe88c947c 100644 --- a/src/sycl/sycl_engine.hpp +++ b/src/sycl/sycl_engine.hpp @@ -81,7 +81,7 @@ class sycl_engine_factory_t : public engine_factory_t { auto dev_type = (engine_kind_ == engine_kind::cpu) ? ::sycl::info::device_type::cpu : ::sycl::info::device_type::gpu; - return hrt::sycl::get_devices(dev_type).size(); + return xpu::sycl::get_devices(dev_type).size(); } status_t engine_create(engine_t **engine, size_t index) const override; diff --git a/src/sycl/sycl_engine_base.cpp b/src/sycl/sycl_engine_base.cpp index 7d37972283c..5b1388448d7 100644 --- a/src/sycl/sycl_engine_base.cpp +++ b/src/sycl/sycl_engine_base.cpp @@ -19,9 +19,9 @@ #include "common/memory.hpp" #include "common/memory_storage.hpp" #include "gpu/intel/sycl/compat.hpp" -#include "hrt/sycl/memory_storage.hpp" #include "sycl/sycl_device_info.hpp" #include "sycl/sycl_stream.hpp" +#include "xpu/sycl/memory_storage.hpp" namespace dnnl { namespace impl { @@ -32,10 +32,10 @@ status_t sycl_engine_base_t::create_memory_storage( std::unique_ptr _storage; if (flags & memory_flags_t::prefer_device_usm) { - _storage.reset(new hrt::sycl::usm_memory_storage_t( + _storage.reset(new xpu::sycl::usm_memory_storage_t( this, ::sycl::usm::alloc::device)); } else - _storage.reset(new hrt::sycl::buffer_memory_storage_t(this)); + _storage.reset(new xpu::sycl::buffer_memory_storage_t(this)); if (!_storage) return status::out_of_memory; diff --git a/src/sycl/sycl_engine_base.hpp b/src/sycl/sycl_engine_base.hpp index 609c7de87c2..ff0f10867db 100644 --- a/src/sycl/sycl_engine_base.hpp +++ b/src/sycl/sycl_engine_base.hpp @@ -28,7 +28,7 @@ #include "gpu/intel/sycl/compat.hpp" #include "gpu/intel/sycl/utils.hpp" #include "gpu/sycl/sycl_interop_gpu_kernel.hpp" -#include "hrt/sycl/engine_id.hpp" +#include "xpu/sycl/engine_id.hpp" namespace dnnl { namespace impl { @@ -41,18 +41,18 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { : gpu::intel::compute::compute_engine_t(kind, runtime_kind::sycl, index) , device_(dev) , context_(ctx) - , backend_(hrt::sycl::backend_t::unknown) {} + , backend_(xpu::sycl::backend_t::unknown) {} status_t init() override { - backend_ = hrt::sycl::get_backend(device_); - VERROR_ENGINE(utils::one_of(backend_, hrt::sycl::backend_t::host, - hrt::sycl::backend_t::opencl, - hrt::sycl::backend_t::level0, - hrt::sycl::backend_t::nvidia, - hrt::sycl::backend_t::amd), + backend_ = xpu::sycl::get_backend(device_); + VERROR_ENGINE(utils::one_of(backend_, xpu::sycl::backend_t::host, + xpu::sycl::backend_t::opencl, + xpu::sycl::backend_t::level0, + xpu::sycl::backend_t::nvidia, + xpu::sycl::backend_t::amd), status::invalid_arguments, VERBOSE_UNSUPPORTED_BACKEND, "sycl"); - CHECK(hrt::sycl::check_device(kind(), device_, context_)); + CHECK(xpu::sycl::check_device(kind(), device_, context_)); CHECK(gpu::intel::compute::compute_engine_t::init()); return status::success; @@ -75,7 +75,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { if (!ocl_kernels[i]) continue; auto *k = utils::downcast( ocl_kernels[i].impl()); - hrt::binary_t binary; + xpu::binary_t binary; CHECK(k->get_binary(ocl_engine, binary)); CHECK(create_kernel_from_binary( kernels[i], binary, kernel_names[i])); @@ -84,7 +84,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { } status_t create_kernel_from_binary(gpu::intel::compute::kernel_t &kernel, - const hrt::binary_t &binary, + const xpu::binary_t &binary, const char *kernel_name) const override { std::vector arg_types; @@ -136,7 +136,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { auto kernel_name = jitter->kernel_name(); - hrt::binary_t binary = jitter->get_binary( + xpu::binary_t binary = jitter->get_binary( ocl_engine->context(), ocl_engine->device()); return create_kernel_from_binary(*kernel, binary, kernel_name); } @@ -166,33 +166,33 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { const ::sycl::device &device() const { return device_; } const ::sycl::context &context() const { return context_; } - hrt::sycl::backend_t backend() const { return backend_; } + xpu::sycl::backend_t backend() const { return backend_; } cl_device_id ocl_device() const { - if (backend() != hrt::sycl::backend_t::opencl) { + if (backend() != xpu::sycl::backend_t::opencl) { assert(!"not expected"); return nullptr; } assert(device_.is_cpu() || device_.is_gpu()); - return hrt::ocl::make_wrapper( - hrt::sycl::compat::get_native(device())); + return xpu::ocl::make_wrapper( + xpu::sycl::compat::get_native(device())); } cl_context ocl_context() const { - if (backend() != hrt::sycl::backend_t::opencl) { + if (backend() != xpu::sycl::backend_t::opencl) { assert(!"not expected"); return nullptr; } assert(device_.is_cpu() || device_.is_gpu()); - return hrt::ocl::make_wrapper( - hrt::sycl::compat::get_native(context())); + return xpu::ocl::make_wrapper( + xpu::sycl::compat::get_native(context())); } device_id_t device_id() const override { - return hrt::sycl::device_id(device_); + return xpu::sycl::device_id(device_); } engine_id_t engine_id() const override { - return engine_id_t(new hrt::sycl::engine_id_impl_t( + return engine_id_t(new xpu::sycl::engine_id_impl_t( device(), context(), kind(), runtime_kind(), index())); } @@ -204,7 +204,7 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { ::sycl::device device_; ::sycl::context context_; - hrt::sycl::backend_t backend_; + xpu::sycl::backend_t backend_; }; } // namespace sycl diff --git a/src/sycl/sycl_stream.cpp b/src/sycl/sycl_stream.cpp index d8ae46ce5b3..3b876191e7f 100644 --- a/src/sycl/sycl_stream.cpp +++ b/src/sycl/sycl_stream.cpp @@ -68,7 +68,7 @@ status_t sycl_stream_t::init() { && IMPLICATION( engine()->kind() == engine_kind::gpu, sycl_dev.is_gpu()) && IMPLICATION(engine()->kind() == engine_kind::cpu, - (sycl_dev.is_cpu() || hrt::sycl::is_host(sycl_dev))); + (sycl_dev.is_cpu() || xpu::sycl::is_host(sycl_dev))); if (!args_ok) return status::invalid_arguments; } diff --git a/src/sycl/sycl_stream.hpp b/src/sycl/sycl_stream.hpp index 4fdec09e4eb..62e4575c3cf 100644 --- a/src/sycl/sycl_stream.hpp +++ b/src/sycl/sycl_stream.hpp @@ -26,9 +26,9 @@ #include "gpu/intel/compute/compute_stream.hpp" #include "gpu/intel/ocl/ocl_utils.hpp" #include "gpu/sycl/sycl_gpu_engine.hpp" -#include "hrt/sycl/memory_storage.hpp" #include "sycl/stream_profiler.hpp" #include "sycl/sycl_context.hpp" +#include "xpu/sycl/memory_storage.hpp" #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL #include "sycl/sycl_stream_cpu_thunk.hpp" @@ -162,21 +162,21 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { // Handle all other cases. auto *sycl_src - = utils::downcast( + = utils::downcast( &src); auto *sycl_dst - = utils::downcast( + = utils::downcast( &dst); - bool usm_src = sycl_src->memory_kind() == hrt::sycl::memory_kind::usm; - bool usm_dst = sycl_dst->memory_kind() == hrt::sycl::memory_kind::usm; + bool usm_src = sycl_src->memory_kind() == xpu::sycl::memory_kind::usm; + bool usm_dst = sycl_dst->memory_kind() == xpu::sycl::memory_kind::usm; ::sycl::event e; if (usm_src && usm_dst) { auto *usm_src - = utils::downcast( + = utils::downcast( &src); auto *usm_dst - = utils::downcast( + = utils::downcast( &dst); e = queue_->submit([&](::sycl::handler &cgh) { cgh.depends_on(sycl_event_t::from(deps).events); @@ -184,10 +184,10 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { }); } else if (usm_src && !usm_dst) { auto *usm_src - = utils::downcast( + = utils::downcast( &src); auto *buffer_dst = utils::downcast< - const hrt::sycl::buffer_memory_storage_t *>(&dst); + const xpu::sycl::buffer_memory_storage_t *>(&dst); auto &b_dst = buffer_dst->buffer(); e = queue_->submit([&](::sycl::handler &cgh) { cgh.depends_on(sycl_event_t::from(deps).events); @@ -197,10 +197,10 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { }); } else if (!usm_src && usm_dst) { auto *buffer_src = utils::downcast< - const hrt::sycl::buffer_memory_storage_t *>(&src); + const xpu::sycl::buffer_memory_storage_t *>(&src); auto &b_src = buffer_src->buffer(); auto *usm_dst - = utils::downcast( + = utils::downcast( &dst); e = queue_->submit([&](::sycl::handler &cgh) { cgh.depends_on(sycl_event_t::from(deps).events); @@ -211,9 +211,9 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { } else { // if (!usm_src && !usm_dst) assert(!usm_src && !usm_dst && "USM is not supported yet"); auto *buffer_src = utils::downcast< - const hrt::sycl::buffer_memory_storage_t *>(&src); + const xpu::sycl::buffer_memory_storage_t *>(&src); auto *buffer_dst = utils::downcast< - const hrt::sycl::buffer_memory_storage_t *>(&dst); + const xpu::sycl::buffer_memory_storage_t *>(&dst); auto &b_src = buffer_src->buffer(); auto &b_dst = buffer_dst->buffer(); e = queue_->submit([&](::sycl::handler &cgh) { @@ -241,15 +241,15 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { const gpu::intel::compute::event_t &deps, gpu::intel::compute::event_t &out_dep) override { auto *sycl_dst - = utils::downcast( + = utils::downcast( &dst); - bool usm = sycl_dst->memory_kind() == hrt::sycl::memory_kind::usm; + bool usm = sycl_dst->memory_kind() == xpu::sycl::memory_kind::usm; ::sycl::event out_event; if (usm) { auto *usm_dst - = utils::downcast( + = utils::downcast( &dst); auto dst_ptr = static_cast(usm_dst->usm_ptr()); // Note: we cannot use queue_.fill since it cannot handle @@ -260,11 +260,11 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { }); } else { auto *buffer_dst = utils::downcast< - const hrt::sycl::buffer_memory_storage_t *>(&dst); + const xpu::sycl::buffer_memory_storage_t *>(&dst); out_event = queue_->submit([&](::sycl::handler &cgh) { // need a u8 accessor to get the proper range ::sycl::accessor + xpu::sycl::compat::target_device> acc_dst(buffer_dst->buffer(), cgh, ::sycl::range<1>(size), ::sycl::id<1>(0)); cgh.depends_on(sycl_event_t::from(deps).events); @@ -344,7 +344,7 @@ struct sycl_stream_t : public gpu::intel::compute::compute_stream_t { // XXX: this is a temporary solution to make sycl_memory_arg_t // default constructible. - hrt::sycl::buffer_u8_t dummy_buffer_ = hrt::sycl::buffer_u8_t(1); + xpu::sycl::buffer_u8_t dummy_buffer_ = xpu::sycl::buffer_u8_t(1); private: status_t init(); diff --git a/src/sycl/sycl_stream_submit_cpu_primitive.cpp b/src/sycl/sycl_stream_submit_cpu_primitive.cpp index 1eac0e84e79..c3319822cd3 100644 --- a/src/sycl/sycl_stream_submit_cpu_primitive.cpp +++ b/src/sycl/sycl_stream_submit_cpu_primitive.cpp @@ -26,8 +26,8 @@ #include "common/stream.hpp" #include "common/utils.hpp" #include "gpu/intel/sycl/compat.hpp" -#include "hrt/sycl/c_types_map.hpp" -#include "hrt/sycl/memory_storage.hpp" +#include "xpu/sycl/c_types_map.hpp" +#include "xpu/sycl/memory_storage.hpp" #include #include @@ -57,7 +57,7 @@ template status_t submit_cpu_primitive_with_params_impl( submit_ctx_t *submit_ctx, ::sycl::handler &cgh, param_types... params) { - hrt::sycl::compat::host_task(cgh, [=]() { + xpu::sycl::compat::host_task(cgh, [=]() { thunk_params_t thunk_params; thunk_params.submit_ctx_ptr = submit_ctx; @@ -87,7 +87,7 @@ void fast_dispatch_by_size(submit_ctx_t *submit_ctx, ::sycl::handler &cgh, constexpr size_t nparams = sizeof...(storage_types); auto params_tp = std::make_tuple( - utils::downcast( + utils::downcast( storages) ->buffer()...); submit_cpu_primitive_with_params_impl( @@ -122,9 +122,9 @@ void submit_cpu_primitive(stream_t *stream, const primitive_iface_t *prim_iface, // Skip USM memory storages as they do not require special // handling and can be accessed directly auto mem_api_kind = utils::downcast< - const hrt::sycl::memory_storage_base_t *>(mem_storage) + const xpu::sycl::memory_storage_base_t *>(mem_storage) ->memory_kind(); - if (mem_api_kind == hrt::sycl::memory_kind::usm) continue; + if (mem_api_kind == xpu::sycl::memory_kind::usm) continue; sycl_mem_storages.push_back(mem_storage); } } diff --git a/src/sycl/sycl_stream_submit_cpu_primitive.hpp b/src/sycl/sycl_stream_submit_cpu_primitive.hpp index 400b0a948bd..d2a47ba9ea4 100644 --- a/src/sycl/sycl_stream_submit_cpu_primitive.hpp +++ b/src/sycl/sycl_stream_submit_cpu_primitive.hpp @@ -18,8 +18,8 @@ #define SYCL_STREAM_SUBMIT_CPU_DISPATCH_HPP #include "common/c_types_map.hpp" -#include "hrt/sycl/utils.hpp" #include "sycl/sycl_stream_cpu_thunk.hpp" +#include "xpu/sycl/utils.hpp" #include diff --git a/src/hrt/CMakeLists.txt b/src/xpu/CMakeLists.txt similarity index 97% rename from src/hrt/CMakeLists.txt rename to src/xpu/CMakeLists.txt index 0b470c61010..0845c630fd4 100644 --- a/src/hrt/CMakeLists.txt +++ b/src/xpu/CMakeLists.txt @@ -29,7 +29,7 @@ endif() # `ocl` directory will have to be added only for the intel vendor. add_subdirectory(ocl) -set(OBJ_LIB ${LIB_PACKAGE_NAME}_hrt) +set(OBJ_LIB ${LIB_PACKAGE_NAME}_xpu) add_library(${OBJ_LIB} OBJECT ${SOURCES}) set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS $) diff --git a/src/hrt/ocl/CMakeLists.txt b/src/xpu/ocl/CMakeLists.txt similarity index 96% rename from src/hrt/ocl/CMakeLists.txt rename to src/xpu/ocl/CMakeLists.txt index f5f62bcec33..364605b6d54 100644 --- a/src/hrt/ocl/CMakeLists.txt +++ b/src/xpu/ocl/CMakeLists.txt @@ -21,7 +21,7 @@ file(GLOB_RECURSE SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ) -set(OBJ_LIB ${LIB_PACKAGE_NAME}_hrt_ocl) +set(OBJ_LIB ${LIB_PACKAGE_NAME}_xpu_ocl) add_library(${OBJ_LIB} OBJECT ${SOURCES}) set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS $) diff --git a/src/hrt/ocl/capi/engine.cpp b/src/xpu/ocl/capi/engine.cpp similarity index 97% rename from src/hrt/ocl/capi/engine.cpp rename to src/xpu/ocl/capi/engine.cpp index 4f50f70bc77..eaf2f727c10 100644 --- a/src/hrt/ocl/capi/engine.cpp +++ b/src/xpu/ocl/capi/engine.cpp @@ -21,7 +21,7 @@ #include "common/c_types_map.hpp" #include "common/engine.hpp" #include "gpu/intel/ocl/ocl_engine.hpp" -#include "hrt/ocl/utils.hpp" +#include "xpu/ocl/utils.hpp" using namespace dnnl::impl; using namespace dnnl::impl::gpu::intel::ocl; @@ -34,7 +34,7 @@ status_t dnnl_ocl_interop_engine_create( ocl_engine_factory_t f(engine_kind::gpu); size_t index; - CHECK(hrt::ocl::get_device_index(&index, device)); + CHECK(xpu::ocl::get_device_index(&index, device)); return f.engine_create(engine, device, context, index); } @@ -72,7 +72,7 @@ status_t dnnl_ocl_interop_engine_create_from_cache_blob(engine_t **engine, ocl_engine_factory_t f(engine_kind::gpu); size_t index; - CHECK(hrt::ocl::get_device_index(&index, device)); + CHECK(xpu::ocl::get_device_index(&index, device)); const std::vector cb(cache_blob, cache_blob + size); return f.engine_create(engine, device, context, index, cb); diff --git a/src/hrt/ocl/capi/memory.cpp b/src/xpu/ocl/capi/memory.cpp similarity index 100% rename from src/hrt/ocl/capi/memory.cpp rename to src/xpu/ocl/capi/memory.cpp diff --git a/src/hrt/ocl/capi/primitive.cpp b/src/xpu/ocl/capi/primitive.cpp similarity index 94% rename from src/hrt/ocl/capi/primitive.cpp rename to src/xpu/ocl/capi/primitive.cpp index 2c67c5a8c06..943189be264 100644 --- a/src/hrt/ocl/capi/primitive.cpp +++ b/src/xpu/ocl/capi/primitive.cpp @@ -24,7 +24,7 @@ #include "common/primitive_desc_iface.hpp" #include "common/primitive_iface.hpp" #include "common/utils.hpp" -#include "hrt/ocl/utils.hpp" +#include "xpu/ocl/utils.hpp" #include "gpu/intel/ocl/ocl_c_types_map.hpp" #include "gpu/intel/ocl/ocl_engine.hpp" @@ -50,9 +50,9 @@ status_t dnnl_ocl_interop_primitive_execute( ocl_stream->before_exec_hook(); if (deps != nullptr) { - std::vector> events(ndeps); + std::vector> events(ndeps); for (int i = 0; i < ndeps; i++) { - events[i] = hrt::ocl::wrapper_t(deps[i], true); + events[i] = xpu::ocl::wrapper_t(deps[i], true); } ocl_stream->ocl_ctx().set_deps(events); } diff --git a/src/hrt/ocl/capi/stream.cpp b/src/xpu/ocl/capi/stream.cpp similarity index 100% rename from src/hrt/ocl/capi/stream.cpp rename to src/xpu/ocl/capi/stream.cpp diff --git a/src/hrt/ocl/utils.cpp b/src/xpu/ocl/utils.cpp similarity index 98% rename from src/hrt/ocl/utils.cpp rename to src/xpu/ocl/utils.cpp index da956a42399..f3fe2e100a4 100644 --- a/src/hrt/ocl/utils.cpp +++ b/src/xpu/ocl/utils.cpp @@ -20,7 +20,7 @@ // - CL_DEVICE_UUID_KHR #include -#include "hrt/ocl/utils.hpp" +#include "xpu/ocl/utils.hpp" // XXX: Include this header for VERROR_ENGINE. // TODO: Move VERROR_ENGINE and other similar macros to a separate file. @@ -31,7 +31,7 @@ namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace ocl { status_t convert_to_dnnl(cl_int cl_status) { @@ -304,7 +304,7 @@ cl_platform_id get_platform(engine_t *engine) { } status_t create_program(ocl::wrapper_t &ocl_program, - cl_device_id dev, cl_context ctx, const hrt::binary_t &binary) { + cl_device_id dev, cl_context ctx, const xpu::binary_t &binary) { cl_int err; const unsigned char *binary_buffer = binary.data(); size_t binary_size = binary.size(); @@ -319,7 +319,7 @@ status_t create_program(ocl::wrapper_t &ocl_program, return status::success; } -status_t get_device_uuid(hrt::device_uuid_t &uuid, cl_device_id ocl_dev) { +status_t get_device_uuid(xpu::device_uuid_t &uuid, cl_device_id ocl_dev) { // This function is used only with SYCL that works with OpenCL 3.0 // that supports `cl_khr_device_uuid` extension. #if defined(cl_khr_device_uuid) @@ -336,7 +336,7 @@ status_t get_device_uuid(hrt::device_uuid_t &uuid, cl_device_id ocl_dev) { uuid_packed[i / sizeof(uint64_t)] |= (((uint64_t)ocl_dev_uuid[i]) << shift); } - uuid = hrt::device_uuid_t(uuid_packed[0], uuid_packed[1]); + uuid = xpu::device_uuid_t(uuid_packed[0], uuid_packed[1]); return status::success; #endif return status::runtime_error; @@ -411,6 +411,6 @@ status_t clone_kernel(cl_kernel kernel, cl_kernel *cloned_kernel) { } } // namespace ocl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/ocl/utils.hpp b/src/xpu/ocl/utils.hpp similarity index 95% rename from src/hrt/ocl/utils.hpp rename to src/xpu/ocl/utils.hpp index bca644ee0e2..185c5de3a28 100644 --- a/src/hrt/ocl/utils.hpp +++ b/src/xpu/ocl/utils.hpp @@ -14,8 +14,8 @@ * limitations under the License. *******************************************************************************/ -#ifndef COMMON_HRT_OCL_UTILS_HPP -#define COMMON_HRT_OCL_UTILS_HPP +#ifndef COMMON_XPU_OCL_UTILS_HPP +#define COMMON_XPU_OCL_UTILS_HPP #include @@ -26,11 +26,11 @@ #include "common/utils.hpp" #include "common/verbose.hpp" -#include "hrt/utils.hpp" +#include "xpu/utils.hpp" namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace ocl { status_t convert_to_dnnl(cl_int cl_status); @@ -44,7 +44,7 @@ const char *convert_cl_int_to_str(cl_int cl_status); #define MAYBE_REPORT_OCL_ERROR(s) \ do { \ VERROR(primitive, ocl, "errcode %d,%s,%s:%d", int(s), \ - dnnl::impl::hrt::ocl::convert_cl_int_to_str(s), __FILENAME__, \ + dnnl::impl::xpu::ocl::convert_cl_int_to_str(s), __FILENAME__, \ __LINE__); \ } while (0) @@ -62,7 +62,7 @@ const char *convert_cl_int_to_str(cl_int cl_status); cl_int s = x; \ if (s != CL_SUCCESS) { \ MAYBE_REPORT_OCL_ERROR(s); \ - return dnnl::impl::hrt::ocl::convert_to_dnnl(s); \ + return dnnl::impl::xpu::ocl::convert_to_dnnl(s); \ } \ } while (0) @@ -281,9 +281,9 @@ cl_platform_id get_platform(cl_device_id device); cl_platform_id get_platform(engine_t *engine); status_t create_program(ocl::wrapper_t &ocl_program, - cl_device_id dev, cl_context ctx, const hrt::binary_t &binary); + cl_device_id dev, cl_context ctx, const xpu::binary_t &binary); -status_t get_device_uuid(hrt::device_uuid_t &uuid, cl_device_id ocl_dev); +status_t get_device_uuid(xpu::device_uuid_t &uuid, cl_device_id ocl_dev); // Check for three conditions: // 1. Device and context are compatible, i.e. the device belongs to @@ -295,7 +295,7 @@ status_t check_device(engine_kind_t eng_kind, cl_device_id dev, cl_context ctx); status_t clone_kernel(cl_kernel kernel, cl_kernel *cloned_kernel); } // namespace ocl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/CMakeLists.txt b/src/xpu/sycl/CMakeLists.txt similarity index 96% rename from src/hrt/sycl/CMakeLists.txt rename to src/xpu/sycl/CMakeLists.txt index 4c4bdb95e2c..5e78fb93078 100644 --- a/src/hrt/sycl/CMakeLists.txt +++ b/src/xpu/sycl/CMakeLists.txt @@ -21,7 +21,7 @@ file(GLOB_RECURSE SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ) -set(OBJ_LIB ${LIB_PACKAGE_NAME}_hrt_sycl) +set(OBJ_LIB ${LIB_PACKAGE_NAME}_xpu_sycl) add_library(${OBJ_LIB} OBJECT ${SOURCES}) set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS $) diff --git a/src/hrt/sycl/buffer_memory_storage.cpp b/src/xpu/sycl/buffer_memory_storage.cpp similarity index 93% rename from src/hrt/sycl/buffer_memory_storage.cpp rename to src/xpu/sycl/buffer_memory_storage.cpp index 28de215a229..8832980e82c 100644 --- a/src/hrt/sycl/buffer_memory_storage.cpp +++ b/src/xpu/sycl/buffer_memory_storage.cpp @@ -14,7 +14,7 @@ * limitations under the License. *******************************************************************************/ -#include "hrt/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" #include "sycl/sycl_engine_base.hpp" #include "common/memory.hpp" @@ -24,7 +24,7 @@ namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { namespace { @@ -93,13 +93,13 @@ std::unique_ptr buffer_memory_storage_t::get_sub_storage( storage->buffer_ = buffer_; } else { gpu_assert(IMPLICATION( - hrt::sycl::is_intel_device( + xpu::sycl::is_intel_device( utils::downcast( engine()) ->device()), offset % gpu::intel::ocl::OCL_BUFFER_ALIGNMENT == 0)); - hrt::sycl::buffer_u8_t *sub_buffer = buffer_ - ? new hrt::sycl::buffer_u8_t( + xpu::sycl::buffer_u8_t *sub_buffer = buffer_ + ? new xpu::sycl::buffer_u8_t( parent_buffer(), base_offset_ + offset, size) : nullptr; storage->buffer_.reset(sub_buffer); @@ -130,12 +130,12 @@ status_t buffer_memory_storage_t::init_allocate(size_t size) { return status::out_of_memory; } - buffer_ = std::make_shared(::sycl::range<1>(size)); + buffer_ = std::make_shared(::sycl::range<1>(size)); if (!buffer_) return status::out_of_memory; return status::success; } -hrt::sycl::buffer_u8_t &buffer_memory_storage_t::parent_buffer() const { +xpu::sycl::buffer_u8_t &buffer_memory_storage_t::parent_buffer() const { return utils::downcast(parent_storage()) ->buffer(); } @@ -155,6 +155,6 @@ inout_memory_arg_t buffer_memory_storage_t::get_inout_memory_arg( return get_memory_arg<::sycl::access::mode::read_write>(this, stream, cgh); } } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/buffer_memory_storage.hpp b/src/xpu/sycl/buffer_memory_storage.hpp similarity index 86% rename from src/hrt/sycl/buffer_memory_storage.hpp rename to src/xpu/sycl/buffer_memory_storage.hpp index b359bbf7031..2af7d8111ee 100644 --- a/src/hrt/sycl/buffer_memory_storage.hpp +++ b/src/xpu/sycl/buffer_memory_storage.hpp @@ -23,12 +23,12 @@ #include "common/memory_storage.hpp" #include "common/utils.hpp" #include "gpu/intel/sycl/utils.hpp" -#include "hrt/sycl/c_types_map.hpp" -#include "hrt/sycl/memory_storage_base.hpp" +#include "xpu/sycl/c_types_map.hpp" +#include "xpu/sycl/memory_storage_base.hpp" namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { class buffer_memory_storage_t : public memory_storage_base_t { @@ -38,7 +38,7 @@ class buffer_memory_storage_t : public memory_storage_base_t { buffer_memory_storage_t( engine_t *engine, const memory_storage_t *parent_storage); - hrt::sycl::buffer_u8_t &buffer() const { return *buffer_; } + xpu::sycl::buffer_u8_t &buffer() const { return *buffer_; } memory_kind_t memory_kind() const override { return memory_kind::buffer; } @@ -50,8 +50,8 @@ class buffer_memory_storage_t : public memory_storage_base_t { status_t set_data_handle(void *handle) override { if (!handle) return status::success; - auto *buf_u8_ptr = static_cast(handle); - buffer_.reset(new hrt::sycl::buffer_u8_t(*buf_u8_ptr)); + auto *buf_u8_ptr = static_cast(handle); + buffer_.reset(new xpu::sycl::buffer_u8_t(*buf_u8_ptr)); return status::success; } @@ -79,14 +79,14 @@ class buffer_memory_storage_t : public memory_storage_base_t { status_t init_allocate(size_t size) override; private: - hrt::sycl::buffer_u8_t &parent_buffer() const; + xpu::sycl::buffer_u8_t &parent_buffer() const; - std::shared_ptr buffer_; + std::shared_ptr buffer_; size_t base_offset_ = 0; }; } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/c_types_map.hpp b/src/xpu/sycl/c_types_map.hpp similarity index 91% rename from src/hrt/sycl/c_types_map.hpp rename to src/xpu/sycl/c_types_map.hpp index a5dad3d5438..90ebc28ea68 100644 --- a/src/hrt/sycl/c_types_map.hpp +++ b/src/xpu/sycl/c_types_map.hpp @@ -14,14 +14,14 @@ * limitations under the License. *******************************************************************************/ -#ifndef HRT_SYCL_C_TYPES_MAP_HPP -#define HRT_SYCL_C_TYPES_MAP_HPP +#ifndef XPU_SYCL_C_TYPES_MAP_HPP +#define XPU_SYCL_C_TYPES_MAP_HPP #include "oneapi/dnnl/dnnl_sycl_types.h" namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { using memory_kind_t = dnnl_sycl_interop_memory_kind_t; @@ -31,7 +31,7 @@ const memory_kind_t buffer = dnnl_sycl_interop_buffer; } // namespace memory_kind } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/capi/capi_engine.cpp b/src/xpu/sycl/capi/capi_engine.cpp similarity index 95% rename from src/hrt/sycl/capi/capi_engine.cpp rename to src/xpu/sycl/capi/capi_engine.cpp index 15cb9eb0a27..28bef6297ee 100644 --- a/src/hrt/sycl/capi/capi_engine.cpp +++ b/src/xpu/sycl/capi/capi_engine.cpp @@ -19,8 +19,8 @@ #include "common/c_types_map.hpp" #include "common/engine.hpp" #include "common/utils.hpp" -#include "hrt/sycl/utils.hpp" #include "sycl/sycl_engine.hpp" +#include "xpu/sycl/utils.hpp" using dnnl::impl::engine_t; using dnnl::impl::status_t; @@ -37,7 +37,7 @@ status_t dnnl_sycl_interop_engine_create( engine_kind_t kind; if (sycl_dev.is_gpu()) kind = engine_kind::gpu; - else if (sycl_dev.is_cpu() || dnnl::impl::hrt::sycl::is_host(sycl_dev)) + else if (sycl_dev.is_cpu() || dnnl::impl::xpu::sycl::is_host(sycl_dev)) kind = engine_kind::cpu; else VERROR_ENGINE( @@ -52,7 +52,7 @@ status_t dnnl_sycl_interop_engine_create( VERROR_ENGINE(ef, status::invalid_arguments, VERBOSE_BAD_ENGINE_KIND); size_t index; - CHECK(dnnl::impl::hrt::sycl::get_device_index(&index, sycl_dev)); + CHECK(dnnl::impl::xpu::sycl::get_device_index(&index, sycl_dev)); return ef->engine_create(engine, sycl_dev, sycl_ctx, index); } diff --git a/src/hrt/sycl/capi/capi_memory.cpp b/src/xpu/sycl/capi/capi_memory.cpp similarity index 97% rename from src/hrt/sycl/capi/capi_memory.cpp rename to src/xpu/sycl/capi/capi_memory.cpp index 33b4bdc9121..aa53d6090f7 100644 --- a/src/hrt/sycl/capi/capi_memory.cpp +++ b/src/xpu/sycl/capi/capi_memory.cpp @@ -21,12 +21,12 @@ #include "common/memory.hpp" #include "common/utils.hpp" -#include "hrt/sycl/c_types_map.hpp" -#include "hrt/sycl/memory_storage.hpp" #include "sycl/sycl_engine.hpp" +#include "xpu/sycl/c_types_map.hpp" +#include "xpu/sycl/memory_storage.hpp" using namespace dnnl::impl::sycl; -using namespace dnnl::impl::hrt::sycl; +using namespace dnnl::impl::xpu::sycl; using dnnl::impl::engine_t; using dnnl::impl::memory_desc_t; diff --git a/src/hrt/sycl/capi/capi_primitive.cpp b/src/xpu/sycl/capi/capi_primitive.cpp similarity index 100% rename from src/hrt/sycl/capi/capi_primitive.cpp rename to src/xpu/sycl/capi/capi_primitive.cpp diff --git a/src/hrt/sycl/capi/capi_stream.cpp b/src/xpu/sycl/capi/capi_stream.cpp similarity index 100% rename from src/hrt/sycl/capi/capi_stream.cpp rename to src/xpu/sycl/capi/capi_stream.cpp diff --git a/src/hrt/sycl/compat.cpp b/src/xpu/sycl/compat.cpp similarity index 95% rename from src/hrt/sycl/compat.cpp rename to src/xpu/sycl/compat.cpp index 8aada3971a8..38a06c2bbb5 100644 --- a/src/hrt/sycl/compat.cpp +++ b/src/xpu/sycl/compat.cpp @@ -25,12 +25,12 @@ #include -#include "hrt/sycl/compat.hpp" -#include "hrt/sycl/utils.hpp" +#include "xpu/sycl/compat.hpp" +#include "xpu/sycl/utils.hpp" namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { namespace { @@ -68,6 +68,6 @@ void *get_native(const ::sycl::context &ctx) { } // namespace compat } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/compat.hpp b/src/xpu/sycl/compat.hpp similarity index 95% rename from src/hrt/sycl/compat.hpp rename to src/xpu/sycl/compat.hpp index bdebed7656b..fa5ebf680f8 100644 --- a/src/hrt/sycl/compat.hpp +++ b/src/xpu/sycl/compat.hpp @@ -14,17 +14,17 @@ * limitations under the License. *******************************************************************************/ -#ifndef COMMON_HRT_SYCL_COMPAT_HPP -#define COMMON_HRT_SYCL_COMPAT_HPP +#ifndef COMMON_XPU_SYCL_COMPAT_HPP +#define COMMON_XPU_SYCL_COMPAT_HPP // This file contains a common SYCL compatibility layer. All vendor specific // SYCL code that requires compatbility must reside in the vendor directories. -#include "hrt/sycl/utils.hpp" +#include "xpu/sycl/utils.hpp" namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { namespace compat { @@ -90,7 +90,7 @@ inline const auto &gpu_selector_v = ::sycl::gpu_selector_v; } // namespace compat } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/engine_id.hpp b/src/xpu/sycl/engine_id.hpp similarity index 94% rename from src/hrt/sycl/engine_id.hpp rename to src/xpu/sycl/engine_id.hpp index cea6a7d7db5..a9422d17623 100644 --- a/src/hrt/sycl/engine_id.hpp +++ b/src/xpu/sycl/engine_id.hpp @@ -14,14 +14,14 @@ * limitations under the License. *******************************************************************************/ -#ifndef HRT_SYCL_ENGINE_ID_HPP -#define HRT_SYCL_ENGINE_ID_HPP +#ifndef XPU_SYCL_ENGINE_ID_HPP +#define XPU_SYCL_ENGINE_ID_HPP #include "common/utils.hpp" namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { struct engine_id_impl_t : public impl::engine_id_impl_t { @@ -55,7 +55,7 @@ struct engine_id_impl_t : public impl::engine_id_impl_t { }; } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/memory_storage.hpp b/src/xpu/sycl/memory_storage.hpp similarity index 82% rename from src/hrt/sycl/memory_storage.hpp rename to src/xpu/sycl/memory_storage.hpp index 0a914ab1302..387eab84f64 100644 --- a/src/hrt/sycl/memory_storage.hpp +++ b/src/xpu/sycl/memory_storage.hpp @@ -14,10 +14,10 @@ * limitations under the License. *******************************************************************************/ -#ifndef HRT_SYCL_MEMORY_STORAGE_HPP -#define HRT_SYCL_MEMORY_STORAGE_HPP +#ifndef XPU_SYCL_MEMORY_STORAGE_HPP +#define XPU_SYCL_MEMORY_STORAGE_HPP -#include "hrt/sycl/buffer_memory_storage.hpp" -#include "hrt/sycl/usm_memory_storage.hpp" +#include "xpu/sycl/buffer_memory_storage.hpp" +#include "xpu/sycl/usm_memory_storage.hpp" #endif diff --git a/src/hrt/sycl/memory_storage_base.cpp b/src/xpu/sycl/memory_storage_base.cpp similarity index 95% rename from src/hrt/sycl/memory_storage_base.cpp rename to src/xpu/sycl/memory_storage_base.cpp index f7b409bb230..5348b27abf0 100644 --- a/src/hrt/sycl/memory_storage_base.cpp +++ b/src/xpu/sycl/memory_storage_base.cpp @@ -14,12 +14,12 @@ * limitations under the License. *******************************************************************************/ -#include "hrt/sycl/memory_storage_base.hpp" +#include "xpu/sycl/memory_storage_base.hpp" #include "sycl/sycl_stream.hpp" namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { namespace { @@ -47,6 +47,6 @@ inout_memory_arg_t memory_storage_base_t::empty_inout_memory_arg( return get_empty_memory_arg<::sycl::access::mode::read_write>(stream, cgh); } } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/memory_storage_base.hpp b/src/xpu/sycl/memory_storage_base.hpp similarity index 90% rename from src/hrt/sycl/memory_storage_base.hpp rename to src/xpu/sycl/memory_storage_base.hpp index 0f0aab5e29c..bda7cac2f34 100644 --- a/src/hrt/sycl/memory_storage_base.hpp +++ b/src/xpu/sycl/memory_storage_base.hpp @@ -14,16 +14,16 @@ * limitations under the License. *******************************************************************************/ -#ifndef HRT_SYCL_MEMORY_STORAGE_BASE_HPP -#define HRT_SYCL_MEMORY_STORAGE_BASE_HPP +#ifndef XPU_SYCL_MEMORY_STORAGE_BASE_HPP +#define XPU_SYCL_MEMORY_STORAGE_BASE_HPP #include "common/memory_storage.hpp" -#include "hrt/sycl/c_types_map.hpp" -#include "hrt/sycl/types.hpp" +#include "xpu/sycl/c_types_map.hpp" +#include "xpu/sycl/types.hpp" namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { class memory_storage_base_t : public memory_storage_t { @@ -48,7 +48,7 @@ class memory_storage_base_t : public memory_storage_t { }; } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/memory_storage_helper.hpp b/src/xpu/sycl/memory_storage_helper.hpp similarity index 91% rename from src/hrt/sycl/memory_storage_helper.hpp rename to src/xpu/sycl/memory_storage_helper.hpp index f6b29d131fa..841de606256 100644 --- a/src/hrt/sycl/memory_storage_helper.hpp +++ b/src/xpu/sycl/memory_storage_helper.hpp @@ -15,11 +15,11 @@ * limitations under the License. *******************************************************************************/ -#ifndef HRT_SYCL_MEMORY_STORAGE_HELPER_HPP -#define HRT_SYCL_MEMORY_STORAGE_HELPER_HPP +#ifndef XPU_SYCL_MEMORY_STORAGE_HELPER_HPP +#define XPU_SYCL_MEMORY_STORAGE_HELPER_HPP #include -#include "hrt/sycl/memory_storage.hpp" +#include "xpu/sycl/memory_storage.hpp" #ifdef DNNL_SYCL_CUDA #include "gpu/nvidia/sycl_cuda_compat.hpp" @@ -31,19 +31,19 @@ namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { #define CTX_IN_SYCL_MEMORY(arg) \ - dnnl::impl::hrt::sycl::interop_memory_arg_t<::sycl::access::mode::read>( \ + dnnl::impl::xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read>( \ &CTX_IN_STORAGE(arg), cgh) #define CTX_OUT_SYCL_MEMORY(arg) \ - dnnl::impl::hrt::sycl::interop_memory_arg_t<::sycl::access::mode::write>( \ + dnnl::impl::xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write>( \ &CTX_OUT_STORAGE(arg), cgh) #define CTX_SCRATCH_SYCL_MEMORY(arg) \ - dnnl::impl::hrt::sycl::interop_memory_arg_t< \ + dnnl::impl::xpu::sycl::interop_memory_arg_t< \ ::sycl::access::mode::read_write>( \ ctx.get_scratchpad_grantor().get_memory_storage(arg).get(), cgh) @@ -116,7 +116,7 @@ class interop_memory_arg_t { }; } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/types.hpp b/src/xpu/sycl/types.hpp similarity index 93% rename from src/hrt/sycl/types.hpp rename to src/xpu/sycl/types.hpp index b2a8fca17c9..d2e37a7eda2 100644 --- a/src/hrt/sycl/types.hpp +++ b/src/xpu/sycl/types.hpp @@ -14,37 +14,37 @@ * limitations under the License. *******************************************************************************/ -#ifndef HRT_SYCL_TYPES_HPP -#define HRT_SYCL_TYPES_HPP +#ifndef XPU_SYCL_TYPES_HPP +#define XPU_SYCL_TYPES_HPP #include #include "common/c_types_map.hpp" #include "common/memory_desc_wrapper.hpp" #include "common/utils.hpp" -#include "hrt/sycl/compat.hpp" -#include "hrt/sycl/utils.hpp" +#include "xpu/sycl/compat.hpp" +#include "xpu/sycl/utils.hpp" namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { // The macros are expected to be called within a command group function object // that is passed to `parallel_for`. #define CTX_IN_SYCL_KERNEL_MEMORY(arg) \ CTX_IN_STORAGE(arg).is_null() \ - ? hrt::sycl::memory_storage_base_t::empty_in_memory_arg( \ + ? xpu::sycl::memory_storage_base_t::empty_in_memory_arg( \ ctx.stream(), cgh) \ - : utils::downcast( \ + : utils::downcast( \ &CTX_IN_STORAGE(arg)) \ ->get_in_memory_arg(ctx.stream(), cgh) #define CTX_OUT_SYCL_KERNEL_MEMORY(arg) \ CTX_OUT_STORAGE(arg).is_null() \ - ? hrt::sycl::memory_storage_base_t::empty_out_memory_arg( \ + ? xpu::sycl::memory_storage_base_t::empty_out_memory_arg( \ ctx.stream(), cgh) \ - : utils::downcast( \ + : utils::downcast( \ &CTX_OUT_STORAGE(arg)) \ ->get_out_memory_arg(ctx.stream(), cgh) @@ -287,23 +287,23 @@ struct prec_traits { }; } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl namespace std { template <> -class numeric_limits { +class numeric_limits { public: - static constexpr dnnl::impl::hrt::sycl::bfloat16_t lowest() { + static constexpr dnnl::impl::xpu::sycl::bfloat16_t lowest() { return {uint16_t(0xff7f)}; } - static constexpr dnnl::impl::hrt::sycl::bfloat16_t max() { + static constexpr dnnl::impl::xpu::sycl::bfloat16_t max() { return {uint16_t(0x7f7f)}; } static constexpr int digits = 8; - static constexpr dnnl::impl::hrt::sycl::bfloat16_t epsilon() { + static constexpr dnnl::impl::xpu::sycl::bfloat16_t epsilon() { return {uint16_t((0x7f - (digits - 1)) << (digits - 1))}; } }; diff --git a/src/hrt/sycl/usm_memory_storage.cpp b/src/xpu/sycl/usm_memory_storage.cpp similarity index 97% rename from src/hrt/sycl/usm_memory_storage.cpp rename to src/xpu/sycl/usm_memory_storage.cpp index 7068e9af1a4..d2eac9220e5 100644 --- a/src/hrt/sycl/usm_memory_storage.cpp +++ b/src/xpu/sycl/usm_memory_storage.cpp @@ -14,7 +14,7 @@ * limitations under the License. *******************************************************************************/ -#include "hrt/sycl/usm_memory_storage.hpp" +#include "xpu/sycl/usm_memory_storage.hpp" #include "common/memory.hpp" #include "common/memory_map_manager.hpp" @@ -25,7 +25,7 @@ namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { namespace { @@ -103,6 +103,6 @@ inout_memory_arg_t usm_memory_storage_t::get_inout_memory_arg( return get_memory_arg<::sycl::access::mode::read_write>(this, stream, cgh); } } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/usm_memory_storage.hpp b/src/xpu/sycl/usm_memory_storage.hpp similarity index 98% rename from src/hrt/sycl/usm_memory_storage.hpp rename to src/xpu/sycl/usm_memory_storage.hpp index 6b66adabbe5..7c4ecf83c38 100644 --- a/src/hrt/sycl/usm_memory_storage.hpp +++ b/src/xpu/sycl/usm_memory_storage.hpp @@ -21,14 +21,14 @@ #include "common/memory_storage.hpp" #include "common/utils.hpp" -#include "hrt/sycl/memory_storage_base.hpp" #include "sycl/sycl_engine_base.hpp" +#include "xpu/sycl/memory_storage_base.hpp" #include namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { class usm_memory_storage_t : public memory_storage_base_t { @@ -156,7 +156,7 @@ class usm_memory_storage_t : public memory_storage_base_t { }; } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/utils.cpp b/src/xpu/sycl/utils.cpp similarity index 98% rename from src/hrt/sycl/utils.cpp rename to src/xpu/sycl/utils.cpp index a9800bc9944..dd410df8916 100644 --- a/src/hrt/sycl/utils.cpp +++ b/src/xpu/sycl/utils.cpp @@ -14,8 +14,8 @@ * limitations under the License. *******************************************************************************/ -#include "hrt/sycl/utils.hpp" -#include "hrt/sycl/compat.hpp" +#include "xpu/sycl/utils.hpp" +#include "xpu/sycl/compat.hpp" // TODO: Include only for GPU vendor intel. #include "gpu/intel/sycl/l0/utils.hpp" @@ -53,7 +53,7 @@ bool compare_hip_devices(const ::sycl::device &lhs, const ::sycl::device &rhs); namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { std::string to_string(backend_t backend) { @@ -191,7 +191,7 @@ device_id_t device_id(const ::sycl::device &dev) { = device_id_t {static_cast(backend_t::unknown), 0, 0}; switch (get_backend(dev)) { case backend_t::opencl: { - auto ocl_device = hrt::ocl::make_wrapper( + auto ocl_device = xpu::ocl::make_wrapper( compat::get_native(dev)); device_id = std::make_tuple(static_cast(backend_t::opencl), reinterpret_cast(ocl_device.get()), 0); @@ -343,6 +343,6 @@ status_t get_device_index(size_t *index, const ::sycl::device &dev) { } } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/sycl/utils.hpp b/src/xpu/sycl/utils.hpp similarity index 95% rename from src/hrt/sycl/utils.hpp rename to src/xpu/sycl/utils.hpp index af2b6934acd..8342c0be5cf 100644 --- a/src/hrt/sycl/utils.hpp +++ b/src/xpu/sycl/utils.hpp @@ -14,13 +14,13 @@ * limitations under the License. *******************************************************************************/ -#ifndef HRT_SYCL_UTILS_HPP -#define HRT_SYCL_UTILS_HPP +#ifndef XPU_SYCL_UTILS_HPP +#define XPU_SYCL_UTILS_HPP #include "common/c_types_map.hpp" #include "common/utils.hpp" -#include "hrt/utils.hpp" +#include "xpu/utils.hpp" #if __has_include() #include @@ -48,7 +48,7 @@ namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { namespace sycl { using buffer_u8_t = ::sycl::buffer; @@ -85,7 +85,7 @@ std::vector<::sycl::device> get_devices(::sycl::info::device_type dev_type, status_t get_device_index(size_t *index, const ::sycl::device &dev); } // namespace sycl -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/utils.cpp b/src/xpu/utils.cpp similarity index 94% rename from src/hrt/utils.cpp rename to src/xpu/utils.cpp index 682967272fa..8a82242c9ed 100644 --- a/src/hrt/utils.cpp +++ b/src/xpu/utils.cpp @@ -17,17 +17,17 @@ #include #include -#include "hrt/utils.hpp" +#include "xpu/utils.hpp" namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { size_t device_uuid_hasher_t::operator()(const device_uuid_t &uuid) const { const size_t seed = hash_combine(0, std::get<0>(uuid)); return hash_combine(seed, std::get<1>(uuid)); } -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/src/hrt/utils.hpp b/src/xpu/utils.hpp similarity index 92% rename from src/hrt/utils.hpp rename to src/xpu/utils.hpp index 3564bf3a879..87dfe27fe4f 100644 --- a/src/hrt/utils.hpp +++ b/src/xpu/utils.hpp @@ -14,8 +14,8 @@ * limitations under the License. *******************************************************************************/ -#ifndef COMMON_HRT_UTILS_HPP -#define COMMON_HRT_UTILS_HPP +#ifndef COMMON_XPU_UTILS_HPP +#define COMMON_XPU_UTILS_HPP #include #include @@ -27,7 +27,7 @@ namespace dnnl { namespace impl { -namespace hrt { +namespace xpu { using binary_t = std::vector; using device_uuid_t = std::tuple; @@ -36,7 +36,7 @@ struct device_uuid_hasher_t { size_t operator()(const device_uuid_t &uuid) const; }; -} // namespace hrt +} // namespace xpu } // namespace impl } // namespace dnnl diff --git a/tests/gtests/graph/api/sycl/test_cpp_api_compiled_partition.cpp b/tests/gtests/graph/api/sycl/test_cpp_api_compiled_partition.cpp index 12adf04ad7a..6ccbb3b7f18 100644 --- a/tests/gtests/graph/api/sycl/test_cpp_api_compiled_partition.cpp +++ b/tests/gtests/graph/api/sycl/test_cpp_api_compiled_partition.cpp @@ -74,9 +74,9 @@ TEST(SYCLApi, CompiledPartitionExecute) { dnnl::graph::testing::sycl_free_wrapper); sycl::queue q = (ekind == dnnl::engine::kind::gpu) - ? sycl::queue(dnnl::impl::hrt::sycl::compat::gpu_selector_v, + ? sycl::queue(dnnl::impl::xpu::sycl::compat::gpu_selector_v, sycl::property::queue::in_order {}) - : sycl::queue(dnnl::impl::hrt::sycl::compat::cpu_selector_v, + : sycl::queue(dnnl::impl::xpu::sycl::compat::cpu_selector_v, sycl::property::queue::in_order {}); dnnl::engine eng = sycl_interop::make_engine_with_allocator( @@ -137,9 +137,9 @@ TEST(SYCLApi, CompiledPartitionInteropExecute) { dnnl::graph::testing::sycl_free_wrapper); sycl::queue q = (ekind == dnnl::engine::kind::gpu) - ? sycl::queue(dnnl::impl::hrt::sycl::compat::gpu_selector_v, + ? sycl::queue(dnnl::impl::xpu::sycl::compat::gpu_selector_v, sycl::property::queue::in_order {}) - : sycl::queue(dnnl::impl::hrt::sycl::compat::cpu_selector_v, + : sycl::queue(dnnl::impl::xpu::sycl::compat::cpu_selector_v, sycl::property::queue::in_order {}); dnnl::engine eng = sycl_interop::make_engine_with_allocator( diff --git a/tests/gtests/graph/api/sycl/test_cpp_api_engine.cpp b/tests/gtests/graph/api/sycl/test_cpp_api_engine.cpp index 13db74c14f8..0c1521b1000 100644 --- a/tests/gtests/graph/api/sycl/test_cpp_api_engine.cpp +++ b/tests/gtests/graph/api/sycl/test_cpp_api_engine.cpp @@ -36,9 +36,9 @@ TEST(SYCLApi, Engine) { = static_cast(api_test_engine_kind); queue q = (ekind == dnnl::engine::kind::gpu) - ? queue(dnnl::impl::hrt::sycl::compat::gpu_selector_v, + ? queue(dnnl::impl::xpu::sycl::compat::gpu_selector_v, property::queue::in_order {}) - : queue(dnnl::impl::hrt::sycl::compat::cpu_selector_v, + : queue(dnnl::impl::xpu::sycl::compat::cpu_selector_v, property::queue::in_order {}); allocator alloc = dnnl::graph::sycl_interop::make_allocator( diff --git a/tests/gtests/graph/api/sycl/test_cpp_api_tensor.cpp b/tests/gtests/graph/api/sycl/test_cpp_api_tensor.cpp index ba5c0143ba3..1b09c374852 100644 --- a/tests/gtests/graph/api/sycl/test_cpp_api_tensor.cpp +++ b/tests/gtests/graph/api/sycl/test_cpp_api_tensor.cpp @@ -29,7 +29,7 @@ using namespace dnnl::graph; #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL TEST(SYCLApi, Tensor) { SKIP_IF(api_test_engine_kind == dnnl_cpu, "skip sycl test for cpu engine."); - sycl::device dev {dnnl::impl::hrt::sycl::compat::gpu_selector_v}; + sycl::device dev {dnnl::impl::xpu::sycl::compat::gpu_selector_v}; sycl::context ctx {dev}; dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx); diff --git a/tests/gtests/graph/api/test_api_common.cpp b/tests/gtests/graph/api/test_api_common.cpp index f1535844e31..a4a4c63662a 100644 --- a/tests/gtests/graph/api/test_api_common.cpp +++ b/tests/gtests/graph/api/test_api_common.cpp @@ -27,7 +27,7 @@ void api_test_dnnl_engine_create( if (engine_kind == dnnl_cpu) { #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL static ::sycl::device dev { - dnnl::impl::hrt::sycl::compat::cpu_selector_v}; + dnnl::impl::xpu::sycl::compat::cpu_selector_v}; static ::sycl::context ctx {dev}; if (!allocator_handle) { ASSERT_EQ(dnnl_graph_sycl_interop_allocator_create( @@ -60,7 +60,7 @@ void api_test_dnnl_engine_create( } else { #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL static ::sycl::device dev { - dnnl::impl::hrt::sycl::compat::gpu_selector_v}; + dnnl::impl::xpu::sycl::compat::gpu_selector_v}; static ::sycl::context ctx {dev}; if (!allocator_handle) { ASSERT_EQ(dnnl_graph_sycl_interop_allocator_create( @@ -92,7 +92,7 @@ dnnl::engine &cpp_api_test_dnnl_engine_create(dnnl::engine::kind engine_kind) { if (engine_kind == dnnl::engine::kind::cpu) { #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL static ::sycl::device dev { - dnnl::impl::hrt::sycl::compat::cpu_selector_v}; + dnnl::impl::xpu::sycl::compat::cpu_selector_v}; static ::sycl::context ctx {dev}; static dnnl::graph::allocator alloc = dnnl::graph::sycl_interop::make_allocator( @@ -110,7 +110,7 @@ dnnl::engine &cpp_api_test_dnnl_engine_create(dnnl::engine::kind engine_kind) { } #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL - static ::sycl::device dev {dnnl::impl::hrt::sycl::compat::gpu_selector_v}; + static ::sycl::device dev {dnnl::impl::xpu::sycl::compat::gpu_selector_v}; static ::sycl::context ctx {dev}; static dnnl::graph::allocator alloc = dnnl::graph::sycl_interop::make_allocator( diff --git a/tests/gtests/graph/api/test_api_common.hpp b/tests/gtests/graph/api/test_api_common.hpp index 347a6487074..c0ad8c42fc7 100644 --- a/tests/gtests/graph/api/test_api_common.hpp +++ b/tests/gtests/graph/api/test_api_common.hpp @@ -35,7 +35,7 @@ #include "tests/gtests/dnnl_test_macros.hpp" #ifdef DNNL_WITH_SYCL -#include "hrt/sycl/compat.hpp" +#include "xpu/sycl/compat.hpp" #if __has_include() #include #elif __has_include() diff --git a/tests/gtests/graph/unit/interface/sycl/test_allocator.cpp b/tests/gtests/graph/unit/interface/sycl/test_allocator.cpp index 5967bf16bda..c5f5a5d5095 100644 --- a/tests/gtests/graph/unit/interface/sycl/test_allocator.cpp +++ b/tests/gtests/graph/unit/interface/sycl/test_allocator.cpp @@ -33,9 +33,9 @@ TEST(test_interface_test_allocator, DefaultSyclAllocator) { #endif graph::allocator_t *alloc = new graph::allocator_t(); sycl::queue q = kind == graph::engine_kind::gpu - ? sycl::queue {dnnl::impl::hrt::sycl::compat::gpu_selector_v, + ? sycl::queue {dnnl::impl::xpu::sycl::compat::gpu_selector_v, sycl::property::queue::in_order {}} - : sycl::queue {dnnl::impl::hrt::sycl::compat::cpu_selector_v, + : sycl::queue {dnnl::impl::xpu::sycl::compat::cpu_selector_v, sycl::property::queue::in_order {}}; graph::allocator_t::mem_attr_t attr { @@ -69,8 +69,8 @@ TEST(test_interface_test_allocator, SyclAllocator) { = new graph::allocator_t(dnnl::graph::testing::sycl_malloc_wrapper, dnnl::graph::testing::sycl_free_wrapper); sycl::device sycl_dev = (kind == graph::engine_kind::gpu) - ? sycl::device {dnnl::impl::hrt::sycl::compat::gpu_selector_v} - : sycl::device {dnnl::impl::hrt::sycl::compat::cpu_selector_v}; + ? sycl::device {dnnl::impl::xpu::sycl::compat::gpu_selector_v} + : sycl::device {dnnl::impl::xpu::sycl::compat::cpu_selector_v}; sycl::context sycl_ctx {sycl_dev}; auto *mem_ptr = sycl_alloc->allocate( diff --git a/tests/gtests/graph/unit/unit_test_common.cpp b/tests/gtests/graph/unit/unit_test_common.cpp index ae993d79611..6681bba92f8 100644 --- a/tests/gtests/graph/unit/unit_test_common.cpp +++ b/tests/gtests/graph/unit/unit_test_common.cpp @@ -37,8 +37,8 @@ namespace graph = dnnl::impl::graph; ::sycl::device &get_device() { static ::sycl::device dev = get_test_engine_kind() == graph::engine_kind::cpu - ? ::sycl::device {dnnl::impl::hrt::sycl::compat::cpu_selector_v} - : ::sycl::device {dnnl::impl::hrt::sycl::compat::gpu_selector_v}; + ? ::sycl::device {dnnl::impl::xpu::sycl::compat::cpu_selector_v} + : ::sycl::device {dnnl::impl::xpu::sycl::compat::gpu_selector_v}; return dev; } diff --git a/tests/gtests/sycl/api/test_engine.cpp b/tests/gtests/sycl/api/test_engine.cpp index e6844a17d25..3d21749e483 100644 --- a/tests/gtests/sycl/api/test_engine.cpp +++ b/tests/gtests/sycl/api/test_engine.cpp @@ -20,7 +20,7 @@ #include "oneapi/dnnl/dnnl.hpp" #include "oneapi/dnnl/dnnl_sycl.hpp" -#include "hrt/sycl/compat.hpp" +#include "xpu/sycl/compat.hpp" #include @@ -207,7 +207,7 @@ TEST_P(sycl_engine_test, SubDevice) { #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_SYCL TEST_P(sycl_engine_test, non_sycl_cpu_runtime) { try { - device dev(dnnl::impl::hrt::sycl::compat::cpu_selector_v); + device dev(dnnl::impl::xpu::sycl::compat::cpu_selector_v); context ctx(dev); EXPECT_ANY_THROW(sycl_interop::make_engine(dev, ctx)); } catch (::sycl::exception &e) { diff --git a/tests/gtests/sycl/api/test_memory_buffer.cpp b/tests/gtests/sycl/api/test_memory_buffer.cpp index 8bc6ba055a7..c0884af9e89 100644 --- a/tests/gtests/sycl/api/test_memory_buffer.cpp +++ b/tests/gtests/sycl/api/test_memory_buffer.cpp @@ -22,7 +22,7 @@ #include "oneapi/dnnl/dnnl.h" #include "oneapi/dnnl/dnnl_sycl.hpp" -#include "hrt/sycl/compat.hpp" +#include "xpu/sycl/compat.hpp" #include #include @@ -361,9 +361,9 @@ TEST_P(sycl_memory_buffer_test, EltwiseWithUserKernel) { std::unique_ptr q; if (eng_kind == engine::kind::cpu) { - q.reset(new queue(dnnl::impl::hrt::sycl::compat::cpu_selector_v)); + q.reset(new queue(dnnl::impl::xpu::sycl::compat::cpu_selector_v)); } else { - q.reset(new queue(dnnl::impl::hrt::sycl::compat::gpu_selector_v)); + q.reset(new queue(dnnl::impl::xpu::sycl::compat::gpu_selector_v)); } q->submit([&](handler &cgh) { From 0163d0d3adbcb13916b51ea547de447f3a744809 Mon Sep 17 00:00:00 2001 From: Shreyas-fuj <141716478+Shreyas-fuj@users.noreply.github.com> Date: Thu, 9 May 2024 09:27:01 +0530 Subject: [PATCH 019/187] benchdnn: brgemm: enable aarch64 (#1890) --- src/cpu/aarch64/brgemm/brgemm.hpp | 4 +- src/cpu/aarch64/brgemm/brgemm_types.hpp | 2 + src/cpu/aarch64/cpu_isa_traits.hpp | 1 - .../jit_uni_deconv_zp_pad_str_kernel.cpp | 1 + tests/benchdnn/brgemm/bench_brgemm.cpp | 8 ++- tests/benchdnn/brgemm/brgemm.cpp | 58 +++++++++++++------ tests/benchdnn/brgemm/brgemm.hpp | 3 + tests/benchdnn/brgemm/ref_brgemm.cpp | 15 ++++- 8 files changed, 68 insertions(+), 24 deletions(-) diff --git a/src/cpu/aarch64/brgemm/brgemm.hpp b/src/cpu/aarch64/brgemm/brgemm.hpp index 098454dbd30..f6531f5ff64 100644 --- a/src/cpu/aarch64/brgemm/brgemm.hpp +++ b/src/cpu/aarch64/brgemm/brgemm.hpp @@ -219,8 +219,8 @@ void DNNL_API brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel, /// phase /// @param scratch Scratchpad memory needed in several scenarios /// -void brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel, int bs, - const void *addr_A, const void *addr_B, +void DNNL_API brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel, + int bs, const void *addr_A, const void *addr_B, const brgemm_batch_element_t *batch, void *ptr_C, void *ptr_D, const brgemm_post_ops_data_t &post_ops_data, void *scratch = nullptr); diff --git a/src/cpu/aarch64/brgemm/brgemm_types.hpp b/src/cpu/aarch64/brgemm/brgemm_types.hpp index 904367df64a..d6eb16cd6ff 100644 --- a/src/cpu/aarch64/brgemm/brgemm_types.hpp +++ b/src/cpu/aarch64/brgemm/brgemm_types.hpp @@ -28,6 +28,8 @@ namespace aarch64 { // The type defines organization of batch of matrices typedef enum { + // Undefined brgemm batch kind + brgemm_batch_kind_undef = 0, // A and B arrays of pointers brgemm_addr = 1, // Base address and array of offsets from base address. diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp index 358987c98a7..7b2d4ff9031 100644 --- a/src/cpu/aarch64/cpu_isa_traits.hpp +++ b/src/cpu/aarch64/cpu_isa_traits.hpp @@ -21,7 +21,6 @@ #include -#include "common/dnnl_thread.hpp" #include "common/type_helpers.hpp" #include "common/utils.hpp" #include "dnnl_types.h" diff --git a/src/cpu/aarch64/jit_uni_deconv_zp_pad_str_kernel.cpp b/src/cpu/aarch64/jit_uni_deconv_zp_pad_str_kernel.cpp index 54167218c3f..cd93313ba83 100644 --- a/src/cpu/aarch64/jit_uni_deconv_zp_pad_str_kernel.cpp +++ b/src/cpu/aarch64/jit_uni_deconv_zp_pad_str_kernel.cpp @@ -16,6 +16,7 @@ *******************************************************************************/ #include +#include "common/dnnl_thread.hpp" #include "common/utils.hpp" #include "cpu/aarch64/jit_primitive_conf.hpp" #include diff --git a/tests/benchdnn/brgemm/bench_brgemm.cpp b/tests/benchdnn/brgemm/bench_brgemm.cpp index 81fb6611eab..88bb3c032bc 100644 --- a/tests/benchdnn/brgemm/bench_brgemm.cpp +++ b/tests/benchdnn/brgemm/bench_brgemm.cpp @@ -24,7 +24,9 @@ namespace brgemm { -#if defined(DNNL_X64) && DNNL_X64 == 1 && DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE +#if ((defined(DNNL_X64) && DNNL_X64 == 1) \ + || (defined(DNNL_AARCH64) && DNNL_AARCH64 == 1)) \ + && DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE void check_correctness(const settings_t &s, const settings_t &def) { for_(const auto &i_dt : s.dt) @@ -157,8 +159,8 @@ int bench(int argc, char **argv) { #else int bench(int argc, char **argv) { - BENCHDNN_PRINT( - 0, "%s\n", "INFO: brgemm driver: only x64 backend is supported."); + BENCHDNN_PRINT(0, "%s\n", + "INFO: brgemm driver: only x64, aarch64 backend is supported."); return OK; } diff --git a/tests/benchdnn/brgemm/brgemm.cpp b/tests/benchdnn/brgemm/brgemm.cpp index 503c21a1f29..477cfa25a73 100644 --- a/tests/benchdnn/brgemm/brgemm.cpp +++ b/tests/benchdnn/brgemm/brgemm.cpp @@ -43,11 +43,30 @@ struct dnnl_api_traits { DNN_SAFE_V(dnnl::impl::cpu::x64::brgemm_kernel_destroy(t)); } }; + +#elif defined(DNNL_AARCH64) && DNNL_AARCH64 == 1 \ + && DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE +template <> +struct dnnl_api_traits { + static void destroy(dnnl::impl::cpu::aarch64::brgemm_kernel_t *t) { + DNN_SAFE_V(dnnl::impl::cpu::aarch64::brgemm_kernel_destroy(t)); + } +}; #endif namespace brgemm { -#if defined(DNNL_X64) && DNNL_X64 == 1 && DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE +#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE +#if defined(DNNL_X64) && DNNL_X64 == 1 +#define brg_x64 +#define namespace_impl dnnl::impl::cpu::x64 +#elif defined(DNNL_AARCH64) && DNNL_AARCH64 == 1 +#define brg_aarch64 +#define namespace_impl dnnl::impl::cpu::aarch64 +#endif +#endif + +#if defined(brg_x64) || defined(brg_aarch64) /// Initializes BRGEMM attributes from an input string. /// @@ -58,8 +77,8 @@ namespace brgemm { /// integers. /// dnnl_status_t brgemm_attr_init( - dnnl::impl::cpu::x64::brgemm_attr_t *brgattr, const prb_t *prb) { - using namespace dnnl::impl::cpu::x64; + namespace_impl::brgemm_attr_t *brgattr, const prb_t *prb) { + using namespace namespace_impl; // `max_bs` is handled directly through the driver interface. brgattr->max_bs = prb->batch_size; @@ -159,14 +178,13 @@ std::string prepare_wei_format_string( return wtag; } -dnnl::impl::cpu::x64::brgemm_batch_kind_t str2batch_kind( - const std::string &str) { +namespace_impl::brgemm_batch_kind_t str2batch_kind(const std::string &str) { if (str == "addr") - return dnnl::impl::cpu::x64::brgemm_batch_kind_t::brgemm_addr; + return namespace_impl::brgemm_batch_kind_t::brgemm_addr; else if (str == "offs") - return dnnl::impl::cpu::x64::brgemm_batch_kind_t::brgemm_offs; + return namespace_impl::brgemm_batch_kind_t::brgemm_offs; assert(!"Unsupported batch kind value"); - return dnnl::impl::cpu::x64::brgemm_batch_kind_t::brgemm_batch_kind_undef; + return namespace_impl::brgemm_batch_kind_t::brgemm_batch_kind_undef; } int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg, @@ -272,20 +290,19 @@ void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind, // A special wrapper needed to match internal infrastructure. dnnl_status_t brgemm_kernel_execute_postops_wrapper( - const dnnl::impl::cpu::x64::brgemm_kernel_t *brgemm_kernel, - dnnl::impl::cpu::x64::brgemm_batch_kind_t batch_kind, int batch_size, + const namespace_impl::brgemm_kernel_t *brgemm_kernel, + namespace_impl::brgemm_batch_kind_t batch_kind, int batch_size, const void *src_ptr, const void *wei_ptr, - const dnnl::impl::cpu::x64::brgemm_batch_element_t *batch_element, + const namespace_impl::brgemm_batch_element_t *batch_element, void *acc_ptr, void *dst_ptr, - const dnnl::impl::cpu::x64::brgemm_post_ops_data_t &post_ops_data, + const namespace_impl::brgemm_post_ops_data_t &post_ops_data, void *scratchpad_ptr, const dnnl_stream_t &stream, const std::vector &dnnl_args) { - if (batch_kind == dnnl::impl::cpu::x64::brgemm_batch_kind_t::brgemm_addr) { + if (batch_kind == namespace_impl::brgemm_batch_kind_t::brgemm_addr) { brgemm_kernel_execute_postops(brgemm_kernel, batch_size, batch_element, acc_ptr, dst_ptr, post_ops_data, scratchpad_ptr); - } else if (batch_kind - == dnnl::impl::cpu::x64::brgemm_batch_kind_t::brgemm_offs) { + } else if (batch_kind == namespace_impl::brgemm_batch_kind_t::brgemm_offs) { brgemm_kernel_execute_postops(brgemm_kernel, batch_size, src_ptr, wei_ptr, batch_element, acc_ptr, dst_ptr, post_ops_data, scratchpad_ptr); @@ -320,9 +337,12 @@ int doit(const prb_t *prb, res_t *res) { auto dst_md = dnn_mem_t::init_md(prb->ndims, prb->dst_dims.data(), prb->dst_dt(), prb->dtag, dst_strides); - using namespace dnnl::impl::cpu::x64; - + using namespace namespace_impl; +#if defined(brg_x64) brgemm_desc_t brgemm_desc; +#elif defined(brg_aarch64) + brgemm_t brgemm_desc; +#endif // Supports only address model for now as only affects the way memory is // passed to `brgemm_batch_element_t` object. brgemm_batch_kind_t batch_kind = str2batch_kind(prb->batch_kind); @@ -376,12 +396,14 @@ int doit(const prb_t *prb, res_t *res) { } auto brgemm_kernel = make_benchdnn_dnnl_wrapper(brgemm_kernel_); +#if defined(brg_x64) const auto is_tmm = brgemm_desc.is_tmm; if (is_tmm) { char palette[AMX_PALETTE_SIZE] = {}; DNN_SAFE(brgemm_init_tiles(brgemm_desc, palette), WARN); DNN_SAFE(amx_tile_configure(palette), WARN); } +#endif auto src_md = dnn_mem_t::init_md( prb->ndims, src_dims, prb->src_dt(), prb->stag, src_strides); @@ -640,7 +662,9 @@ int doit(const prb_t *prb, res_t *res) { scratchpad_ptr, std::placeholders::_1, std::placeholders::_2); measure_perf(prb->ctx_exe, res, perf_func, args); +#if defined(brg_x64) if (is_tmm) DNN_SAFE(amx_tile_release(), WARN); +#endif return OK; } diff --git a/tests/benchdnn/brgemm/brgemm.hpp b/tests/benchdnn/brgemm/brgemm.hpp index 9121d6f5082..b72e4b66a4d 100644 --- a/tests/benchdnn/brgemm/brgemm.hpp +++ b/tests/benchdnn/brgemm/brgemm.hpp @@ -26,6 +26,9 @@ #if defined(DNNL_X64) && DNNL_X64 == 1 \ && (DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE) #include "src/cpu/x64/brgemm/brgemm.hpp" +#elif defined(DNNL_AARCH64) && DNNL_AARCH64 == 1 \ + && (DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE) +#include "src/cpu/aarch64/brgemm/brgemm.hpp" #endif #include "common.hpp" diff --git a/tests/benchdnn/brgemm/ref_brgemm.cpp b/tests/benchdnn/brgemm/ref_brgemm.cpp index ecbda8aafce..54923ac5393 100644 --- a/tests/benchdnn/brgemm/ref_brgemm.cpp +++ b/tests/benchdnn/brgemm/ref_brgemm.cpp @@ -20,7 +20,15 @@ namespace brgemm { -#if defined(DNNL_X64) && DNNL_X64 == 1 && DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE +#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE +#if defined(DNNL_X64) && DNNL_X64 == 1 +#define brg_x64 +#elif defined(DNNL_AARCH64) && DNNL_AARCH64 == 1 +#define brg_aarch64 +#endif +#endif + +#if defined(brg_x64) || defined(brg_aarch64) int64_t src_off_f(const prb_t *prb, int64_t bs, int64_t m, int64_t k) { return (m * prb->batch_size + bs) * prb->k + k; @@ -47,7 +55,12 @@ void compute_ref_brgemm(const prb_t *prb, const args_t &args) { const int64_t K = prb->k; // Using workspace memory as a method to get brgemm attributes. +#if defined(brg_x64) using brgemm_attr_t = dnnl::impl::cpu::x64::brgemm_attr_t; +#elif defined(brg_aarch64) + using brgemm_attr_t = dnnl::impl::cpu::aarch64::brgemm_attr_t; +#endif + brgemm_attr_t *brgemm_attr = (brgemm_attr_t *)ws_m; const int wei_zero_point = prb->attr.zero_points[DNNL_ARG_WEIGHTS]; From 7bdb0e1afc5e726ed297c897681bdf9674c0fa34 Mon Sep 17 00:00:00 2001 From: Shreyas-fuj <141716478+Shreyas-fuj@users.noreply.github.com> Date: Thu, 9 May 2024 09:33:03 +0530 Subject: [PATCH 020/187] cpu: aarch64: extend matmul support (#1899) - Adding Graviton 3 support for brgemm matmul. - Adding scales support for brgemm matmul. - Implementation of buffer for weight matrix(Normal and transpose). --- src/cpu/aarch64/brgemm/jit_brgemm_kernel.cpp | 2 +- .../injectors/jit_uni_postops_injector.cpp | 10 +- src/cpu/aarch64/matmul/brgemm_matmul.cpp | 10 +- .../matmul/brgemm_matmul_copy_utils.cpp | 435 ++++++++++++++++-- .../aarch64/matmul/brgemm_matmul_reorders.cpp | 2 +- src/cpu/matmul/cpu_matmul_list.cpp | 5 +- src/cpu/reorder/cpu_reorder.hpp | 1 + .../reorder/cpu_reorder_regular_f32_f32.cpp | 2 + 8 files changed, 420 insertions(+), 47 deletions(-) diff --git a/src/cpu/aarch64/brgemm/jit_brgemm_kernel.cpp b/src/cpu/aarch64/brgemm/jit_brgemm_kernel.cpp index 33ae52083d0..bfe797b723d 100644 --- a/src/cpu/aarch64/brgemm/jit_brgemm_kernel.cpp +++ b/src/cpu/aarch64/brgemm/jit_brgemm_kernel.cpp @@ -928,7 +928,7 @@ void jit_brgemm_kernel_t::apply_post_ops( } static inline bool isa_has_masks(cpu_isa_t isa) { - return is_superset(isa, sve_512); + return is_superset(isa, sve_256); } void jit_brgemm_kernel_t::store_accumulators_apply_post_ops( diff --git a/src/cpu/aarch64/injectors/jit_uni_postops_injector.cpp b/src/cpu/aarch64/injectors/jit_uni_postops_injector.cpp index 3ce8669d2e7..07ffe1639cb 100644 --- a/src/cpu/aarch64/injectors/jit_uni_postops_injector.cpp +++ b/src/cpu/aarch64/injectors/jit_uni_postops_injector.cpp @@ -198,14 +198,11 @@ bool post_ops_ok(const post_ops_ok_args_t &post_ops_ok_args) { const std::vector &accepted_post_op_types = post_ops_ok_args.accepted_post_op_types; const post_ops_t &post_ops = post_ops_ok_args.post_ops; - const memory_desc_wrapper *dst_d = post_ops_ok_args.dst_d; const bool sum_at_pos_0_only = post_ops_ok_args.sum_at_pos_0_only; const bool sum_requires_scale_one = post_ops_ok_args.sum_requires_scale_one; const bool sum_requires_zp_zero = post_ops_ok_args.sum_requires_zp_zero; const bool sum_requires_same_params = post_ops_ok_args.sum_requires_same_params; - const auto &enabled_bcast_strategy - = post_ops_ok_args.enabled_bcast_strategy; // Save scale and zero point of first sum postop in order to check that any // subsequent sum postops have the same values. This check is necessary @@ -243,12 +240,7 @@ bool post_ops_ok(const post_ops_ok_args_t &post_ops_ok_args) { } break; case binary: - if (entry.is_binary()) { - assert(dst_d != nullptr && "dst_d is null"); - return binary_injector::is_supported(isa, - entry.binary.src1_desc, *dst_d, - enabled_bcast_strategy); - } + if (entry.is_binary()) { return false; } break; default: assert(false && "Unhandled post_op type"); } diff --git a/src/cpu/aarch64/matmul/brgemm_matmul.cpp b/src/cpu/aarch64/matmul/brgemm_matmul.cpp index e2f67540b94..bebdae12041 100644 --- a/src/cpu/aarch64/matmul/brgemm_matmul.cpp +++ b/src/cpu/aarch64/matmul/brgemm_matmul.cpp @@ -78,12 +78,6 @@ status_t brgemm_matmul_t::pd_t::init(engine_t *engine) { if (N() == DNNL_RUNTIME_DIM_VAL) ok = false; } - if (!attr()->scales_.get(DNNL_ARG_SRC).has_default_values() - || !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values() - || !attr()->scales_.get(DNNL_ARG_DST).has_default_values()) { - return false; - } - if (!attr()->post_ops_.sum_with_default_dt()) return false; return ok; @@ -180,7 +174,8 @@ status_t brgemm_matmul_t::pd_t::init(engine_t *engine) { abced, abcdfe, abcdegf, abcdefhg, abcdefgih, abcdefghji, abcdefghikj, abcdefghijlk); - if (is_A_transposed || is_B_transposed) return status::unimplemented; + if ((mayiuse(sve_512) && is_B_transposed) || is_A_transposed) + return status::unimplemented; return status::success; } @@ -1433,6 +1428,7 @@ struct brgemm_matmul_t::brg_matmul_exec_ctx_t { }; template struct brgemm_matmul_t; +template struct brgemm_matmul_t; } // namespace matmul } // namespace aarch64 diff --git a/src/cpu/aarch64/matmul/brgemm_matmul_copy_utils.cpp b/src/cpu/aarch64/matmul/brgemm_matmul_copy_utils.cpp index 380e5b49273..3d734f3b536 100644 --- a/src/cpu/aarch64/matmul/brgemm_matmul_copy_utils.cpp +++ b/src/cpu/aarch64/matmul/brgemm_matmul_copy_utils.cpp @@ -32,7 +32,29 @@ using namespace dnnl::impl::format_tag; using namespace dnnl::impl::utils; using namespace Xbyak_aarch64; -#define GET_OFF(x) offsetof(ctx_t, x) +#define GET_OFF(x) (uint32_t) offsetof(ctx_t, x) + +#define LDR_IMM(reg, addr, off) \ + { \ + const uint64_t IMM12_MASK = ~uint64_t(0xfff); \ + if ((off & IMM12_MASK) == 0) { \ + ldr(reg, ptr(addr, off)); \ + } else { \ + add_imm(X_DEFAULT_ADDR, addr, off, X_TMP_0); \ + ldr(reg, ptr(X_DEFAULT_ADDR)); \ + } \ + } + +#define STR_IMM(reg, addr, off) \ + { \ + const uint64_t IMM12_MASK = ~uint64_t(0xfff); \ + if ((off & IMM12_MASK) == 0) { \ + str(reg, ptr(addr, off)); \ + } else { \ + add_imm(X_DEFAULT_ADDR, addr, off, X_TMP_0); \ + str(reg, ptr(X_DEFAULT_ADDR)); \ + } \ + } template struct jit_brgemm_matmul_copy_a_impl_t : public jit_brgemm_matmul_copy_a_t, @@ -52,7 +74,7 @@ struct jit_brgemm_matmul_copy_a_impl_t : public jit_brgemm_matmul_copy_a_t, : conf_->LDA) * tr_typesize_) , do_compute_compensation_(conf_->has_zero_point_b) - , k_loop_unroll_(is_ymm_ ? 7 : 16) + , k_loop_unroll_(is_sve256_ ? 7 : 16) , vmm_copy_idx_(29) {} void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); } @@ -64,8 +86,8 @@ struct jit_brgemm_matmul_copy_a_impl_t : public jit_brgemm_matmul_copy_a_t, using opmask_t = const Xbyak_aarch64::PReg; static constexpr int vlen_ = cpu_isa_traits::vlen; - static constexpr bool is_ymm_ = isa == sve_256; - static constexpr int num_comp_acc_ = is_ymm_ ? 7 : 8; + static constexpr bool is_sve256_ = isa == sve_256; + static constexpr int num_comp_acc_ = is_sve256_ ? 7 : 8; const int typesize_; const int tr_typesize_; @@ -102,8 +124,8 @@ struct jit_brgemm_matmul_copy_a_impl_t : public jit_brgemm_matmul_copy_a_t, ZReg vmm_ones_words = ZReg(28); ZReg vmm_dot_product_temp = ZReg(29); - ZReg vmm_comp_mul = ZReg(is_ymm_ ? 14 : 30); // 1s - ZReg vmm_comp_add = ZReg(is_ymm_ ? 15 : 31); // 128 + ZReg vmm_comp_mul = ZReg(is_sve256_ ? 14 : 30); // 1s + ZReg vmm_comp_add = ZReg(is_sve256_ ? 15 : 31); // 128 // Allows to shift A data by 128 for s8s8 problem for SVE512 in copy // routine, not in compute kernel. It's disabled for now, as it @@ -355,7 +377,7 @@ struct jit_brgemm_matmul_copy_b_int8_t : public jit_brgemm_matmul_copy_b_t, using reg64_t = const Xbyak_aarch64::XReg; using reg32_t = const Xbyak_aarch64::WReg; - static constexpr bool is_ymm_ = cpu_isa_traits::vlen == 32; + static constexpr bool is_sve256_ = cpu_isa_traits::vlen == 32; static constexpr int k_blk_step_ = 4; static constexpr int n_blk_step_ = 64; static constexpr int blk_sz_ = 6; @@ -402,7 +424,7 @@ struct jit_brgemm_matmul_copy_b_int8_t : public jit_brgemm_matmul_copy_b_t, inline ZReg get_vmm(int blk, int idx) { if (idx < 0 || idx >= 32) assert(!"idx > vregs"); - assert(IMPLICATION(!is_ymm_, idx < blk_sz_ && blk >= 0)); + assert(IMPLICATION(!is_sve256_, idx < blk_sz_ && blk >= 0)); auto reg_idx = blk_sz_ * blk + idx; return ZReg(reg_idx); } @@ -498,11 +520,12 @@ struct jit_brgemm_matmul_copy_b_f32_t : public jit_brgemm_matmul_copy_b_t, using opmask_t = const Xbyak_aarch64::PReg; using zmm = const Xbyak_aarch64::ZReg; - enum { n_blk_step = 16, max_regs_available = 30 }; + enum { n_blk_step = 8, max_regs_available = 30 }; const data_type_t dt_in_; const size_t typesize_in_; const size_t typesize_out_ = sizeof(float); dim_t src_stride_, tr_src_stride_; + const bool is_sve_256 = !mayiuse(sve_512); opmask_t kTail = p7; opmask_t kFFFF = p6; @@ -519,26 +542,100 @@ struct jit_brgemm_matmul_copy_b_f32_t : public jit_brgemm_matmul_copy_b_t, zmm zmm_permw = z30; zmm zmm_zero = z31; - inline void kmovw(Xbyak_aarch64::PReg k, unsigned w) { - assert(!"under construction"); - } - void copy_16_x_n_block(int nrows, int ncolumns); + void copy_16_8_x_n_block(int nrows, int ncolumns); void compute_k_loop(int ncolumns); void generate() override; }; -void jit_brgemm_matmul_copy_b_f32_t::copy_16_x_n_block( +void jit_brgemm_matmul_copy_b_f32_t::copy_16_8_x_n_block( int nrows, int ncolumns) { - assert(!"under construction"); + + int n_blk_step = is_sve_256 ? 8 : 16; + + auto get_zmm = [](int reg_idx) { + assert(reg_idx >= 0 && reg_idx < max_regs_available); + return ZRegS(reg_idx); + }; + + auto load = [this, get_zmm](int blk, int k, int n, opmask_t current_mask) { + auto src_zmm = get_zmm(blk); + add_imm(X_DEFAULT_ADDR, reg_src, k * src_stride_ + n * typesize_in_, + X_TMP_0); + ld1w(src_zmm, current_mask / T_z, ptr(X_DEFAULT_ADDR)); + }; + + const int columns_tail = ncolumns % n_blk_step; + + if (columns_tail < n_blk_step) + set_preg(kTail.s, columns_tail, X_TMP_0, X_TMP_1); + + int iter = 0; + for_(int k = 0; k < nrows; k++) //nrows = unroll + for (int n = 0; n < conf_->wei_n_blk; n += n_blk_step) { + + const dim_t tr_src_off = k * tr_src_stride_ + n * typesize_out_; + const int zero_padding = ncolumns - n; + if (zero_padding <= 0) { + add_imm(X_DEFAULT_ADDR, reg_tr_src, tr_src_off, X_TMP_0); + str(zmm_zero, ptr(X_DEFAULT_ADDR)); + continue; + } + + const opmask_t curr_msk = zero_padding < n_blk_step ? kTail : kFFFF; + const int blk_idx = iter % max_regs_available; + load(blk_idx, k, n, curr_msk); + add_imm(X_DEFAULT_ADDR, reg_tr_src, tr_src_off, X_TMP_0); + const auto src_zmm0 = ZReg(blk_idx); + str(src_zmm0, ptr(X_DEFAULT_ADDR)); + iter++; + } } void jit_brgemm_matmul_copy_b_f32_t::compute_k_loop(int ncolumns) { - assert(!"under construction"); + + auto compute_uni_k_loop = [&](int unroll) { + Label K_start_label, K_end_label; + + L(K_start_label); + cmp_imm(reg_K_iters, unroll, X_TMP_0); + b(LT, K_end_label); + + copy_16_8_x_n_block(unroll, ncolumns); + add_imm(reg_src, reg_src, unroll * src_stride_, X_TMP_0); + add_imm(reg_tr_src, reg_tr_src, unroll * tr_src_stride_, X_TMP_0); + + sub_imm(reg_K_iters, reg_K_iters, unroll, X_TMP_0); + bl(K_start_label); + + L(K_end_label); + }; + + int k_unroll = is_sve_256 ? 8 : 16; + compute_uni_k_loop(k_unroll); + compute_uni_k_loop(1); } void jit_brgemm_matmul_copy_b_f32_t::generate() { preamble(); - assert(!"under construction"); + eor(zmm_zero.d, zmm_zero.d, zmm_zero.d); + LDR_IMM(reg_src, param1, GET_OFF(src)); + LDR_IMM(reg_tr_src, param1, GET_OFF(tr_src)); + LDR_IMM(reg_K_iters, param1, GET_OFF(current_K_iters)); + LDR_IMM(reg_N_blk, param1, GET_OFF(current_N_blk)); + ptrue(kFFFF.s); + + Label done; + if (conf_->N_tail > 0) { + Label not_N_tail; + cmp_imm(reg_N_blk, conf_->N_tail, X_TMP_0); + b(NE, not_N_tail); + compute_k_loop(conf_->N_tail); + bl(done); + + L(not_N_tail); + } + compute_k_loop(conf_->N_blk); + L(done); postamble(); } @@ -576,11 +673,11 @@ struct jit_brgemm_matmul_copy_b_transposed_t using opmask_t = const Xbyak_aarch64::PReg; using ZReg = const Xbyak_aarch64::ZReg; - static constexpr bool is_ymm_ = isa == sve_256; + static constexpr bool is_sve256_ = isa == sve_256; static constexpr cpu_isa_t isa_ = isa; static constexpr int max_vmm_regs_ = cpu_isa_traits::n_vregs; static constexpr int vlen_ = cpu_isa_traits::vlen; - static constexpr int n_blk_step_ = 16; + static constexpr int n_blk_step_ = is_sve256_ ? 8 : 16; static constexpr int bf32_k_blk_step_ = 16; static constexpr size_t comp_shift_ = vlen_; @@ -629,10 +726,24 @@ struct jit_brgemm_matmul_copy_b_transposed_t ZReg vmm_all_bits_1 = Xbyak_aarch64::ZReg(max_vmm_regs_ - 5); ZReg vmm_one_s32 = Xbyak_aarch64::ZReg(max_vmm_regs_ - 6); - // Required in every dot product for INT8 non-VNNI computation. ZReg vmm_ones_words = ZReg(max_vmm_regs_ - 7); ZReg vmm_dot_product_temp = ZReg(max_vmm_regs_ - 8); + ZReg z_tmp_0 = ZReg(28); + ZReg z_tmp_1 = ZReg(29); + ZReg z_tmp_3 = ZReg(30); + ZReg z_tmp_2 = ZReg(27); + PReg p_tmp_0 = p7; + PReg p_02 = p8; + PReg p_AA = p9; + PReg p_55 = p10; + PReg p_FF = p5; + PReg p_0F = p4; + PReg p_33 = p3; + PReg p_F0 = p2; + PReg p_CC = p1; + PReg p_E0 = p6; + void kmovw(Xbyak_aarch64::PReg k, unsigned w) { assert(!"under construction"); }; @@ -648,8 +759,8 @@ struct jit_brgemm_matmul_copy_b_transposed_t ZReg tmp_vmm(int i) { // If compensation compute is required - last 6 zregs are reserved for it - assert(i >= 0 && IMPLICATION(!is_ymm_, i < max_tmp_idx) - && IMPLICATION(is_ymm_, i < 2)); + assert(i >= 0 && IMPLICATION(!is_sve256_, i < max_tmp_idx) + && IMPLICATION(is_sve256_, i < 2)); return ZReg(n_blk_step_ + i); } @@ -660,7 +771,7 @@ struct jit_brgemm_matmul_copy_b_transposed_t int curr_K_tail, bool is_first_K_iter, bool is_last_K_iter); inline void dot_product(ZReg v1, ZReg v2, ZReg v3) { - assert(!"under construction"); + fmla(v1.s, P_ALL_ONE / T_m, v2.s, v3.s); } void generate() override; }; @@ -674,27 +785,297 @@ void jit_brgemm_matmul_copy_b_transposed_t::copy_row_x_col( template <> void jit_brgemm_matmul_copy_b_transposed_t::copy_row_x_col( int nrows, int ncolumns) { - assert(!"under construction"); + assert(nrows >= 0 && nrows <= n_blk_step_ && ncolumns >= 0 + && ncolumns <= k_blk_step_); + if (!nrows) return; + + const int columns_tail = ncolumns % k_blk_step_; + auto load = [this, nrows, columns_tail](int i) { + auto vmm_src = src_vmm(i); + if (i >= nrows) { + eor(vmm_src.d, vmm_src.d, vmm_src.d); + return; + } + if (columns_tail > 0) { + add_imm(X_DEFAULT_ADDR, reg_src, i * src_stride_, X_TMP_0); + set_preg(P_TMP.b, columns_tail * typesize_, X_TMP_0, X_TMP_1); + ld1b(vmm_src.b, P_TMP / T_z, ptr(X_DEFAULT_ADDR)); + } else { + add_imm(X_DEFAULT_ADDR, reg_src, i * src_stride_, X_TMP_0); + ldr(vmm_src, ptr(X_DEFAULT_ADDR)); + } + }; + + // swap 1 + for (int i = 0; i < 4; ++i) { + const int src_idx0 = i * 2; + const int src_idx1 = src_idx0 + 1; + + const int next_src_idx0 = src_idx0 + 2; + const int next_src_idx1 = src_idx1 + 2; + const bool load_next = i < 3; + + if (i == 0) { + + load(src_idx0); + load(src_idx1); + } + const auto tmp0 = tmp_vmm(0); + const auto tmp1 = tmp_vmm(1); + const auto src0 = src_vmm(src_idx0); + const auto src1 = src_vmm(src_idx1); + + if (next_src_idx0 < nrows && load_next) { load(next_src_idx0); } + mov(tmp0.d, src0.d); + ext(tmp0.b, src0.b, 16); + splice(tmp0.s, p_E0, tmp0.s); + + if (next_src_idx1 < nrows && load_next) { load(next_src_idx1); } + set_preg(p_tmp_0.s, 1, X_TMP_0, X_TMP_1); + mov(tmp1.d, src1.d); + splice(tmp1.s, p_tmp_0, tmp1.s); + + mov(src0.s, p_AA / T_m, tmp1.s); + mov(src1.s, p_55 / T_m, tmp0.s); + } + // swap 2 + for (int i = 0; i < 4; ++i) { + const int select_half = (i < 2) ? 0 : 2; + const int src_idx0 = i + select_half; + const int src_idx2 = src_idx0 + 2; + + const auto tmp0 = tmp_vmm(0); + const auto tmp1 = tmp_vmm(1); + const auto src0 = src_vmm(src_idx0); + const auto src2 = src_vmm(src_idx2); + + not_(p_tmp_0.b, p_FF, p_02.b); + mov(tmp0.d, src0.d); + splice(tmp0.s, p_tmp_0, tmp0.s); + + rev(p_tmp_0.s, p_02.s); + mov(tmp1.d, src2.d); + splice(tmp1.s, p_tmp_0, tmp1.s); + + mov(src2.s, p_33 / T_m, tmp0.s); + mov(src0.s, p_CC / T_m, tmp1.s); + } + // swap 4 + for (int i = 0; i < 4; ++i) { + const int src_idx0 = i; + const int src_idx4 = src_idx0 + 4; + + const auto tmp0 = tmp_vmm(0); + const auto src0 = src_vmm(src_idx0); + const auto src4 = src_vmm(src_idx4); + + mov(tmp0.d, src0.d); + ext(tmp0.b, src0.b, 16); + + splice(src0.s, p_0F, src4.s); + mov(src4.s, p_0F / T_m, tmp0.s); + } + // swap 8 + for (int i = 0; i < 8; i++) { + const auto src0 = src_vmm(i); + if (do_compute_compensation_) + dot_product(vmm_comp_acc, vmm_comp_mul, src0); + add_imm(X_DEFAULT_ADDR, reg_tr_src, i * tr_src_stride_, X_TMP_0); + str(src0, ptr(X_DEFAULT_ADDR)); + } } template void jit_brgemm_matmul_copy_b_transposed_t::compute_K_loop(bool is_N_tail, int curr_K_tail, bool is_first_K_iter, bool is_last_K_iter) { - assert(!"under construction"); + + MAYBE_UNUSED(is_first_K_iter); + MAYBE_UNUSED(is_last_K_iter); + + const int N_chunk_tail = conf_->N % n_blk_step_; + const int nrows = is_N_tail ? N_chunk_tail : n_blk_step_; + if (do_compute_compensation_) + eor(vmm_comp_acc.d, vmm_comp_acc.d, vmm_comp_acc.d); + + Label K_loop, K_loop_tail_or_done; + LDR_IMM(reg_K_iters, param1, GET_OFF(current_K_iters)); + + mov(reg_src, reg_src_base); + mov(reg_tr_src, reg_tr_src_base); + if (curr_K_tail > 0) { + cmp_imm(reg_K_iters, k_blk_step_, X_TMP_0); + b(LT, K_loop_tail_or_done); + } + + L(K_loop); + copy_row_x_col(nrows, k_blk_step_); + add_imm(reg_src, reg_src, k_blk_step_ * typesize_, X_TMP_0); + add_imm(reg_tr_src, reg_tr_src, + k_blk_step_ / vnni_granularity_ * tr_src_stride_, X_TMP_0); + + sub_imm(reg_K_iters, reg_K_iters, k_blk_step_, X_TMP_0); + cmp_imm(reg_K_iters, k_blk_step_, X_TMP_0); + b(GE, K_loop); + + L(K_loop_tail_or_done); + + if (curr_K_tail > 0) copy_row_x_col(nrows, curr_K_tail); + + if (req_zp_comp_) { + const auto addr = ptr(reg_zp_comp_ptr); + if (!is_first_K_iter) ld1rw(vmm_comp_acc.s, P_ALL_ONE / T_z, addr); + if (is_last_K_iter) + mul(vmm_comp_acc.s, P_ALL_ONE / T_m, vmm_zp_a_neg_val.s); + st1w(vmm_comp_acc.s, P_ALL_ONE / T_m, addr); + } } template void jit_brgemm_matmul_copy_b_transposed_t::compute_N_loop( int curr_K_tail, bool is_first_K_iter, bool is_last_K_iter) { - assert(!"under construction"); + + const int N_chunk_tail = conf_->N % n_blk_step_; + + Label N_loop, N_loop_tail_or_done; + if (N_chunk_tail > 0) { + cmp_imm(reg_N_iters, n_blk_step_, X_TMP_0); + b(LT, N_loop_tail_or_done); + } + + L(N_loop); + compute_K_loop(false, curr_K_tail, is_first_K_iter, is_last_K_iter); + add_imm(reg_src_base, reg_src_base, n_blk_step_ * src_stride_, X_TMP_0); + add_imm(reg_tr_src_base, reg_tr_src_base, + n_blk_step_ * vnni_granularity_ * tr_typesize_, X_TMP_0); + + if (req_zp_comp_) + add_imm(reg_zp_comp_ptr, reg_zp_comp_ptr, comp_shift_, X_TMP_0); + if (req_s8s8_comp_) + add_imm(reg_comp_ptr, reg_comp_ptr, comp_shift_, X_TMP_0); + + sub_imm(reg_N_iters, reg_N_iters, n_blk_step_, X_TMP_0); + cmp_imm(reg_N_iters, n_blk_step_, X_TMP_0); + b(GE, N_loop); + + L(N_loop_tail_or_done); + if (N_chunk_tail > 0) { + Label N_loop_done; + cmp_imm(reg_N_iters, 0, X_TMP_0); + b(LE, N_loop_done); + + compute_K_loop(true, curr_K_tail, is_first_K_iter, is_last_K_iter); + L(N_loop_done); + } } template void jit_brgemm_matmul_copy_b_transposed_t::generate() { preamble(); - assert(!"under construction"); + ptrue(p_FF.s); + set_preg(p_0F.s, 4, X_TMP_0, X_TMP_1); + rev(p_F0.s, p_0F.s); + set_preg(p_33.s, 2, X_TMP_0, X_TMP_1); + rev(p_tmp_0.s, p_33.s); + orr(p_33.b, p_FF, p_33.b, p_F0.b); + eor(p_33.b, p_FF, p_33.b, p_tmp_0.b); + rev(p_CC.s, p_33.s); + pfalse(p_AA.b); + ptrue(p_tmp_0.s); + trn1(p_AA.s, p_AA.s, p_tmp_0.s); + rev(p_55.s, p_AA.s); + set_preg(p_E0.s, 3, X_TMP_0, X_TMP_1); + rev(p_E0.s, p_E0.s); + set_preg(p_02.s, 2, X_TMP_0, X_TMP_1); + + LDR_IMM(reg_src_base, param1, GET_OFF(src)); + LDR_IMM(reg_tr_src_base, param1, GET_OFF(tr_src)); + LDR_IMM(reg_K_iters, param1, GET_OFF(current_K_iters)); + LDR_IMM(reg_N_iters, param1, GET_OFF(current_N_blk)); + + const dim_t N_chunk_elems = conf_->N_chunk_elems; + assert(N_chunk_elems % n_blk_step_ == 0 || N_chunk_elems == conf_->N); + UNUSED(N_chunk_elems); + + const auto K_blk_tail = nstl::min(conf_->K, conf_->K_blk) % k_blk_step_; + const auto K_tail_tail = (conf_->K % conf_->K_blk) % k_blk_step_; + + auto compute_body = [&](bool is_first_K_iter, bool is_last_K_iter) { + if (is_last_K_iter) { + if (req_s8s8_comp_) { + mov_imm(imm_addr64, 0xffffffff); + auto wreg_tmp_1 = WReg(imm_addr64.getIdx()); + dup(vmm_all_bits_1.s, wreg_tmp_1); + mov_imm(imm_addr64, 0x1); + dup(vmm_one_s32.s, wreg_tmp_1); + } + if (req_zp_comp_) { + LDR_IMM(reg_zp_a_neg_val_ptr, param1, + GET_OFF(zp_a_neg_value_ptr)); + ldr(W_TMP_0, ptr(reg_zp_a_neg_val_ptr)); + dup(vmm_zp_a_neg_val.s, W_TMP_0); + } + } + + Label compute_body_done; + if (conf_->K_tail > 0 && K_blk_tail != K_tail_tail) { + Label not_K_tail; + cmp_imm(reg_K_iters, conf_->K_blk, X_TMP_0); + b(EQ, not_K_tail); + compute_N_loop(K_tail_tail, is_first_K_iter, is_last_K_iter); + bl(compute_body_done); + + L(not_K_tail); + } + + compute_N_loop(K_blk_tail, is_first_K_iter, is_last_K_iter); + L(compute_body_done); + }; + + Label done; + if (do_compute_compensation_) { + assert(IMPLICATION(req_zp_comp_, + conf_->src_zp_type == brgemm_broadcast_t::per_tensor)); + + LDR_IMM(reg_K_start, param1, GET_OFF(current_K_start)); + if (req_s8s8_comp_) + LDR_IMM(reg_comp_ptr, param1, GET_OFF(compensation_ptr)); + if (req_zp_comp_) + LDR_IMM(reg_zp_comp_ptr, param1, GET_OFF(zp_a_compensation_ptr)); + mov_imm(regq_tmp, 1); + auto wreg_tmp_2 = WReg(regq_tmp.getIdx()); + dup(vmm_comp_mul.s, wreg_tmp_2); + + const auto last_K_threshold + = rnd_up(conf_->K, conf_->K_blk) - conf_->K_blk; + Label not_first, not_first_not_last; + cmp_imm(reg_K_start, 0, X_TMP_0); + b(NE, not_first); + { + // first K iteration + Label first_not_last; + cmp_imm(reg_K_start, last_K_threshold, X_TMP_0); + b(LT, first_not_last); + compute_body(true, true); + bl(done); + + L(first_not_last); + compute_body(true, false); + bl(done); + } + + L(not_first); + cmp_imm(reg_K_start, last_K_threshold, X_TMP_0); + b(LT, not_first_not_last); + + compute_body(false, true); + bl(done); + L(not_first_not_last); + } + + compute_body(false, false); + L(done); postamble(); } diff --git a/src/cpu/aarch64/matmul/brgemm_matmul_reorders.cpp b/src/cpu/aarch64/matmul/brgemm_matmul_reorders.cpp index 9e496e1939c..0aa98fe9c2b 100644 --- a/src/cpu/aarch64/matmul/brgemm_matmul_reorders.cpp +++ b/src/cpu/aarch64/matmul/brgemm_matmul_reorders.cpp @@ -109,7 +109,7 @@ status_t brgemm_matmul_matrix_B_reorder_t::pd_t::init( : brgemm_broadcast_t::none; matmul_conf_for_reorder_.has_zero_point_a = matmul_conf_for_reorder_.src_zp_type != brgemm_broadcast_t::none; - matmul_conf_for_reorder_.isa = sve_512; + matmul_conf_for_reorder_.isa = (!mayiuse(sve_512)) ? sve_256 : sve_512; auto mask_ok = [&](bool check, int mask) { return IMPLICATION( diff --git a/src/cpu/matmul/cpu_matmul_list.cpp b/src/cpu/matmul/cpu_matmul_list.cpp index dd7d8284fe8..10329f9232f 100644 --- a/src/cpu/matmul/cpu_matmul_list.cpp +++ b/src/cpu/matmul/cpu_matmul_list.cpp @@ -72,8 +72,9 @@ using namespace dnnl::impl::cpu::matmul; // clang-format off constexpr impl_list_item_t impl_list[] = REG_MATMUL_P({ - CPU_INSTANCE_AARCH64(brgemm_matmul_t) - CPU_INSTANCE_AARCH64_ACL(acl_matmul_t) + CPU_INSTANCE_AARCH64(brgemm_matmul_t) + CPU_INSTANCE_AARCH64_ACL(acl_matmul_t) + CPU_INSTANCE_AARCH64(brgemm_matmul_t) CPU_INSTANCE_AMX(brgemm_matmul_t) CPU_INSTANCE_AMX(brgemm_matmul_t) CPU_INSTANCE_AVX512(brgemm_matmul_t) diff --git a/src/cpu/reorder/cpu_reorder.hpp b/src/cpu/reorder/cpu_reorder.hpp index 5c265f8df03..dc0105966b1 100644 --- a/src/cpu/reorder/cpu_reorder.hpp +++ b/src/cpu/reorder/cpu_reorder.hpp @@ -36,6 +36,7 @@ #include "cpu/x64/matmul/brgemm_matmul_reorders.hpp" #elif DNNL_AARCH64 #include "cpu/aarch64/jit_uni_reorder.hpp" +#include "cpu/aarch64/matmul/brgemm_matmul_reorders.hpp" #endif #if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp index 9a108905b6a..7d9a77a47ef 100644 --- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp +++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp @@ -35,6 +35,7 @@ const impl_list_map_t ®ular_f32_f32_impl_list_map() { DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t)) DNNL_AARCH64_ACL_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t)) + DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::brgemm_matmul_matrix_B_reorder_t)) DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t)) DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t)) REG_SR(f32, any, f32, any, fmt_order::any, spec::reference) @@ -48,6 +49,7 @@ const impl_list_map_t ®ular_f32_f32_impl_list_map() { DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t)) DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t)) + DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::brgemm_matmul_matrix_B_reorder_t)) DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t)) DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t)) DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, nCw16c)) From 0124f1e287bbc9840b1faa3966bbdbff49b4b0aa Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Thu, 9 May 2024 08:20:39 -0700 Subject: [PATCH 021/187] {xehpc, xe2}: jit: gemm: large-tile TN int4 compressed weights kernel --- src/gpu/intel/jit/gemm/kernel.db | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gpu/intel/jit/gemm/kernel.db b/src/gpu/intel/jit/gemm/kernel.db index 8fe264a92e6..a6c26bce413 100644 --- a/src/gpu/intel/jit/gemm/kernel.db +++ b/src/gpu/intel/jit/gemm/kernel.db @@ -15,8 +15,8 @@ *******************************************************************************/ /*@kcatalog@*/ -kcatalog::FlatCatalog<1022> _CATALOG_ -{1, 8378, 1022, { +kcatalog::FlatCatalog<1023> _CATALOG_ +{1, 8379, 1023, { {{'9', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2x2 as8x2 ab l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'9', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4x2 ab l4 acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'9', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 as16 ab l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {256}}}, @@ -602,6 +602,7 @@ kcatalog::FlatCatalog<1022> _CATALOG_ {{'F', "gemm", {"F", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, {{'F', "gemm", {"F", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav di li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, {{'F', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "IB"}, "aB64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr dm", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {1, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.21597e+06, 165524, 28763.4, 9411.33, 0, 0, 0.278476, 0.599765, 0.920572, 5.21577, 0.021695, 0.0325353, 0.0148497, 1, 1.30279, 0.785158, 1.47381e-11}}}, +{{'F', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "BIp"}, "aB16+m64@48 am32+m32@64 aB wg 8x4 xaf ca4x2 vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb dm", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 32768, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {986359, 1.14807e+06, 0, 0, 5.29203e+06, 1.04448e+07, 0.958606, 0.919165, 0.917099, 1.4058, 0.00482064, 0.00482064, 0, 0.992145, 1.42432, 1.05623, 3.49665e-12}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {16, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 aS8x2+S32@8 aB wg 2x8x2 kr kc8 nse hi pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 8}, {2, 8, 2}, 1, (WGType) 1, 413, 0, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12483e+06, -574330, -20527.6, 853244, 4.03046e+06, 0, 2.33007, 1.51079, 0.80632, 1.51643, 0.0625462, -1.54388e-05, 0.0629715, 0.425906, 1.06028, 0.229788, 1.06051e-11}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse di li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {16, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}}, From f5503b8244ad1f1f6ca0caca335118790c81f02b Mon Sep 17 00:00:00 2001 From: Sergey Kazakov Date: Mon, 6 May 2024 12:10:33 -0700 Subject: [PATCH 022/187] gpu: bnorm: nhwc-reusable: make subgroup size tunable --- src/gpu/intel/ocl/bnorm/nhwc_reusable.cl | 18 +++++++++--------- src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp | 6 ++++-- src/gpu/intel/ocl/bnorm/nhwc_reusable.h | 1 - src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp | 1 + 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl index 4030f6e3167..27dd29b1ed7 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl @@ -89,7 +89,7 @@ void nhwc_reusable_calc_fused_reduction(volatile __global atomic_float *dst, } // Calculate mean, regular algorithm -__attribute__((intel_reqd_sub_group_size(16))) __kernel void +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_calc_mean(__global DATA_T *src, __global float *reduce_temp, volatile __global atomic_float *mean, off_t ic_size, off_t ic_block, off_t sp_size, off_t stat_sp_block, off_t reduce_stat_nblocks, @@ -161,7 +161,7 @@ nhwc_reusable_calc_mean(__global DATA_T *src, __global float *reduce_temp, } // Calculate variance, regular algorithm -__attribute__((intel_reqd_sub_group_size(16))) __kernel void +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_calc_var(__global DATA_T *src, __global float *mean, __global float *reduce_temp, volatile __global atomic_float *variance, off_t ic_size, off_t ic_block, off_t sp_size, off_t stat_sp_block, @@ -246,7 +246,7 @@ nhwc_reusable_calc_var(__global DATA_T *src, __global float *mean, } // Calculate mean and variance at once, 1pass algorithm -__attribute__((intel_reqd_sub_group_size(16))) __kernel void +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_calc_mean_var(__global DATA_T *src, __global float *reduce_temp, volatile __global atomic_float *mean, volatile __global atomic_float *variance, off_t ic_size, off_t ic_block, @@ -335,7 +335,7 @@ nhwc_reusable_calc_mean_var(__global DATA_T *src, __global float *reduce_temp, } // Main FWD kernel, common for regular and 1pass algorithms -__attribute__((intel_reqd_sub_group_size(16))) __kernel void +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_norm_fwd(__global DATA_T *src, __global float *mean, __global float *variance, __global DATA_T *dst, __global float *scaleshift, __global float *shift, __global char *ws, @@ -502,7 +502,7 @@ void nhwc_reusable_bwd_calc_fused_reduction( } // Calculate stats for BWD pass -__attribute__((intel_reqd_sub_group_size(16))) __kernel void +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, __global DATA_T *diff_dst, __global char *ws, __global float *temp_reduce, __global float *temp_reduce_shift, @@ -631,7 +631,7 @@ nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, } // Main BWD pass kernel -__attribute__((intel_reqd_sub_group_size(16))) __kernel void +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_norm_bwd(__global DATA_T *src, __global float *mean, __global float *variance, __global DATA_T *diff_dst, __global float *scaleshift, __global char *ws, @@ -778,7 +778,7 @@ __kernel void nhwc_reusable_reduce_aux(__global float *ptr1, } // Reduction thru scratchpad, FWD pass, regular algorithm -__attribute__((intel_reqd_sub_group_size(16))) __kernel void +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_reduce_fwd_reg(__global float *reduce_scratchpad, off_t scratchpad_off, __global float *dst, off_t ic_size, off_t reduce_ic_sub_groups, off_t reduce_stat_nblocks, off_t sp_size, @@ -809,7 +809,7 @@ nhwc_reusable_reduce_fwd_reg(__global float *reduce_scratchpad, } // Reduction thru scratchpad, FWD pass, 1pass algorithm -__attribute__((intel_reqd_sub_group_size(16))) __kernel void +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_reduce_fwd_1pass(__global float *reduce_temp, __global float *mean, __global float *variance, off_t ic_size, off_t reduce_ic_sub_groups, off_t reduce_stat_nblocks, off_t sp_size, @@ -860,7 +860,7 @@ nhwc_reusable_reduce_fwd_1pass(__global float *reduce_temp, } // Reduction thru scratchpad, BWD pass -__attribute__((intel_reqd_sub_group_size(16))) __kernel void +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_reduce_stat(__global float *temp_reduce, __global float *temp_reduce_shift, __global float *diff_scale, __global float *diff_shift, __global float *variance, float eps, diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp index 754615dccb2..71e3963419a 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp @@ -140,8 +140,8 @@ static status_t init_conf_common(nhwc_bnorm_params_t &bn_conf, // TODO: implement it, possible perf boost could be ~ 2x if (bn_conf.ic % 8 == 0 && bn_conf.ic % 16 && cmpl_conf.use_stats_one_pass) cmpl_conf.use_stats_one_pass = false; - - bn_conf.sub_group_size = 16; + // Temporary for performance tuning. TODO: add subgroup size to perf model + bn_conf.sub_group_size = dev_getenv("SG", 16); // reshape to xc bn_conf.sp = bn_conf.mb * bn_conf.id * bn_conf.ih * bn_conf.iw; @@ -168,6 +168,7 @@ static status_t init_conf_common(nhwc_bnorm_params_t &bn_conf, CHECK(get_params_by_model(bn_conf, pd, hw_params, true)); cmpl_conf.vect_size = bn_conf.vect_size; + cmpl_conf.sub_group_size = bn_conf.sub_group_size; // For performance debuging and analisys std::string prb_str = get_prb_desc_str(pd); @@ -211,6 +212,7 @@ static void init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx, kernel_ctx.define_int("USE_SCALE", cmpl_conf.use_scale); kernel_ctx.define_int("USE_SHIFT", cmpl_conf.use_shift); kernel_ctx.define_int("VECT_SIZE", cmpl_conf.vect_size); + kernel_ctx.define_int("SG_SIZE", cmpl_conf.sub_group_size); kernel_ctx.add_option("-cl-std=CL2.0"); if (cmpl_conf.data_type == data_type::s8) kernel_ctx.add_option("-Dcl_intel_subgroups_char"); diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.h b/src/gpu/intel/ocl/bnorm/nhwc_reusable.h index d7b2b50ead2..d7d9cf088ae 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.h +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.h @@ -16,7 +16,6 @@ #ifndef GPU_INTEL_OCL_BNORM_NHWC_REUSABLE_H #define GPU_INTEL_OCL_BNORM_NHWC_REUSABLE_H -#define SG_SIZE 16 #define VECT_DT_N VECT_SIZE #include "gpu/intel/ocl/dispatch.h" #include "gpu/intel/ocl/ocl_types.h" diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp b/src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp index b730187e605..fbbc7422349 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp @@ -78,6 +78,7 @@ struct nhwc_reusable_bnorm_compile_params_t { data_type_t data_type; int vect_size; + int sub_group_size; bool use_scale; bool use_shift; bool is_training; From 6a3ae0d8ea9468218cfae97eec96d1b5de5eb181 Mon Sep 17 00:00:00 2001 From: Sergey Kazakov Date: Mon, 6 May 2024 15:50:27 -0700 Subject: [PATCH 023/187] gpu: bnorm: nhwc-reusable: introduce upper limit of ic block size --- src/gpu/intel/ocl/bnorm/bnorm_model.cpp | 4 +++- src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp | 5 ++++- src/gpu/intel/primitive_conf.hpp | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/gpu/intel/ocl/bnorm/bnorm_model.cpp b/src/gpu/intel/ocl/bnorm/bnorm_model.cpp index 09c2fd81155..724fea092c3 100644 --- a/src/gpu/intel/ocl/bnorm/bnorm_model.cpp +++ b/src/gpu/intel/ocl/bnorm/bnorm_model.cpp @@ -641,7 +641,9 @@ status_t get_params_by_model(nhwc_bnorm_params_t &conf, model_params_t p; p.ic_block = conf.sub_group_size; assert(conf.ic % conf.sub_group_size == 0); - while (p.ic_block <= conf.ic) { + + while (p.ic_block <= conf.ic + && (reusable_version ? p.ic_block <= conf.max_ic_block : true)) { if (conf.ic % p.ic_block == 0) { const int calc_stat_ic = get_nhwc_calc_stat_ic( conf.ic, p.ic_block, conf.sub_group_size); diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp index 71e3963419a..db73acf33dd 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp @@ -140,8 +140,11 @@ static status_t init_conf_common(nhwc_bnorm_params_t &bn_conf, // TODO: implement it, possible perf boost could be ~ 2x if (bn_conf.ic % 8 == 0 && bn_conf.ic % 16 && cmpl_conf.use_stats_one_pass) cmpl_conf.use_stats_one_pass = false; - // Temporary for performance tuning. TODO: add subgroup size to perf model + + // Temporary for performance tuning. TODO: consider adding it to a perf model bn_conf.sub_group_size = dev_getenv("SG", 16); + bn_conf.max_ic_block = dev_getenv("MAX_IC_BLOCK", 128); + // reshape to xc bn_conf.sp = bn_conf.mb * bn_conf.id * bn_conf.ih * bn_conf.iw; diff --git a/src/gpu/intel/primitive_conf.hpp b/src/gpu/intel/primitive_conf.hpp index 0d34650075c..90402d4719e 100644 --- a/src/gpu/intel/primitive_conf.hpp +++ b/src/gpu/intel/primitive_conf.hpp @@ -485,6 +485,7 @@ struct bnorm_conf_t { bool skip_reduce_stat; bool use_stats_one_pass; int calc_stat_ic; + int max_ic_block; }; // Layer Normalization From 40b7cf4ea4c1637db466f7575e80d17e04065138 Mon Sep 17 00:00:00 2001 From: Sergey Kazakov Date: Mon, 6 May 2024 16:02:51 -0700 Subject: [PATCH 024/187] gpu: bnorm: nhwc-reusable: rename reduction functions --- src/gpu/intel/ocl/bnorm/nhwc_reusable.cl | 35 ++++++++++++------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl index 27dd29b1ed7..0cbc3f9787a 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl @@ -16,8 +16,7 @@ #include "gpu/intel/ocl/bnorm/nhwc_reusable.h" // Atomic-based reduction for 1pass algorithm -void nhwc_reusable_mean_var_fused_reduction( - volatile __global atomic_float *mean, +void nhwc_reusable_1pass_fused_reduction(volatile __global atomic_float *mean, volatile __global atomic_float *variance, off_t dst_offset, SUM_DATA_T *sum, SUM_DATA_T *sum_sq, __local SUM_DATA_T *local_sum, __local SUM_DATA_T *local_sum_sq, off_t vect_size) { @@ -57,7 +56,7 @@ void nhwc_reusable_mean_var_fused_reduction( } // Atomic-based reduction for regular algorithm -void nhwc_reusable_calc_fused_reduction(volatile __global atomic_float *dst, +void nhwc_reusable_reg_fused_reduction(volatile __global atomic_float *dst, off_t dst_offset, float *sum, __local float *local_sum, off_t vect_size) { const int local_id = get_local_id(1); @@ -125,7 +124,7 @@ nhwc_reusable_calc_mean(__global DATA_T *src, __global float *reduce_temp, // store res if (use_fused_atomics_reduction) { const int dst_off = ic_block_offset + sg * VECT_SIZE * SG_SIZE; - nhwc_reusable_calc_fused_reduction( + nhwc_reusable_reg_fused_reduction( mean, dst_off, (float *)(&v_mean), local_sum, VECT_SIZE); } else { const int sg_off = sg * VECT_SIZE * SG_SIZE; @@ -151,7 +150,7 @@ nhwc_reusable_calc_mean(__global DATA_T *src, __global float *reduce_temp, if (use_fused_atomics_reduction) { const int dst_off = ic_block_offset + (ic_vect_sgroups + sg) * SG_SIZE; - nhwc_reusable_calc_fused_reduction( + nhwc_reusable_reg_fused_reduction( mean, dst_off, &v_mean, local_sum, 1); } else { const int sg_off = (ic_vect_sgroups + sg) * SG_SIZE; @@ -175,7 +174,7 @@ nhwc_reusable_calc_var(__global DATA_T *src, __global float *mean, const int src_off = ic_block_offset + sp_block_idx * stat_sp_block * ic_size; - // exp reduce_temp layout: reduce_stat_nblocks rows x ic columns + // reduce_temp layout: reduce_stat_nblocks rows x ic columns const int reduce_off = ic_block_offset + sp_block_idx * ic_size; src += src_off; @@ -205,7 +204,7 @@ nhwc_reusable_calc_var(__global DATA_T *src, __global float *mean, // store res if (use_fused_atomics_reduction) { const int dst_off = ic_block_offset + sg * VECT_SIZE * SG_SIZE; - nhwc_reusable_calc_fused_reduction( + nhwc_reusable_reg_fused_reduction( variance, dst_off, (float *)(&v_var), local_sum, VECT_SIZE); } else { const int sg_off = sg * VECT_SIZE * SG_SIZE; @@ -236,7 +235,7 @@ nhwc_reusable_calc_var(__global DATA_T *src, __global float *mean, if (use_fused_atomics_reduction) { const int dst_off = ic_block_offset + (ic_vect_sgroups + sg) * SG_SIZE; - nhwc_reusable_calc_fused_reduction( + nhwc_reusable_reg_fused_reduction( variance, dst_off, &v_var, local_sum, 1); } else { const int sg_off = (ic_vect_sgroups + sg) * SG_SIZE; @@ -297,7 +296,7 @@ nhwc_reusable_calc_mean_var(__global DATA_T *src, __global float *reduce_temp, // store res if (use_fused_atomics_reduction) { const int dst_off = ic_block_offset + sg * VECT_SIZE * SG_SIZE; - nhwc_reusable_mean_var_fused_reduction(mean, variance, dst_off, sum, + nhwc_reusable_1pass_fused_reduction(mean, variance, dst_off, sum, sum_sq, local_sum, local_sum_sq, VECT_SIZE); } else { @@ -324,8 +323,8 @@ nhwc_reusable_calc_mean_var(__global DATA_T *src, __global float *reduce_temp, if (use_fused_atomics_reduction) { const int dst_off = ic_block_offset + (ic_vect_sgroups + sg) * SG_SIZE; - nhwc_reusable_mean_var_fused_reduction(mean, variance, dst_off, - &sum, &sum_sq, local_sum, local_sum_sq, 1); + nhwc_reusable_1pass_fused_reduction(mean, variance, dst_off, &sum, + &sum_sq, local_sum, local_sum_sq, 1); } else { const int sg_off = (ic_vect_sgroups + sg) * SG_SIZE; STORE_FLOAT_1x16(&reduce_temp[sg_off], sum.s0); @@ -461,7 +460,7 @@ nhwc_reusable_norm_fwd(__global DATA_T *src, __global float *mean, } // Atomic-based reduction, BWD pass -void nhwc_reusable_bwd_calc_fused_reduction( +void nhwc_reusable_bwd_fused_reduction( volatile __global atomic_float *diff_scale, volatile __global atomic_float *diff_shift, off_t dst_offset, float *diff_gamma, float *diff_beta, __local float *local_sums, @@ -563,9 +562,9 @@ nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, // store results if (use_fused_atomics_reduction) { const int dst_off = ic_block_offset + sg * VECT_SIZE * SG_SIZE; - nhwc_reusable_bwd_calc_fused_reduction(diff_scale, diff_shift, - dst_off, (float *)(&diff_gamma), (float *)(&diff_beta), - local_sums, VECT_SIZE, calc_slm_size); + nhwc_reusable_bwd_fused_reduction(diff_scale, diff_shift, dst_off, + (float *)(&diff_gamma), (float *)(&diff_beta), local_sums, + VECT_SIZE, calc_slm_size); } else { // Two different scratchpads: for diff_gamma and diff_beta // scratchpad layout (elements): @@ -619,9 +618,9 @@ nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, if (use_fused_atomics_reduction) { const int dst_off = ic_block_offset + (ic_vect_sgroups + sg) * SG_SIZE; - nhwc_reusable_bwd_calc_fused_reduction(diff_scale, diff_shift, - dst_off, (float *)(&diff_gamma), (float *)(&diff_beta), - local_sums, 1, calc_slm_size); + nhwc_reusable_bwd_fused_reduction(diff_scale, diff_shift, dst_off, + (float *)(&diff_gamma), (float *)(&diff_beta), local_sums, + 1, calc_slm_size); } else { const int sg_off = (ic_vect_sgroups + sg) * SG_SIZE; STORE_FLOAT_1x16(&temp_reduce[sg_off], diff_gamma); From 29e4536c780f4ffa20f11179c85fc49ebb6e06db Mon Sep 17 00:00:00 2001 From: Sergey Kazakov Date: Tue, 7 May 2024 09:15:38 -0700 Subject: [PATCH 025/187] gpu: bnorm: nhwc-reusable: add kernels with private memory buffers usage --- src/gpu/intel/ocl/bnorm/bnorm_utils.hpp | 6 + src/gpu/intel/ocl/bnorm/nhwc_reusable.cl | 741 +++++++++++++++++++++- src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp | 34 +- src/gpu/intel/ocl/bnorm/nhwc_reusable.h | 6 + src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp | 9 +- 5 files changed, 782 insertions(+), 14 deletions(-) diff --git a/src/gpu/intel/ocl/bnorm/bnorm_utils.hpp b/src/gpu/intel/ocl/bnorm/bnorm_utils.hpp index d9dded5d44a..0f9bc8bd5c3 100644 --- a/src/gpu/intel/ocl/bnorm/bnorm_utils.hpp +++ b/src/gpu/intel/ocl/bnorm/bnorm_utils.hpp @@ -85,6 +85,12 @@ constexpr size_t reduce_aux = 6; constexpr size_t norm_bwd = 7; constexpr size_t calc_stat = 8; constexpr size_t reduce_stat = 9; +constexpr size_t norm_fwd_buff = 10; +constexpr size_t norm_bwd_buff = 11; +constexpr size_t calc_mean_buff = 12; +constexpr size_t calc_var_buff = 13; +constexpr size_t calc_mean_var_buff = 14; +constexpr size_t calc_stat_buff = 15; } // namespace kernel_id float get_ss_utilization( diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl index 0cbc3f9787a..ba93f06cfa9 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl @@ -15,7 +15,12 @@ *******************************************************************************/ #include "gpu/intel/ocl/bnorm/nhwc_reusable.h" -// Atomic-based reduction for 1pass algorithm +// Two sets of nhwc-optimized reusable kernels which are implemented with and +// without use of private memory buffers. +// These two ways require different layouts of a scratchpadd and/or SLM buffers. +// The names of kernels and relative functions distinguish by suffix "buff". + +// Atomic-based reduction for 1pass algorithm, for no private buffers kernels. void nhwc_reusable_1pass_fused_reduction(volatile __global atomic_float *mean, volatile __global atomic_float *variance, off_t dst_offset, SUM_DATA_T *sum, SUM_DATA_T *sum_sq, __local SUM_DATA_T *local_sum, @@ -55,7 +60,48 @@ void nhwc_reusable_1pass_fused_reduction(volatile __global atomic_float *mean, return; } -// Atomic-based reduction for regular algorithm +// Atomic-based reduction for 1pass algorithm, for kernels with private buffers. +void nhwc_reusable_1pass_fused_reduction_buff( + volatile __global atomic_float *mean, + volatile __global atomic_float *variance, off_t dst_offset, + SUM_DATA_T *sum, SUM_DATA_T *sum_sq, __local SUM_DATA_T *local_sum, + __local SUM_DATA_T *local_sum_sq, off_t ic_block) { + const int local_id = get_local_id(1); + const int simd_id = get_sub_group_local_id(); + const int row_size = ic_block; + const int group_size = get_local_size(1); + const int ic_block_sgroups = ic_block / SG_SIZE; + + if (local_id > 0) { + unroll_16_for(int sg = 0; sg < ic_block_sgroups; sg++) { + const int slm_offset = local_id * row_size + sg * SG_SIZE + simd_id; + local_sum[slm_offset] = sum[sg]; + local_sum_sq[slm_offset] = sum_sq[sg]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if (local_id == 0) { + unroll_16_for(int l_id = 1; l_id < group_size; l_id++) { + unroll_4_for(int sg = 0; sg < ic_block_sgroups; sg++) { + const int off = l_id * row_size + sg * SG_SIZE + simd_id; + SUM_DATA_T tmp = local_sum[off]; + SUM_DATA_T tmp_sq = local_sum_sq[off]; + sum[sg] = summation(tmp.s1, sum[sg]); + sum_sq[sg] = summation(tmp_sq.s1, sum_sq[sg]); + sum[sg] = summation(tmp.s0, sum[sg]); + sum_sq[sg] = summation(tmp_sq.s0, sum_sq[sg]); + } + } + unroll_4_for(int sg = 0; sg < ic_block_sgroups; sg++) { + const int off = sg * SG_SIZE + simd_id; + atomic_add_global(&mean[dst_offset + off], sum[sg].s0); + atomic_add_global(&variance[dst_offset + off], sum_sq[sg].s0); + } + } + return; +} + +// Atomic-based reduction for regular algorithm, for no private buffers kernels. void nhwc_reusable_reg_fused_reduction(volatile __global atomic_float *dst, off_t dst_offset, float *sum, __local float *local_sum, off_t vect_size) { @@ -87,7 +133,41 @@ void nhwc_reusable_reg_fused_reduction(volatile __global atomic_float *dst, return; } -// Calculate mean, regular algorithm +// Atomic-based reduction for regular algorithm, +// for kernels with private buffers. +void nhwc_reusable_reg_fused_reduction_buff(volatile __global atomic_float *dst, + off_t dst_offset, float *sum, __local float *local_sum, + off_t ic_block) { + + const int local_id = get_local_id(1); + const int simd_id = get_sub_group_local_id(); + const int group_size = get_local_size(1); + const int row_size = ic_block; + const int ic_block_sgroups = ic_block / SG_SIZE; + + if (local_id > 0) { + unroll_16_for(int sg = 0; sg < ic_block_sgroups; sg++) { + const int slm_offset = local_id * row_size + sg * SG_SIZE + simd_id; + local_sum[slm_offset] = sum[sg]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if (local_id == 0) { + unroll_16_for(int l_id = 1; l_id < group_size; l_id++) { + unroll_4_for(int sg = 0; sg < ic_block_sgroups; sg++) { + const int off = l_id * row_size + sg * SG_SIZE + simd_id; + sum[sg] += local_sum[off]; + } + } + unroll_4_for(int sg = 0; sg < ic_block_sgroups; sg++) { + const int off = sg * SG_SIZE + simd_id; + atomic_add_global(&dst[dst_offset + off], sum[sg]); + } + } + return; +} + +// Calculate mean, regular algorithm, no private memory buffers used. __attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_calc_mean(__global DATA_T *src, __global float *reduce_temp, volatile __global atomic_float *mean, off_t ic_size, off_t ic_block, @@ -159,7 +239,68 @@ nhwc_reusable_calc_mean(__global DATA_T *src, __global float *reduce_temp, } } -// Calculate variance, regular algorithm +// Calculate mean, regular algorithm, private memory buffers used. +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +nhwc_reusable_calc_mean_buff(__global DATA_T *src, __global float *reduce_temp, + volatile __global atomic_float *mean, off_t ic_size, off_t ic_block, + off_t sp_size, off_t stat_sp_block, off_t reduce_stat_nblocks, + int use_fused_atomics_reduction, __local float *local_sum) { + + const int c = get_global_id(0); + const int sp_block_idx = get_global_id(1); + + const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int src_off + = ic_block_offset + sp_block_idx * stat_sp_block * ic_size; + + // reduce_temp layout: reduce_stat_nblocks rows x ic columns + const int reduce_off = ic_block_offset + sp_block_idx * ic_size; + + src += src_off; + reduce_temp += reduce_off; + + const int sp_idx_bnd = sp_size % stat_sp_block + ? min(stat_sp_block, sp_size - sp_block_idx * stat_sp_block) + : stat_sp_block; + const int ic_block_sgroups + = min(ic_size - ic_block_offset, ic_block) / SG_SIZE; + const int ic_vect_sgroups = ic_block_sgroups / VECT_SIZE; + const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; + + float v_mean[MAX_IC_BLOCK_SGROUPS] = {0.0f}; + for (int sp = 0; sp < sp_idx_bnd; ++sp) { + // vectorized part + for (int sg = 0; sg < ic_vect_sgroups; ++sg) { + float s_vect[VECT_SIZE]; + AS_VECT_FLOAT(s_vect) + = LOAD_VECT_DATA(&src[sg * SG_SIZE * VECT_SIZE]); + for (int vect = 0; vect < VECT_SIZE; ++vect) { + v_mean[sg * VECT_SIZE + vect] += s_vect[vect]; + } + } +#if MAY_HAVE_IC_TAIL + // tails + for (int sg = 0; sg < ic_tail_sgroups; ++sg) { + const int sg_idx = ic_vect_sgroups * VECT_SIZE + sg; + v_mean[sg_idx] += LOAD_DATA_1x16(&src[sg_idx * SG_SIZE]); + } +#endif // HAS_IC_VECT_TAIL + src += ic_size; + } // sp_loop + + // store res + if (use_fused_atomics_reduction) { + nhwc_reusable_reg_fused_reduction_buff( + mean, ic_block_offset, (float *)(&v_mean), local_sum, ic_block); + } else { + for (int sg = 0; sg < ic_block_sgroups; ++sg) { + const int sg_off = sg * SG_SIZE; + STORE_FLOAT_1x16(&reduce_temp[sg_off], v_mean[sg]); + } + } +} + +// Calculate variance, regular algorithm, no private memory buffers used. __attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_calc_var(__global DATA_T *src, __global float *mean, __global float *reduce_temp, volatile __global atomic_float *variance, @@ -244,7 +385,84 @@ nhwc_reusable_calc_var(__global DATA_T *src, __global float *mean, } } +// Calculate variance, regular algorithm, private memory buffers used. +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +nhwc_reusable_calc_var_buff(__global DATA_T *src, __global float *mean, + __global float *reduce_temp, volatile __global atomic_float *variance, + off_t ic_size, off_t ic_block, off_t sp_size, off_t stat_sp_block, + off_t reduce_stat_nblocks, int use_fused_atomics_reduction, + __local float *local_sum) { + + const int c = get_global_id(0); + const int sp_block_idx = get_global_id(1); + + const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int src_off + = ic_block_offset + sp_block_idx * stat_sp_block * ic_size; + + // reduce_temp layout: reduce_stat_nblocks rows x ic columns + const int reduce_off = ic_block_offset + sp_block_idx * ic_size; + + src += src_off; + reduce_temp += reduce_off + reduce_stat_nblocks * ic_size; + mean += ic_block_offset; + + const int sp_idx_bnd = sp_size % stat_sp_block + ? min(stat_sp_block, sp_size - sp_block_idx * stat_sp_block) + : stat_sp_block; + const int ic_block_sgroups + = min(ic_size - ic_block_offset, ic_block) / SG_SIZE; + const int ic_vect_sgroups = ic_block_sgroups / VECT_SIZE; + const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; + + float v_mean[MAX_IC_BLOCK_SGROUPS] = {0.0f}; + for (int sg = 0; sg < ic_block_sgroups; ++sg) { + v_mean[sg] = as_float(intel_sub_group_block_read( + (const __global uint *)(&mean[(sg * SG_SIZE)]))); + } + + float v_var[MAX_IC_BLOCK_SGROUPS] = {0.0f}; + float v0[MAX_IC_BLOCK_SGROUPS] = {0.0f}; + + for (int sp = 0; sp < sp_idx_bnd; ++sp) { + // vectorized part + for (int sg = 0; sg < ic_vect_sgroups; ++sg) { + float s_vect[VECT_SIZE]; + AS_VECT_FLOAT(s_vect) + = LOAD_VECT_DATA(&src[sg * SG_SIZE * VECT_SIZE]); + for (int vect = 0; vect < VECT_SIZE; ++vect) { + int sg_idx = sg * VECT_SIZE + vect; + v0[sg_idx] = s_vect[vect] - v_mean[sg_idx]; + v_var[sg_idx] = fma(v0[sg_idx], v0[sg_idx], v_var[sg_idx]); + } + } + +#if MAY_HAVE_IC_TAIL + // tails + for (int sg = 0; sg < ic_tail_sgroups; ++sg) { + const int sg_idx = ic_vect_sgroups * VECT_SIZE + sg; + float s_tail = LOAD_DATA_1x16(&src[sg_idx * SG_SIZE]); + v0[sg_idx] = s_tail - v_mean[sg_idx]; + v_var[sg_idx] = fma(v0[sg_idx], v0[sg_idx], v_var[sg_idx]); + } +#endif // HAS_IC_VECT_TAIL + src += ic_size; + } // sp_loop + + // store res + if (use_fused_atomics_reduction) { + nhwc_reusable_reg_fused_reduction_buff(variance, ic_block_offset, + (float *)(&v_var), local_sum, ic_block); + } else { + for (int sg = 0; sg < ic_block_sgroups; ++sg) { + const int sg_off = sg * SG_SIZE; + STORE_FLOAT_1x16(&reduce_temp[sg_off], v_var[sg]); + } + } +} + // Calculate mean and variance at once, 1pass algorithm +// no private memory buffers used. __attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_calc_mean_var(__global DATA_T *src, __global float *reduce_temp, volatile __global atomic_float *mean, @@ -333,7 +551,84 @@ nhwc_reusable_calc_mean_var(__global DATA_T *src, __global float *reduce_temp, } } +// Calculate mean and variance at once, 1pass algorithm, +// private memory buffers used. +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +nhwc_reusable_calc_mean_var_buff(__global DATA_T *src, + __global float *reduce_temp, volatile __global atomic_float *mean, + volatile __global atomic_float *variance, off_t ic_size, off_t ic_block, + off_t sp_size, off_t stat_sp_block, off_t reduce_stat_nblocks, + int use_fused_atomics_reduction, __local SUM_DATA_T *local_sum, + __local SUM_DATA_T *local_sum_sq) { + + const int c = get_global_id(0); + const int sp_block_idx = get_global_id(1); + const int simd_id = get_sub_group_local_id(); + + const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int src_off + = ic_block_offset + sp_block_idx * stat_sp_block * ic_size; + + // reduce_temp layout: reduce_stat_nblocks rows x ic columns + const int reduce_off = ic_block_offset + sp_block_idx * ic_size; + + const int variance_off = reduce_stat_nblocks * ic_size; + + src += src_off; + reduce_temp += reduce_off; + + const int sp_idx_bnd = sp_size % stat_sp_block + ? min(stat_sp_block, sp_size - sp_block_idx * stat_sp_block) + : stat_sp_block; + const int ic_block_sgroups + = min(ic_size - ic_block_offset, ic_block) / SG_SIZE; + const int ic_vect_sgroups = ic_block_sgroups / VECT_SIZE; + const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; + + SUM_DATA_T sum[MAX_IC_BLOCK_SGROUPS] = {0.0f}; + SUM_DATA_T sum_sq[MAX_IC_BLOCK_SGROUPS] = {0.0f}; + + for (int sp = 0; sp < sp_idx_bnd; ++sp) { + // vectorized part + for (int sg = 0; sg < ic_vect_sgroups; ++sg) { + float s_vect[VECT_SIZE]; + AS_VECT_FLOAT(s_vect) + = LOAD_VECT_DATA(&src[sg * SG_SIZE * VECT_SIZE]); + for (int vect = 0; vect < VECT_SIZE; ++vect) { + const int sum_idx = sg * VECT_SIZE + vect; + sum[sum_idx] = summation(s_vect[vect], sum[sum_idx]); + sum_sq[sum_idx] = summation( + s_vect[vect] * s_vect[vect], sum_sq[sum_idx]); + } + } +#if MAY_HAVE_IC_TAIL + // tails + for (int sg = 0; sg < ic_tail_sgroups; ++sg) { + const int sg_idx = ic_vect_sgroups * VECT_SIZE + sg; + float s_tail = LOAD_DATA_1x16(&src[sg_idx * SG_SIZE]); + sum[sg_idx] = summation(s_tail, sum[sg_idx]); + sum_sq[sg_idx] = summation(s_tail * s_tail, sum_sq[sg_idx]); + } +#endif + src += ic_size; + } + // store res + if (use_fused_atomics_reduction) { + nhwc_reusable_1pass_fused_reduction_buff(mean, variance, + ic_block_offset, sum, sum_sq, local_sum, local_sum_sq, + ic_block); + } else { + for (int sg = 0; sg < ic_block_sgroups; ++sg) { + const int reduce_off = sg * SG_SIZE; + STORE_FLOAT_1x16(&reduce_temp[reduce_off], sum[sg].s0); + STORE_FLOAT_1x16( + &reduce_temp[variance_off + reduce_off], sum_sq[sg].s0); + } + } +} + // Main FWD kernel, common for regular and 1pass algorithms +// no private memory buffers used. __attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_norm_fwd(__global DATA_T *src, __global float *mean, __global float *variance, __global DATA_T *dst, @@ -459,7 +754,152 @@ nhwc_reusable_norm_fwd(__global DATA_T *src, __global float *mean, } // sp loop } -// Atomic-based reduction, BWD pass +// Main FWD kernel, common for regular and 1pass algorithms, +// private memory buffers used. +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +nhwc_reusable_norm_fwd_buff(__global DATA_T *src, __global float *mean, + __global float *variance, __global DATA_T *dst, + __global float *scaleshift, __global float *shift, __global char *ws, + float eps, __global DATA_T *src_add, float relu_alpha, off_t ic_size, + off_t ic_block, off_t sp_size, off_t update_sp_block) { + + const int c = get_global_id(0); + const int sp = get_global_id(1) * update_sp_block; + + const int ic_block_offset = (c / SG_SIZE) * ic_block; + + mean += ic_block_offset; + variance += ic_block_offset; + shift += ic_block_offset; + scaleshift += ic_block_offset; + const uint d_off = sp * ic_size + ic_block_offset; + + src += d_off; +#if FUSE_BN_ADD_RELU + src_add += d_off; +#endif + dst += d_off; +#if FUSE_BN_RELU && IS_TRAINING + ws += d_off; +#endif + + float sm[MAX_IC_BLOCK_SGROUPS], sv[MAX_IC_BLOCK_SGROUPS], + v_mean[MAX_IC_BLOCK_SGROUPS], v_variance[MAX_IC_BLOCK_SGROUPS], + sqrt_variance[MAX_IC_BLOCK_SGROUPS]; + + const bool has_sp_block_tail = sp_size % update_sp_block; + const int sp_idx_bnd = has_sp_block_tail + ? min(update_sp_block, sp_size - sp) + : update_sp_block; + + const int ic_block_sgroups + = min(ic_size - ic_block_offset, ic_block) / SG_SIZE; + const int ic_vect_sgroups = ic_block_sgroups / VECT_SIZE; + const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; + const bool has_ic_vect_tail = ic_tail_sgroups > 0; + + for (int sg = 0; sg < ic_vect_sgroups; ++sg) { + const int sg_idx = sg * SG_SIZE * VECT_SIZE; + const int sgv = sg * VECT_SIZE; + + AS_VECT_FLOAT(&sm[sgv]) = USE_SCALE + ? LOAD_VECT_FLOAT(&scaleshift[sg_idx]) + : (VECT_FLOAT_T)1.0f; + AS_VECT_FLOAT(&sv[sgv]) = USE_SHIFT ? LOAD_VECT_FLOAT(&shift[sg_idx]) + : (VECT_FLOAT_T)0.0f; + AS_VECT_FLOAT(&v_mean[sgv]) = LOAD_VECT_FLOAT(&mean[sg_idx]); + AS_VECT_FLOAT(&v_variance[sgv]) = LOAD_VECT_FLOAT(&variance[sg_idx]); + AS_VECT_FLOAT(&sqrt_variance[sgv]) = AS_VECT_FLOAT(&sm[sgv]) + / sqrt(AS_VECT_FLOAT(&v_variance[sgv]) + (VECT_FLOAT_T)eps); + } + +#if MAY_HAVE_IC_TAIL + for (int sg = 0; sg < ic_tail_sgroups; ++sg) { + const int sgv = ic_vect_sgroups * VECT_SIZE + sg; + const int sg_idx = (ic_vect_sgroups * VECT_SIZE + sg) * SG_SIZE; + sm[sgv] = USE_SCALE ? LOAD_FLOAT_1x16(&scaleshift[sg_idx]) : 1.0f; + sv[sgv] = USE_SHIFT ? LOAD_FLOAT_1x16(&shift[sg_idx]) : 0.0f; + v_mean[sgv] = LOAD_FLOAT_1x16(&mean[sg_idx]); + v_variance[sgv] = LOAD_FLOAT_1x16(&variance[sg_idx]); + sqrt_variance[sgv] = sm[sgv] / sqrt(v_variance[sgv] + eps); + } +#endif //MAY_HAVE_IC_TAIL + + for (int sp_idx = 0; sp_idx < sp_idx_bnd; sp_idx++) { + // vectorized part + for (int sg = 0; sg < ic_vect_sgroups; ++sg) { + const int sg_idx = sg * SG_SIZE * VECT_SIZE; + const int sgv = sg * VECT_SIZE; + + VECT_FLOAT_T d_vect; + const VECT_FLOAT_T s_vect = LOAD_VECT_DATA(&src[sg_idx]); + d_vect = fma(s_vect - AS_VECT_FLOAT(&v_mean[sgv]), + AS_VECT_FLOAT(&sqrt_variance[sgv]), + AS_VECT_FLOAT(&sv[sgv])); + +#if FUSE_BN_RELU +#if FUSE_BN_ADD_RELU + d_vect += LOAD_VECT_DATA(&src_add[sg_idx]); +#endif + const VECT_INT_T ws_vect = ISGREATER(d_vect, (VECT_FLOAT_T)0.0f); + d_vect = select((VECT_FLOAT_T)0.0f, d_vect, ws_vect); +#if IS_TRAINING + STORE_VECT_CHAR(&ws[sg_idx], ws_vect); +#endif // IS_TRAINING +#endif // FUSE_BN_RELU + +#if WITH_RELU && WITH_LEAKY_RELU + VECT_INT_T l_vect; +#endif //WITH_RELU && WITH_LEAKY_RELU +#if WITH_RELU +#if WITH_LEAKY_RELU + l_vect = isless(d_vect, 0.0f); + d_vect = select(d_vect, d_vect * relu_alpha, l_vect); +#else + d_vect = max(d_vect, (VECT_FLOAT_T)0.0f); +#endif //WITH_LEAKY_RELU +#endif //WITH_RELU + STORE_VECT_DATA(&dst[sg_idx], d_vect); + } // sg loop + +#if MAY_HAVE_IC_TAIL + // tails + for (int sg = 0; sg < ic_tail_sgroups; ++sg) { + const int sgv = ic_vect_sgroups * VECT_SIZE + sg; + const int sg_idx = (ic_vect_sgroups * VECT_SIZE + sg) * SG_SIZE; + float d_tail; + const float s_tail = LOAD_DATA_1x16(&src[sg_idx]); + d_tail = fma(s_tail - v_mean[sgv], sqrt_variance[sgv], sv[sgv]); + if (FUSE_BN_ADD_RELU) d_tail += LOAD_DATA_1x16(&src_add[sg_idx]); +#if FUSE_BN_RELU + if (d_tail <= 0) d_tail = 0.0f; +#if IS_TRAINING + const int ws_tail = d_tail > 0.0f ? -1 : 0; + STORE_CHAR_1x16(&ws[sg_idx], convert_char(ws_tail)); +#endif // IS_TRAINING +#endif // FUSE_BN_RELU +#if WITH_RELU +#if WITH_LEAKY_RELU + if (d_tail < 0) d_tail *= relu_alpha; +#else + d_tail = max(d_tail, 0.0f); +#endif //WITH_LEAKY_RELU +#endif //WITH_RELU + STORE_DATA_1x16(&dst[sg_idx], d_tail); + } +#endif //MAY_HAVE_IC_TAIL + src += ic_size; +#if FUSE_BN_ADD_RELU + src_add += ic_size; +#endif + dst += ic_size; +#if FUSE_BN_RELU && IS_TRAINING + ws += ic_size; +#endif + } // sp loop +} + +// Atomic-based reduction, BWD pass, for no private buffers kernels. void nhwc_reusable_bwd_fused_reduction( volatile __global atomic_float *diff_scale, volatile __global atomic_float *diff_shift, off_t dst_offset, @@ -500,7 +940,48 @@ void nhwc_reusable_bwd_fused_reduction( return; } +// Atomic-based reduction, BWD pass, for kernel with private buffers. +void nhwc_reusable_bwd_fused_reduction_buff( + volatile __global atomic_float *diff_scale, + volatile __global atomic_float *diff_shift, off_t dst_offset, + float *diff_gamma, float *diff_beta, __local float *local_sums, + off_t ic_block, off_t calc_slm_size) { + const int local_id = get_local_id(1); + const int simd_id = get_sub_group_local_id(); + const int row_size = ic_block; + const int group_size = get_local_size(1); + + const int ic_block_sgroups = ic_block / SG_SIZE; + __local float *local_gamma = local_sums; + __local float *local_beta = local_sums + calc_slm_size / sizeof(float); + + if (local_id > 0) { + unroll_16_for(int sg = 0; sg < ic_block_sgroups; sg++) { + const int slm_offset = local_id * row_size + sg * SG_SIZE + simd_id; + local_gamma[slm_offset] = diff_gamma[sg]; + local_beta[slm_offset] = diff_beta[sg]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if (local_id == 0) { + unroll_16_for(int l_id = 1; l_id < group_size; l_id++) { + unroll_4_for(int sg = 0; sg < ic_block_sgroups; sg++) { + const int off = l_id * row_size + sg * SG_SIZE + simd_id; + diff_gamma[sg] += local_gamma[off]; + diff_beta[sg] += local_beta[off]; + } + } + unroll_4_for(int sg = 0; sg < ic_block_sgroups; sg++) { + const int off = sg * SG_SIZE + simd_id; + atomic_add_global(&diff_scale[dst_offset + off], diff_gamma[sg]); + atomic_add_global(&diff_shift[dst_offset + off], diff_beta[sg]); + } + } + return; +} + // Calculate stats for BWD pass +// no private memory buffers used. __attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, __global DATA_T *diff_dst, __global char *ws, @@ -629,7 +1110,119 @@ nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, } // sg loop } +// Calculate stats for BWD pass, private memory buffers used. +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +nhwc_reusable_calc_stat_buff(__global DATA_T *src, __global float *mean, + __global DATA_T *diff_dst, __global char *ws, + __global float *temp_reduce, __global float *temp_reduce_shift, + volatile __global atomic_float *diff_scale, + volatile __global atomic_float *diff_shift, off_t ic_size, + off_t ic_block, off_t sp_size, off_t stat_sp_block, + off_t reduce_stat_nblocks, int use_fused_atomics_reduction, + __local float *local_sums, off_t calc_slm_size) { + + const int c = get_global_id(0); + const int sp_block_idx = get_global_id(1); + const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int offset = ic_block_offset + sp_block_idx * stat_sp_block * ic_size; + + mean += ic_block_offset; + src += offset; + diff_dst += offset; + ws += offset; + + // scratchpad layout: (reduce_stat_nblocks + 1) rows x ic columns + const int reduce_off = ic_block_offset + (sp_block_idx + 1) * ic_size; + + temp_reduce += reduce_off; + temp_reduce_shift += reduce_off; + + const bool has_sp_block_tail = sp_size % stat_sp_block; + const int sp_idx_bnd = has_sp_block_tail + ? min(stat_sp_block, sp_size - sp_block_idx * stat_sp_block) + : stat_sp_block; + const int ic_block_sgroups + = min(ic_size - ic_block_offset, ic_block) / SG_SIZE; + const int ic_vect_sgroups = ic_block_sgroups / VECT_SIZE; + const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; + + float v_mean[MAX_IC_BLOCK_SGROUPS]; + for (int sg = 0; sg < ic_block_sgroups; ++sg) { + v_mean[sg] = as_float(intel_sub_group_block_read( + (const __global uint *)(&mean[(sg * SG_SIZE)]))); + } + + float diff_gamma[MAX_IC_BLOCK_SGROUPS] = {0.0f}; + float diff_beta[MAX_IC_BLOCK_SGROUPS] = {0.0f}; + + for (int sp = 0; sp < sp_idx_bnd; ++sp) { + // vector part + for (int sg = 0; sg < ic_vect_sgroups; ++sg) { + const int sg_idx = sg * SG_SIZE * VECT_SIZE; + const int sgv = sg * VECT_SIZE; +#if FUSE_BN_RELU + const VECT_CHAR_T ws_vect = LOAD_VECT_CHAR(&ws[sg_idx]); +#endif + + float src_vect[VECT_SIZE]; + AS_VECT_FLOAT(src_vect) = LOAD_VECT_DATA(&src[sg_idx]); + VECT_FLOAT_T dd_vect = LOAD_VECT_DATA(&diff_dst[sg_idx]); + float v0[VECT_SIZE]; + for (int vect = 0; vect < VECT_SIZE; ++vect) { + int sg_idx = sg * VECT_SIZE + vect; + v0[vect] = src_vect[vect] - v_mean[sg_idx]; + } +#if FUSE_BN_RELU + dd_vect = select( + (VECT_FLOAT_T)0.0f, dd_vect, CONVERT_VECT_INT_T(ws_vect)); +#endif + AS_VECT_FLOAT(&diff_gamma[sgv]) = fma(AS_VECT_FLOAT(v0), dd_vect, + AS_VECT_FLOAT(&diff_gamma[sgv])); + AS_VECT_FLOAT(&diff_beta[sgv]) += dd_vect; + } + +#if MAY_HAVE_IC_TAIL + // tails + for (int sg = 0; sg < ic_tail_sgroups; ++sg) { + const int sg_idx = ic_vect_sgroups * VECT_SIZE + sg; +#if FUSE_BN_RELU + char ws_tail = LOAD_CHAR_1x16(&ws[sg_idx * SG_SIZE]); +#endif + float src_tail = LOAD_DATA_1x16(&src[sg_idx * SG_SIZE]); + float dd_tail = LOAD_DATA_1x16(&diff_dst[sg_idx * SG_SIZE]); + float v0 = src_tail - v_mean[sg_idx]; +#if FUSE_BN_RELU + dd_tail = select(0.0f, dd_tail, convert_int(ws_tail)); +#endif + + diff_gamma[sg_idx] = fma(v0, dd_tail, diff_gamma[sg_idx]); + diff_beta[sg_idx] += dd_tail; + } +#endif + src += ic_size; + diff_dst += ic_size; +#if FUSE_BN_RELU + ws += ic_size; +#endif + } // sp loop + + // store results + if (use_fused_atomics_reduction) { + nhwc_reusable_bwd_fused_reduction_buff(diff_scale, diff_shift, + ic_block_offset, (float *)(&diff_gamma), (float *)(&diff_beta), + local_sums, ic_block, calc_slm_size); + + } else { + for (int sg = 0; sg < ic_block_sgroups; ++sg) { + const int sg_off = sg * SG_SIZE; + STORE_FLOAT_1x16(&temp_reduce[sg_off], diff_gamma[sg]); + STORE_FLOAT_1x16(&temp_reduce_shift[sg_off], diff_beta[sg]); + } + } +} + // Main BWD pass kernel +// no private memory buffers used. __attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void nhwc_reusable_norm_bwd(__global DATA_T *src, __global float *mean, __global float *variance, __global DATA_T *diff_dst, @@ -749,6 +1342,144 @@ nhwc_reusable_norm_bwd(__global DATA_T *src, __global float *mean, } // sp loop } +// Main BWD pass kernel, private memory buffers used. +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +nhwc_reusable_norm_bwd_buff(__global DATA_T *src, __global float *mean, + __global float *variance, __global DATA_T *diff_dst, + __global float *scaleshift, __global char *ws, + __global DATA_T *diff_src, __global float *diff_scale, + __global float *diff_shift, float eps, __global DATA_T *diff_src_add, + off_t ic_size, off_t ic_block, off_t sp_size, off_t update_sp_block) { + + const int c = get_global_id(0); + const int ic_block_offset = (c / SG_SIZE) * ic_block; + + variance += ic_block_offset; + mean += ic_block_offset; + diff_scale += ic_block_offset; + diff_shift += ic_block_offset; + scaleshift += ic_block_offset; + + const int sp_block_idx = get_global_id(1); + const int offset + = ic_block_offset + sp_block_idx * update_sp_block * ic_size; + + src += offset; + diff_dst += offset; + ws += offset; + diff_src += offset; +#if FUSE_BN_ADD_RELU + diff_src_add += offset; +#endif + + const bool has_sp_block_tail = sp_size % update_sp_block; + const int sp_idx_bnd = has_sp_block_tail + ? min(update_sp_block, sp_size - sp_block_idx * update_sp_block) + : update_sp_block; + const int ic_block_sgroups + = min(ic_size - ic_block_offset, ic_block) / SG_SIZE; + const int ic_vect_sgroups = ic_block_sgroups / VECT_SIZE; + const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; + + float v_variance[MAX_IC_BLOCK_SGROUPS], v_mean[MAX_IC_BLOCK_SGROUPS], + diff_gamma[MAX_IC_BLOCK_SGROUPS], diff_beta[MAX_IC_BLOCK_SGROUPS], + sqrt_variance[MAX_IC_BLOCK_SGROUPS], gamma[MAX_IC_BLOCK_SGROUPS]; + + for (int sg = 0; sg < ic_vect_sgroups; ++sg) { + const int sgv = sg * VECT_SIZE; + const int sg_idx = sg * SG_SIZE * VECT_SIZE; + + AS_VECT_FLOAT(&v_variance[sgv]) = LOAD_VECT_FLOAT(&variance[sg_idx]); +#if CALCULATE_STATS == 1 + AS_VECT_FLOAT(&v_mean[sgv]) = LOAD_VECT_FLOAT(&mean[sg_idx]); + AS_VECT_FLOAT(&diff_gamma[sgv]) = LOAD_VECT_FLOAT(&diff_scale[sg_idx]); + AS_VECT_FLOAT(&diff_beta[sgv]) = LOAD_VECT_FLOAT(&diff_shift[sg_idx]); +#endif // #if CALCULATE_DIFF_STATS == 1 + AS_VECT_FLOAT(&gamma[sgv]) = USE_SCALE + ? LOAD_VECT_FLOAT(&scaleshift[sg_idx]) + : (VECT_FLOAT_T)1.0f; + AS_VECT_FLOAT(&sqrt_variance[sgv]) = (VECT_FLOAT_T)1.0f + / sqrt(AS_VECT_FLOAT(&v_variance[sgv]) + (VECT_FLOAT_T)eps); + } + +#if MAY_HAVE_IC_TAIL + for (int sg = 0; sg < ic_tail_sgroups; ++sg) { + const int sgv = ic_vect_sgroups * VECT_SIZE + sg; + const int sg_idx = (ic_vect_sgroups * VECT_SIZE + sg) * SG_SIZE; + v_variance[sgv] = LOAD_FLOAT_1x16(&variance[sg_idx]); +#if CALCULATE_STATS == 1 + v_mean[sgv] = LOAD_FLOAT_1x16(&mean[sg_idx]); + diff_gamma[sgv] = LOAD_FLOAT_1x16(&diff_scale[sg_idx]); + diff_beta[sgv] = LOAD_FLOAT_1x16(&diff_shift[sg_idx]); +#endif // #if CALCULATE_DIFF_STATS == 1 + gamma[sgv] = USE_SCALE ? LOAD_FLOAT_1x16(&scaleshift[sg_idx]) : 1.0f; + sqrt_variance[sgv] = 1.0f / sqrt(v_variance[sgv] + eps); + } +#endif + for (int sp = 0; sp < sp_idx_bnd; ++sp) { + // vector part + for (int sg = 0; sg < ic_vect_sgroups; ++sg) { + const int sg_idx = sg * SG_SIZE * VECT_SIZE; + const int sgv = sg * VECT_SIZE; + + const VECT_FLOAT_T src_vect = LOAD_VECT_DATA(&src[sg_idx]); + VECT_FLOAT_T dd_vect = LOAD_VECT_DATA(&diff_dst[sg_idx]); +#if FUSE_BN_RELU + const VECT_CHAR_T ws_vect = LOAD_VECT_CHAR(&ws[sg_idx]); + dd_vect = select( + (VECT_FLOAT_T)0.0f, dd_vect, CONVERT_VECT_INT_T(ws_vect)); +#if FUSE_BN_ADD_RELU + STORE_VECT_DATA(&diff_src_add[sg_idx], dd_vect); +#endif +#endif +#if CALCULATE_STATS == 1 + dd_vect -= (AS_VECT_FLOAT(&diff_beta[sgv]) + + (src_vect - AS_VECT_FLOAT(&v_mean[sgv])) + * AS_VECT_FLOAT(&diff_gamma[sgv]) + * AS_VECT_FLOAT(&sqrt_variance[sgv])) + / sp_size; +#endif + dd_vect *= AS_VECT_FLOAT(&gamma[sgv]) + * AS_VECT_FLOAT(&sqrt_variance[sgv]); + STORE_VECT_DATA(&diff_src[sg_idx], dd_vect); + } // vector sg loop + +#if MAY_HAVE_IC_TAIL + // tails + for (int sg = 0; sg < ic_tail_sgroups; ++sg) { + const int sgv = ic_vect_sgroups * VECT_SIZE + sg; + const int sg_idx = (ic_vect_sgroups * VECT_SIZE + sg) * SG_SIZE; + const float src_tail = LOAD_DATA_1x16(&src[sg_idx]); + float dd_tail = LOAD_DATA_1x16(&diff_dst[sg_idx]); +#if FUSE_BN_RELU + const char ws_tail = LOAD_CHAR_1x16(&ws[sg_idx]); + dd_tail = select(0.0f, dd_tail, convert_int(ws_tail)); +#if FUSE_BN_ADD_RELU + STORE_DATA_1x16(&diff_src_add[sg_idx], dd_tail); +#endif +#endif +#if CALCULATE_STATS == 1 + dd_tail -= (diff_beta[sgv] + + (src_tail - v_mean[sgv]) * diff_gamma[sgv] + * sqrt_variance[sgv]) + / sp_size; +#endif + dd_tail *= gamma[sgv] * sqrt_variance[sgv]; + STORE_DATA_1x16(&diff_src[sg_idx], dd_tail); + } // tail sg loop +#endif + src += ic_size; + diff_dst += ic_size; + diff_src += ic_size; +#if FUSE_BN_RELU +#if FUSE_BN_ADD_RELU + diff_src_add += ic_size; +#endif + ws += ic_size; +#endif + } // sp loop +} + // Aux kernel performs initial zero-padding or finalization of stat vectors // if atomic-based reduction is used __kernel void nhwc_reusable_reduce_aux(__global float *ptr1, diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp index db73acf33dd..36ca1b15080 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp @@ -80,6 +80,12 @@ static status_t final_set_rt_params(nhwc_bnorm_params_t &bn_conf, rt_conf.use_fused_atomics_reduction = bn_conf.use_fused_atomics_reduction(); rt_conf.calc_adj_lws = bn_conf.calc_adj_lws; + // Switchers between kernels with or without private buffers. + // Currently used for performance experiments/tuning + // TODO: make it as part of perf model + rt_conf.use_buffers_calc = dev_getenv("USE_BUFFERS_CALC", 0); + rt_conf.use_buffers_norm = dev_getenv("USE_BUFFERS_NORM", 0); + return status::success; } @@ -172,6 +178,7 @@ static status_t init_conf_common(nhwc_bnorm_params_t &bn_conf, cmpl_conf.vect_size = bn_conf.vect_size; cmpl_conf.sub_group_size = bn_conf.sub_group_size; + cmpl_conf.max_ic_block = bn_conf.max_ic_block; // For performance debuging and analisys std::string prb_str = get_prb_desc_str(pd); @@ -219,6 +226,7 @@ static void init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx, kernel_ctx.add_option("-cl-std=CL2.0"); if (cmpl_conf.data_type == data_type::s8) kernel_ctx.add_option("-Dcl_intel_subgroups_char"); + kernel_ctx.define_int("MAX_IC_BLOCK", cmpl_conf.max_ic_block); } status_t nhwc_reusable_batch_normalization_fwd_t::pd_t::init_conf( @@ -340,7 +348,8 @@ status_t nhwc_reusable_batch_normalization_fwd_t::execute_forward( auto nd_range_calc_mean = pd()->dispatch_calc_stat.nd_range(); - status = parallel_for(ctx, nd_range_calc_mean, kernels_[calc_mean], + status = parallel_for(ctx, nd_range_calc_mean, + kernels_[rt_conf.use_buffers_calc ? calc_mean_buff : calc_mean], calc_mean_arg_list); if (status != status::success) return status; @@ -392,8 +401,9 @@ status_t nhwc_reusable_batch_normalization_fwd_t::execute_forward( auto nd_range_calc_var = pd()->dispatch_calc_stat.nd_range(); - status = parallel_for( - ctx, nd_range_calc_var, kernels_[calc_var], calc_var_arg_list); + status = parallel_for(ctx, nd_range_calc_var, + kernels_[rt_conf.use_buffers_calc ? calc_var_buff : calc_var], + calc_var_arg_list); if (status != status::success) return status; if (rt_conf.use_fused_atomics_reduction) { @@ -446,7 +456,10 @@ status_t nhwc_reusable_batch_normalization_fwd_t::execute_forward( arg_list.append(2 * calc_slm_size, nullptr); auto nd_range = pd()->dispatch_calc_stat.nd_range(); - status = parallel_for(ctx, nd_range, kernels_[calc_mean_var], arg_list); + status = parallel_for(ctx, nd_range, + kernels_[rt_conf.use_buffers_calc ? calc_mean_var_buff + : calc_mean_var], + arg_list); if (status != status::success) return status; if (rt_conf.use_fused_atomics_reduction) { @@ -501,7 +514,9 @@ status_t nhwc_reusable_batch_normalization_fwd_t::execute_forward( arg_list.append(rt_conf.update_sp_block); auto nd_range = pd()->dispatch.nd_range(); - return parallel_for(ctx, nd_range, kernels_[norm_fwd], arg_list); + return parallel_for(ctx, nd_range, + kernels_[rt_conf.use_buffers_norm ? norm_fwd_buff : norm_fwd], + arg_list); } status_t nhwc_reusable_batch_normalization_bwd_t::pd_t::init_conf( @@ -588,8 +603,9 @@ status_t nhwc_reusable_batch_normalization_bwd_t::execute_backward( calc_stats_arg_list.append(calc_slm_size); auto calc_stats_nd_range = pd()->dispatch_calc_stat.nd_range(); - status = parallel_for( - ctx, calc_stats_nd_range, kernels_[calc_stat], calc_stats_arg_list); + status = parallel_for(ctx, calc_stats_nd_range, + kernels_[rt_conf.use_buffers_calc ? calc_stat_buff : calc_stat], + calc_stats_arg_list); if (status != status::success) return status; if (rt_conf.use_fused_atomics_reduction) { @@ -645,7 +661,9 @@ status_t nhwc_reusable_batch_normalization_bwd_t::execute_backward( arg_list.append(rt_conf.update_sp_block); auto nd_range = pd()->dispatch.nd_range(); - return parallel_for(ctx, nd_range, kernels_[norm_bwd], arg_list); + return parallel_for(ctx, nd_range, + kernels_[rt_conf.use_buffers_norm ? norm_bwd_buff : norm_bwd], + arg_list); } } // namespace ocl diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.h b/src/gpu/intel/ocl/bnorm/nhwc_reusable.h index d7d9cf088ae..1808805ffcd 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.h +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.h @@ -16,6 +16,10 @@ #ifndef GPU_INTEL_OCL_BNORM_NHWC_REUSABLE_H #define GPU_INTEL_OCL_BNORM_NHWC_REUSABLE_H +#define MAX_IC_BLOCK_SGROUPS (MAX_IC_BLOCK / SG_SIZE) +#define MAX_IC_TAIL_SGROUPS (VECT_SIZE - 1) +#define MAY_HAVE_IC_TAIL (MAX_IC_TAIL_SGROUPS > 0) + #define VECT_DT_N VECT_SIZE #include "gpu/intel/ocl/dispatch.h" #include "gpu/intel/ocl/ocl_types.h" @@ -95,6 +99,8 @@ #define ACCUM_DATA2_T float2 #define SUM_DATA_T ACCUM_DATA2_T +#define AS_VECT_FLOAT(a) *(VECT_FLOAT_T *)(a) + // Kahan summation algorithm. It's much more precise than simple sum and works // just as fast, since kernel is still memory-bound. SUM_DATA_T summation(ACCUM_DATA_T input, SUM_DATA_T state) { diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp b/src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp index fbbc7422349..a278818e98a 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp @@ -55,7 +55,11 @@ struct nhwc_reusable_bnorm_compile_params_t { "nhwc_reusable_calc_var", "nhwc_reusable_reduce_fwd_reg", "nhwc_reusable_calc_mean_var", "nhwc_reusable_reduce_fwd_1pass", "nhwc_reusable_reduce_aux", "nhwc_reusable_norm_bwd", - "nhwc_reusable_calc_stat", "nhwc_reusable_reduce_stat"}; + "nhwc_reusable_calc_stat", "nhwc_reusable_reduce_stat", + "nhwc_reusable_norm_fwd_buff", "nhwc_reusable_norm_bwd_buff", + "nhwc_reusable_calc_mean_buff", "nhwc_reusable_calc_var_buff", + "nhwc_reusable_calc_mean_var_buff", + "nhwc_reusable_calc_stat_buff"}; return kernel_names; } @@ -79,6 +83,7 @@ struct nhwc_reusable_bnorm_compile_params_t { data_type_t data_type; int vect_size; int sub_group_size; + int max_ic_block; bool use_scale; bool use_shift; bool is_training; @@ -100,6 +105,8 @@ struct nhwc_reusable_bnorm_runtime_params_t { float relu_negative_slope; float eps; bool use_fused_atomics_reduction; + bool use_buffers_calc; + bool use_buffers_norm; compute::range_t calc_adj_lws; }; From c3e7e4759e7219d4d451a5aedfecb84ae9dd369f Mon Sep 17 00:00:00 2001 From: Sergey Kazakov Date: Tue, 7 May 2024 09:35:46 -0700 Subject: [PATCH 026/187] gpu: bnorm: nhwc-reusable: add function for gettting slm buffer size --- src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp index 36ca1b15080..12d2288593c 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp @@ -262,6 +262,17 @@ void nhwc_reusable_batch_normalization_fwd_t::pd_t::init_scratchpad() { } } +static dim_t get_calc_slm_size( + const nhwc_reusable_bnorm_compile_params_t &cmpl_conf, + const nhwc_reusable_bnorm_runtime_params_t &rt_conf) { + return rt_conf.use_fused_atomics_reduction + ? (rt_conf.use_buffers_calc ? sizeof(float) * rt_conf.ic_block + * rt_conf.calc_adj_lws[1] + : sizeof(float) * cmpl_conf.vect_size + * rt_conf.sg_size * rt_conf.calc_adj_lws[1]) + : 0; +} + status_t nhwc_reusable_batch_normalization_fwd_t::execute_forward( const exec_ctx_t &ctx) const { @@ -322,10 +333,7 @@ status_t nhwc_reusable_batch_normalization_fwd_t::execute_forward( if (status != status::success) return status; } - const dim_t calc_slm_size = rt_conf.use_fused_atomics_reduction - ? sizeof(float) * cmpl_conf.vect_size * rt_conf.sg_size - * rt_conf.calc_adj_lws[1] - : 0; + const dim_t calc_slm_size = get_calc_slm_size(cmpl_conf, rt_conf); if (cmpl_conf.calculate_stats && !cmpl_conf.use_stats_one_pass) { const dim_t local_sum_size = sizeof(float) * rt_conf.sg_size @@ -579,10 +587,7 @@ status_t nhwc_reusable_batch_normalization_bwd_t::execute_backward( if (status != status::success) return status; } - const dim_t calc_slm_size = rt_conf.use_fused_atomics_reduction - ? sizeof(float) * cmpl_conf.vect_size * rt_conf.sg_size - * rt_conf.calc_adj_lws[1] - : 0; + const dim_t calc_slm_size = get_calc_slm_size(cmpl_conf, rt_conf); compute::kernel_arg_list_t calc_stats_arg_list; calc_stats_arg_list.append(src); From 6594aee8755377e6945b405900be1a7ac58120be Mon Sep 17 00:00:00 2001 From: Sergey Kazakov Date: Tue, 7 May 2024 14:35:57 -0700 Subject: [PATCH 027/187] gpu: bnorm: nhwc-reusable: rename sub group size --- src/gpu/intel/ocl/bnorm/nhwc_reusable.cl | 304 ++++++++++++---------- src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp | 2 +- src/gpu/intel/ocl/bnorm/nhwc_reusable.h | 2 +- 3 files changed, 163 insertions(+), 145 deletions(-) diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl index ba93f06cfa9..fab295fe52c 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl @@ -27,12 +27,12 @@ void nhwc_reusable_1pass_fused_reduction(volatile __global atomic_float *mean, __local SUM_DATA_T *local_sum_sq, off_t vect_size) { const int local_id = get_local_id(1); const int simd_id = get_sub_group_local_id(); - const int row_size = vect_size * SG_SIZE; + const int row_size = vect_size * SUB_GROUP_SIZE; const int group_size = get_local_size(1); if (local_id > 0) { unroll_4_for(int v_idx = 0; v_idx < vect_size; v_idx++) { const int slm_offset - = local_id * row_size + v_idx * SG_SIZE + simd_id; + = local_id * row_size + v_idx * SUB_GROUP_SIZE + simd_id; local_sum[slm_offset] = sum[v_idx]; local_sum_sq[slm_offset] = sum_sq[v_idx]; } @@ -41,7 +41,8 @@ void nhwc_reusable_1pass_fused_reduction(volatile __global atomic_float *mean, if (local_id == 0) { unroll_16_for(int l_id = 1; l_id < group_size; l_id++) { for (int v_idx = 0; v_idx < vect_size; v_idx++) { - const int off = l_id * row_size + v_idx * SG_SIZE + simd_id; + const int off + = l_id * row_size + v_idx * SUB_GROUP_SIZE + simd_id; SUM_DATA_T tmp = local_sum[off]; SUM_DATA_T tmp_sq = local_sum_sq[off]; sum[v_idx] = summation(tmp.s1, sum[v_idx]); @@ -51,7 +52,7 @@ void nhwc_reusable_1pass_fused_reduction(volatile __global atomic_float *mean, } } unroll_4_for(int v_idx = 0; v_idx < vect_size; v_idx++) { - const int off = v_idx * SG_SIZE + simd_id; + const int off = v_idx * SUB_GROUP_SIZE + simd_id; atomic_add_global(&mean[dst_offset + off], sum[v_idx].s0); atomic_add_global(&variance[dst_offset + off], sum_sq[v_idx].s0); } @@ -70,11 +71,12 @@ void nhwc_reusable_1pass_fused_reduction_buff( const int simd_id = get_sub_group_local_id(); const int row_size = ic_block; const int group_size = get_local_size(1); - const int ic_block_sgroups = ic_block / SG_SIZE; + const int ic_block_sgroups = ic_block / SUB_GROUP_SIZE; if (local_id > 0) { unroll_16_for(int sg = 0; sg < ic_block_sgroups; sg++) { - const int slm_offset = local_id * row_size + sg * SG_SIZE + simd_id; + const int slm_offset + = local_id * row_size + sg * SUB_GROUP_SIZE + simd_id; local_sum[slm_offset] = sum[sg]; local_sum_sq[slm_offset] = sum_sq[sg]; } @@ -83,7 +85,7 @@ void nhwc_reusable_1pass_fused_reduction_buff( if (local_id == 0) { unroll_16_for(int l_id = 1; l_id < group_size; l_id++) { unroll_4_for(int sg = 0; sg < ic_block_sgroups; sg++) { - const int off = l_id * row_size + sg * SG_SIZE + simd_id; + const int off = l_id * row_size + sg * SUB_GROUP_SIZE + simd_id; SUM_DATA_T tmp = local_sum[off]; SUM_DATA_T tmp_sq = local_sum_sq[off]; sum[sg] = summation(tmp.s1, sum[sg]); @@ -93,7 +95,7 @@ void nhwc_reusable_1pass_fused_reduction_buff( } } unroll_4_for(int sg = 0; sg < ic_block_sgroups; sg++) { - const int off = sg * SG_SIZE + simd_id; + const int off = sg * SUB_GROUP_SIZE + simd_id; atomic_add_global(&mean[dst_offset + off], sum[sg].s0); atomic_add_global(&variance[dst_offset + off], sum_sq[sg].s0); } @@ -107,12 +109,12 @@ void nhwc_reusable_reg_fused_reduction(volatile __global atomic_float *dst, off_t vect_size) { const int local_id = get_local_id(1); const int simd_id = get_sub_group_local_id(); - const int row_size = vect_size * SG_SIZE; + const int row_size = vect_size * SUB_GROUP_SIZE; const int group_size = get_local_size(1); if (local_id > 0) { unroll_4_for(int v_idx = 0; v_idx < vect_size; v_idx++) { const int slm_offset - = local_id * row_size + v_idx * SG_SIZE + simd_id; + = local_id * row_size + v_idx * SUB_GROUP_SIZE + simd_id; local_sum[slm_offset] = sum[v_idx]; } } @@ -120,12 +122,13 @@ void nhwc_reusable_reg_fused_reduction(volatile __global atomic_float *dst, if (local_id == 0) { unroll_16_for(int l_id = 1; l_id < group_size; l_id++) { for (int v_idx = 0; v_idx < vect_size; v_idx++) { - const int off = l_id * row_size + v_idx * SG_SIZE + simd_id; + const int off + = l_id * row_size + v_idx * SUB_GROUP_SIZE + simd_id; sum[v_idx] += local_sum[off]; } } unroll_4_for(int v_idx = 0; v_idx < vect_size; v_idx++) { - const int off = v_idx * SG_SIZE + simd_id; + const int off = v_idx * SUB_GROUP_SIZE + simd_id; atomic_add_global(&dst[dst_offset + off], sum[v_idx]); } } @@ -143,11 +146,12 @@ void nhwc_reusable_reg_fused_reduction_buff(volatile __global atomic_float *dst, const int simd_id = get_sub_group_local_id(); const int group_size = get_local_size(1); const int row_size = ic_block; - const int ic_block_sgroups = ic_block / SG_SIZE; + const int ic_block_sgroups = ic_block / SUB_GROUP_SIZE; if (local_id > 0) { unroll_16_for(int sg = 0; sg < ic_block_sgroups; sg++) { - const int slm_offset = local_id * row_size + sg * SG_SIZE + simd_id; + const int slm_offset + = local_id * row_size + sg * SUB_GROUP_SIZE + simd_id; local_sum[slm_offset] = sum[sg]; } } @@ -155,12 +159,12 @@ void nhwc_reusable_reg_fused_reduction_buff(volatile __global atomic_float *dst, if (local_id == 0) { unroll_16_for(int l_id = 1; l_id < group_size; l_id++) { unroll_4_for(int sg = 0; sg < ic_block_sgroups; sg++) { - const int off = l_id * row_size + sg * SG_SIZE + simd_id; + const int off = l_id * row_size + sg * SUB_GROUP_SIZE + simd_id; sum[sg] += local_sum[off]; } } unroll_4_for(int sg = 0; sg < ic_block_sgroups; sg++) { - const int off = sg * SG_SIZE + simd_id; + const int off = sg * SUB_GROUP_SIZE + simd_id; atomic_add_global(&dst[dst_offset + off], sum[sg]); } } @@ -168,7 +172,7 @@ void nhwc_reusable_reg_fused_reduction_buff(volatile __global atomic_float *dst, } // Calculate mean, regular algorithm, no private memory buffers used. -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_calc_mean(__global DATA_T *src, __global float *reduce_temp, volatile __global atomic_float *mean, off_t ic_size, off_t ic_block, off_t sp_size, off_t stat_sp_block, off_t reduce_stat_nblocks, @@ -176,7 +180,7 @@ nhwc_reusable_calc_mean(__global DATA_T *src, __global float *reduce_temp, const int c = get_global_id(0); const int sp_block_idx = get_global_id(1); - const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int ic_block_offset = (c / SUB_GROUP_SIZE) * ic_block; const int src_off = ic_block_offset + sp_block_idx * stat_sp_block * ic_size; @@ -189,7 +193,7 @@ nhwc_reusable_calc_mean(__global DATA_T *src, __global float *reduce_temp, const int sp_idx_bnd = sp_size % stat_sp_block ? min(stat_sp_block, sp_size - sp_block_idx * stat_sp_block) : stat_sp_block; - const int ic_block_sgroups = ic_block / SG_SIZE; + const int ic_block_sgroups = ic_block / SUB_GROUP_SIZE; const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; const int ic_vect_sgroups = ic_block_sgroups - ic_tail_sgroups; @@ -199,17 +203,18 @@ nhwc_reusable_calc_mean(__global DATA_T *src, __global float *reduce_temp, // reduce for (int sp = 0; sp < sp_idx_bnd; ++sp) { v_mean += LOAD_VECT_DATA( - &src[sg * SG_SIZE * VECT_SIZE + sp * ic_size]); + &src[sg * SUB_GROUP_SIZE * VECT_SIZE + sp * ic_size]); } // store res if (use_fused_atomics_reduction) { - const int dst_off = ic_block_offset + sg * VECT_SIZE * SG_SIZE; + const int dst_off + = ic_block_offset + sg * VECT_SIZE * SUB_GROUP_SIZE; nhwc_reusable_reg_fused_reduction( mean, dst_off, (float *)(&v_mean), local_sum, VECT_SIZE); } else { - const int sg_off = sg * VECT_SIZE * SG_SIZE; + const int sg_off = sg * VECT_SIZE * SUB_GROUP_SIZE; for (int v_idx = 0; v_idx < VECT_SIZE; v_idx++) { - STORE_FLOAT_1x16(&reduce_temp[sg_off + v_idx * SG_SIZE], + STORE_FLOAT_1x16(&reduce_temp[sg_off + v_idx * SUB_GROUP_SIZE], #if VECT_SIZE > 1 v_mean[v_idx]); #else @@ -224,23 +229,24 @@ nhwc_reusable_calc_mean(__global DATA_T *src, __global float *reduce_temp, // reduce for (int sp = 0; sp < sp_idx_bnd; ++sp) { v_mean += LOAD_DATA_1x16( - &src[(ic_vect_sgroups + sg) * SG_SIZE + sp * ic_size]); + &src[(ic_vect_sgroups + sg) * SUB_GROUP_SIZE + + sp * ic_size]); } // store res if (use_fused_atomics_reduction) { const int dst_off - = ic_block_offset + (ic_vect_sgroups + sg) * SG_SIZE; + = ic_block_offset + (ic_vect_sgroups + sg) * SUB_GROUP_SIZE; nhwc_reusable_reg_fused_reduction( mean, dst_off, &v_mean, local_sum, 1); } else { - const int sg_off = (ic_vect_sgroups + sg) * SG_SIZE; + const int sg_off = (ic_vect_sgroups + sg) * SUB_GROUP_SIZE; STORE_FLOAT_1x16(&reduce_temp[sg_off], v_mean); } } } // Calculate mean, regular algorithm, private memory buffers used. -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_calc_mean_buff(__global DATA_T *src, __global float *reduce_temp, volatile __global atomic_float *mean, off_t ic_size, off_t ic_block, off_t sp_size, off_t stat_sp_block, off_t reduce_stat_nblocks, @@ -249,7 +255,7 @@ nhwc_reusable_calc_mean_buff(__global DATA_T *src, __global float *reduce_temp, const int c = get_global_id(0); const int sp_block_idx = get_global_id(1); - const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int ic_block_offset = (c / SUB_GROUP_SIZE) * ic_block; const int src_off = ic_block_offset + sp_block_idx * stat_sp_block * ic_size; @@ -263,7 +269,7 @@ nhwc_reusable_calc_mean_buff(__global DATA_T *src, __global float *reduce_temp, ? min(stat_sp_block, sp_size - sp_block_idx * stat_sp_block) : stat_sp_block; const int ic_block_sgroups - = min(ic_size - ic_block_offset, ic_block) / SG_SIZE; + = min(ic_size - ic_block_offset, ic_block) / SUB_GROUP_SIZE; const int ic_vect_sgroups = ic_block_sgroups / VECT_SIZE; const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; @@ -273,7 +279,7 @@ nhwc_reusable_calc_mean_buff(__global DATA_T *src, __global float *reduce_temp, for (int sg = 0; sg < ic_vect_sgroups; ++sg) { float s_vect[VECT_SIZE]; AS_VECT_FLOAT(s_vect) - = LOAD_VECT_DATA(&src[sg * SG_SIZE * VECT_SIZE]); + = LOAD_VECT_DATA(&src[sg * SUB_GROUP_SIZE * VECT_SIZE]); for (int vect = 0; vect < VECT_SIZE; ++vect) { v_mean[sg * VECT_SIZE + vect] += s_vect[vect]; } @@ -282,7 +288,7 @@ nhwc_reusable_calc_mean_buff(__global DATA_T *src, __global float *reduce_temp, // tails for (int sg = 0; sg < ic_tail_sgroups; ++sg) { const int sg_idx = ic_vect_sgroups * VECT_SIZE + sg; - v_mean[sg_idx] += LOAD_DATA_1x16(&src[sg_idx * SG_SIZE]); + v_mean[sg_idx] += LOAD_DATA_1x16(&src[sg_idx * SUB_GROUP_SIZE]); } #endif // HAS_IC_VECT_TAIL src += ic_size; @@ -294,14 +300,14 @@ nhwc_reusable_calc_mean_buff(__global DATA_T *src, __global float *reduce_temp, mean, ic_block_offset, (float *)(&v_mean), local_sum, ic_block); } else { for (int sg = 0; sg < ic_block_sgroups; ++sg) { - const int sg_off = sg * SG_SIZE; + const int sg_off = sg * SUB_GROUP_SIZE; STORE_FLOAT_1x16(&reduce_temp[sg_off], v_mean[sg]); } } } // Calculate variance, regular algorithm, no private memory buffers used. -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_calc_var(__global DATA_T *src, __global float *mean, __global float *reduce_temp, volatile __global atomic_float *variance, off_t ic_size, off_t ic_block, off_t sp_size, off_t stat_sp_block, @@ -311,7 +317,7 @@ nhwc_reusable_calc_var(__global DATA_T *src, __global float *mean, const int c = get_global_id(0); const int sp_block_idx = get_global_id(1); - const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int ic_block_offset = (c / SUB_GROUP_SIZE) * ic_block; const int src_off = ic_block_offset + sp_block_idx * stat_sp_block * ic_size; @@ -325,7 +331,7 @@ nhwc_reusable_calc_var(__global DATA_T *src, __global float *mean, const int sp_idx_bnd = sp_size % stat_sp_block ? min(stat_sp_block, sp_size - sp_block_idx * stat_sp_block) : stat_sp_block; - const int ic_block_sgroups = ic_block / SG_SIZE; + const int ic_block_sgroups = ic_block / SUB_GROUP_SIZE; const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; const int ic_vect_sgroups = ic_block_sgroups - ic_tail_sgroups; @@ -333,24 +339,25 @@ nhwc_reusable_calc_var(__global DATA_T *src, __global float *mean, for (int sg = 0; sg < ic_block_sgroups / VECT_SIZE; ++sg) { VECT_FLOAT_T v_var = 0.0f; const VECT_FLOAT_T v_mean - = LOAD_VECT_FLOAT(&mean[sg * SG_SIZE * VECT_SIZE]); + = LOAD_VECT_FLOAT(&mean[sg * SUB_GROUP_SIZE * VECT_SIZE]); // reduce for (int sp = 0; sp < sp_idx_bnd; ++sp) { const VECT_FLOAT_T v0 - = LOAD_VECT_DATA( - &src[sg * SG_SIZE * VECT_SIZE + sp * ic_size]) + = LOAD_VECT_DATA(&src[sg * SUB_GROUP_SIZE * VECT_SIZE + + sp * ic_size]) - v_mean; v_var = fma(v0, v0, v_var); } // store res if (use_fused_atomics_reduction) { - const int dst_off = ic_block_offset + sg * VECT_SIZE * SG_SIZE; + const int dst_off + = ic_block_offset + sg * VECT_SIZE * SUB_GROUP_SIZE; nhwc_reusable_reg_fused_reduction( variance, dst_off, (float *)(&v_var), local_sum, VECT_SIZE); } else { - const int sg_off = sg * VECT_SIZE * SG_SIZE; + const int sg_off = sg * VECT_SIZE * SUB_GROUP_SIZE; for (int v_idx = 0; v_idx < VECT_SIZE; v_idx++) { - STORE_FLOAT_1x16(&reduce_temp[sg_off + v_idx * SG_SIZE], + STORE_FLOAT_1x16(&reduce_temp[sg_off + v_idx * SUB_GROUP_SIZE], #if VECT_SIZE > 1 v_var[v_idx]); #else @@ -362,31 +369,32 @@ nhwc_reusable_calc_var(__global DATA_T *src, __global float *mean, // tails for (int sg = 0; sg < ic_tail_sgroups; ++sg) { float v_var = 0.0f; - const float v_mean - = LOAD_FLOAT_1x16(&mean[(ic_vect_sgroups + sg) * SG_SIZE]); + const float v_mean = LOAD_FLOAT_1x16( + &mean[(ic_vect_sgroups + sg) * SUB_GROUP_SIZE]); // reduce for (int sp = 0; sp < sp_idx_bnd; ++sp) { const float v0 - = LOAD_DATA_1x16(&src[(ic_vect_sgroups + sg) * SG_SIZE - + sp * ic_size]) + = LOAD_DATA_1x16( + &src[(ic_vect_sgroups + sg) * SUB_GROUP_SIZE + + sp * ic_size]) - v_mean; v_var = fma(v0, v0, v_var); } // store res if (use_fused_atomics_reduction) { const int dst_off - = ic_block_offset + (ic_vect_sgroups + sg) * SG_SIZE; + = ic_block_offset + (ic_vect_sgroups + sg) * SUB_GROUP_SIZE; nhwc_reusable_reg_fused_reduction( variance, dst_off, &v_var, local_sum, 1); } else { - const int sg_off = (ic_vect_sgroups + sg) * SG_SIZE; + const int sg_off = (ic_vect_sgroups + sg) * SUB_GROUP_SIZE; STORE_FLOAT_1x16(&reduce_temp[sg_off], v_var); } } } // Calculate variance, regular algorithm, private memory buffers used. -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_calc_var_buff(__global DATA_T *src, __global float *mean, __global float *reduce_temp, volatile __global atomic_float *variance, off_t ic_size, off_t ic_block, off_t sp_size, off_t stat_sp_block, @@ -396,7 +404,7 @@ nhwc_reusable_calc_var_buff(__global DATA_T *src, __global float *mean, const int c = get_global_id(0); const int sp_block_idx = get_global_id(1); - const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int ic_block_offset = (c / SUB_GROUP_SIZE) * ic_block; const int src_off = ic_block_offset + sp_block_idx * stat_sp_block * ic_size; @@ -411,14 +419,14 @@ nhwc_reusable_calc_var_buff(__global DATA_T *src, __global float *mean, ? min(stat_sp_block, sp_size - sp_block_idx * stat_sp_block) : stat_sp_block; const int ic_block_sgroups - = min(ic_size - ic_block_offset, ic_block) / SG_SIZE; + = min(ic_size - ic_block_offset, ic_block) / SUB_GROUP_SIZE; const int ic_vect_sgroups = ic_block_sgroups / VECT_SIZE; const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; float v_mean[MAX_IC_BLOCK_SGROUPS] = {0.0f}; for (int sg = 0; sg < ic_block_sgroups; ++sg) { v_mean[sg] = as_float(intel_sub_group_block_read( - (const __global uint *)(&mean[(sg * SG_SIZE)]))); + (const __global uint *)(&mean[(sg * SUB_GROUP_SIZE)]))); } float v_var[MAX_IC_BLOCK_SGROUPS] = {0.0f}; @@ -429,7 +437,7 @@ nhwc_reusable_calc_var_buff(__global DATA_T *src, __global float *mean, for (int sg = 0; sg < ic_vect_sgroups; ++sg) { float s_vect[VECT_SIZE]; AS_VECT_FLOAT(s_vect) - = LOAD_VECT_DATA(&src[sg * SG_SIZE * VECT_SIZE]); + = LOAD_VECT_DATA(&src[sg * SUB_GROUP_SIZE * VECT_SIZE]); for (int vect = 0; vect < VECT_SIZE; ++vect) { int sg_idx = sg * VECT_SIZE + vect; v0[sg_idx] = s_vect[vect] - v_mean[sg_idx]; @@ -441,7 +449,7 @@ nhwc_reusable_calc_var_buff(__global DATA_T *src, __global float *mean, // tails for (int sg = 0; sg < ic_tail_sgroups; ++sg) { const int sg_idx = ic_vect_sgroups * VECT_SIZE + sg; - float s_tail = LOAD_DATA_1x16(&src[sg_idx * SG_SIZE]); + float s_tail = LOAD_DATA_1x16(&src[sg_idx * SUB_GROUP_SIZE]); v0[sg_idx] = s_tail - v_mean[sg_idx]; v_var[sg_idx] = fma(v0[sg_idx], v0[sg_idx], v_var[sg_idx]); } @@ -455,7 +463,7 @@ nhwc_reusable_calc_var_buff(__global DATA_T *src, __global float *mean, (float *)(&v_var), local_sum, ic_block); } else { for (int sg = 0; sg < ic_block_sgroups; ++sg) { - const int sg_off = sg * SG_SIZE; + const int sg_off = sg * SUB_GROUP_SIZE; STORE_FLOAT_1x16(&reduce_temp[sg_off], v_var[sg]); } } @@ -463,7 +471,7 @@ nhwc_reusable_calc_var_buff(__global DATA_T *src, __global float *mean, // Calculate mean and variance at once, 1pass algorithm // no private memory buffers used. -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_calc_mean_var(__global DATA_T *src, __global float *reduce_temp, volatile __global atomic_float *mean, volatile __global atomic_float *variance, off_t ic_size, off_t ic_block, @@ -474,7 +482,7 @@ nhwc_reusable_calc_mean_var(__global DATA_T *src, __global float *reduce_temp, const int sp_block_idx = get_global_id(1); const int simd_id = get_sub_group_local_id(); - const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int ic_block_offset = (c / SUB_GROUP_SIZE) * ic_block; const int src_off = ic_block_offset + sp_block_idx * stat_sp_block * ic_size; @@ -489,7 +497,7 @@ nhwc_reusable_calc_mean_var(__global DATA_T *src, __global float *reduce_temp, const int sp_idx_bnd = sp_size % stat_sp_block ? min(stat_sp_block, sp_size - sp_block_idx * stat_sp_block) : stat_sp_block; - const int ic_block_sgroups = ic_block / SG_SIZE; + const int ic_block_sgroups = ic_block / SUB_GROUP_SIZE; const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; const int ic_vect_sgroups = ic_block_sgroups - ic_tail_sgroups; @@ -500,7 +508,7 @@ nhwc_reusable_calc_mean_var(__global DATA_T *src, __global float *reduce_temp, // reduce for (int sp = 0; sp < sp_idx_bnd; ++sp) { const VECT_FLOAT_T s_vect = LOAD_VECT_DATA( - &src[sg * SG_SIZE * VECT_SIZE + sp * ic_size]); + &src[sg * SUB_GROUP_SIZE * VECT_SIZE + sp * ic_size]); for (int v_idx = 0; v_idx < VECT_SIZE; ++v_idx) { #if VECT_SIZE > 1 #define S_VECT s_vect[v_idx] @@ -513,14 +521,15 @@ nhwc_reusable_calc_mean_var(__global DATA_T *src, __global float *reduce_temp, } // store res if (use_fused_atomics_reduction) { - const int dst_off = ic_block_offset + sg * VECT_SIZE * SG_SIZE; + const int dst_off + = ic_block_offset + sg * VECT_SIZE * SUB_GROUP_SIZE; nhwc_reusable_1pass_fused_reduction(mean, variance, dst_off, sum, sum_sq, local_sum, local_sum_sq, VECT_SIZE); } else { - const int sg_off = sg * VECT_SIZE * SG_SIZE; + const int sg_off = sg * VECT_SIZE * SUB_GROUP_SIZE; for (int v_idx = 0; v_idx < VECT_SIZE; v_idx++) { - const int reduce_off = sg_off + v_idx * SG_SIZE; + const int reduce_off = sg_off + v_idx * SUB_GROUP_SIZE; STORE_FLOAT_1x16(&reduce_temp[reduce_off], sum[v_idx].s0); STORE_FLOAT_1x16(&reduce_temp[variance_off + reduce_off], sum_sq[v_idx].s0); @@ -533,18 +542,19 @@ nhwc_reusable_calc_mean_var(__global DATA_T *src, __global float *reduce_temp, SUM_DATA_T sum_sq = 0.0f; for (int sp = 0; sp < sp_idx_bnd; ++sp) { const float src_v = LOAD_DATA_1x16( - &src[(ic_vect_sgroups + sg) * SG_SIZE + sp * ic_size]); + &src[(ic_vect_sgroups + sg) * SUB_GROUP_SIZE + + sp * ic_size]); sum = summation(src_v, sum); sum_sq = summation(src_v * src_v, sum_sq); } // store res if (use_fused_atomics_reduction) { const int dst_off - = ic_block_offset + (ic_vect_sgroups + sg) * SG_SIZE; + = ic_block_offset + (ic_vect_sgroups + sg) * SUB_GROUP_SIZE; nhwc_reusable_1pass_fused_reduction(mean, variance, dst_off, &sum, &sum_sq, local_sum, local_sum_sq, 1); } else { - const int sg_off = (ic_vect_sgroups + sg) * SG_SIZE; + const int sg_off = (ic_vect_sgroups + sg) * SUB_GROUP_SIZE; STORE_FLOAT_1x16(&reduce_temp[sg_off], sum.s0); STORE_FLOAT_1x16(&reduce_temp[variance_off + sg_off], sum_sq.s0); } @@ -553,7 +563,7 @@ nhwc_reusable_calc_mean_var(__global DATA_T *src, __global float *reduce_temp, // Calculate mean and variance at once, 1pass algorithm, // private memory buffers used. -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_calc_mean_var_buff(__global DATA_T *src, __global float *reduce_temp, volatile __global atomic_float *mean, volatile __global atomic_float *variance, off_t ic_size, off_t ic_block, @@ -565,7 +575,7 @@ nhwc_reusable_calc_mean_var_buff(__global DATA_T *src, const int sp_block_idx = get_global_id(1); const int simd_id = get_sub_group_local_id(); - const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int ic_block_offset = (c / SUB_GROUP_SIZE) * ic_block; const int src_off = ic_block_offset + sp_block_idx * stat_sp_block * ic_size; @@ -581,7 +591,7 @@ nhwc_reusable_calc_mean_var_buff(__global DATA_T *src, ? min(stat_sp_block, sp_size - sp_block_idx * stat_sp_block) : stat_sp_block; const int ic_block_sgroups - = min(ic_size - ic_block_offset, ic_block) / SG_SIZE; + = min(ic_size - ic_block_offset, ic_block) / SUB_GROUP_SIZE; const int ic_vect_sgroups = ic_block_sgroups / VECT_SIZE; const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; @@ -593,7 +603,7 @@ nhwc_reusable_calc_mean_var_buff(__global DATA_T *src, for (int sg = 0; sg < ic_vect_sgroups; ++sg) { float s_vect[VECT_SIZE]; AS_VECT_FLOAT(s_vect) - = LOAD_VECT_DATA(&src[sg * SG_SIZE * VECT_SIZE]); + = LOAD_VECT_DATA(&src[sg * SUB_GROUP_SIZE * VECT_SIZE]); for (int vect = 0; vect < VECT_SIZE; ++vect) { const int sum_idx = sg * VECT_SIZE + vect; sum[sum_idx] = summation(s_vect[vect], sum[sum_idx]); @@ -605,7 +615,7 @@ nhwc_reusable_calc_mean_var_buff(__global DATA_T *src, // tails for (int sg = 0; sg < ic_tail_sgroups; ++sg) { const int sg_idx = ic_vect_sgroups * VECT_SIZE + sg; - float s_tail = LOAD_DATA_1x16(&src[sg_idx * SG_SIZE]); + float s_tail = LOAD_DATA_1x16(&src[sg_idx * SUB_GROUP_SIZE]); sum[sg_idx] = summation(s_tail, sum[sg_idx]); sum_sq[sg_idx] = summation(s_tail * s_tail, sum_sq[sg_idx]); } @@ -619,7 +629,7 @@ nhwc_reusable_calc_mean_var_buff(__global DATA_T *src, ic_block); } else { for (int sg = 0; sg < ic_block_sgroups; ++sg) { - const int reduce_off = sg * SG_SIZE; + const int reduce_off = sg * SUB_GROUP_SIZE; STORE_FLOAT_1x16(&reduce_temp[reduce_off], sum[sg].s0); STORE_FLOAT_1x16( &reduce_temp[variance_off + reduce_off], sum_sq[sg].s0); @@ -629,7 +639,7 @@ nhwc_reusable_calc_mean_var_buff(__global DATA_T *src, // Main FWD kernel, common for regular and 1pass algorithms // no private memory buffers used. -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_norm_fwd(__global DATA_T *src, __global float *mean, __global float *variance, __global DATA_T *dst, __global float *scaleshift, __global float *shift, __global char *ws, @@ -638,7 +648,7 @@ nhwc_reusable_norm_fwd(__global DATA_T *src, __global float *mean, const int c = get_global_id(0); const int sp = get_global_id(1) * update_sp_block; - const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int ic_block_offset = (c / SUB_GROUP_SIZE) * ic_block; mean += ic_block_offset; variance += ic_block_offset; shift += ic_block_offset; @@ -658,11 +668,11 @@ nhwc_reusable_norm_fwd(__global DATA_T *src, __global float *mean, const int sp_idx_bnd = has_sp_block_tail ? min(update_sp_block, sp_size - sp) : update_sp_block; - const int ic_block_sgroups = ic_block / SG_SIZE; + const int ic_block_sgroups = ic_block / SUB_GROUP_SIZE; for (int sp_idx = 0; sp_idx < sp_idx_bnd; sp_idx++) { // vectorized part for (int sg = 0; sg < ic_block_sgroups / VECT_SIZE; ++sg) { - const int sg_idx = sg * SG_SIZE * VECT_SIZE; + const int sg_idx = sg * SUB_GROUP_SIZE * VECT_SIZE; const VECT_FLOAT_T sm = USE_SCALE ? LOAD_VECT_FLOAT(&scaleshift[sg_idx]) : (VECT_FLOAT_T)1.0f; @@ -701,13 +711,13 @@ nhwc_reusable_norm_fwd(__global DATA_T *src, __global float *mean, STORE_VECT_DATA(&dst[sg_idx], d_vect); } // sg loop - const int ic_tail_sgroups = (ic_block / SG_SIZE) % VECT_SIZE; + const int ic_tail_sgroups = (ic_block / SUB_GROUP_SIZE) % VECT_SIZE; const int ic_vect_sgroups = ic_block_sgroups - ic_tail_sgroups; const bool has_ic_vect_tail = ic_tail_sgroups > 0; if (has_ic_vect_tail) { // tails for (int sg = 0; sg < ic_tail_sgroups; ++sg) { - const int sg_idx = (ic_vect_sgroups + sg) * SG_SIZE; + const int sg_idx = (ic_vect_sgroups + sg) * SUB_GROUP_SIZE; const float sm_tail = USE_SCALE ? LOAD_FLOAT_1x16(&scaleshift[sg_idx]) @@ -756,7 +766,7 @@ nhwc_reusable_norm_fwd(__global DATA_T *src, __global float *mean, // Main FWD kernel, common for regular and 1pass algorithms, // private memory buffers used. -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_norm_fwd_buff(__global DATA_T *src, __global float *mean, __global float *variance, __global DATA_T *dst, __global float *scaleshift, __global float *shift, __global char *ws, @@ -766,7 +776,7 @@ nhwc_reusable_norm_fwd_buff(__global DATA_T *src, __global float *mean, const int c = get_global_id(0); const int sp = get_global_id(1) * update_sp_block; - const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int ic_block_offset = (c / SUB_GROUP_SIZE) * ic_block; mean += ic_block_offset; variance += ic_block_offset; @@ -793,13 +803,13 @@ nhwc_reusable_norm_fwd_buff(__global DATA_T *src, __global float *mean, : update_sp_block; const int ic_block_sgroups - = min(ic_size - ic_block_offset, ic_block) / SG_SIZE; + = min(ic_size - ic_block_offset, ic_block) / SUB_GROUP_SIZE; const int ic_vect_sgroups = ic_block_sgroups / VECT_SIZE; const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; const bool has_ic_vect_tail = ic_tail_sgroups > 0; for (int sg = 0; sg < ic_vect_sgroups; ++sg) { - const int sg_idx = sg * SG_SIZE * VECT_SIZE; + const int sg_idx = sg * SUB_GROUP_SIZE * VECT_SIZE; const int sgv = sg * VECT_SIZE; AS_VECT_FLOAT(&sm[sgv]) = USE_SCALE @@ -816,7 +826,7 @@ nhwc_reusable_norm_fwd_buff(__global DATA_T *src, __global float *mean, #if MAY_HAVE_IC_TAIL for (int sg = 0; sg < ic_tail_sgroups; ++sg) { const int sgv = ic_vect_sgroups * VECT_SIZE + sg; - const int sg_idx = (ic_vect_sgroups * VECT_SIZE + sg) * SG_SIZE; + const int sg_idx = (ic_vect_sgroups * VECT_SIZE + sg) * SUB_GROUP_SIZE; sm[sgv] = USE_SCALE ? LOAD_FLOAT_1x16(&scaleshift[sg_idx]) : 1.0f; sv[sgv] = USE_SHIFT ? LOAD_FLOAT_1x16(&shift[sg_idx]) : 0.0f; v_mean[sgv] = LOAD_FLOAT_1x16(&mean[sg_idx]); @@ -828,7 +838,7 @@ nhwc_reusable_norm_fwd_buff(__global DATA_T *src, __global float *mean, for (int sp_idx = 0; sp_idx < sp_idx_bnd; sp_idx++) { // vectorized part for (int sg = 0; sg < ic_vect_sgroups; ++sg) { - const int sg_idx = sg * SG_SIZE * VECT_SIZE; + const int sg_idx = sg * SUB_GROUP_SIZE * VECT_SIZE; const int sgv = sg * VECT_SIZE; VECT_FLOAT_T d_vect; @@ -866,7 +876,8 @@ nhwc_reusable_norm_fwd_buff(__global DATA_T *src, __global float *mean, // tails for (int sg = 0; sg < ic_tail_sgroups; ++sg) { const int sgv = ic_vect_sgroups * VECT_SIZE + sg; - const int sg_idx = (ic_vect_sgroups * VECT_SIZE + sg) * SG_SIZE; + const int sg_idx + = (ic_vect_sgroups * VECT_SIZE + sg) * SUB_GROUP_SIZE; float d_tail; const float s_tail = LOAD_DATA_1x16(&src[sg_idx]); d_tail = fma(s_tail - v_mean[sgv], sqrt_variance[sgv], sv[sgv]); @@ -907,7 +918,7 @@ void nhwc_reusable_bwd_fused_reduction( off_t vect_size, off_t calc_slm_size) { const int local_id = get_local_id(1); const int simd_id = get_sub_group_local_id(); - const int row_size = vect_size * SG_SIZE; + const int row_size = vect_size * SUB_GROUP_SIZE; const int group_size = get_local_size(1); __local float *local_gamma = local_sums; @@ -916,7 +927,7 @@ void nhwc_reusable_bwd_fused_reduction( if (local_id > 0) { unroll_4_for(int v_idx = 0; v_idx < vect_size; v_idx++) { const int slm_offset - = local_id * row_size + v_idx * SG_SIZE + simd_id; + = local_id * row_size + v_idx * SUB_GROUP_SIZE + simd_id; local_gamma[slm_offset] = diff_gamma[v_idx]; local_beta[slm_offset] = diff_beta[v_idx]; } @@ -925,13 +936,14 @@ void nhwc_reusable_bwd_fused_reduction( if (local_id == 0) { unroll_16_for(int l_id = 1; l_id < group_size; l_id++) { for (int v_idx = 0; v_idx < vect_size; v_idx++) { - const int off = l_id * row_size + v_idx * SG_SIZE + simd_id; + const int off + = l_id * row_size + v_idx * SUB_GROUP_SIZE + simd_id; diff_gamma[v_idx] += local_gamma[off]; diff_beta[v_idx] += local_beta[off]; } } unroll_4_for(int v_idx = 0; v_idx < vect_size; v_idx++) { - const int off = v_idx * SG_SIZE + simd_id; + const int off = v_idx * SUB_GROUP_SIZE + simd_id; atomic_add_global(&diff_scale[dst_offset + off], diff_gamma[v_idx]); atomic_add_global(&diff_shift[dst_offset + off], diff_beta[v_idx]); } @@ -951,13 +963,14 @@ void nhwc_reusable_bwd_fused_reduction_buff( const int row_size = ic_block; const int group_size = get_local_size(1); - const int ic_block_sgroups = ic_block / SG_SIZE; + const int ic_block_sgroups = ic_block / SUB_GROUP_SIZE; __local float *local_gamma = local_sums; __local float *local_beta = local_sums + calc_slm_size / sizeof(float); if (local_id > 0) { unroll_16_for(int sg = 0; sg < ic_block_sgroups; sg++) { - const int slm_offset = local_id * row_size + sg * SG_SIZE + simd_id; + const int slm_offset + = local_id * row_size + sg * SUB_GROUP_SIZE + simd_id; local_gamma[slm_offset] = diff_gamma[sg]; local_beta[slm_offset] = diff_beta[sg]; } @@ -966,13 +979,13 @@ void nhwc_reusable_bwd_fused_reduction_buff( if (local_id == 0) { unroll_16_for(int l_id = 1; l_id < group_size; l_id++) { unroll_4_for(int sg = 0; sg < ic_block_sgroups; sg++) { - const int off = l_id * row_size + sg * SG_SIZE + simd_id; + const int off = l_id * row_size + sg * SUB_GROUP_SIZE + simd_id; diff_gamma[sg] += local_gamma[off]; diff_beta[sg] += local_beta[off]; } } unroll_4_for(int sg = 0; sg < ic_block_sgroups; sg++) { - const int off = sg * SG_SIZE + simd_id; + const int off = sg * SUB_GROUP_SIZE + simd_id; atomic_add_global(&diff_scale[dst_offset + off], diff_gamma[sg]); atomic_add_global(&diff_shift[dst_offset + off], diff_beta[sg]); } @@ -982,7 +995,7 @@ void nhwc_reusable_bwd_fused_reduction_buff( // Calculate stats for BWD pass // no private memory buffers used. -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, __global DATA_T *diff_dst, __global char *ws, __global float *temp_reduce, __global float *temp_reduce_shift, @@ -993,7 +1006,7 @@ nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, __local float *local_sums, off_t calc_slm_size) { const int c = get_global_id(0); const int sp_block_idx = get_global_id(1); - const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int ic_block_offset = (c / SUB_GROUP_SIZE) * ic_block; const int offset = ic_block_offset + sp_block_idx * stat_sp_block * ic_size; mean += ic_block_offset; @@ -1011,14 +1024,14 @@ nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, const int sp_idx_bnd = has_sp_block_tail ? min(stat_sp_block, sp_size - sp_block_idx * stat_sp_block) : stat_sp_block; - const int ic_block_sgroups = ic_block / SG_SIZE; + const int ic_block_sgroups = ic_block / SUB_GROUP_SIZE; const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; const int ic_vect_sgroups = ic_block_sgroups - ic_tail_sgroups; // vectorized part for (int sg = 0; sg < ic_block_sgroups / VECT_SIZE; ++sg) { - const int sg_idx = sg * SG_SIZE * VECT_SIZE; + const int sg_idx = sg * SUB_GROUP_SIZE * VECT_SIZE; VECT_FLOAT_T diff_gamma = 0.0f; VECT_FLOAT_T diff_beta = 0.0f; const VECT_FLOAT_T v_mean = LOAD_VECT_FLOAT(&mean[(sg_idx)]); @@ -1042,7 +1055,8 @@ nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, // store results if (use_fused_atomics_reduction) { - const int dst_off = ic_block_offset + sg * VECT_SIZE * SG_SIZE; + const int dst_off + = ic_block_offset + sg * VECT_SIZE * SUB_GROUP_SIZE; nhwc_reusable_bwd_fused_reduction(diff_scale, diff_shift, dst_off, (float *)(&diff_gamma), (float *)(&diff_beta), local_sums, VECT_SIZE, calc_slm_size); @@ -1054,15 +1068,16 @@ nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, // reduce_stat_nblocks * ic_size - initialy reduced data, // calculated by this kernel - const int sg_off = sg * VECT_SIZE * SG_SIZE; + const int sg_off = sg * VECT_SIZE * SUB_GROUP_SIZE; for (int v_idx = 0; v_idx < VECT_SIZE; v_idx++) { - STORE_FLOAT_1x16(&temp_reduce[sg_off + v_idx * SG_SIZE], + STORE_FLOAT_1x16(&temp_reduce[sg_off + v_idx * SUB_GROUP_SIZE], #if VECT_SIZE > 1 diff_gamma[v_idx]); #else diff_gamma); #endif - STORE_FLOAT_1x16(&temp_reduce_shift[sg_off + v_idx * SG_SIZE], + STORE_FLOAT_1x16( + &temp_reduce_shift[sg_off + v_idx * SUB_GROUP_SIZE], #if VECT_SIZE > 1 diff_beta[v_idx]); #else @@ -1074,7 +1089,7 @@ nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, // tails for (int sg = 0; sg < ic_tail_sgroups; ++sg) { - const int sg_idx = (ic_vect_sgroups + sg) * SG_SIZE; + const int sg_idx = (ic_vect_sgroups + sg) * SUB_GROUP_SIZE; float diff_gamma = 0.0f; float diff_beta = 0.0f; const float v_mean = LOAD_FLOAT_1x16(&mean[(sg_idx)]); @@ -1098,12 +1113,12 @@ nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, // store results if (use_fused_atomics_reduction) { const int dst_off - = ic_block_offset + (ic_vect_sgroups + sg) * SG_SIZE; + = ic_block_offset + (ic_vect_sgroups + sg) * SUB_GROUP_SIZE; nhwc_reusable_bwd_fused_reduction(diff_scale, diff_shift, dst_off, (float *)(&diff_gamma), (float *)(&diff_beta), local_sums, 1, calc_slm_size); } else { - const int sg_off = (ic_vect_sgroups + sg) * SG_SIZE; + const int sg_off = (ic_vect_sgroups + sg) * SUB_GROUP_SIZE; STORE_FLOAT_1x16(&temp_reduce[sg_off], diff_gamma); STORE_FLOAT_1x16(&temp_reduce_shift[sg_off], diff_beta); } @@ -1111,7 +1126,7 @@ nhwc_reusable_calc_stat(__global DATA_T *src, __global float *mean, } // Calculate stats for BWD pass, private memory buffers used. -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_calc_stat_buff(__global DATA_T *src, __global float *mean, __global DATA_T *diff_dst, __global char *ws, __global float *temp_reduce, __global float *temp_reduce_shift, @@ -1123,7 +1138,7 @@ nhwc_reusable_calc_stat_buff(__global DATA_T *src, __global float *mean, const int c = get_global_id(0); const int sp_block_idx = get_global_id(1); - const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int ic_block_offset = (c / SUB_GROUP_SIZE) * ic_block; const int offset = ic_block_offset + sp_block_idx * stat_sp_block * ic_size; mean += ic_block_offset; @@ -1142,14 +1157,14 @@ nhwc_reusable_calc_stat_buff(__global DATA_T *src, __global float *mean, ? min(stat_sp_block, sp_size - sp_block_idx * stat_sp_block) : stat_sp_block; const int ic_block_sgroups - = min(ic_size - ic_block_offset, ic_block) / SG_SIZE; + = min(ic_size - ic_block_offset, ic_block) / SUB_GROUP_SIZE; const int ic_vect_sgroups = ic_block_sgroups / VECT_SIZE; const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; float v_mean[MAX_IC_BLOCK_SGROUPS]; for (int sg = 0; sg < ic_block_sgroups; ++sg) { v_mean[sg] = as_float(intel_sub_group_block_read( - (const __global uint *)(&mean[(sg * SG_SIZE)]))); + (const __global uint *)(&mean[(sg * SUB_GROUP_SIZE)]))); } float diff_gamma[MAX_IC_BLOCK_SGROUPS] = {0.0f}; @@ -1158,7 +1173,7 @@ nhwc_reusable_calc_stat_buff(__global DATA_T *src, __global float *mean, for (int sp = 0; sp < sp_idx_bnd; ++sp) { // vector part for (int sg = 0; sg < ic_vect_sgroups; ++sg) { - const int sg_idx = sg * SG_SIZE * VECT_SIZE; + const int sg_idx = sg * SUB_GROUP_SIZE * VECT_SIZE; const int sgv = sg * VECT_SIZE; #if FUSE_BN_RELU const VECT_CHAR_T ws_vect = LOAD_VECT_CHAR(&ws[sg_idx]); @@ -1186,10 +1201,10 @@ nhwc_reusable_calc_stat_buff(__global DATA_T *src, __global float *mean, for (int sg = 0; sg < ic_tail_sgroups; ++sg) { const int sg_idx = ic_vect_sgroups * VECT_SIZE + sg; #if FUSE_BN_RELU - char ws_tail = LOAD_CHAR_1x16(&ws[sg_idx * SG_SIZE]); + char ws_tail = LOAD_CHAR_1x16(&ws[sg_idx * SUB_GROUP_SIZE]); #endif - float src_tail = LOAD_DATA_1x16(&src[sg_idx * SG_SIZE]); - float dd_tail = LOAD_DATA_1x16(&diff_dst[sg_idx * SG_SIZE]); + float src_tail = LOAD_DATA_1x16(&src[sg_idx * SUB_GROUP_SIZE]); + float dd_tail = LOAD_DATA_1x16(&diff_dst[sg_idx * SUB_GROUP_SIZE]); float v0 = src_tail - v_mean[sg_idx]; #if FUSE_BN_RELU dd_tail = select(0.0f, dd_tail, convert_int(ws_tail)); @@ -1214,7 +1229,7 @@ nhwc_reusable_calc_stat_buff(__global DATA_T *src, __global float *mean, } else { for (int sg = 0; sg < ic_block_sgroups; ++sg) { - const int sg_off = sg * SG_SIZE; + const int sg_off = sg * SUB_GROUP_SIZE; STORE_FLOAT_1x16(&temp_reduce[sg_off], diff_gamma[sg]); STORE_FLOAT_1x16(&temp_reduce_shift[sg_off], diff_beta[sg]); } @@ -1223,7 +1238,7 @@ nhwc_reusable_calc_stat_buff(__global DATA_T *src, __global float *mean, // Main BWD pass kernel // no private memory buffers used. -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_norm_bwd(__global DATA_T *src, __global float *mean, __global float *variance, __global DATA_T *diff_dst, __global float *scaleshift, __global char *ws, @@ -1231,7 +1246,7 @@ nhwc_reusable_norm_bwd(__global DATA_T *src, __global float *mean, __global float *diff_shift, float eps, __global DATA_T *diff_src_add, off_t ic_size, off_t ic_block, off_t sp_size, off_t update_sp_block) { const int c = get_global_id(0); - const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int ic_block_offset = (c / SUB_GROUP_SIZE) * ic_block; variance += ic_block_offset; mean += ic_block_offset; @@ -1255,12 +1270,12 @@ nhwc_reusable_norm_bwd(__global DATA_T *src, __global float *mean, const int sp_idx_bnd = has_sp_block_tail ? min(update_sp_block, sp_size - sp_block_idx * update_sp_block) : update_sp_block; - const int ic_block_sgroups = ic_block / SG_SIZE; + const int ic_block_sgroups = ic_block / SUB_GROUP_SIZE; for (int sp = 0; sp < sp_idx_bnd; ++sp) { // vectorized part for (int sg = 0; sg < ic_block_sgroups / VECT_SIZE; ++sg) { - const int sg_idx = sg * SG_SIZE * VECT_SIZE; + const int sg_idx = sg * SUB_GROUP_SIZE * VECT_SIZE; const VECT_FLOAT_T v_variance = LOAD_VECT_FLOAT(&variance[sg_idx]); const VECT_FLOAT_T sqrt_variance @@ -1294,12 +1309,12 @@ nhwc_reusable_norm_bwd(__global DATA_T *src, __global float *mean, } // sg loop - const int ic_tail_sgroups = (ic_block / SG_SIZE) % VECT_SIZE; + const int ic_tail_sgroups = (ic_block / SUB_GROUP_SIZE) % VECT_SIZE; const int ic_vect_sgroups = ic_block_sgroups - ic_tail_sgroups; // tails for (int sg = 0; sg < ic_tail_sgroups; ++sg) { - const int sg_idx = (ic_vect_sgroups + sg) * SG_SIZE; + const int sg_idx = (ic_vect_sgroups + sg) * SUB_GROUP_SIZE; const float v_variance = LOAD_FLOAT_1x16(&variance[sg_idx]); const float sqrt_variance @@ -1343,7 +1358,7 @@ nhwc_reusable_norm_bwd(__global DATA_T *src, __global float *mean, } // Main BWD pass kernel, private memory buffers used. -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_norm_bwd_buff(__global DATA_T *src, __global float *mean, __global float *variance, __global DATA_T *diff_dst, __global float *scaleshift, __global char *ws, @@ -1352,7 +1367,7 @@ nhwc_reusable_norm_bwd_buff(__global DATA_T *src, __global float *mean, off_t ic_size, off_t ic_block, off_t sp_size, off_t update_sp_block) { const int c = get_global_id(0); - const int ic_block_offset = (c / SG_SIZE) * ic_block; + const int ic_block_offset = (c / SUB_GROUP_SIZE) * ic_block; variance += ic_block_offset; mean += ic_block_offset; @@ -1377,7 +1392,7 @@ nhwc_reusable_norm_bwd_buff(__global DATA_T *src, __global float *mean, ? min(update_sp_block, sp_size - sp_block_idx * update_sp_block) : update_sp_block; const int ic_block_sgroups - = min(ic_size - ic_block_offset, ic_block) / SG_SIZE; + = min(ic_size - ic_block_offset, ic_block) / SUB_GROUP_SIZE; const int ic_vect_sgroups = ic_block_sgroups / VECT_SIZE; const int ic_tail_sgroups = ic_block_sgroups % VECT_SIZE; @@ -1387,7 +1402,7 @@ nhwc_reusable_norm_bwd_buff(__global DATA_T *src, __global float *mean, for (int sg = 0; sg < ic_vect_sgroups; ++sg) { const int sgv = sg * VECT_SIZE; - const int sg_idx = sg * SG_SIZE * VECT_SIZE; + const int sg_idx = sg * SUB_GROUP_SIZE * VECT_SIZE; AS_VECT_FLOAT(&v_variance[sgv]) = LOAD_VECT_FLOAT(&variance[sg_idx]); #if CALCULATE_STATS == 1 @@ -1405,7 +1420,7 @@ nhwc_reusable_norm_bwd_buff(__global DATA_T *src, __global float *mean, #if MAY_HAVE_IC_TAIL for (int sg = 0; sg < ic_tail_sgroups; ++sg) { const int sgv = ic_vect_sgroups * VECT_SIZE + sg; - const int sg_idx = (ic_vect_sgroups * VECT_SIZE + sg) * SG_SIZE; + const int sg_idx = (ic_vect_sgroups * VECT_SIZE + sg) * SUB_GROUP_SIZE; v_variance[sgv] = LOAD_FLOAT_1x16(&variance[sg_idx]); #if CALCULATE_STATS == 1 v_mean[sgv] = LOAD_FLOAT_1x16(&mean[sg_idx]); @@ -1419,7 +1434,7 @@ nhwc_reusable_norm_bwd_buff(__global DATA_T *src, __global float *mean, for (int sp = 0; sp < sp_idx_bnd; ++sp) { // vector part for (int sg = 0; sg < ic_vect_sgroups; ++sg) { - const int sg_idx = sg * SG_SIZE * VECT_SIZE; + const int sg_idx = sg * SUB_GROUP_SIZE * VECT_SIZE; const int sgv = sg * VECT_SIZE; const VECT_FLOAT_T src_vect = LOAD_VECT_DATA(&src[sg_idx]); @@ -1448,7 +1463,8 @@ nhwc_reusable_norm_bwd_buff(__global DATA_T *src, __global float *mean, // tails for (int sg = 0; sg < ic_tail_sgroups; ++sg) { const int sgv = ic_vect_sgroups * VECT_SIZE + sg; - const int sg_idx = (ic_vect_sgroups * VECT_SIZE + sg) * SG_SIZE; + const int sg_idx + = (ic_vect_sgroups * VECT_SIZE + sg) * SUB_GROUP_SIZE; const float src_tail = LOAD_DATA_1x16(&src[sg_idx]); float dd_tail = LOAD_DATA_1x16(&diff_dst[sg_idx]); #if FUSE_BN_RELU @@ -1508,15 +1524,15 @@ __kernel void nhwc_reusable_reduce_aux(__global float *ptr1, } // Reduction thru scratchpad, FWD pass, regular algorithm -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_reduce_fwd_reg(__global float *reduce_scratchpad, off_t scratchpad_off, __global float *dst, off_t ic_size, off_t reduce_ic_sub_groups, off_t reduce_stat_nblocks, off_t sp_size, __local float *local_sum) { - const int ic_sub_group = get_global_id(0) / SG_SIZE; + const int ic_sub_group = get_global_id(0) / SUB_GROUP_SIZE; const int group_c = get_global_id(1); const int simd_id = get_sub_group_local_id(); - const int c = group_c * SG_SIZE + simd_id; + const int c = group_c * SUB_GROUP_SIZE + simd_id; float sum = 0.0f; const int reduce_chunk = reduce_stat_nblocks / reduce_ic_sub_groups; @@ -1528,26 +1544,28 @@ nhwc_reusable_reduce_fwd_reg(__global float *reduce_scratchpad, sum += reduce_scratchpad[i * ic_size]; } - if (ic_sub_group > 0) { local_sum[ic_sub_group * SG_SIZE + simd_id] = sum; } + if (ic_sub_group > 0) { + local_sum[ic_sub_group * SUB_GROUP_SIZE + simd_id] = sum; + } barrier(CLK_LOCAL_MEM_FENCE); if (ic_sub_group == 0) { unroll_16_for(int i = 1; i < reduce_ic_sub_groups; i++) { - sum += local_sum[i * SG_SIZE + simd_id]; + sum += local_sum[i * SUB_GROUP_SIZE + simd_id]; } dst[c] = sum / sp_size; } } // Reduction thru scratchpad, FWD pass, 1pass algorithm -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_reduce_fwd_1pass(__global float *reduce_temp, __global float *mean, __global float *variance, off_t ic_size, off_t reduce_ic_sub_groups, off_t reduce_stat_nblocks, off_t sp_size, __local SUM_DATA_T *local_sum, __local SUM_DATA_T *local_sum_sq) { - const int ic_sub_group = get_global_id(0) / SG_SIZE; + const int ic_sub_group = get_global_id(0) / SUB_GROUP_SIZE; const int group_c = get_global_id(1); const int simd_id = get_sub_group_local_id(); - const int c = group_c * SG_SIZE + simd_id; + const int c = group_c * SUB_GROUP_SIZE + simd_id; SUM_DATA_T sum; SUM_DATA_T sum_sq; sum.s0 = 0; @@ -1568,14 +1586,14 @@ nhwc_reusable_reduce_fwd_1pass(__global float *reduce_temp, sum_sq = summation(tmp, sum_sq); } if (ic_sub_group > 0) { - local_sum[ic_sub_group * SG_SIZE + simd_id] = sum; - local_sum_sq[ic_sub_group * SG_SIZE + simd_id] = sum_sq; + local_sum[ic_sub_group * SUB_GROUP_SIZE + simd_id] = sum; + local_sum_sq[ic_sub_group * SUB_GROUP_SIZE + simd_id] = sum_sq; } barrier(CLK_LOCAL_MEM_FENCE); if (ic_sub_group == 0) { unroll_16_for(int i = 1; i < reduce_ic_sub_groups; i++) { - SUM_DATA_T tmp = local_sum[i * SG_SIZE + simd_id]; - SUM_DATA_T tmp_sq = local_sum_sq[i * SG_SIZE + simd_id]; + SUM_DATA_T tmp = local_sum[i * SUB_GROUP_SIZE + simd_id]; + SUM_DATA_T tmp_sq = local_sum_sq[i * SUB_GROUP_SIZE + simd_id]; sum = summation(tmp.s1, sum); sum_sq = summation(tmp_sq.s1, sum_sq); sum = summation(tmp.s0, sum); @@ -1590,16 +1608,16 @@ nhwc_reusable_reduce_fwd_1pass(__global float *reduce_temp, } // Reduction thru scratchpad, BWD pass -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) __kernel void +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __kernel void nhwc_reusable_reduce_stat(__global float *temp_reduce, __global float *temp_reduce_shift, __global float *diff_scale, __global float *diff_shift, __global float *variance, float eps, off_t ic_size, off_t reduce_ic_sub_groups, off_t reduce_stat_nblocks, __local float *local_gamma, __local float *local_beta) { - const int ic_sub_group = get_global_id(0) / SG_SIZE; + const int ic_sub_group = get_global_id(0) / SUB_GROUP_SIZE; const int group_c = get_global_id(1); const int simd_id = get_sub_group_local_id(); - const int c = group_c * SG_SIZE + simd_id; + const int c = group_c * SUB_GROUP_SIZE + simd_id; float diff_gamma = 0.0f; float diff_beta = 0.0f; @@ -1616,14 +1634,14 @@ nhwc_reusable_reduce_stat(__global float *temp_reduce, diff_beta += temp_reduce_shift[i * ic_size]; } if (ic_sub_group > 0) { - local_gamma[ic_sub_group * SG_SIZE + simd_id] = diff_gamma; - local_beta[ic_sub_group * SG_SIZE + simd_id] = diff_beta; + local_gamma[ic_sub_group * SUB_GROUP_SIZE + simd_id] = diff_gamma; + local_beta[ic_sub_group * SUB_GROUP_SIZE + simd_id] = diff_beta; } barrier(CLK_LOCAL_MEM_FENCE); if (ic_sub_group == 0) { unroll_16_for(int i = 1; i < reduce_ic_sub_groups; i++) { - diff_gamma += local_gamma[i * SG_SIZE + simd_id]; - diff_beta += local_beta[i * SG_SIZE + simd_id]; + diff_gamma += local_gamma[i * SUB_GROUP_SIZE + simd_id]; + diff_beta += local_beta[i * SUB_GROUP_SIZE + simd_id]; } float sqrt_variance = 1.0f / sqrt(variance[c] + eps); diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp index 12d2288593c..80c33726dae 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cpp @@ -222,7 +222,7 @@ static void init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx, kernel_ctx.define_int("USE_SCALE", cmpl_conf.use_scale); kernel_ctx.define_int("USE_SHIFT", cmpl_conf.use_shift); kernel_ctx.define_int("VECT_SIZE", cmpl_conf.vect_size); - kernel_ctx.define_int("SG_SIZE", cmpl_conf.sub_group_size); + kernel_ctx.define_int("SUB_GROUP_SIZE", cmpl_conf.sub_group_size); kernel_ctx.add_option("-cl-std=CL2.0"); if (cmpl_conf.data_type == data_type::s8) kernel_ctx.add_option("-Dcl_intel_subgroups_char"); diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.h b/src/gpu/intel/ocl/bnorm/nhwc_reusable.h index 1808805ffcd..c634935ea06 100644 --- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.h +++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.h @@ -16,7 +16,7 @@ #ifndef GPU_INTEL_OCL_BNORM_NHWC_REUSABLE_H #define GPU_INTEL_OCL_BNORM_NHWC_REUSABLE_H -#define MAX_IC_BLOCK_SGROUPS (MAX_IC_BLOCK / SG_SIZE) +#define MAX_IC_BLOCK_SGROUPS (MAX_IC_BLOCK / SUB_GROUP_SIZE) #define MAX_IC_TAIL_SGROUPS (VECT_SIZE - 1) #define MAY_HAVE_IC_TAIL (MAX_IC_TAIL_SGROUPS > 0) From 1ca4a1694a5244b87d988713cb565ab2b9c2a622 Mon Sep 17 00:00:00 2001 From: David Svantesson-Yeung Date: Wed, 8 May 2024 10:23:45 +0000 Subject: [PATCH 028/187] src: cpu: aarch64: Fix deconv shape Fix an issue in deconv where ACL reduces the dimensions in TensorShape if the last dimension is of size 1. Also add unit test to catch this issue. --- src/cpu/aarch64/acl_deconvolution.hpp | 14 +++++++++----- tests/benchdnn/inputs/deconv/shapes_1d | 1 + 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/cpu/aarch64/acl_deconvolution.hpp b/src/cpu/aarch64/acl_deconvolution.hpp index 2bd6bbfb802..4b646148b1d 100644 --- a/src/cpu/aarch64/acl_deconvolution.hpp +++ b/src/cpu/aarch64/acl_deconvolution.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Arm Ltd. and affiliates +* Copyright 2022-2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -201,10 +201,14 @@ struct acl_deconvolution_fwd_t : public primitive_t { : arm_compute::TensorShape(iw, ih, ic, mb), 1, acl_src_data_t, acl_layout); - acl_pd_conf.wei_info = arm_compute::TensorInfo(is_nspc - ? arm_compute::TensorShape(ic, kw, kh, oc) - : arm_compute::TensorShape(kw, kh, ic, oc), - 1, acl_wei_data_t, acl_layout); + auto wei_info_tensor_shape = is_nspc + ? arm_compute::TensorShape(ic, kw, kh, oc) + : arm_compute::TensorShape(kw, kh, ic, oc); + // ACL removes last dimension if dim is 1. + // Below fix ensures the tensor shape is correct when queried. + wei_info_tensor_shape.set_num_dimensions(4); + acl_pd_conf.wei_info = arm_compute::TensorInfo( + wei_info_tensor_shape, 1, acl_wei_data_t, acl_layout); acl_pd_conf.dst_info = arm_compute::TensorInfo(is_nspc ? arm_compute::TensorShape(oc, ow, oh, mb) diff --git a/tests/benchdnn/inputs/deconv/shapes_1d b/tests/benchdnn/inputs/deconv/shapes_1d index ab517bb79b9..da7b830b1df 100644 --- a/tests/benchdnn/inputs/deconv/shapes_1d +++ b/tests/benchdnn/inputs/deconv/shapes_1d @@ -6,3 +6,4 @@ mb256oc256ow13ic384iw13kw3pw1n"alexnet:deconv3" g1mb96ic64iw112oc3ow224kw7sw2pw3n"googlenet_v1:conv1/7x7_s2" mb1_g1oc3ic64_ow1030iw512kw7sw2dw0pw0_n"masknet_p1:deconv1" g1mb50ic256iw28oc512ow56kw1sw2pw0n"resnet_50:res3a_branch1" +mb9_ic1oc1_ih1oh1kh1sh1dh0ph0_iw55ow55kw3sw1dw0pw1n"pytorch_unittest" From b3d7ca9322632d5c8dc74707c3f9507d5d1a9981 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Mon, 6 May 2024 16:11:19 -0700 Subject: [PATCH 029/187] benchdnn: convert skip_reason_t enum into skip_reason namespace with strings This is the first step to provide a random string with a meaningful message to clarify the outcome. --- tests/benchdnn/binary/binary.cpp | 12 +++-- tests/benchdnn/bnorm/bnorm.cpp | 6 +-- tests/benchdnn/brgemm/brgemm.cpp | 16 ++++-- tests/benchdnn/common.cpp | 24 ++++----- tests/benchdnn/common.hpp | 20 ++++--- tests/benchdnn/concat/concat.cpp | 3 +- tests/benchdnn/conv/conv.cpp | 9 ++-- tests/benchdnn/deconv/deconv.cpp | 3 +- tests/benchdnn/dnnl_common.cpp | 38 ++++++++----- tests/benchdnn/dnnl_common.hpp | 6 +-- tests/benchdnn/doc/benchdnn_general_info.md | 6 +-- tests/benchdnn/eltwise/eltwise.cpp | 6 ++- tests/benchdnn/graph/graph.cpp | 6 +-- tests/benchdnn/graph/utils.cpp | 10 ++-- tests/benchdnn/graph/utils.hpp | 2 +- tests/benchdnn/ip/ip.cpp | 3 +- tests/benchdnn/lnorm/lnorm.cpp | 3 +- tests/benchdnn/matmul/matmul.cpp | 48 +++++++++++------ tests/benchdnn/pool/pool.cpp | 6 ++- tests/benchdnn/reduction/reduction.cpp | 3 +- tests/benchdnn/reorder/reorder.cpp | 46 ++++++++++------ tests/benchdnn/rnn/rnn.cpp | 60 ++++++++++++++------- tests/benchdnn/softmax/softmax.cpp | 3 +- tests/benchdnn/zeropad/zeropad.cpp | 4 +- 24 files changed, 212 insertions(+), 131 deletions(-) diff --git a/tests/benchdnn/binary/binary.cpp b/tests/benchdnn/binary/binary.cpp index 1bc055218be..c059bf01aa0 100644 --- a/tests/benchdnn/binary/binary.cpp +++ b/tests/benchdnn/binary/binary.cpp @@ -109,14 +109,16 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { bool is_bf16u8 = (dts[0] == dnnl_bf16 && dts[1] == dnnl_bf16 && dts[2] == dnnl_u8); if (is_bf16u8 && have_post_ops) { - res->state = SKIPPED, res->reason = DATA_TYPE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::data_type_not_supported; return; } // gpu does not support s32 for (const auto &dt : dts) if (dt == dnnl_s32) { - res->state = SKIPPED, res->reason = DATA_TYPE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::data_type_not_supported; return; } } @@ -134,14 +136,16 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { // In case src0 is broadcasted into src1, it means that src0 has smaller // memory footprint and doing sum post-op or in-place will cause a crash. if (bcast_src0 && (prb->inplace || is_sum)) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } // See `skip_invalid_inplace` for details. if (prb->inplace) { if (is_sum) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } diff --git a/tests/benchdnn/bnorm/bnorm.cpp b/tests/benchdnn/bnorm/bnorm.cpp index 93131c62a75..b730340418d 100644 --- a/tests/benchdnn/bnorm/bnorm.cpp +++ b/tests/benchdnn/bnorm/bnorm.cpp @@ -438,13 +438,13 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { bool alpha_ok = IMPLICATION(alpha != 0.f, (prb->dir & FLAG_INF)); if (!alpha_ok) { res->state = SKIPPED; - res->reason = CASE_NOT_SUPPORTED; + res->reason = skip_reason::case_not_supported; } } // BN+Add+ReLU fusion is not supported on CPU if (is_cpu() && prb->fuse_add_relu()) { res->state = SKIPPED; - res->reason = CASE_NOT_SUPPORTED; + res->reason = skip_reason::case_not_supported; } // int8 only supports forward s8 w/ global stats const bool u8_not_ok = prb->dt == dnnl_u8; @@ -452,7 +452,7 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { && ((prb->dir & FLAG_BWD) || (prb->flags & GLOB_STATS) == 0); if (s8_not_ok || u8_not_ok) { res->state = SKIPPED; - res->reason = CASE_NOT_SUPPORTED; + res->reason = skip_reason::case_not_supported; } } diff --git a/tests/benchdnn/brgemm/brgemm.cpp b/tests/benchdnn/brgemm/brgemm.cpp index 477cfa25a73..229a5035a92 100644 --- a/tests/benchdnn/brgemm/brgemm.cpp +++ b/tests/benchdnn/brgemm/brgemm.cpp @@ -249,7 +249,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { }; if (!IMPLICATION(is_xf16(prb->bia_dt) || is_xf16(prb->dst_dt()), is_xf16(prb->wei_dt()))) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } skip_unimplemented_data_type( @@ -262,7 +263,7 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { // Unconditionally skip remaining unimplemented cases. // TODO: stop doing it. res->state = SKIPPED; - res->reason = CASE_NOT_SUPPORTED; + res->reason = skip_reason::case_not_supported; } void skip_invalid_prb(const prb_t *prb, res_t *res) { @@ -275,7 +276,8 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { const bool req_s8_comp = prb->src_dt() == dnnl_s8; const bool req_zp_comp = !prb->attr.zero_points.is_def(DNNL_ARG_SRC); if (is_bad_ldb && (req_s8_comp || req_zp_comp)) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } @@ -592,12 +594,16 @@ int doit(const prb_t *prb, res_t *res) { // It requires enabling f32 -> u8 reorder with compensation on the // library side. When enabled, it produces incorrect results for cases // with K=1. Likely there's a bug inside. Postpone supporting it. - return res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED, OK; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; + return OK; } if (prb->attr.post_ops.binary_index() >= 0) { // TODO: binary post-op is not supported yet. - return res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED, OK; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; + return OK; } brgemm_post_ops_data_t post_ops_data( diff --git a/tests/benchdnn/common.cpp b/tests/benchdnn/common.cpp index e4eebe7e493..8fa70f1b0ba 100644 --- a/tests/benchdnn/common.cpp +++ b/tests/benchdnn/common.cpp @@ -67,18 +67,14 @@ const char *state2str(res_state_t state) { return "STATE_UNDEF"; } -const char *skip_reason2str(skip_reason_t skip_reason) { -#define CASE(x) \ - if (skip_reason == (x)) return STRINGIFY(x) - CASE(CASE_NOT_SUPPORTED); - CASE(DATA_TYPE_NOT_SUPPORTED); - CASE(INVALID_CASE); - CASE(NOT_ENOUGH_RAM); - CASE(SKIP_IMPL_HIT); - CASE(SKIP_START); -#undef CASE - return "SKIP_UNKNOWN"; -} +namespace skip_reason { +std::string case_not_supported("Case not supported"); +std::string data_type_not_supported("Data type not supported"); +std::string invalid_case("Invalid case"); +std::string not_enough_ram("Not enough RAM"); +std::string skip_impl_hit("Skip-impl option hit"); +std::string skip_start("Skip-start option hit"); +} // namespace skip_reason dir_t str2dir(const char *str) { #define CASE(x) \ @@ -117,7 +113,7 @@ void parse_result(res_t &res, const char *pstr) { break; case SKIPPED: BENCHDNN_PRINT(0, "%d:%s (%s) __REPRO: %s\n", bs.tests, state, - skip_reason2str(res.reason), pstr); + res.reason.c_str(), pstr); bs.skipped++; break; case UNIMPLEMENTED: @@ -337,7 +333,7 @@ bool maybe_skip(const std::string &impl_str) { bool skip_start(res_t *res, int idx) { if (idx < test_start) { res->state = SKIPPED; - res->reason = SKIP_START; + res->reason = skip_reason::skip_start; return true; } return false; diff --git a/tests/benchdnn/common.hpp b/tests/benchdnn/common.hpp index 1e6fa076419..b29f75d60c3 100644 --- a/tests/benchdnn/common.hpp +++ b/tests/benchdnn/common.hpp @@ -152,16 +152,14 @@ enum res_state_t { }; const char *state2str(res_state_t state); -enum skip_reason_t { - SKIP_UNKNOWN = 0, - CASE_NOT_SUPPORTED, - DATA_TYPE_NOT_SUPPORTED, - INVALID_CASE, - NOT_ENOUGH_RAM, - SKIP_IMPL_HIT, - SKIP_START, -}; -const char *skip_reason2str(skip_reason_t skip_reason); +namespace skip_reason { +extern std::string case_not_supported; +extern std::string data_type_not_supported; +extern std::string invalid_case; +extern std::string not_enough_ram; +extern std::string skip_impl_hit; +extern std::string skip_start; +} // namespace skip_reason enum dir_t { DIR_UNDEF = 0, @@ -187,7 +185,7 @@ struct res_t { timer::timer_map_t timer_map; std::string impl_name; std::string prim_ref_repro; - skip_reason_t reason; + std::string reason; size_t ibytes, obytes; dir_t mem_check_dir = DIR_UNDEF; }; diff --git a/tests/benchdnn/concat/concat.cpp b/tests/benchdnn/concat/concat.cpp index 0db18622228..e39a0dd5c03 100644 --- a/tests/benchdnn/concat/concat.cpp +++ b/tests/benchdnn/concat/concat.cpp @@ -125,7 +125,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { (prb->sdt == dnnl_f32 || prb->sdt == prb->ddt)); if (is_cpu() && (!valid_xf16_input || !valid_xf16_output)) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } diff --git a/tests/benchdnn/conv/conv.cpp b/tests/benchdnn/conv/conv.cpp index 7a05f100227..8d7cc9bdff5 100644 --- a/tests/benchdnn/conv/conv.cpp +++ b/tests/benchdnn/conv/conv.cpp @@ -377,7 +377,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { const bool is_x8x8f16 = is_int8_src && is_int8_wei && is_f16_dst; if (is_f32f32x8 || is_bf16bf16x8 || is_x8x8f16 || !is_valid_f16) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } @@ -386,13 +387,15 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { // make sense to list all of them, just convert all unimplemented Winograd // problems into not supported. if (prb->alg == WINO) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } // GPU does not support depthwise fusion if (is_gpu() && prb->attr.post_ops.convolution_index() != -1) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } diff --git a/tests/benchdnn/deconv/deconv.cpp b/tests/benchdnn/deconv/deconv.cpp index 12aaf2cf578..2fb143e0560 100644 --- a/tests/benchdnn/deconv/deconv.cpp +++ b/tests/benchdnn/deconv/deconv.cpp @@ -359,7 +359,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { = prb->get_dt(WEI) == dnnl_s8 && prb->get_dt(DST) == dnnl_bf16; const bool fwd_ok = !is_x8s8bf16_cfg; if (!fwd_ok) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } diff --git a/tests/benchdnn/dnnl_common.cpp b/tests/benchdnn/dnnl_common.cpp index c534a9ce23b..c86df55d650 100644 --- a/tests/benchdnn/dnnl_common.cpp +++ b/tests/benchdnn/dnnl_common.cpp @@ -660,7 +660,8 @@ void skip_unimplemented_data_type( default: break; } if (need_skip) { - res->state = SKIPPED, res->reason = DATA_TYPE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::data_type_not_supported; return; } } @@ -684,7 +685,8 @@ void skip_unimplemented_sum_po(const attr_t &attr, res_t *res, if (e.sum.zero_point != 0) { // Sum with zero-point is only supported for int8 if (!is_integral_dt(src_dt)) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } else { // Only quantized sum operand can have zero point @@ -692,7 +694,8 @@ void skip_unimplemented_sum_po(const attr_t &attr, res_t *res, = e.sum.dt == dnnl_data_type_undef ? dst_dt : e.sum.dt; if (!is_integral_dt(e_sum_dt)) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } @@ -700,19 +703,22 @@ void skip_unimplemented_sum_po(const attr_t &attr, res_t *res, // Sum with zero-point is not supported on GPU if (is_gpu() && e.sum.zero_point != 0) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; break; } // Each sum must have same data on CPU if (is_cpu() && e.sum.dt != sum_dt) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; break; } // Sum must have data type with the same size like dst on both if (dst_dt != dnnl_data_type_undef && sum_dt != dnnl_data_type_undef && dnnl_data_type_size(dst_dt) != dnnl_data_type_size(e.sum.dt)) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } @@ -732,14 +738,18 @@ void skip_unimplemented_prelu_po( case dnnl_deconvolution: case dnnl_inner_product: case dnnl_matmul: return; break; - default: res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; break; + default: + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; + break; } } void skip_unimplemented_arg_scale(const attr_t &attr, res_t *res) { for (const auto &arg_s : attr.scales.scales) { if (arg_s.second.policy != policy_t::COMMON) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } @@ -756,13 +766,15 @@ void skip_invalid_inplace(res_t *res, dnnl_data_type_t sdt, // General limitation of in-place mode is having same amount of memory on // input and output. if (sdt != ddt) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } if (dtag == tag::any) return; if (stag != dtag) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } } @@ -1036,7 +1048,7 @@ static int check_total_size( "[CHECK_MEM][%s]: Not enough device RAM for a problem.\n", dir_c_str()); res->state = SKIPPED; - res->reason = NOT_ENOUGH_RAM; + res->reason = skip_reason::not_enough_ram; } const bool all_allocation_fit_limit = std::all_of( @@ -1053,7 +1065,7 @@ static int check_total_size( }); if (!all_allocation_fit_limit) { res->state = SKIPPED; - res->reason = NOT_ENOUGH_RAM; + res->reason = skip_reason::not_enough_ram; } BENCHDNN_PRINT((!fits_device_ram ? 2 : 6), @@ -1091,7 +1103,7 @@ static int check_total_size( } else { res->state = SKIPPED; } - res->reason = NOT_ENOUGH_RAM; + res->reason = skip_reason::not_enough_ram; } BENCHDNN_PRINT((!fits_cpu_ram ? 2 : 6), diff --git a/tests/benchdnn/dnnl_common.hpp b/tests/benchdnn/dnnl_common.hpp index 9e8fd9c2106..c780853dd55 100644 --- a/tests/benchdnn/dnnl_common.hpp +++ b/tests/benchdnn/dnnl_common.hpp @@ -307,7 +307,7 @@ int check_dnnl_status(dnnl_status_t status, const prb_t *prb, res_t *res) { // not supported. if (is_nvidia_gpu() || is_amd_gpu()) { res->state = SKIPPED; - res->reason = CASE_NOT_SUPPORTED; + res->reason = skip_reason::case_not_supported; return OK; } @@ -353,7 +353,7 @@ int fetch_impl(benchdnn_dnnl_wrapper_t &pdw, // Iterator is not supported, further logic is not applicable. if (!init_pd_args.is_iterator_supported) { res->state = SKIPPED; - res->reason = SKIP_IMPL_HIT; + res->reason = skip_reason::skip_impl_hit; return OK; } @@ -361,7 +361,7 @@ int fetch_impl(benchdnn_dnnl_wrapper_t &pdw, if (status == dnnl_last_impl_reached) { BENCHDNN_PRINT(2, "%s\n", "All implementations were skipped!"); res->state = SKIPPED; - res->reason = SKIP_IMPL_HIT; + res->reason = skip_reason::skip_impl_hit; pdw.reset(nullptr); return OK; } else if (status == dnnl_success) { diff --git a/tests/benchdnn/doc/benchdnn_general_info.md b/tests/benchdnn/doc/benchdnn_general_info.md index 90401b3c1b9..78ac514211d 100644 --- a/tests/benchdnn/doc/benchdnn_general_info.md +++ b/tests/benchdnn/doc/benchdnn_general_info.md @@ -129,15 +129,15 @@ problem): reproducer line might be reported. The execution was stopped before creating any library objects. * `SKIPPED`. Same as `LISTED` but the execution was stopped intentionally for - the reason given in the short description, e.g. `CASE_NOT_SUPPORTED` or - `SKIP_IMPL_HIT`. + the reason given in the short description, e.g. "Case not supported" or + "Skip-impl option hit". Note: Nvidia backend is treated specially. See a note below. * `INVALID_ARGUMENTS`. It means that the library API returned an error due to incorrect argument values. It is treated as a failure. * `UNIMPLEMENTED`. It means that the library does not have an implementation for a requested problem. It is treated as a failure. Note: All Nvidia backend `unimplemented` status errors are always treated as - `SKIPPED (CASE_NOT_SUPPORTED)` to simplify validation. + `SKIPPED "(Case not supported)"` to simplify validation. * `INITIALIZED`. It means that a problem was initialized, and the primitive creation was successful, but there was no execution call or validation. * `EXECUTED`. It means that a problem was run, and the library execution call diff --git a/tests/benchdnn/eltwise/eltwise.cpp b/tests/benchdnn/eltwise/eltwise.cpp index 29bfa595fb6..b2ed404a77f 100644 --- a/tests/benchdnn/eltwise/eltwise.cpp +++ b/tests/benchdnn/eltwise/eltwise.cpp @@ -300,7 +300,8 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { default: break; }; if (is_invalid) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } @@ -308,7 +309,8 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { // let forward path overwrite it. is_invalid = (prb->dir & FLAG_BWD) && !prb->use_dst() && prb->inplace; if (is_invalid) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } diff --git a/tests/benchdnn/graph/graph.cpp b/tests/benchdnn/graph/graph.cpp index b55c7bc618e..f71adc169a2 100644 --- a/tests/benchdnn/graph/graph.cpp +++ b/tests/benchdnn/graph/graph.cpp @@ -385,7 +385,7 @@ void skip_unimplemented_ops(const dnnl::graph::partition &partition, // TODO: extend with `getenv` call if limits too much. if (is_gc_backend()) { res->state = SKIPPED; - res->reason = CASE_NOT_SUPPORTED; + res->reason = skip_reason::case_not_supported; return; } @@ -408,7 +408,7 @@ void skip_unimplemented_ops(const dnnl::graph::partition &partition, BENCHDNN_PRINT( 2, "[INFO]: Unimplemented op: %s.\n", dg_op_kind.c_str()); res->state = SKIPPED; - res->reason = CASE_NOT_SUPPORTED; + res->reason = skip_reason::case_not_supported; return; } } @@ -422,7 +422,7 @@ void skip_unimplemented_graph_attribute( if (fpmath_mode != dnnl::fpmath_mode::strict && fpmath_mode != dnnl::fpmath_mode::bf16) { res->state = SKIPPED; - res->reason = CASE_NOT_SUPPORTED; + res->reason = skip_reason::case_not_supported; return; } } diff --git a/tests/benchdnn/graph/utils.cpp b/tests/benchdnn/graph/utils.cpp index 6cbbb215744..eff88da4f78 100644 --- a/tests/benchdnn/graph/utils.cpp +++ b/tests/benchdnn/graph/utils.cpp @@ -40,14 +40,14 @@ bdnn_state_t convert_state(const dnnl_status_t &s) { return bdnn_state_t {res_state_t::PASSED}; case dnnl_status_t::dnnl_out_of_memory: return bdnn_state_t { - res_state_t::SKIPPED, skip_reason_t::NOT_ENOUGH_RAM}; + res_state_t::SKIPPED, skip_reason::not_enough_ram}; case dnnl_status_t::dnnl_invalid_arguments: return bdnn_state_t {res_state_t::INVALID_ARGUMENTS}; case dnnl_status_t::dnnl_unimplemented: return bdnn_state_t {res_state_t::UNIMPLEMENTED}; case dnnl_status_t::dnnl_last_impl_reached: return bdnn_state_t { - res_state_t::SKIPPED, skip_reason_t::SKIP_IMPL_HIT}; + res_state_t::SKIPPED, skip_reason::skip_impl_hit}; case dnnl_status_t::dnnl_runtime_error: return bdnn_state_t {res_state_t::FAILED}; case dnnl_status_t::dnnl_not_required: @@ -56,10 +56,10 @@ bdnn_state_t convert_state(const dnnl_status_t &s) { case dnnl_status_t::dnnl_invalid_graph_op: case dnnl_status_t::dnnl_invalid_shape: return bdnn_state_t { - res_state_t::SKIPPED, skip_reason_t::INVALID_CASE}; + res_state_t::SKIPPED, skip_reason::invalid_case}; case dnnl_status_t::dnnl_invalid_data_type: - return bdnn_state_t {res_state_t::SKIPPED, - skip_reason_t::DATA_TYPE_NOT_SUPPORTED}; + return bdnn_state_t { + res_state_t::SKIPPED, skip_reason::data_type_not_supported}; default: assert(!"dnnl state is not found!"); return bdnn_state_t {}; } } diff --git a/tests/benchdnn/graph/utils.hpp b/tests/benchdnn/graph/utils.hpp index 4afcd7c56b7..ef590a5aade 100644 --- a/tests/benchdnn/graph/utils.hpp +++ b/tests/benchdnn/graph/utils.hpp @@ -48,7 +48,7 @@ struct deserialized_lt; struct bdnn_state_t { res_state_t state; - skip_reason_t reason; + std::string reason; }; extern bdnn_state_t convert_state(const dnnl_status_t &s); diff --git a/tests/benchdnn/ip/ip.cpp b/tests/benchdnn/ip/ip.cpp index 105bbfb504a..7c8417fa873 100644 --- a/tests/benchdnn/ip/ip.cpp +++ b/tests/benchdnn/ip/ip.cpp @@ -259,7 +259,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { is_dt_f16_or_f32(prb->get_dt(SRC)) && is_dt_f16_or_f32(prb->get_dt(WEI)) && is_dt_f16_or_f32(prb->get_dt(DST)))) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; } } } diff --git a/tests/benchdnn/lnorm/lnorm.cpp b/tests/benchdnn/lnorm/lnorm.cpp index 72913cd950b..f1df3252cb9 100644 --- a/tests/benchdnn/lnorm/lnorm.cpp +++ b/tests/benchdnn/lnorm/lnorm.cpp @@ -425,7 +425,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { if (is_gpu() && prb->attr.post_ops.len() != 0) { // GPU does not support post-ops - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } diff --git a/tests/benchdnn/matmul/matmul.cpp b/tests/benchdnn/matmul/matmul.cpp index 1f6ca1c5305..4f97203010c 100644 --- a/tests/benchdnn/matmul/matmul.cpp +++ b/tests/benchdnn/matmul/matmul.cpp @@ -461,7 +461,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { = prb->sparse_options.get_encoding(DNNL_ARG_WEIGHTS); if ((is_gpu() && !prb->sparse_options.is_def()) || wei_encoding == dnnl_packed) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } #endif @@ -471,7 +472,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { const bool is_x8s8f16 = prb->wei_dt() == dnnl_s8 && prb->dst_dt() == dnnl_f16; if (is_x8s8f16) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } @@ -481,7 +483,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { if (prb->attr.zero_points.get(DNNL_ARG_SRC).policy != policy_t::COMMON || prb->attr.zero_points.get(DNNL_ARG_DST).policy != policy_t::COMMON) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -490,7 +493,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { == policy_t::PER_OCIC || prb->attr.scales.get(DNNL_ARG_WEIGHTS).policy == policy_t::PER_OCIC) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -498,7 +502,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { const auto &po = prb->attr.post_ops; const int sum_idx = po.find(attr_t::post_ops_t::kind_t::SUM); if (sum_idx != -1 && po.entry[sum_idx].sum.dt != dnnl_data_type_undef) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -515,7 +520,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { prb->attr.zero_points.get(DNNL_ARG_DST).is_def() && rt_dims_are_none && prb->ndims <= 2); if (!x8s8bf16_ok) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -526,7 +532,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { const bool bf16_bias_ok = IMPLICATION( prb->bia_dt == dnnl_bf16, prb->ndims <= 2 + is_bf16); if (!bf16_bias_ok) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -535,7 +542,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { if (prb->weights_decompression() && (!prb->attr.zero_points.is_def() || !prb->attr.scales.is_def())) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -545,7 +553,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { || (prb->src_dt() == dnnl_f8_e5m2 || prb->dst_dt() == dnnl_f8_e5m2)) && (!po.is_def() || !prb->attr.scales.is_def())) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } @@ -556,7 +565,8 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { // memory therefore all SYCL cases must be skipped. #ifdef DNNL_EXPERIMENTAL_SPARSE if (is_sycl_engine(get_test_engine()) && !prb->sparse_options.is_def()) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } #endif @@ -565,7 +575,8 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { if (!prb->attr.zero_points.is_def() && (prb->wei_dt() != dnnl_s8 && prb->wei_dt() != dnnl_u8 && prb->wei_dt() != dnnl_s4 && prb->wei_dt() != dnnl_u4)) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } @@ -574,14 +585,16 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { if (!prb->attr.scales.get(DNNL_ARG_WEIGHTS).is_def()) { const auto &groups = prb->attr.scales.get(DNNL_ARG_WEIGHTS).groups; if (!groups.empty() && (prb->k % groups[0] || groups.size() > 2)) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } } if (!prb->attr.zero_points.get(DNNL_ARG_WEIGHTS).is_def()) { const auto &groups = prb->attr.zero_points.get(DNNL_ARG_WEIGHTS).groups; if (!groups.empty() && (prb->k % groups[0] || groups.size() > 2)) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } } @@ -599,7 +612,8 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { "WARNING: runtime dimensions require user to specify a memory " "format for affected arguments. Consider specifying `--stag`, " "`--wtag`, and/or `--dtag`."); - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } @@ -611,7 +625,8 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { if (src_rt_mask[m_idx] != dst_rt_mask[m_idx] || src_rt_mask[k_idx_src] != wei_rt_mask[k_idx_wei] || wei_rt_mask[n_idx] != dst_rt_mask[n_idx]) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } @@ -624,7 +639,8 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { wei_rt_mask &= batch_rt_mask; dst_rt_mask &= batch_rt_mask; if (src_rt_mask != wei_rt_mask || src_rt_mask != dst_rt_mask) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } } diff --git a/tests/benchdnn/pool/pool.cpp b/tests/benchdnn/pool/pool.cpp index 9c36ff1c347..8b2324224fb 100644 --- a/tests/benchdnn/pool/pool.cpp +++ b/tests/benchdnn/pool/pool.cpp @@ -143,7 +143,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { skip_unimplemented_prelu_po(prb->attr, res, dnnl_pooling); if (is_cpu() && prb->src_dt() != prb->dst_dt()) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } @@ -152,7 +153,8 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { // Average pooling without padding can't handle cases when kernel window is // applied to padded area only. if (prb->alg == avg_np && prb->has_ker_in_pad()) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } } diff --git a/tests/benchdnn/reduction/reduction.cpp b/tests/benchdnn/reduction/reduction.cpp index 81cc7f4c4b8..8bb2466bce5 100644 --- a/tests/benchdnn/reduction/reduction.cpp +++ b/tests/benchdnn/reduction/reduction.cpp @@ -209,7 +209,8 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { && (is_integral_dt(prb->sdt) || prb->p < 1.f); if (is_invalid) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } } diff --git a/tests/benchdnn/reorder/reorder.cpp b/tests/benchdnn/reorder/reorder.cpp index 00b96bacbdf..1437f7bd305 100644 --- a/tests/benchdnn/reorder/reorder.cpp +++ b/tests/benchdnn/reorder/reorder.cpp @@ -220,7 +220,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { } #endif if (!scales_ok) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -245,14 +246,16 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { } if (!dt_ok || !attr_ok || !rt_ok || !masks_ok) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } #if !defined(DNNL_X64) || DNNL_X64 == 0 // Simple reorder doesn't provide decent coverage for compensated cases. // Shut them down unconditionally by default. - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; #endif } @@ -263,7 +266,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { const auto &dst_scales = prb->attr.scales.get(DNNL_ARG_DST); if (!dst_scales.is_def() && attr_t::get_default_mask(dst_scales.policy) > 0 && prb->runtime_dim_mask != 0) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -274,7 +278,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { if (attr_t::get_default_mask(src_scales.policy) != attr_t::get_default_mask(dst_scales.policy) && prb->is_reorder_with_compensation(FLAG_ANY)) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } @@ -283,7 +288,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { // Int4 reorder support is limited on CPU. if (sdt == dnnl_s4 || ddt == dnnl_s4 || sdt == dnnl_u4 || ddt == dnnl_u4) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -295,7 +301,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { sdt != dnnl_f8_e5m2 && sdt != dnnl_f8_e4m3 && sdt != dnnl_bf16 && sdt != dnnl_f16); if (!s32_src_ok || !s32_dst_ok) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -305,7 +312,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { const bool f16_dst_ok = IMPLICATION( ddt == dnnl_f16, sdt == dnnl_f16 || sdt == dnnl_f32); if (!f16_src_ok || !f16_dst_ok) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -317,7 +325,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { = IMPLICATION(sdt == dnnl_f8_e5m2 || sdt == dnnl_f8_e4m3, ddt == dnnl_f16 || ddt == dnnl_f32); if (!xf8_src_ok || !xf8_dst_ok) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } @@ -328,7 +337,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { // in kernels directly, but s8s8 instructions are available in HW. if (prb->runtime_dim_mask != 0 || prb->is_reorder_with_compensation(FLAG_ANY)) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -336,7 +346,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { const bool is_xf8 = prb->sdt == dnnl_f8_e5m2 || prb->sdt == dnnl_f8_e4m3 || prb->ddt == dnnl_f8_e5m2 || prb->ddt == dnnl_f8_e4m3; if (is_xf8) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } @@ -347,14 +358,18 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE \ || DNNL_GPU_RUNTIME == DNNL_RUNTIME_NONE auto cross_engine = prb->cross_engine; - if (cross_engine == CPU2GPU || cross_engine == GPU2CPU) - res->state = SKIPPED, res->reason = INVALID_CASE; + if (cross_engine == CPU2GPU || cross_engine == GPU2CPU) { + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; + return; + } #endif // Zero-points can't be used with sum post-op. if (!prb->attr.zero_points.is_def(DNNL_ARG_DST) && prb->attr.post_ops.find(attr_t::post_ops_t::kind_t::SUM) != -1) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } @@ -364,7 +379,8 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { const bool is_dst_zp_ok = is_integral_dt(prb->ddt) || prb->attr.zero_points.is_def(DNNL_ARG_DST); if (!(is_src_zp_ok && is_dst_zp_ok)) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } } diff --git a/tests/benchdnn/rnn/rnn.cpp b/tests/benchdnn/rnn/rnn.cpp index bd073bfa19a..af69230ac02 100644 --- a/tests/benchdnn/rnn/rnn.cpp +++ b/tests/benchdnn/rnn/rnn.cpp @@ -774,13 +774,15 @@ void skip_unimplemented_prb(const prb_t *prb_, res_t *res) { // FIXME: this will disable int8 RNN testing if the library is built with // Intel MKL that does have packed IGEMM if (prb.is_int8()) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } #endif // cpu backward only supports `any` or `abx` layouts for weights if (IMPLICATION(prb.prop == dnnl_backward, prb.tag[1] != tag::abx)) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -788,7 +790,8 @@ void skip_unimplemented_prb(const prb_t *prb_, res_t *res) { const bool is_f16_not_ok = prb.cfg[SRC_LAYER].dt == dnnl_f16 && !(dir & FLAG_INF); if (is_f16_not_ok) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -796,7 +799,8 @@ void skip_unimplemented_prb(const prb_t *prb_, res_t *res) { const bool is_acl_f16_not_ok = prb.cfg[SRC_LAYER].dt == dnnl_f16 && dnnl::impl::cpu::platform::has_data_type_support(dnnl_f16); if (is_acl_f16_not_ok) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } #endif @@ -806,25 +810,30 @@ void skip_unimplemented_prb(const prb_t *prb_, res_t *res) { // only LSTM and GRU cell kinds support int8 so far; if (prb.is_int8()) { if (!prb.trivial_strides) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } if (prb.alg != VANILLA_LSTM && prb.alg != VANILLA_GRU) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } if (prb.prop != dnnl_forward_inference) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } if (is_cpu() && (prb.tag[0] != tag::abx || prb.tag[1] != tag::any || prb.tag[2] != tag::abx)) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } if (is_gpu() && prb.tag[1] != tag::any) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } @@ -833,7 +842,8 @@ void skip_unimplemented_prb(const prb_t *prb_, res_t *res) { if (prb.is_lstm_projection() && (prb.cfg[SRC_LAYER].dt == dnnl_bf16 || prb.cfg[SRC_LAYER].dt == dnnl_f16)) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } @@ -841,29 +851,35 @@ void skip_unimplemented_prb(const prb_t *prb_, res_t *res) { if (is_gpu()) { bool is_AUGRU = prb.alg == VANILLA_AUGRU || prb.alg == LBR_AUGRU; if (is_AUGRU) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } if (prb.is_lstm_projection() || prb.is_lstm_peephole()) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } if (prb.is_int8() && prb.alg != VANILLA_LSTM) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } if (prb.is_s8() && prb.alg == VANILLA_LSTM) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } // Implemented only for CPU if (prb.cfg[BIAS].dt == dnnl_bf16 || prb.cfg[SRC_ITER_C].dt == dnnl_bf16 || prb.cfg[DST_ITER_C].dt == dnnl_bf16) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } if (prb.flags != NONE) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } @@ -885,7 +901,8 @@ void skip_invalid_prb(const prb_t *prb_, res_t *res) { && prb.direction == dnnl_unidirectional_left2right); if (!consistent_proj || !consistent_L || !consistent_T || !consistent_GRU || !consistent_AUGRU) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } @@ -895,7 +912,8 @@ void skip_invalid_prb(const prb_t *prb_, res_t *res) { bool is_lstm_projection = IMPLICATION(prb.with_projection, prb.alg == VANILLA_LSTM); if (!is_lstm_peephole || !is_lstm_projection) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } @@ -903,7 +921,8 @@ void skip_invalid_prb(const prb_t *prb_, res_t *res) { // the output, which doesn't allow to validate numerical stability. if (has_bench_mode_bit(mode_bit_t::bitwise) && (prb.prop == dnnl_backward) && prb.flags != DIFF_WEIGHTS_OVERWRITE) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } @@ -911,7 +930,8 @@ void skip_invalid_prb(const prb_t *prb_, res_t *res) { // With tag::any, strides are not defined. if (!prb.trivial_strides && (prb.tag[0] == tag::any || prb.tag[2] == tag::any)) { - res->state = SKIPPED, res->reason = INVALID_CASE; + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; return; } } diff --git a/tests/benchdnn/softmax/softmax.cpp b/tests/benchdnn/softmax/softmax.cpp index af95485b27c..3a16c52a648 100644 --- a/tests/benchdnn/softmax/softmax.cpp +++ b/tests/benchdnn/softmax/softmax.cpp @@ -209,7 +209,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { skip_unimplemented_prelu_po(prb->attr, res, dnnl_softmax); if (prb->attr.post_ops.find(attr_t::post_ops_t::kind_t::SUM) != -1) { - res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; return; } } diff --git a/tests/benchdnn/zeropad/zeropad.cpp b/tests/benchdnn/zeropad/zeropad.cpp index 2a7f153772a..c9845f74455 100644 --- a/tests/benchdnn/zeropad/zeropad.cpp +++ b/tests/benchdnn/zeropad/zeropad.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -118,7 +118,7 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { if (is_nvidia_gpu() || is_amd_gpu()) { res->state = SKIPPED; - res->reason = CASE_NOT_SUPPORTED; + res->reason = skip_reason::case_not_supported; } } From aa490b17be6c092cc9306d239e73f81f73c5c0ea Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Mon, 6 May 2024 16:49:08 -0700 Subject: [PATCH 030/187] benchdnn: common: add reason for failed state --- tests/benchdnn/common.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/benchdnn/common.cpp b/tests/benchdnn/common.cpp index 8fa70f1b0ba..010d212a052 100644 --- a/tests/benchdnn/common.cpp +++ b/tests/benchdnn/common.cpp @@ -105,12 +105,19 @@ void parse_result(res_t &res, const char *pstr) { BENCHDNN_PRINT(0, "%d:%s __REPRO: %s\n", bs.tests, state, pstr); bs.passed++; break; - case FAILED: + case FAILED: { bs.failed++; - BENCHDNN_PRINT(0, "%d:%s (errors:%lu total:%lu) __REPRO: %s\n", - bs.tests, state, (unsigned long)res.errors, - (unsigned long)res.total, pstr); - break; + std::string error_stat; + if (res.errors > 0) { + error_stat = " (errors:" + std::to_string(res.errors) + + " total:" + std::to_string(res.total) + ")"; + } + + std::string reason; + if (!res.reason.empty()) { reason = " (" + res.reason + ")"; } + BENCHDNN_PRINT(0, "%d:%s%s%s __REPRO: %s\n", bs.tests, state, + reason.c_str(), error_stat.c_str(), pstr); + } break; case SKIPPED: BENCHDNN_PRINT(0, "%d:%s (%s) __REPRO: %s\n", bs.tests, state, res.reason.c_str(), pstr); From 8fdbf3ad34362afaf1a0ec227163f99c6c11cb67 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Mon, 6 May 2024 16:49:35 -0700 Subject: [PATCH 031/187] benchdnn: introduce a ref-impl check knob --- tests/benchdnn/benchdnn.cpp | 3 ++- tests/benchdnn/common.hpp | 1 + tests/benchdnn/dnn_types.cpp | 2 ++ tests/benchdnn/dnnl_common.cpp | 13 +++++++++++++ tests/benchdnn/dnnl_common.hpp | 4 ++++ tests/benchdnn/doc/knobs_common.md | 7 +++++++ tests/benchdnn/utils/parser.cpp | 23 +++++++++++++++++------ 7 files changed, 46 insertions(+), 7 deletions(-) diff --git a/tests/benchdnn/benchdnn.cpp b/tests/benchdnn/benchdnn.cpp index e82bf5c2620..6c32443d597 100644 --- a/tests/benchdnn/benchdnn.cpp +++ b/tests/benchdnn/benchdnn.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2017-2023 Intel Corporation +* Copyright 2017-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,6 +74,7 @@ bool fast_ref {default_fast_ref}; bool allow_enum_tags_only {true}; int test_start {0}; bool attr_same_pd_check {false}; +bool check_ref_impl {false}; int main(int argc, char **argv) { using namespace parser; diff --git a/tests/benchdnn/common.hpp b/tests/benchdnn/common.hpp index b29f75d60c3..858d00c6bef 100644 --- a/tests/benchdnn/common.hpp +++ b/tests/benchdnn/common.hpp @@ -93,6 +93,7 @@ extern int verbose; extern bool canonical; extern bool mem_check; extern bool attr_same_pd_check; +extern bool check_ref_impl; extern std::string skip_impl; /* empty or "" means skip nothing */ extern std::string driver_name; diff --git a/tests/benchdnn/dnn_types.cpp b/tests/benchdnn/dnn_types.cpp index 40fb3b848e7..f5903a71084 100644 --- a/tests/benchdnn/dnn_types.cpp +++ b/tests/benchdnn/dnn_types.cpp @@ -860,6 +860,8 @@ std::ostream &dump_global_params(std::ostream &s) { s << "--cpu-isa-hints=" << isa_hints_t::hints2str(hints) << " "; if (canonical || attr_same_pd_check != false) s << "--attr-same-pd-check=" << bool2str(attr_same_pd_check) << " "; + if (canonical || check_ref_impl != false) + s << "--check-ref-impl=" << bool2str(check_ref_impl) << " "; #if defined(DNNL_WITH_SYCL) || DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL if (canonical || memory_kind != default_memory_kind) s << "--memory-kind=" << memory_kind << " "; diff --git a/tests/benchdnn/dnnl_common.cpp b/tests/benchdnn/dnnl_common.cpp index c86df55d650..d8fba8af444 100644 --- a/tests/benchdnn/dnnl_common.cpp +++ b/tests/benchdnn/dnnl_common.cpp @@ -791,6 +791,19 @@ int check_same_pd(const dnnl_primitive_desc_t &pd_no_attr, res_t *res) { return FAIL; } +// Checks if unexpected reference implementation was hit. +int check_ref_impl_hit(res_t *res) { + if (!check_ref_impl) return OK; + + const auto &impl_name = res->impl_name; + if (impl_name.find("ref") != std::string::npos) { + res->state = FAILED; + res->reason = "Ref Impl Not Expected"; + return FAIL; + } + return OK; +} + bool is_cpu(const dnnl_engine_t &engine) { return query_engine_kind(engine) == dnnl_cpu; } diff --git a/tests/benchdnn/dnnl_common.hpp b/tests/benchdnn/dnnl_common.hpp index c780853dd55..64ac86f2a6d 100644 --- a/tests/benchdnn/dnnl_common.hpp +++ b/tests/benchdnn/dnnl_common.hpp @@ -434,6 +434,8 @@ int check_pd_w_and_wo_attr(dnnl_engine_t engine, const func_t &init_pd_func, return OK; } +int check_ref_impl_hit(res_t *res); + template int init_prim(benchdnn_dnnl_wrapper_t &user_prim, const func_t &init_pd_func, const prb_t *prb, res_t *res, @@ -489,6 +491,8 @@ int init_prim(benchdnn_dnnl_wrapper_t &user_prim, SAFE(check_pd_w_and_wo_attr( get_test_engine(), init_pd_func, prb, res, dir, hint), WARN); + // Check if unexpected ref impl was hit. + SAFE(check_ref_impl_hit(res), WARN); } user_prim.reset(primw.release()); diff --git a/tests/benchdnn/doc/knobs_common.md b/tests/benchdnn/doc/knobs_common.md index 730ec8ba562..9d01c0ce9cd 100644 --- a/tests/benchdnn/doc/knobs_common.md +++ b/tests/benchdnn/doc/knobs_common.md @@ -157,6 +157,13 @@ names mismatch for two descriptors. It indicates that appending an attribute changes the implementation dispatching which is an undesired behavior. When `BOOL` is `false` (the default), the check is disabled. +### --check-ref-impl +`--check-ref-impl=BOOL` instructs the driver to compare the implementation name +string against the `ref` string pattern. When `BOOL` is set to `true`, the check +returns an error if the name matches the reference pattern. By default, the +check is disabled. It's useful to catch unexpected fallbacks to slow reference +implementations from a big batch of problems. + ### --fast-ref `--fast-ref=BOOL` instructs the driver to use an optimized implementation from the library as a reference path for correctness comparison when `BOOL` is diff --git a/tests/benchdnn/utils/parser.cpp b/tests/benchdnn/utils/parser.cpp index 2d1c5ba8c2e..444c5023eb0 100644 --- a/tests/benchdnn/utils/parser.cpp +++ b/tests/benchdnn/utils/parser.cpp @@ -762,6 +762,17 @@ static bool parse_canonical( canonical, false, str2bool, str, option_name, help); } +static bool parse_check_ref_impl( + const char *str, const std::string &option_name = "check-ref-impl") { + static const std::string help + = "BOOL (Default: `false`)\n Instructs the driver to compare " + "an implementation name against the \'ref\' string pattern.\n " + "When set to `true`, the check would return an error if the " + "implementation name contains such pattern.\n"; + return parse_single_value_option( + check_ref_impl, false, str2bool, str, option_name, help); +} + static bool parse_cold_cache( const char *str, const std::string &option_name = "cold-cache") { static const std::string help @@ -1236,12 +1247,12 @@ bool parse_bench_settings(const char *str) { bool parsed = parse_allow_enum_tags_only(str) || parse_attr_same_pd_check(str) || parse_canonical(str) - || parse_cold_cache(str) || parse_cpu_isa_hints(str) - || parse_engine(str) || parse_fast_ref(str) - || parse_fast_ref_gpu(str) || parse_fix_times_per_prb(str) - || parse_max_ms_per_prb(str) || parse_num_streams(str) - || parse_repeats_per_prb(str) || parse_mem_check(str) - || parse_memory_kind(str) || parse_mode(str) + || parse_check_ref_impl(str) || parse_cold_cache(str) + || parse_cpu_isa_hints(str) || parse_engine(str) + || parse_fast_ref(str) || parse_fast_ref_gpu(str) + || parse_fix_times_per_prb(str) || parse_max_ms_per_prb(str) + || parse_num_streams(str) || parse_repeats_per_prb(str) + || parse_mem_check(str) || parse_memory_kind(str) || parse_mode(str) || parse_mode_modifier(str) || parse_skip_impl(str) || parse_start(str) || parse_stream_kind(str) || parse_verbose(str); From df3022638aaab0d1fdf62bc6ab16d9031739a0fc Mon Sep 17 00:00:00 2001 From: "Pirogov, Vadim" Date: Thu, 9 May 2024 15:38:42 -0700 Subject: [PATCH 032/187] github: workflows: limited GITHUB_TOKEN permissions for Slack action --- .github/workflows/slack-pr.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/slack-pr.yaml b/.github/workflows/slack-pr.yaml index 963b14d9f2f..72734c51cce 100644 --- a/.github/workflows/slack-pr.yaml +++ b/.github/workflows/slack-pr.yaml @@ -24,6 +24,9 @@ env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} channel: "onednn" +permissions: + pull-requests: read + jobs: rfc: name: RFC Notification From eb8773580e3e19081a3f2cbaa34dc709dc1a2199 Mon Sep 17 00:00:00 2001 From: Tomasz Czeszun Date: Thu, 9 May 2024 12:48:24 -0700 Subject: [PATCH 033/187] x64: brgconv: optimize dispatching for strided cases --- src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp index d33d579b553..364f11f010e 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp @@ -1565,14 +1565,19 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, && everyone_is(128, jcp.oh, jcp.ow) && everyone_is(3, jcp.kh, jcp.kw) && everyone_is(2, jcp.stride_h, jcp.stride_w)) - || (jcp.ic == 256 && jcp.oc == 512 - && everyone_is(129, jcp.ih, jcp.iw) - && everyone_is(64, jcp.oh, jcp.ow) - && everyone_is(3, jcp.kh, jcp.kw) - && everyone_is(2, jcp.stride_h, jcp.stride_w)) || (jcp.ic == 256 && jcp.oc == 512 && jcp.ih == 49 && jcp.iw == 41 && jcp.oh == 23 && jcp.ow == 19 && everyone_is(5, jcp.kh, jcp.kw) + && everyone_is(2, jcp.stride_h, jcp.stride_w)) + || (jcp.ic == 64 && jcp.oc == 128 + && everyone_is(14, jcp.ih, jcp.iw) + && everyone_is(7, jcp.oh, jcp.ow) + && everyone_is(4, jcp.kh, jcp.kw) + && everyone_is(2, jcp.stride_h, jcp.stride_w)) + || (jcp.ic == 1 && jcp.oc == 64 + && everyone_is(28, jcp.ih, jcp.iw) + && everyone_is(14, jcp.oh, jcp.ow) + && everyone_is(4, jcp.kh, jcp.kw) && everyone_is(2, jcp.stride_h, jcp.stride_w))); VDISPATCH_CONV_IC(!(is_f32 && is_regression_shape), "implementation skipped due to low performance"); From 220bc4bc3f778fb8b2c0f946972030dd8a5673f8 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Wed, 13 Mar 2024 10:34:49 -0700 Subject: [PATCH 034/187] x64: brgemm conv: update data trans and copy utils to support fp8 for forward and backward by data convolutions --- src/cpu/x64/jit_brgemm_conv_bwd_copy_kernel.cpp | 10 +++++++--- src/cpu/x64/jit_brgemm_conv_bwd_trans_kernel.cpp | 10 +++++++--- src/cpu/x64/jit_brgemm_conv_trans_kernel.cpp | 4 ++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_copy_kernel.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_copy_kernel.cpp index cb12fa100f6..25be385360e 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_copy_kernel.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_copy_kernel.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,7 +47,9 @@ void jit_avx512_core_brgemm_conv_bwd_copy_kernel_t::load( case bf16: case f16: vmovdqu16(x, addr); break; case s8: - case u8: vmovdqu8(x, addr); break; + case u8: + case f8_e5m2: + case f8_e4m3: vmovdqu8(x, addr); break; default: assert(!"Unknown type!"); } } @@ -62,7 +64,9 @@ void jit_avx512_core_brgemm_conv_bwd_copy_kernel_t::store( case bf16: case f16: vmovdqu16(addr, x); break; case s8: - case u8: vmovdqu8(addr, x); break; + case u8: + case f8_e5m2: + case f8_e4m3: vmovdqu8(addr, x); break; default: assert(!"Unknown type!"); } } diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_trans_kernel.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_trans_kernel.cpp index 3c32dfb6be5..95ceaba44eb 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_trans_kernel.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_trans_kernel.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -77,7 +77,9 @@ void jit_avx512_core_brgemm_conv_bwd_trans_kernel_t::load( case bf16: case f16: vmovdqu16(x, addr); break; case s8: - case u8: vmovdqu8(x, addr); break; + case u8: + case f8_e5m2: + case f8_e4m3: vmovdqu8(x, addr); break; default: assert(!"Unknown type!"); } } @@ -92,7 +94,9 @@ void jit_avx512_core_brgemm_conv_bwd_trans_kernel_t::store( case bf16: case f16: vmovdqu16(addr, x); break; case s8: - case u8: vmovdqu8(addr, x); break; + case u8: + case f8_e5m2: + case f8_e4m3: vmovdqu8(addr, x); break; default: assert(!"Unknown type!"); } } diff --git a/src/cpu/x64/jit_brgemm_conv_trans_kernel.cpp b/src/cpu/x64/jit_brgemm_conv_trans_kernel.cpp index e86cde69806..7e043de1137 100644 --- a/src/cpu/x64/jit_brgemm_conv_trans_kernel.cpp +++ b/src/cpu/x64/jit_brgemm_conv_trans_kernel.cpp @@ -88,7 +88,7 @@ void jit_avx512_core_brgemm_conv_trans_kernel_t::load( vmovdqu32(x, addr); else if (one_of(jcp.src_dt, bf16, f16)) vmovdqu16(x, addr); - else if (one_of(jcp.src_dt, s8, u8)) + else if (one_of(jcp.src_dt, s8, u8, f8_e5m2, f8_e4m3)) vmovdqu8(x, addr); else assert(!"Unknown type!"); @@ -100,7 +100,7 @@ void jit_avx512_core_brgemm_conv_trans_kernel_t::store( vmovdqu32(addr, x); else if (one_of(jcp.src_dt, bf16, f16)) vmovdqu16(addr, x); - else if (one_of(jcp.src_dt, s8, u8)) + else if (one_of(jcp.src_dt, s8, u8, f8_e5m2, f8_e4m3)) vmovdqu8(addr, x); else assert(!"Unknown type!"); From f081e10becfb2d54f71184d100b9969b3387e607 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Wed, 13 Mar 2024 10:36:44 -0700 Subject: [PATCH 035/187] x64: brgemm fwd conv: update post_ops kernel to support fp8 data --- src/cpu/x64/jit_brgemm_post_ops.hpp | 90 ++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 22 deletions(-) diff --git a/src/cpu/x64/jit_brgemm_post_ops.hpp b/src/cpu/x64/jit_brgemm_post_ops.hpp index 11ae8f4c337..3e7a8a0e9b5 100644 --- a/src/cpu/x64/jit_brgemm_post_ops.hpp +++ b/src/cpu/x64/jit_brgemm_post_ops.hpp @@ -26,6 +26,7 @@ #include "cpu/x64/injectors/jit_uni_postops_injector.hpp" #include "cpu/x64/jit_avx512_core_bf16cvt.hpp" +#include "cpu/x64/jit_avx512_core_fp8cvt.hpp" #include "cpu/x64/jit_brgemm_primitive_conf.hpp" #include "cpu/x64/jit_generator.hpp" @@ -402,9 +403,21 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { this, attr.post_ops_, bsp, esp); } if (brg.is_bf16_emu) - bf16_emu_ = utils::make_unique(this, - bf16_emu_reserv_1, bf16_emu_reserv_2, bf16_emu_reserv_3, - bf16_emu_scratch, bf16_emu_reserv_4, bf16_emu_reserv_4); + bf16_emu_ = utils::make_unique(this, emu_reserv_1, + emu_reserv_2, emu_reserv_3, emu_scratch, emu_reserv_4, + emu_reserv_4); + if (brg.is_fp8_via_convert() + && utils::one_of( + data_type::f8_e5m2, brg.dt_a, brg.dt_b, brg.dt_d)) + f8_e5m2_emulator_ = utils::make_unique(this, + emu_reserv_1, emu_reserv_2, emu_reserv_3, emu_mask, + emu_scratch); + if (brg.is_fp8_via_convert() + && utils::one_of( + data_type::f8_e4m3, brg.dt_a, brg.dt_b, brg.dt_d)) + f8_e4m3_emulator_ = utils::make_unique(this, + emu_reserv_1, emu_reserv_2, emu_reserv_3, emu_reserv_4, + emu_reserv_5, emu_scratch); const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS); // per_oc: conv: 1 << 0, (1 << 1) + (1 << 0) (with groups) @@ -441,6 +454,8 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { std::unique_ptr> postops_injector_; std::unique_ptr bf16_emu_; + std::unique_ptr f8_e5m2_emulator_; + std::unique_ptr f8_e4m3_emulator_; const bool with_binary_non_scalar_bcast_; @@ -456,6 +471,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { avx2_vnni_2), Xbyak::Ymm, Xbyak::Zmm>::type; using Vmm_lower_t = typename vreg_traits::Vmm_lower_t; + using Vmm_lower2_t = typename vreg_traits::Vmm_lower_t; // Register decomposition const reg64_t reg_reserved_eltwise = rax; @@ -498,11 +514,13 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { constexpr static int stack_space_needed_ = 72; /* bf16 emulation */ - Xbyak::Zmm bf16_emu_reserv_1 = Xbyak::Zmm(27); - Xbyak::Zmm bf16_emu_reserv_2 = Xbyak::Zmm(24); - Xbyak::Zmm bf16_emu_reserv_3 = Xbyak::Zmm(25); - Xbyak::Zmm bf16_emu_reserv_4 = Xbyak::Zmm(26); - reg64_t bf16_emu_scratch = reg_tmp; + Xbyak::Zmm emu_reserv_1 = Xbyak::Zmm(27); + Xbyak::Zmm emu_reserv_2 = Xbyak::Zmm(26); + Xbyak::Zmm emu_reserv_3 = Xbyak::Zmm(25); + Xbyak::Zmm emu_reserv_4 = Xbyak::Zmm(24); + Xbyak::Zmm emu_reserv_5 = Xbyak::Zmm(23); + reg64_t emu_scratch = reg_tmp; + Xbyak::Opmask emu_mask = Xbyak::Opmask(4); Xbyak::Opmask k_full_mask = Xbyak::Opmask(2); Xbyak::Opmask k_tail_mask = Xbyak::Opmask(3); @@ -566,6 +584,18 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { vpslld(vmm, vmm, 16); break; case data_type::f16: vcvtph2ps(vmm, op); break; + case data_type::f8_e5m2: + if (brg.is_fp8_via_convert()) + f8_e5m2_emulator_->vcvt_f8_to_f32(vmm, op); + else + assert(!"Not supported yet"); + break; + case data_type::f8_e4m3: + if (brg.is_fp8_via_convert()) + f8_e4m3_emulator_->vcvt_f8_to_f32(vmm, op); + else + assert(!"Not supported yet"); + break; default: assert(!"unsupported data type"); } } else { @@ -731,10 +761,11 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { // compensation to avoid the loss of accuracy when converting s32 to f32 for_(int m = 0; m < m_block; m++) for (int n = 0; n < n_block; n++) { - if (brg.alpha == 0 && brg.beta != 0) { - // if postwork then have to init vmm each time + if (brg.alpha == 0) { + // have to init vmm each time because vectors may have been + // changed in the previous iterations uni_vpxor(vector(m, n), vector(m, n), vector(m, n)); - } else if (brg.alpha != 0) { + } else { auto inp_addr = ptr[aux_reg_in + inp_typesize_ * (m * brg.LDC + n * brg.ld_block)]; cvt2ps(inp_dt_, vector(m, n), inp_addr, tail, false, k_mask, @@ -849,8 +880,11 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { if (is_superset(isa, avx512_core)) { auto vmm_masked = maybe_mask(vmm, tail > 0, true, k_mask); Vmm_lower_t vmm_low = Vmm_lower_t(vmm.getIdx()); + Vmm_lower2_t vmm_low2 = Vmm_lower2_t(vmm_low.getIdx()); auto vmm_low_masked = maybe_mask(vmm_low, tail > 0, true, k_mask); + auto vmm_low2_masked + = maybe_mask(vmm_low2, tail > 0, true, k_mask); switch (out_dt_) { case data_type::f32: case data_type::s32: uni_vmovups(addr, vmm_masked); break; @@ -867,6 +901,20 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { vcvtps2ph(vmm_low, vmm, _op_mxcsr); vmovdqu16(addr, vmm_low_masked); break; + case data_type::f8_e5m2: + if (brg.is_fp8_via_convert()) { + f8_e5m2_emulator_->vcvt_f32_to_f8(vmm_low2, vmm); + vmovdqu8(addr, vmm_low2_masked); + } else + assert(!"Not supported yet"); + break; + case data_type::f8_e4m3: + if (brg.is_fp8_via_convert()) { + f8_e4m3_emulator_->vcvt_f32_to_f8(vmm_low2, vmm); + vmovdqu8(addr, vmm_low2_masked); + } else + assert(!"Not supported yet"); + break; case data_type::s8: vpmovsdb(addr, vmm_masked); break; case data_type::u8: vpmovusdb(addr, vmm_masked); break; default: assert(!"unknown dst_dt"); @@ -1006,7 +1054,11 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { int nb2_tail = nb % n_block2_; int n_block = (nb2 == 0) ? nstl::max(1, nb2_tail) : n_block2_; - int m_max_regs = (brg.is_bf16_emu ? 24 : max_vregs_ - 4) / n_block; + int m_max_regs = (brg.is_bf16_emu + ? 24 + : (brg.is_fp8_via_convert() ? 23 : max_vregs_ - 4)); + m_max_regs /= n_block; + int m_block = nstl::min(brg.bcast_dim, m_max_regs); int mb = brg.bcast_dim / m_block; @@ -1052,16 +1104,6 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { } mov(reg_out, ptr[param1 + GET_OFF(ptr_out)]); - // brg.alpha == 0 means initialize registers, 1 means read from input - // brg.beta == 0 means skip postwork, 1 means do postwork - if (brg.alpha == 0 && brg.beta == 0) { - for_(int m = 0; m < m_block; m++) - for (int n = 0; n < n_block; n++) { - auto vmm = Vmm(m * n_block + n); - uni_vpxor(vmm, vmm, vmm); - } - } - for (int mb_ = 0; mb_ < mb; mb_++) { loop_by_N(m_block, nb2, nb2_tail, nb_tail); @@ -1089,6 +1131,10 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { if (postops_injector_) postops_injector_->prepare_table(/* generate = */ true); + if (brg.is_fp8_via_convert()) { + if (f8_e5m2_emulator_) f8_e5m2_emulator_->prepare_table(); + if (f8_e4m3_emulator_) f8_e4m3_emulator_->prepare_table(); + } } }; From af1fd9091b9653bb24b023f4a15691d39b9624a5 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Wed, 13 Mar 2024 10:33:58 -0700 Subject: [PATCH 036/187] x64: brgemm fwd conv: update init_conf to support fp8 --- src/cpu/x64/jit_brgemm_conv_utils.cpp | 27 ++++++++++++++++++--------- src/cpu/x64/jit_primitive_conf.hpp | 2 ++ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/cpu/x64/jit_brgemm_conv_utils.cpp b/src/cpu/x64/jit_brgemm_conv_utils.cpp index 2d8334746ff..20d029812a0 100644 --- a/src/cpu/x64/jit_brgemm_conv_utils.cpp +++ b/src/cpu/x64/jit_brgemm_conv_utils.cpp @@ -196,7 +196,8 @@ struct brg_blocking_t : public jit_brgemm_conv_conf_t { bool is_any_eligible(const jit_brgemm_conv_conf_t &jcp) { return (jcp.prop_kind == prop_kind::forward_inference || jcp.wei_plain - || one_of(jcp.wei_dt, data_type::s8, data_type::f16) + || one_of(jcp.wei_dt, data_type::s8, data_type::f16, + data_type::f8_e5m2, data_type::f8_e4m3) || one_of(jcp.isa, avx2_vnni_2) || is_amx(jcp.isa)); } @@ -1657,11 +1658,14 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, if (one_of(jcp.src_dt, u8, s8)) { jcp.acc_dt = s32; - } else if (one_of(jcp.src_dt, f32, bf16, f16)) { + } else if (one_of(jcp.src_dt, f32, bf16, f16, f8_e5m2, f8_e4m3)) { jcp.acc_dt = f32; } else return status::unimplemented; + jcp.is_fp8 = one_of(jcp.src_dt, f8_e5m2, f8_e4m3) + && one_of(jcp.wei_dt, f8_e5m2, f8_e4m3); + jcp.is_fp8_convert = jcp.is_fp8 && utils::one_of(isa, avx10_1_512_amx_fp16); jcp.src_dsz = types::data_type_size(jcp.src_dt); jcp.wei_dsz = types::data_type_size(jcp.wei_dt); jcp.dst_dsz = types::data_type_size(jcp.dst_dt); @@ -1678,8 +1682,8 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, if (jcp.wei_plain) CHECK(pick_tags(jcp, src_md, weights_md, dst_md, bias_md)); - const data_type_t vnni_block_dt - = get_mac_emu_data_type(jcp.wei_dt, isa, isa == avx10_1_512); + const data_type_t vnni_block_dt = get_mac_emu_data_type( + jcp.wei_dt, isa, isa == avx10_1_512 && !jcp.is_fp8_convert); jcp.vnni_block = data_type_vnni_granularity(vnni_block_dt); if (one_of(jcp.prop_kind, prop_kind::forward_training, @@ -1749,7 +1753,7 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, && jcp.ow >= 128; if (one_of(jcp.prop_kind, prop_kind::forward_training, prop_kind::forward_inference) - && (is_small_shape || is_3d_small_ic)) + && (is_small_shape || is_3d_small_ic) && !jcp.is_fp8) VDISPATCH_CONV_IC(!allow_perf_heuristics(jcp), VERBOSE_IMPL_HEURISTIC_FAIL, "no optimization for fwd-prop and 3d shapes / small ic"); @@ -1836,7 +1840,8 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, const auto kw_koef = jcp.is_relo() ? jcp.kw : 1; const auto kh_koef = jcp.is_relo_whi() ? jcp.kh : 1; - jcp.is_rd_padded_to_block = !jcp.is_1x1 && one_of(jcp.wei_dt, bf16, f16, s8) + jcp.is_rd_padded_to_block = !jcp.is_1x1 + && one_of(jcp.wei_dt, bf16, f16, s8, f8_e5m2, f8_e4m3) && jcp.ic * kw_koef * kh_koef > rd_padded_block && is_amx(isa); jcp.idp = jcp.id + jcp.f_pad + jcp.back_pad; @@ -1888,7 +1893,8 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, bool use_inversion, const bool with_groups = weights_d.ndims() == src_d.ndims() + 1; - if (is_amx(isa)) { + // TODO: check these restrictions + if (is_amx(isa) && !jcp.is_fp8) { // disabled for two convolutions from ssd_resnet34 if ((jcp.ic == jcp.oc) && (jcp.ic == 128 || jcp.ic == 256) && (jcp.oh == jcp.ow) && (jcp.oh == 150)) @@ -2038,8 +2044,10 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, bool use_inversion, // try_relo_whi bool try_relo_whi = false; bool relo_conv_weights_whi = true; - if (!jcp.wei_plain && relo_supported_isa && relo_reasonable_isa) { + if (!jcp.wei_plain && relo_supported_isa && relo_reasonable_isa + && !jcp.is_fp8) { const int rd_whi = jcp.kh * jcp.kw * jcp.ic; + //TODO: support fp8 if (jcp.ic % jcp.vnni_block == 0 && IMPLICATION(rd_whi > jcp.simd_w, rd_whi % jcp.simd_w == 0) && one_of(1, jcp.kh, jcp.kw)) @@ -2115,7 +2123,8 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, bool use_inversion, const auto rd_ksize = (jcp.is_relo() ? jcp.kw : 1) * (jcp.is_relo_whi() ? jcp.kh : 1); - jcp.is_rd_padded_to_block = one_of(jcp.wei_dt, bf16, f16, s8) + jcp.is_rd_padded_to_block + = one_of(jcp.wei_dt, bf16, f16, s8, f8_e5m2, f8_e4m3) && jcp.ic * rd_ksize > rd_padded_block; jcp.is_os_blocking = jcp.f_pad < jcp.kd && jcp.back_pad < jcp.kd diff --git a/src/cpu/x64/jit_primitive_conf.hpp b/src/cpu/x64/jit_primitive_conf.hpp index 0361cf9d69b..445a7c8b775 100644 --- a/src/cpu/x64/jit_primitive_conf.hpp +++ b/src/cpu/x64/jit_primitive_conf.hpp @@ -791,6 +791,8 @@ struct jit_brgemm_conv_conf_t { bool req_brg_comp_pad; bool req_cal_comp_pad; bool is_bf32; + bool is_fp8 {false}; + bool is_fp8_convert {false}; bool comp_with_vpads; int nthr_mb, nthr_g, nthr_oc_b, nthr_ic_b, nthr_oh; From d27a86497f8e36cfd4d76b4efcaa295bc68cb7d1 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Thu, 2 May 2024 15:42:38 -0700 Subject: [PATCH 037/187] x64: brgemm bwd_d conv: update initialization to support fp8 --- src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp | 40 +++++++++++------------ 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp index 364f11f010e..8163e5e1c6c 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp @@ -116,7 +116,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, if (is_3d) { if (no_vnni_format) wei_tag = with_groups ? gIdhwo64i : Idhwo64i; - else if (jcp.wei_dt == s8) { + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) { if (jcp.is_oc_padded) wei_tag = with_groups ? gIdhwO16o64i4o : IdhwO16o64i4o; else @@ -131,7 +131,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, } else if (is_1d) { if (no_vnni_format) wei_tag = with_groups ? gIwo64i : Iwo64i; - else if (jcp.wei_dt == s8) { + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) { if (jcp.is_oc_padded) wei_tag = with_groups ? gIwO16o64i4o : IwO16o64i4o; else @@ -148,7 +148,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, UNUSED(is_2d); if (no_vnni_format) wei_tag = with_groups ? gIhwo64i : Ihwo64i; - else if (jcp.wei_dt == s8) { + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) { if (jcp.is_oc_padded) wei_tag = with_groups ? gIhwO16o64i4o : IhwO16o64i4o; else @@ -165,7 +165,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, if (is_3d) { if (no_vnni_format) wei_tag = with_groups ? gIdhwo48i : Idhwo48i; - else if (jcp.wei_dt == s8) { + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) { if (jcp.is_oc_padded) wei_tag = with_groups ? gIdhwO16o48i4o : IdhwO16o48i4o; else @@ -180,7 +180,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, } else if (is_1d) { if (no_vnni_format) wei_tag = with_groups ? gIwo48i : Iwo48i; - else if (jcp.wei_dt == s8) { + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) { if (jcp.is_oc_padded) wei_tag = with_groups ? gIwO16o48i4o : IwO16o48i4o; else @@ -197,7 +197,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, UNUSED(is_2d); if (no_vnni_format) wei_tag = with_groups ? gIhwo48i : Ihwo48i; - else if (jcp.wei_dt == s8) { + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) { if (jcp.is_oc_padded) wei_tag = with_groups ? gIhwO16o48i4o : IhwO16o48i4o; else @@ -214,7 +214,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, if (is_3d) { if (no_vnni_format) wei_tag = with_groups ? gIdhwo32i : Idhwo32i; - else if (jcp.wei_dt == s8) { + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) { if (jcp.is_oc_padded) wei_tag = with_groups ? gIdhwO16o32i4o : IdhwO16o32i4o; else @@ -229,7 +229,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, } else if (is_1d) { if (no_vnni_format) wei_tag = with_groups ? gIwo32i : Iwo32i; - else if (jcp.wei_dt == s8) { + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) { if (jcp.is_oc_padded) wei_tag = with_groups ? gIwO16o32i4o : IwO16o32i4o; else @@ -246,7 +246,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, UNUSED(is_2d); if (no_vnni_format) wei_tag = with_groups ? gIhwo32i : Ihwo32i; - else if (jcp.wei_dt == s8) { + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) { if (jcp.is_oc_padded) wei_tag = with_groups ? gIhwO16o32i4o : IhwO16o32i4o; else @@ -263,7 +263,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, if (is_3d) { if (no_vnni_format) wei_tag = with_groups ? gIdhwo24i : Idhwo24i; - else if (jcp.wei_dt == s8) + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) wei_tag = with_groups ? gIdhwO24i4o : IdhwO24i4o; else if (one_of(jcp.wei_dt, bf16, f16)) wei_tag = with_groups ? gIdhwO24i2o : IdhwO24i2o; @@ -272,7 +272,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, } else if (is_1d) { if (no_vnni_format) wei_tag = with_groups ? gIwo24i : Iwo24i; - else if (jcp.wei_dt == s8) + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) wei_tag = with_groups ? gIwO24i4o : IwO24i4o; else if (one_of(jcp.wei_dt, bf16, f16)) wei_tag = with_groups ? gIwO24i2o : IwO24i2o; @@ -284,7 +284,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, if (no_vnni_format) wei_tag = with_groups ? gIhwo24i : Ihwo24i; - else if (jcp.wei_dt == s8) + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) wei_tag = with_groups ? gIhwO24i4o : IhwO24i4o; else if (one_of(jcp.wei_dt, bf16, f16)) wei_tag = with_groups ? gIhwO24i2o : IhwO24i2o; @@ -295,7 +295,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, if (is_3d) { if (no_vnni_format) wei_tag = with_groups ? gIdhwo16i : Idhwo16i; - else if (jcp.wei_dt == s8) { + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) { if (jcp.is_oc_padded) wei_tag = with_groups ? gIdhwO16o16i4o : IdhwO16o16i4o; else @@ -310,7 +310,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, } else if (is_1d) { if (no_vnni_format) wei_tag = with_groups ? gIwo16i : Iwo16i; - else if (jcp.wei_dt == s8) { + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) { if (jcp.is_oc_padded) wei_tag = with_groups ? gIwO16o16i4o : IwO16o16i4o; else @@ -328,7 +328,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, if (no_vnni_format) wei_tag = with_groups ? gIhwo16i : Ihwo16i; - else if (jcp.wei_dt == s8) { + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) { if (jcp.is_oc_padded) wei_tag = with_groups ? gIhwO16o16i4o : IhwO16o16i4o; else @@ -345,7 +345,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, if (is_3d) { if (no_vnni_format) wei_tag = with_groups ? gIdhwo8i : Idhwo8i; - else if (jcp.wei_dt == s8) + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) wei_tag = with_groups ? gIdhwO8i4o : IdhwO8i4o; else if (one_of(jcp.wei_dt, bf16, f16)) wei_tag = with_groups ? gIdhwO8i2o : IdhwO8i2o; @@ -354,7 +354,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, } else if (is_1d) { if (no_vnni_format) wei_tag = with_groups ? gIwo8i : Iwo8i; - else if (jcp.wei_dt == s8) + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) wei_tag = with_groups ? gIwO8i4o : IwO8i4o; else if (one_of(jcp.wei_dt, bf16, f16)) wei_tag = with_groups ? gIwO8i2o : IwO8i2o; @@ -366,7 +366,7 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md, if (no_vnni_format) wei_tag = with_groups ? gIhwo8i : Ihwo8i; - else if (jcp.wei_dt == s8) + else if (one_of(jcp.wei_dt, s8, f8_e5m2, f8_e4m3)) wei_tag = with_groups ? gIhwO8i4o : IhwO8i4o; else if (one_of(jcp.wei_dt, bf16, f16)) wei_tag = with_groups ? gIhwO8i2o : IhwO8i2o; @@ -1500,8 +1500,8 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, VDISPATCH_CONV_IC(!jcp.is_bf32, VERBOSE_UNSUPPORTED_DT); - const data_type_t last_oc_block_dt - = get_mac_emu_data_type(jcp.wei_dt, isa, isa == avx512_core_fp16); + const data_type_t last_oc_block_dt = get_mac_emu_data_type( + jcp.wei_dt, isa, isa == avx512_core_fp16 && !jcp.is_fp8_convert); jcp.vnni_block = data_type_vnni_granularity(last_oc_block_dt); // TODO: optimize grouped convolutions with small oc From 48b44e0402998686eef595e8ae73f8c18af6cd79 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Mon, 1 Apr 2024 10:53:35 -0700 Subject: [PATCH 038/187] x64: brgemm bwd_d strided conv: support fp8 calculation --- src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp index d5ae7618d3e..e76321eb1f0 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp @@ -56,7 +56,9 @@ static bool impl_supports_datatype(data_type_t data_type) { case data_type::f32: case data_type::s32: case data_type::s8: - case data_type::u8: return true; + case data_type::u8: + case data_type::f8_e5m2: + case data_type::f8_e4m3: return true; default: return false; } } @@ -103,6 +105,12 @@ status_t brgemm_convolution_bwd_strided_t::pd_t::init( with_bias(), one_of(bias_md_.data_type, f32, s32, s8, u8)) && is_deconv /* only deconv uses int8 */; + const bool is_fp8_supported = one_of(wei_type, f8_e5m2, f8_e4m3) + && one_of(diff_dst_type, f8_e5m2, f8_e4m3) + && one_of(diff_src_type, wei_type, f32, f8_e5m2, f8_e4m3) + && IMPLICATION( + with_bias(), one_of(bias_md_.data_type, f32, wei_type)); + VDISPATCH_CONV(is_bwd_d(), VERBOSE_BAD_PROPKIND); VDISPATCH_CONV( impl_supports_datatype(diff_src_type), VERBOSE_UNSUPPORTED_DT); @@ -110,7 +118,7 @@ status_t brgemm_convolution_bwd_strided_t::pd_t::init( VDISPATCH_CONV( impl_supports_datatype(diff_dst_type), VERBOSE_UNSUPPORTED_DT); VDISPATCH_CONV(one_of(true, is_f32_supported, is_xf16_supported, - is_int8_supported), + is_int8_supported, is_fp8_supported), VERBOSE_UNSUPPORTED_DT); VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct), VERBOSE_BAD_ALGORITHM); From ecda5fa99b6b4b922d16d8377761d3adb50bfa01 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Wed, 13 Mar 2024 10:38:05 -0700 Subject: [PATCH 039/187] cpu: conv list: add brgemm implementations for fwd and bwd_d fp8 --- src/cpu/cpu_convolution_list.cpp | 72 ++++++++++++++++++++++++-------- 1 file changed, 55 insertions(+), 17 deletions(-) diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp index cf8abd17717..bf06adf2b06 100644 --- a/src/cpu/cpu_convolution_list.cpp +++ b/src/cpu/cpu_convolution_list.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2023 Intel Corporation +* Copyright 2019-2024 Intel Corporation * Copyright 2020-2023 Arm Ltd. and affiliates * Copyright 2020-2024 FUJITSU LIMITED * @@ -77,6 +77,36 @@ namespace { using namespace dnnl::impl::data_type; using namespace dnnl::impl::prop_kind; +#define BRGEMM_FP8_FWD_CONVS(dtsrc, dtwei, dtdst) \ + { \ + {forward, dtsrc, dtwei, dtdst}, { \ + CPU_INSTANCE_AMX( \ + brgemm_1x1_convolution_fwd_t) \ + CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) \ + CPU_INSTANCE_AMX( \ + brgemm_convolution_fwd_t) \ + CPU_INSTANCE(ref_convolution_fwd_t) nullptr, \ + } \ + } + +#define BRGEMM_FP8_BWD_D_CONVS(dtsrc, dtwei, dtdst) \ + { \ + {backward_data, dtsrc, dtwei, dtdst}, \ + REG_BWD_D_PK({ \ + CPU_INSTANCE_AMX(brgemm_convolution_bwd_t< \ + avx10_1_512_amx_fp16>) \ + CPU_INSTANCE_AMX( \ + brgemm_convolution_bwd_strided_t< \ + avx10_1_512_amx_fp16>) \ + CPU_INSTANCE_AMX( \ + brgemm_convolution_bwd_strided_t< \ + avx10_1_512_amx_fp16, \ + true>) \ + CPU_INSTANCE( \ + ref_convolution_bwd_data_t) nullptr, \ + }) \ + } + // clang-format off const std::map> &impl_list_map() { static const std::map> the_map = REG_CONV_P({ @@ -192,22 +222,22 @@ const std::map> &impl_list_map() CPU_INSTANCE(ref_fused_convolution_fwd_t) nullptr, }}, - {{forward, f8_e5m2, f8_e5m2, f32}, { - CPU_INSTANCE(ref_convolution_fwd_t) - nullptr, - }}, - {{forward, f8_e5m2, f8_e5m2, f8_e5m2}, { - CPU_INSTANCE(ref_convolution_fwd_t) - nullptr, - }}, - {{forward, f8_e4m3, f8_e4m3, f32}, { - CPU_INSTANCE(ref_convolution_fwd_t) - nullptr, - }}, - {{forward, f8_e4m3, f8_e4m3, f8_e4m3}, { - CPU_INSTANCE(ref_convolution_fwd_t) - nullptr, - }}, + BRGEMM_FP8_FWD_CONVS(f8_e5m2, f8_e5m2, f16), + BRGEMM_FP8_FWD_CONVS(f8_e5m2, f8_e5m2, f32), + BRGEMM_FP8_FWD_CONVS(f8_e5m2, f8_e5m2, f8_e5m2), + BRGEMM_FP8_FWD_CONVS(f8_e5m2, f8_e5m2, f8_e4m3), + BRGEMM_FP8_FWD_CONVS(f8_e5m2, f8_e4m3, f16), + BRGEMM_FP8_FWD_CONVS(f8_e5m2, f8_e4m3, f32), + BRGEMM_FP8_FWD_CONVS(f8_e5m2, f8_e4m3, f8_e5m2), + BRGEMM_FP8_FWD_CONVS(f8_e5m2, f8_e4m3, f8_e4m3), + BRGEMM_FP8_FWD_CONVS(f8_e4m3, f8_e5m2, f16), + BRGEMM_FP8_FWD_CONVS(f8_e4m3, f8_e5m2, f32), + BRGEMM_FP8_FWD_CONVS(f8_e4m3, f8_e5m2, f8_e5m2), + BRGEMM_FP8_FWD_CONVS(f8_e4m3, f8_e5m2, f8_e4m3), + BRGEMM_FP8_FWD_CONVS(f8_e4m3, f8_e4m3, f16), + BRGEMM_FP8_FWD_CONVS(f8_e4m3, f8_e4m3, f32), + BRGEMM_FP8_FWD_CONVS(f8_e4m3, f8_e4m3, f8_e5m2), + BRGEMM_FP8_FWD_CONVS(f8_e4m3, f8_e4m3, f8_e4m3), // BWD_D fp {{backward_data, f32, f32, f32}, REG_BWD_D_PK({ CPU_INSTANCE_X64(ip_convolution_bwd_data_t) @@ -300,6 +330,14 @@ const std::map> &impl_list_map() CPU_INSTANCE(ref_convolution_bwd_data_t) nullptr, })}, + BRGEMM_FP8_BWD_D_CONVS(f8_e5m2, f8_e5m2, f8_e5m2), + BRGEMM_FP8_BWD_D_CONVS(f8_e5m2, f8_e5m2, f8_e4m3), + BRGEMM_FP8_BWD_D_CONVS(f8_e5m2, f8_e4m3, f8_e5m2), + BRGEMM_FP8_BWD_D_CONVS(f8_e5m2, f8_e4m3, f8_e4m3), + BRGEMM_FP8_BWD_D_CONVS(f8_e4m3, f8_e5m2, f8_e5m2), + BRGEMM_FP8_BWD_D_CONVS(f8_e4m3, f8_e5m2, f8_e4m3), + BRGEMM_FP8_BWD_D_CONVS(f8_e4m3, f8_e4m3, f8_e5m2), + BRGEMM_FP8_BWD_D_CONVS(f8_e4m3, f8_e4m3, f8_e4m3), // BWD_W fp {{backward_weights, f32, f32, f32}, REG_BWD_PK({ CPU_INSTANCE_X64(ip_convolution_bwd_weights_t) From 3d54500c4c54064113e80aabe17ccfe1608b326d Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Mon, 6 May 2024 14:46:36 -0700 Subject: [PATCH 040/187] gpu: intel: update bool dev_getenv to allow any nonzero value --- src/gpu/intel/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpu/intel/utils.hpp b/src/gpu/intel/utils.hpp index 14af7c383fa..fc42b752fa9 100644 --- a/src/gpu/intel/utils.hpp +++ b/src/gpu/intel/utils.hpp @@ -144,7 +144,7 @@ inline int dev_getenv(const char *name, int default_value) { } inline bool dev_getenv(const char *s, bool def) { - return dev_getenv(s, def ? 1 : 0) == 1; + return dev_getenv(s, def ? 1 : 0) != 0; } inline std::string dev_getenv(const char *s, const std::string &def) { From 351bf11c2f5f4da71e3c82bce9682ff6e1d838a8 Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Tue, 7 May 2024 10:22:03 -0700 Subject: [PATCH 041/187] gpu: intel: ocl: reduction: Use subgroup shuffles instead of SLM --- .../intel/ocl/reduction/combined_reduction.cl | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cl b/src/gpu/intel/ocl/reduction/combined_reduction.cl index 63b0f51b7f3..07a0a24f7ae 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cl +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cl @@ -180,17 +180,16 @@ combined_reduce( // Potentially accumulate within the subgroup too // TODO: Change to tree-based reduce to help large inner_dims_per_sg cases - local VECT_DEF_ACC_DATA_T local_acc[LWS_SIZE]; - const int local_idx = (sgid * SUBGROUP_SIZE + sglid) % LWS_SIZE; - local_acc[local_idx] = acc; - if (sglid < INNER_DIM_SIZE) { - unroll_for(int i = 1; i < inner_dims_per_sg; i++) { - unroll_for(int v = 0; v < VECT_DT_N; v++) { - GET_ELEM(acc, v) = reduce(SECONDARY_REDUCTION_ALG, - GET_ELEM(acc, v), - GET_ELEM(local_acc[local_idx + i * INNER_DIM_SIZE], v), - POWER); - } + VECT_DEF_ACC_DATA_T acc_sg; + init_acc(SECONDARY_REDUCTION_ALG, &acc_sg); + unroll_for(int i = 0; i < inner_dims_per_sg; i++) { + VECT_DEF_ACC_DATA_T zero + = AS_VECT_DEF_ACC_DATA_T(SPECIAL(DEF_ACC_DATA_T, zero)); + VECT_DEF_ACC_DATA_T next = intel_sub_group_shuffle_down( + GET_ELEM(acc, i), zero, i * INNER_DIM_SIZE); + unroll_for(int v = 0; v < VECT_DT_N; v++) { + GET_ELEM(acc_sg, v) = reduce(SECONDARY_REDUCTION_ALG, + GET_ELEM(acc_sg, v), GET_ELEM(next, v), POWER); } } @@ -199,12 +198,12 @@ combined_reduce( DEF_ACC_DATA_T final_acc; init_acc(SECONDARY_REDUCTION_ALG, &final_acc); for (int i = 0; i < VECT_DT_N; i++) { - final_acc - = reduce(SECONDARY_REDUCTION_ALG, acc[i], final_acc, POWER); + final_acc = reduce( + SECONDARY_REDUCTION_ALG, acc_sg[i], final_acc, POWER); } #else // Just rename the variable to match the REDUCE_VECTOR case - const VECT_DEF_ACC_DATA_T final_acc = acc; + const VECT_DEF_ACC_DATA_T final_acc = acc_sg; #endif // REDUCE_VECTOR // For each result: From b8366f713b170ed70886657685bf6b06aee19f41 Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Tue, 7 May 2024 11:10:36 -0700 Subject: [PATCH 042/187] gpu: intel: ocl: reduction: implement persistent threading over outer dimensions --- .../intel/ocl/reduction/combined_reduction.cl | 210 +++++++++--------- .../ocl/reduction/combined_reduction.cpp | 27 +++ .../ocl/reduction/combined_reduction.hpp | 2 +- 3 files changed, 135 insertions(+), 104 deletions(-) diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cl b/src/gpu/intel/ocl/reduction/combined_reduction.cl index 07a0a24f7ae..6baa7476a67 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cl +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cl @@ -18,9 +18,6 @@ #include "gpu/intel/ocl/ocl_types.h" #include "gpu/intel/ocl/reduction/ocl_reduction.h" -#define BLOCK_READ_DATA_T(data_ptr) \ - AS_VECT_DATA_T(VECT_BLOCK_READ((const __global BLOCK_DATA_T *)data_ptr)) - // Define how to read data #define BLOCK_READ_DATA_T(data_ptr) \ AS_VECT_DATA_T(VECT_BLOCK_READ((const __global BLOCK_DATA_T *)data_ptr)) @@ -139,7 +136,6 @@ combined_reduce( const int sgid = get_global_id(0) / SUBGROUP_SIZE; const int inner_idx_start = (sgid % sg_per_inner_dim) * SUBGROUP_SIZE * VECT_DT_N; - const int outer_idx = sgid / sg_per_inner_dim; // Handle inner vector packing into subgroups const int sglid = get_sub_group_local_id(); @@ -151,135 +147,143 @@ combined_reduce( || sglid >= INNER_DIM_SIZE * inner_dims_per_sg) return; - VECT_DEF_ACC_DATA_T acc; - init_acc(REDUCTION_ALG, &acc); const int loop_stride = _SRC_OFF( 0, inner_dims_per_sg * (REDUCE_VECTOR ? VECT_DT_N : 1), 0); - int src_off = _SRC_OFF(outer_idx, WITH_BLOCK_READ ? 0 : red_off, - WITH_BLOCK_READ ? inner_idx_start : inner_idx); - __attribute__((opencl_unroll_hint(UNROLL_FACTOR))) // attr:no-format - for (int off = 0; off < num_horiz_reductions; - off++, src_off += loop_stride) { - // Load - const VECT_DATA_T src_val = READ_DATA(src[src_off]); - // Accumulate - unroll_for(int i = 0; i < VECT_DT_N; i++) GET_ELEM(acc, i) - = reduce(REDUCTION_ALG, GET_ELEM(acc, i), + unroll_for(int oid = 0; oid < OUTER_TILE_SIZE; oid++) { + const int outer_idx = sgid / sg_per_inner_dim * OUTER_TILE_SIZE + oid; + VECT_DEF_ACC_DATA_T acc; + init_acc(REDUCTION_ALG, &acc); + + int src_off = _SRC_OFF(outer_idx, WITH_BLOCK_READ ? 0 : red_off, + WITH_BLOCK_READ ? inner_idx_start : inner_idx); + __attribute__((opencl_unroll_hint(UNROLL_FACTOR))) // attr:no-format + for (int off = 0; off < num_horiz_reductions; + off++, src_off += loop_stride) { + // Load + const VECT_DATA_T src_val = READ_DATA(src[src_off]); + + // Accumulate + unroll_for(int i = 0; i < VECT_DT_N; i++) { + GET_ELEM(acc, i) = reduce(REDUCTION_ALG, GET_ELEM(acc, i), GET_ELEM(AS_VECT_DEF_ACC_DATA_T(src_val), i), POWER); - } - if (red_off < tail_reductions) { - // Load - const VECT_DATA_T src_val = READ_DATA(src[src_off]); + } + } + if (red_off < tail_reductions) { + // Load + const VECT_DATA_T src_val = READ_DATA(src[src_off]); - // Accumulate - unroll_for(int i = 0; i < VECT_DT_N; i++) GET_ELEM(acc, i) - = reduce(REDUCTION_ALG, GET_ELEM(acc, i), + // Accumulate + unroll_for(int i = 0; i < VECT_DT_N; i++) { + GET_ELEM(acc, i) = reduce(REDUCTION_ALG, GET_ELEM(acc, i), GET_ELEM(AS_VECT_DEF_ACC_DATA_T(src_val), i), POWER); - } + } + } - // Potentially accumulate within the subgroup too - // TODO: Change to tree-based reduce to help large inner_dims_per_sg cases - VECT_DEF_ACC_DATA_T acc_sg; - init_acc(SECONDARY_REDUCTION_ALG, &acc_sg); - unroll_for(int i = 0; i < inner_dims_per_sg; i++) { - VECT_DEF_ACC_DATA_T zero - = AS_VECT_DEF_ACC_DATA_T(SPECIAL(DEF_ACC_DATA_T, zero)); - VECT_DEF_ACC_DATA_T next = intel_sub_group_shuffle_down( - GET_ELEM(acc, i), zero, i * INNER_DIM_SIZE); - unroll_for(int v = 0; v < VECT_DT_N; v++) { - GET_ELEM(acc_sg, v) = reduce(SECONDARY_REDUCTION_ALG, - GET_ELEM(acc_sg, v), GET_ELEM(next, v), POWER); + // Potentially accumulate within the subgroup too + // TODO: Change to tree-based reduce to help large inner_dims_per_sg cases + VECT_DEF_ACC_DATA_T acc_sg; + init_acc(SECONDARY_REDUCTION_ALG, &acc_sg); + unroll_for(int i = 0; i < inner_dims_per_sg; i++) { + VECT_DEF_ACC_DATA_T zero + = AS_VECT_DEF_ACC_DATA_T(SPECIAL(DEF_ACC_DATA_T, zero)); + VECT_DEF_ACC_DATA_T next = intel_sub_group_shuffle_down( + GET_ELEM(acc, i), zero, i * INNER_DIM_SIZE); + unroll_for(int v = 0; v < VECT_DT_N; v++) { + GET_ELEM(acc_sg, v) = reduce(SECONDARY_REDUCTION_ALG, + GET_ELEM(acc_sg, v), GET_ELEM(next, v), POWER); + } } - } - if (sglid < INNER_DIM_SIZE) { + if (sglid < INNER_DIM_SIZE) { #if REDUCE_VECTOR - DEF_ACC_DATA_T final_acc; - init_acc(SECONDARY_REDUCTION_ALG, &final_acc); - for (int i = 0; i < VECT_DT_N; i++) { - final_acc = reduce( - SECONDARY_REDUCTION_ALG, acc_sg[i], final_acc, POWER); - } + DEF_ACC_DATA_T final_acc; + init_acc(SECONDARY_REDUCTION_ALG, &final_acc); + unroll_for(int i = 0; i < VECT_DT_N; i++) { + final_acc = reduce( + SECONDARY_REDUCTION_ALG, acc_sg[i], final_acc, POWER); + } #else - // Just rename the variable to match the REDUCE_VECTOR case - const VECT_DEF_ACC_DATA_T final_acc = acc_sg; + // Just rename the variable to match the REDUCE_VECTOR case + const VECT_DEF_ACC_DATA_T final_acc = acc_sg; #endif // REDUCE_VECTOR - // For each result: - // 1. (if IS_FINAL) finalize the result - // 2. (if IS_FINAL) apply post-ops - // 3. write to dst - for (int i = 0; i < FINAL_VEC_SIZE; i++) { - const dim_t dst_off - = _DST_OFF(outer_idx, inner_idx + i * SUBGROUP_SIZE); - // finalize the result + // For each result: + // 1. (if IS_FINAL) finalize the result + // 2. (if IS_FINAL) apply post-ops + // 3. write to dst + for (int i = 0; i < FINAL_VEC_SIZE; i++) { + const dim_t dst_off + = _DST_OFF(outer_idx, inner_idx + i * SUBGROUP_SIZE); + // finalize the result #if IS_FINAL - float res = finalize(REDUCTION_ALG, - convert_float(GET_FINAL(final_acc, i)), DIV, POWER, EPS); + float res = finalize(REDUCTION_ALG, + convert_float(GET_FINAL(final_acc, i)), DIV, POWER, + EPS); - // Apply post-ops + // Apply post-ops #if WITH_POST_OP - float dst_val; + float dst_val; #if WITH_SUM - dst_val = DST_TO_REF(dst[dst_off]); + dst_val = DST_TO_REF(dst[dst_off]); #endif // WITH_SUM - // Reconstruct MB/C/D/H/W indices from dst_off - const int mb = (DST_S0 == 0) - ? 0 - : dst_off / DST_S0 % div_up(DST_D0, DST_B0) * DST_B0 - + dst_off / DST_SB0 % DST_B0; - const int c = (DST_S1 == 0) - ? 0 - : dst_off / DST_S1 % div_up(DST_D1, DST_B1) * DST_B1 - + dst_off / DST_SB1 % DST_B1; - const int d = (DST_S2 == 0) - ? 0 - : dst_off / DST_S2 % div_up(DST_D2, DST_B2) * DST_B2 - + dst_off / DST_SB2 % DST_B2; - const int h = (DST_S3 == 0) - ? 0 - : dst_off / DST_S3 % div_up(DST_D3, DST_B3) * DST_B3 - + dst_off / DST_SB3 % DST_B3; - const int w = (DST_S4 == 0) - ? 0 - : dst_off / DST_S4 % div_up(DST_D4, DST_B4) * DST_B4 - + dst_off / DST_SB4 % DST_B4; + // Reconstruct MB/C/D/H/W indices from dst_off + const int mb = (DST_S0 == 0) + ? 0 + : dst_off / DST_S0 % div_up(DST_D0, DST_B0) * DST_B0 + + dst_off / DST_SB0 % DST_B0; + const int c = (DST_S1 == 0) + ? 0 + : dst_off / DST_S1 % div_up(DST_D1, DST_B1) * DST_B1 + + dst_off / DST_SB1 % DST_B1; + const int d = (DST_S2 == 0) + ? 0 + : dst_off / DST_S2 % div_up(DST_D2, DST_B2) * DST_B2 + + dst_off / DST_SB2 % DST_B2; + const int h = (DST_S3 == 0) + ? 0 + : dst_off / DST_S3 % div_up(DST_D3, DST_B3) * DST_B3 + + dst_off / DST_SB3 % DST_B3; + const int w = (DST_S4 == 0) + ? 0 + : dst_off / DST_S4 % div_up(DST_D4, DST_B4) * DST_B4 + + dst_off / DST_SB4 % DST_B4; - // Only use post-ops on non-zero-padded elements - if (mb < DST_D0 && c < DST_D1 && d < DST_D2 && h < DST_D3 - && w < DST_D4) { - APPLY_POST_OPS_SERIAL(res, float, dst_val, float, mb, 1, c, 1, - d, 1, h, 1, w, 1, 0, 1); - } + // Only use post-ops on non-zero-padded elements + if (mb < DST_D0 && c < DST_D1 && d < DST_D2 && h < DST_D3 + && w < DST_D4) { + APPLY_POST_OPS_SERIAL(res, float, dst_val, float, mb, 1, c, + 1, d, 1, h, 1, w, 1, 0, 1); + } #endif // WITH_POST_OP #else - float res = GET_FINAL(final_acc, i); + float res = GET_FINAL(final_acc, i); #endif // IS_FINAL - // Write to dst - if (is_dst_zero_padded(dst_off)) res = 0.0f; - dst[dst_off] = IS_FINAL ? TO_DST(res) : res; + // Write to dst + if (is_dst_zero_padded(dst_off)) res = 0.0f; + dst[dst_off] = IS_FINAL ? TO_DST(res) : res; - // Reduced + zero-padded dims need extra zeros written + // Reduced + zero-padded dims need extra zeros written #if DST_Z0_IS_REDUCED && DST_Z1_IS_REDUCED - for (int i = 0; i < DST_Z0_SIZE0; i++) { - for (int j = 0; j < DST_Z1_SIZE0; j++) { - if (i == 0 && j == 0) continue; - dst[dst_off + i * DST_Z0_STRIDE0 + j * DST_Z1_STRIDE0] - = TO_DST(0.0f); + for (int i = 0; i < DST_Z0_SIZE0; i++) { + for (int j = 0; j < DST_Z1_SIZE0; j++) { + if (i == 0 && j == 0) continue; + dst[dst_off + i * DST_Z0_STRIDE0 + j * DST_Z1_STRIDE0] + = TO_DST(0.0f); + } } - } #elif DST_Z0_IS_REDUCED - for (int i = 1; i < DST_Z0_SIZE0; i++) { - dst[dst_off + i * DST_Z0_STRIDE0] = TO_DST(0.0f); - } + for (int i = 1; i < DST_Z0_SIZE0; i++) { + dst[dst_off + i * DST_Z0_STRIDE0] = TO_DST(0.0f); + } #elif DST_Z1_IS_REDUCED - for (int j = 1; j < DST_Z1_SIZE0; j++) { - dst[dst_off + j * DST_Z1_STRIDE0] = TO_DST(0.0f); - } + for (int j = 1; j < DST_Z1_SIZE0; j++) { + dst[dst_off + j * DST_Z1_STRIDE0] = TO_DST(0.0f); + } #endif + } } } } diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cpp b/src/gpu/intel/ocl/reduction/combined_reduction.cpp index 4ac53887776..27a78520925 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cpp +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cpp @@ -17,9 +17,11 @@ #include "gpu/intel/ocl/reduction/combined_reduction.hpp" #include "common/c_types_map.hpp" #include "gpu/intel/block_structure.hpp" +#include "gpu/intel/compute/device_info.hpp" #include "gpu/intel/compute/utils.hpp" #include "gpu/intel/ocl/ocl_utils.hpp" #include "gpu/intel/ocl/reduction/reduction_utils.hpp" +#include "gpu/intel/utils.hpp" namespace dnnl { namespace impl { @@ -135,6 +137,29 @@ reduction_phase_conf_t::reduction_phase_conf_t( if (!reduce_vector) num_subgroups /= vect_size; + // Increase num_outer_idxs to use persistent threading to reduce the number of subgroups + // and avoid overdispatching + outer_tile_size = [this, &compute_engine, &num_EU, &large_grf_mode, + &num_subgroups]() -> int { + compute::gpu_arch_t arch = compute_engine->device_info()->gpu_arch(); + int threads_per_eu = large_grf_mode + ? 4 + : compute::device_info_t::threads_per_eu(arch); + int num_threads = num_EU * threads_per_eu; + + // Enable >1 block sizes only for PVC+, to avoid oldest-first thread arbitration + dim_t block_size = 1; + if (arch >= compute::gpu_arch_t::xe_hpc) { + block_size = num_subgroups / num_threads; + block_size = get_previous_factor(outer_block.block, block_size); + } + return gpu_utils::dev_getenv("combined_reduction_num_outer", + gpu_utils::into(block_size)); + }(); + gpu_assert(outer_block.block % outer_tile_size == 0) + << "Invalid choice of persistent thread outer idxs"; + num_subgroups /= outer_tile_size; + // Compute the number of threads per EU - this has no major impact // on average time, but can improve the best times on // close-to-cache-size problems with high parallelism @@ -397,6 +422,8 @@ static status_t init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx, kernel_ctx.define_int("REDUCTION_SIZE", phase.reduction_block.block); kernel_ctx.define_int("INNER_DIM_SIZE", phase.inner_block.block); + kernel_ctx.define_int("OUTER_TILE_SIZE", phase.outer_tile_size); + kernel_ctx.define_int("IS_FINAL", phase.is_final); kernel_ctx.define_int("IS_FIRST", phase.is_first); diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.hpp b/src/gpu/intel/ocl/reduction/combined_reduction.hpp index a1a2ab9c28b..ce6e6768119 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.hpp +++ b/src/gpu/intel/ocl/reduction/combined_reduction.hpp @@ -38,7 +38,7 @@ struct reduction_phase_conf_t : public reduction_subproblem_t { data_type_t src_type, dst_type; compute::nd_range_t nd_range; - int vect_size; + int vect_size, outer_tile_size; bool reduce_vector; bool is_final, is_first; int subgroup_size; From 6d826427dfc8303277f48e4e5d292005ab866267 Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Tue, 7 May 2024 13:14:08 -0700 Subject: [PATCH 043/187] gpu: intel: ocl: reduction: generalize vectors to use arrays --- .../intel/ocl/reduction/combined_reduction.cl | 82 ++++++++----------- 1 file changed, 35 insertions(+), 47 deletions(-) diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cl b/src/gpu/intel/ocl/reduction/combined_reduction.cl index 6baa7476a67..ecce525f0e3 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cl +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cl @@ -87,12 +87,6 @@ dim_t dst_off_w_zero_padding(dim_t outer, dim_t inner) { #define _DST_OFF(outer, inner) dst_off_w_zero_padding(outer, inner) -#if REDUCE_VECTOR -#define FINAL_VEC_SIZE 1 -#else -#define FINAL_VEC_SIZE VECT_DT_N -#endif - #if NUM_DST_ZPAD == 0 #define PADDED_NELEMS OUTER_SIZE *INNER_DIM_SIZE #elif NUM_DST_ZPAD == 1 @@ -103,19 +97,6 @@ dim_t dst_off_w_zero_padding(dim_t outer, dim_t inner) { *DST_Z1_SIZE1 #endif -#if VECT_DT_N == 1 -#define GET_ELEM(vect, idx) vect -#else -#define GET_ELEM(vect, idx) vect[idx] -#endif - -// If reducing or not using vectorization, we can't access with an index -#if !REDUCE_VECTOR && VECT_DT_N > 1 -#define GET_FINAL(x, idx) x[idx] -#else -#define GET_FINAL(x, idx) x -#endif - // Specifying wg size since larger work groups reduce performance. // TODO: Look into why this is the case __attribute__((reqd_work_group_size(LWS_SIZE, 1, 1))) // attr:no-format @@ -152,8 +133,10 @@ combined_reduce( unroll_for(int oid = 0; oid < OUTER_TILE_SIZE; oid++) { const int outer_idx = sgid / sg_per_inner_dim * OUTER_TILE_SIZE + oid; - VECT_DEF_ACC_DATA_T acc; - init_acc(REDUCTION_ALG, &acc); + DEF_ACC_DATA_T acc[VECT_DT_N]; + unroll_for(int v = 0; v < VECT_DT_N; v++) { + init_acc(REDUCTION_ALG, &acc[v]); + } int src_off = _SRC_OFF(outer_idx, WITH_BLOCK_READ ? 0 : red_off, WITH_BLOCK_READ ? inner_idx_start : inner_idx); @@ -162,64 +145,69 @@ combined_reduce( off++, src_off += loop_stride) { // Load const VECT_DATA_T src_val = READ_DATA(src[src_off]); + const DATA_T *next_val = (DATA_T *)&src_val; // Accumulate - unroll_for(int i = 0; i < VECT_DT_N; i++) { - GET_ELEM(acc, i) = reduce(REDUCTION_ALG, GET_ELEM(acc, i), - GET_ELEM(AS_VECT_DEF_ACC_DATA_T(src_val), i), POWER); + unroll_for(int v = 0; v < VECT_DT_N; v++) { + acc[v] = reduce(REDUCTION_ALG, acc[v], + TO_DEF_ACC_DATA_T(next_val[v]), POWER); } } if (red_off < tail_reductions) { // Load const VECT_DATA_T src_val = READ_DATA(src[src_off]); + const DATA_T *next_val = (DATA_T *)&src_val; // Accumulate - unroll_for(int i = 0; i < VECT_DT_N; i++) { - GET_ELEM(acc, i) = reduce(REDUCTION_ALG, GET_ELEM(acc, i), - GET_ELEM(AS_VECT_DEF_ACC_DATA_T(src_val), i), POWER); + unroll_for(int v = 0; v < VECT_DT_N; v++) { + acc[v] = reduce(REDUCTION_ALG, acc[v], + TO_DEF_ACC_DATA_T(next_val[v]), POWER); } } // Potentially accumulate within the subgroup too // TODO: Change to tree-based reduce to help large inner_dims_per_sg cases - VECT_DEF_ACC_DATA_T acc_sg; - init_acc(SECONDARY_REDUCTION_ALG, &acc_sg); + DEF_ACC_DATA_T acc_sg[VECT_DT_N]; + for (int v = 0; v < VECT_DT_N; v++) { + init_acc(SECONDARY_REDUCTION_ALG, &acc_sg[v]); + } unroll_for(int i = 0; i < inner_dims_per_sg; i++) { - VECT_DEF_ACC_DATA_T zero - = AS_VECT_DEF_ACC_DATA_T(SPECIAL(DEF_ACC_DATA_T, zero)); - VECT_DEF_ACC_DATA_T next = intel_sub_group_shuffle_down( - GET_ELEM(acc, i), zero, i * INNER_DIM_SIZE); unroll_for(int v = 0; v < VECT_DT_N; v++) { - GET_ELEM(acc_sg, v) = reduce(SECONDARY_REDUCTION_ALG, - GET_ELEM(acc_sg, v), GET_ELEM(next, v), POWER); + DEF_ACC_DATA_T next = intel_sub_group_shuffle_down(acc[v], + SPECIAL(DEF_ACC_DATA_T, zero), i * INNER_DIM_SIZE); + acc_sg[v] = reduce( + SECONDARY_REDUCTION_ALG, acc_sg[v], next, POWER); } } if (sglid < INNER_DIM_SIZE) { + const int final_vec_size = REDUCE_VECTOR ? 1 : VECT_DT_N; #if REDUCE_VECTOR - DEF_ACC_DATA_T final_acc; - init_acc(SECONDARY_REDUCTION_ALG, &final_acc); - unroll_for(int i = 0; i < VECT_DT_N; i++) { - final_acc = reduce( - SECONDARY_REDUCTION_ALG, acc_sg[i], final_acc, POWER); + DEF_ACC_DATA_T final_acc[1]; + init_acc(SECONDARY_REDUCTION_ALG, final_acc); + unroll_for(int v = 0; v < VECT_DT_N; v++) { + final_acc[0] = reduce(SECONDARY_REDUCTION_ALG, acc_sg[v], + final_acc[0], POWER); } #else // Just rename the variable to match the REDUCE_VECTOR case - const VECT_DEF_ACC_DATA_T final_acc = acc_sg; + DEF_ACC_DATA_T final_acc[VECT_DT_N]; + for (int v = 0; v < VECT_DT_N; v++) { + final_acc[v] = acc_sg[v]; + } #endif // REDUCE_VECTOR // For each result: // 1. (if IS_FINAL) finalize the result // 2. (if IS_FINAL) apply post-ops // 3. write to dst - for (int i = 0; i < FINAL_VEC_SIZE; i++) { + for (int v = 0; v < final_vec_size; v++) { const dim_t dst_off - = _DST_OFF(outer_idx, inner_idx + i * SUBGROUP_SIZE); + = _DST_OFF(outer_idx, inner_idx + v * SUBGROUP_SIZE); // finalize the result #if IS_FINAL - float res = finalize(REDUCTION_ALG, - convert_float(GET_FINAL(final_acc, i)), DIV, POWER, - EPS); + float res = finalize(REDUCTION_ALG, convert_float(final_acc[v]), + DIV, POWER, EPS); // Apply post-ops #if WITH_POST_OP @@ -258,7 +246,7 @@ combined_reduce( } #endif // WITH_POST_OP #else - float res = GET_FINAL(final_acc, i); + float res = final_acc[v]; #endif // IS_FINAL // Write to dst From 75034b9aa7c7e40c979569fde805d009b8d0c0cc Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Fri, 10 May 2024 13:06:03 -0700 Subject: [PATCH 044/187] gpu: jit: gemm: do not add ReqSum{A,B} tags for grouped quantization --- src/gpu/intel/jit/gemm/kernel_selector.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/gpu/intel/jit/gemm/kernel_selector.cpp b/src/gpu/intel/jit/gemm/kernel_selector.cpp index 094f7b3fd26..e08fbabcdba 100644 --- a/src/gpu/intel/jit/gemm/kernel_selector.cpp +++ b/src/gpu/intel/jit/gemm/kernel_selector.cpp @@ -310,8 +310,10 @@ MatchParamsBase::MatchParamsBase(ngen::HW hw, const GEMMProblem &problem) { if (problem.batchDims > 1) *tagPtr++ = ReqBatchMultiDim; } - if (problem.aOffset != ABOffset::None) *tagPtr++ = ReqSumB; - if (problem.bOffset != ABOffset::None) *tagPtr++ = ReqSumA; + if (problem.aOffset != ABOffset::None && problem.aoPtrDims < 2) + *tagPtr++ = ReqSumB; + if (problem.bOffset != ABOffset::None && problem.boPtrDims < 2) + *tagPtr++ = ReqSumA; if (hw == ngen::HW::Xe2) *tagPtr++ = ReqXe2Block2D; From e11d94d2e0d5b10a77d3c3fdf707622e337fe1df Mon Sep 17 00:00:00 2001 From: "Gu, Yonghao" Date: Mon, 29 Apr 2024 14:35:31 +0000 Subject: [PATCH 045/187] graph: backend: dnnl: fix the order of create arg indices --- src/graph/backend/dnnl/dnnl_op_def.hpp | 4 ++-- src/graph/backend/dnnl/op_executable.cpp | 13 ++++++------- .../gtests/graph/unit/backend/dnnl/test_reorder.cpp | 2 -- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/graph/backend/dnnl/dnnl_op_def.hpp b/src/graph/backend/dnnl/dnnl_op_def.hpp index 824c8c4da02..4fb1398becd 100644 --- a/src/graph/backend/dnnl/dnnl_op_def.hpp +++ b/src/graph/backend/dnnl/dnnl_op_def.hpp @@ -945,8 +945,8 @@ DNNL_GRAPH_OP_SCHEMA(dnnl_matmul, 1, DNNL_GRAPH_OP_SCHEMA(dnnl_softmax, 1, op_schema_t() - .set_inputs_option(op_schema_t::param_num_option::optional) - .set_num_inputs(std::set({1, 2})) + .set_inputs_option(op_schema_t::param_num_option::variadic) + .set_num_inputs(std::set({1, 32})) .set_num_outputs(2) .set_input(0, "input") .set_output(0, "output") diff --git a/src/graph/backend/dnnl/op_executable.cpp b/src/graph/backend/dnnl/op_executable.cpp index fd03c26a532..2a21693ff3d 100644 --- a/src/graph/backend/dnnl/op_executable.cpp +++ b/src/graph/backend/dnnl/op_executable.cpp @@ -1836,13 +1836,12 @@ static arg_indices_t get_arg_indices_for_siso_op( ? mgr.get_info(op->get_attr(op_attr::fusion_info_key)) : fusion_info_t(); + get_arg_indices_for_post_ops(op, mgr, arg_indices, index); if (fusion_info.with_runtime_scales(false, 0)) { arg_indices.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, indices_t {input, index++}}); } - get_arg_indices_for_post_ops(op, mgr, arg_indices, index); - // add output args arg_indices.insert({DNNL_ARG_TO, indices_t {output, 0}}); arg_indices.insert({DNNL_ARG_SCRATCHPAD, indices_t {output, 1}}); @@ -2173,11 +2172,6 @@ arg_indices_t reorder_executable_t::get_arg_indices( indices_t {input, index++}}); } - if (fusion_info.with_runtime_scales(false, 0)) { - arg_indices.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, - indices_t {input, index++}}); - } - if ((op->has_attr(op_attr::with_runtime_src_zps) && op->get_attr(op_attr::with_runtime_src_zps)) || fusion_info.with_runtime_zero_points(true, 0)) { @@ -2187,6 +2181,11 @@ arg_indices_t reorder_executable_t::get_arg_indices( get_arg_indices_for_post_ops(op, mgr, arg_indices, index); + if (fusion_info.with_runtime_scales(false, 0)) { + arg_indices.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, + indices_t {input, index++}}); + } + if ((op->has_attr(op_attr::with_runtime_dst_zps) && op->get_attr(op_attr::with_runtime_dst_zps)) || fusion_info.with_runtime_zero_points(false, 0)) { diff --git a/tests/gtests/graph/unit/backend/dnnl/test_reorder.cpp b/tests/gtests/graph/unit/backend/dnnl/test_reorder.cpp index 624a65f91bb..c9a6607a644 100644 --- a/tests/gtests/graph/unit/backend/dnnl/test_reorder.cpp +++ b/tests/gtests/graph/unit/backend/dnnl/test_reorder.cpp @@ -352,8 +352,6 @@ TEST(test_reorder_execute, Int8ReorderAdd) { | quant */ - // todo(xinyu): fix the case - GTEST_SKIP(); graph::engine_t *engine = get_engine(); std::vector int8_src {1, 2, 3, 4, 5, 6}; From d0fdaa8d59ff9fe8ce084141efb0ee1e3b001147 Mon Sep 17 00:00:00 2001 From: "Gu, Yonghao" Date: Mon, 29 Apr 2024 14:43:17 +0000 Subject: [PATCH 046/187] benchdnn: inputs: graph: add int8 softmax add case --- .../inputs/graph/pattern/harness_int8_all | 2 + .../graph/pattern/int8/int8_softmax_add.json | 228 ++++++++++++++++++ 2 files changed, 230 insertions(+) create mode 100644 tests/benchdnn/inputs/graph/pattern/int8/int8_softmax_add.json diff --git a/tests/benchdnn/inputs/graph/pattern/harness_int8_all b/tests/benchdnn/inputs/graph/pattern/harness_int8_all index cc6baaa35ac..44687a3c79b 100644 --- a/tests/benchdnn/inputs/graph/pattern/harness_int8_all +++ b/tests/benchdnn/inputs/graph/pattern/harness_int8_all @@ -115,3 +115,5 @@ #layernorm --reset --case=pattern/int8/int8_lnorm_gelu_quantize.json --reset --case=pattern/int8/int8_lnorm_multiply_quantize.json +#softmax +--reset --case=pattern/int8/int8_softmax_add.json diff --git a/tests/benchdnn/inputs/graph/pattern/int8/int8_softmax_add.json b/tests/benchdnn/inputs/graph/pattern/int8/int8_softmax_add.json new file mode 100644 index 00000000000..1c08eb37b2a --- /dev/null +++ b/tests/benchdnn/inputs/graph/pattern/int8/int8_softmax_add.json @@ -0,0 +1,228 @@ +{ + "version": "3.5.0", + "engine_kind": "cpu", + "fpmath_mode": "strict", + "input_ports": [ + 0 + ], + "output_ports": [ + 5 + ], + "graph": [ + { + "id": 0, + "name": "softmax", + "kind": "SoftMax", + "attrs": { + "axis": { + "type": "s64", + "value": 2 + } + }, + "inputs": [ + { + "id": 0, + "dtype": "bf16", + "shape": [ + 2, + 2, + 2 + ], + "stride": [ + 4, + 2, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 1, + "dtype": "bf16", + "shape": [ + 2, + 2, + 2 + ], + "stride": [ + 4, + 2, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 1, + "name": "Add", + "kind": "Add", + "attrs": { + "auto_broadcast": { + "type": "string", + "value": "numpy" + } + }, + "inputs": [ + { + "id": 1, + "dtype": "bf16", + "shape": [ + 2, + 2, + 2 + ], + "stride": [ + 4, + 2, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 2, + "dtype": "bf16", + "shape": [ + 2, + 2, + 2 + ], + "stride": [ + 4, + 2, + 1 + ], + "layout_type": "strided", + "property_type": "constant" + } + ], + "outputs": [ + { + "id": 3, + "dtype": "bf16", + "shape": [ + 2, + 2, + 2 + ], + "stride": [ + 4, + 2, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 2, + "name": "typecast", + "kind": "TypeCast", + "attrs": {}, + "inputs": [ + { + "id": 3, + "dtype": "bf16", + "shape": [ + 2, + 2, + 2 + ], + "stride": [ + 4, + 2, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 4, + "dtype": "f32", + "shape": [ + 2, + 2, + 2 + ], + "stride": [ + 4, + 2, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 3, + "name": "quantize", + "kind": "Quantize", + "attrs": { + "axis": { + "type": "s64", + "value": 1 + }, + "qtype": { + "type": "string", + "value": "per_tensor" + }, + "zps": { + "type": "s64[]", + "value": [ + 0 + ] + }, + "scales": { + "type": "f32[]", + "value": [ + 0.1 + ] + } + }, + "inputs": [ + { + "id": 4, + "dtype": "f32", + "shape": [ + 2, + 2, + 2 + ], + "stride": [ + 4, + 2, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 5, + "dtype": "u8", + "shape": [ + 2, + 2, + 2 + ], + "stride": [ + 4, + 2, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + } + ] +} \ No newline at end of file From 8b00b456ad6ca221dc115dc4f8b4d4c94890cbed Mon Sep 17 00:00:00 2001 From: "Taylor, Deb" Date: Wed, 8 May 2024 15:24:09 -0500 Subject: [PATCH 047/187] doc: Added 2 new mathjax3 macros to imgmath_latex_preamble list Signed-off-by: Taylor, Deb --- doc/sphinx/conf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/sphinx/conf.py b/doc/sphinx/conf.py index e6a1e256c3f..42c0801eeaa 100644 --- a/doc/sphinx/conf.py +++ b/doc/sphinx/conf.py @@ -106,7 +106,10 @@ def whereis(binary): \\newcommand{\\diffdstiterc}{\\operatorname{diff\\_dst\\_iter\\_c}} \\newcommand{\\diffgamma}{\\operatorname{diff\\_\\gamma}} \\newcommand{\\diffbeta}{\\operatorname{diff\\_\\beta}} -\\newcommand{\\workspace}{\\operatorname{workspace}}''' +\\newcommand{\\workspace}{\\operatorname{workspace}} +\\newcommand{\\srcshape}{\\operatorname{src\\_\\shape}} +\\newcommand{\\dstshape}{\\operatorname{dst\\_\\shape}}''' + From 89387bad15adc27d1b7fc2f5c32512c889915318 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 May 2024 08:27:08 +0000 Subject: [PATCH 048/187] github: workflows: bump ossf/scorecard-action from 2.3.1 to 2.3.3 Bumps [ossf/scorecard-action](https://github.com/ossf/scorecard-action) from 2.3.1 to 2.3.3. - [Release notes](https://github.com/ossf/scorecard-action/releases) - [Changelog](https://github.com/ossf/scorecard-action/blob/main/RELEASE.md) - [Commits](https://github.com/ossf/scorecard-action/compare/0864cf19026789058feabb7e87baa5f140aac736...dc50aa9510b46c811795eb24b2f1ba02a914e534) --- updated-dependencies: - dependency-name: ossf/scorecard-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/openssf-scorecard.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml index cae71214ecb..38cd2ab2d64 100644 --- a/.github/workflows/openssf-scorecard.yml +++ b/.github/workflows/openssf-scorecard.yml @@ -46,7 +46,7 @@ jobs: persist-credentials: false - name: "Run analysis" - uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1 + uses: ossf/scorecard-action@dc50aa9510b46c811795eb24b2f1ba02a914e534 # v2.3.3 with: results_file: results.sarif results_format: sarif From 1344a14b4160d62a18dcc447a2c0f122f87b5b03 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 May 2024 08:27:16 +0000 Subject: [PATCH 049/187] github: workflows: bump actions/checkout from 4.1.4 to 4.1.5 Bumps [actions/checkout](https://github.com/actions/checkout) from 4.1.4 to 4.1.5. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/0ad4b8fadaa221de15dcec353f45205ec38ea70b...44c2b7a8a4ea60a981eaca3cf939b5f4305c123b) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/openssf-scorecard.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml index 38cd2ab2d64..450e69c0fbf 100644 --- a/.github/workflows/openssf-scorecard.yml +++ b/.github/workflows/openssf-scorecard.yml @@ -41,7 +41,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4 + uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b # v4.1.5 with: persist-credentials: false From 4f2360e1c8d56ddf38f45daab0e1b74e8ffcb100 Mon Sep 17 00:00:00 2001 From: Jonathan Deakin Date: Wed, 10 Apr 2024 10:24:30 +0000 Subject: [PATCH 050/187] src: cpu: aarch64: add ACL s8:s8:f32 matmul - Add acl_lowp_matmul_t which implements matmul for s8:s8:f32 - Bump minimum ACL version to 24.04 Co-authored-by: Milos Puzovic --- cmake/ACL.cmake | 4 +- src/cpu/aarch64/matmul/acl_lowp_matmul.hpp | 276 +++++++++++++++++++++ src/cpu/matmul/cpu_matmul_list.cpp | 4 +- 3 files changed, 281 insertions(+), 3 deletions(-) create mode 100644 src/cpu/aarch64/matmul/acl_lowp_matmul.hpp diff --git a/cmake/ACL.cmake b/cmake/ACL.cmake index 2fdd645557e..7321e5da863 100644 --- a/cmake/ACL.cmake +++ b/cmake/ACL.cmake @@ -1,5 +1,5 @@ # ****************************************************************************** -# Copyright 2020-2023 Arm Limited and affiliates. +# Copyright 2020-2024 Arm Limited and affiliates. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -31,7 +31,7 @@ endif() find_package(ACL REQUIRED) -set(ACL_MINIMUM_VERSION "23.11") +set(ACL_MINIMUM_VERSION "24.04") if(ACL_FOUND) file(GLOB_RECURSE ACL_VERSION_FILE ${ACL_INCLUDE_DIR}/*/arm_compute_version.embed) diff --git a/src/cpu/aarch64/matmul/acl_lowp_matmul.hpp b/src/cpu/aarch64/matmul/acl_lowp_matmul.hpp new file mode 100644 index 00000000000..b8f2cf742e1 --- /dev/null +++ b/src/cpu/aarch64/matmul/acl_lowp_matmul.hpp @@ -0,0 +1,276 @@ +/******************************************************************************* +* Copyright 2024 Arm Ltd. and affiliates +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef ACL_LOWP_MATMUL_HPP +#define ACL_LOWP_MATMUL_HPP + +#include "cpu/cpu_primitive.hpp" +#include "cpu/matmul/cpu_matmul_pd.hpp" +#include "cpu/matmul/matmul_utils.hpp" + +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" +#include "cpu/aarch64/acl_utils.hpp" + +namespace dnnl { +namespace impl { +namespace cpu { +namespace aarch64 { +namespace matmul { + +struct acl_lowp_matmul_obj_t { + arm_compute::NEGEMMLowpMatrixMultiplyCore gemm; + arm_compute::Tensor src_tensor; + arm_compute::Tensor wei_tensor; + arm_compute::Tensor bia_tensor; + arm_compute::Tensor dst_tensor; +}; + +struct acl_lowp_matmul_conf_t { + arm_compute::TensorInfo src_tensor_info; + arm_compute::TensorInfo wei_tensor_info; + bool with_bias; + arm_compute::TensorInfo bia_tensor_info; + arm_compute::TensorInfo dst_tensor_info; +}; + +status_t configure_gemm( + acl_lowp_matmul_obj_t &acl_obj, const acl_lowp_matmul_conf_t &almc) { + + acl_obj.src_tensor.allocator()->init(almc.src_tensor_info); + acl_obj.wei_tensor.allocator()->init(almc.wei_tensor_info); + if (almc.with_bias) { + acl_obj.bia_tensor.allocator()->init(almc.bia_tensor_info); + } + acl_obj.dst_tensor.allocator()->init(almc.dst_tensor_info); + + acl_obj.gemm.configure(&acl_obj.src_tensor, &acl_obj.wei_tensor, + almc.with_bias ? &acl_obj.bia_tensor : nullptr, + &acl_obj.dst_tensor); + + return status::success; +} + +struct acl_lowp_matmul_resource_t : public resource_t { + acl_lowp_matmul_resource_t() + : acl_obj_(utils::make_unique()) {} + + status_t configure(const acl_lowp_matmul_conf_t &almc) { + + if (!acl_obj_) return status::out_of_memory; + + acl_obj_->src_tensor.allocator()->init(almc.src_tensor_info); + acl_obj_->wei_tensor.allocator()->init(almc.wei_tensor_info); + if (almc.with_bias) { + acl_obj_->bia_tensor.allocator()->init(almc.bia_tensor_info); + } + acl_obj_->dst_tensor.allocator()->init(almc.dst_tensor_info); + + acl_obj_->gemm.configure(&acl_obj_->src_tensor, &acl_obj_->wei_tensor, + almc.with_bias ? &acl_obj_->bia_tensor : nullptr, + &acl_obj_->dst_tensor); + + return status::success; + } + + acl_lowp_matmul_obj_t &get_acl_obj() const { return *acl_obj_; } + + DNNL_DISALLOW_COPY_AND_ASSIGN(acl_lowp_matmul_resource_t); + +private: + std::unique_ptr acl_obj_; +}; + +struct acl_lowp_matmul_t : public primitive_t { + struct pd_t : public dnnl::impl::cpu::matmul::cpu_matmul_pd_t { + + pd_t(const matmul_desc_t *adesc, const primitive_attr_t *attr, + const cpu_matmul_pd_t *hint_fwd_pd) + : cpu_matmul_pd_t(adesc, attr, hint_fwd_pd), almc_() {} + + using cpu_matmul_pd_t::cpu_matmul_pd_t; + + DECLARE_COMMON_PD_T( + "lowp_gemm:acl", acl_lowp_matmul_t, USE_GLOBAL_SCRATCHPAD); + + status_t init(engine_t *engine) { + + VDISPATCH_MATMUL( + set_default_formats(), "failed to set default formats"); + using smask_t = primitive_attr_t::skip_mask_t; + VDISPATCH_MATMUL(attr()->has_default_values(smask_t::scales_runtime + | smask_t::zero_points_runtime), + "only scale and zero point attrs supported"); + + // Note that has_default_values checks the argument for default zero + // points but skips the argument for scales. Hence they are the + // opposite but mean similar things + VDISPATCH_MATMUL(attr()->scales_.has_default_values( + {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS}), + "only src and weights scales are supported"); + VDISPATCH_MATMUL( + attr()->zero_points_.has_default_values(DNNL_ARG_DST), + "only src and weights zero points are supported"); + + VDISPATCH_MATMUL(attr()->scales_.get(DNNL_ARG_SRC).mask_ == 0 + && attr()->zero_points_.get(DNNL_ARG_SRC) == 0 + && attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ == 0 + && attr()->zero_points_.get(DNNL_ARG_WEIGHTS) == 0, + "common scales and zero points only"); + + VDISPATCH_MATMUL(!has_runtime_dims_or_strides(), + VERBOSE_RUNTIMEDIM_UNSUPPORTED); + + const memory_desc_wrapper src_d(src_md_); + const memory_desc_wrapper wei_d(weights_md_); + const memory_desc_wrapper bia_d(bias_md_); + const memory_desc_wrapper dst_d(dst_md_); + + using namespace data_type; + VDISPATCH_MATMUL(src_d.data_type() == s8 && wei_d.data_type() == s8 + && dst_d.data_type() == f32 + && utils::one_of(bia_d.data_type(), f32, undef), + VERBOSE_UNSUPPORTED_DT_CFG); + + VDISPATCH_MATMUL(src_d.matches_tag(format_tag::ab) + && wei_d.matches_tag(format_tag::ab) + && dst_d.matches_tag(format_tag::ab), + VERBOSE_UNSUPPORTED_TAG); + + VDISPATCH_MATMUL_SC( + memory_desc_init_by_tag(bias_md_, bias_md_.ndims, + bias_md_.dims, bias_md_.data_type, format_tag::ab), + VERBOSE_UNSUPPORTED_BIAS_CFG); + + // We set the QuantizationInfo to be dynamic because it is re-set in run() + almc_.src_tensor_info = arm_compute::TensorInfo( + arm_compute::TensorShape(K(), M()), 1, + arm_compute::DataType::QASYMM8_SIGNED, + arm_compute::QuantizationInfo(1.0, 0, true)); + almc_.src_tensor_info.set_are_values_constant(false); + + almc_.wei_tensor_info = arm_compute::TensorInfo( + arm_compute::TensorShape(N(), K()), 1, + arm_compute::DataType::QASYMM8_SIGNED, + arm_compute::QuantizationInfo(1.0, 0, true)); + almc_.wei_tensor_info.set_are_values_constant(false); + + almc_.bia_tensor_info = arm_compute::TensorInfo( + arm_compute::TensorShape(), 1, arm_compute::DataType::F32); + almc_.with_bias = bia_d.format_kind() != format_kind::undef; + if (almc_.with_bias) { + // This is not currently guarded in ACL + VDISPATCH_MATMUL(bia_d.ndims() == 2 && bia_d.dims()[0] == 1 + && bia_d.dims()[1] == N(), + "Only 1xN bias is supported"); + almc_.bia_tensor_info.set_tensor_shape(arm_compute::TensorShape( + bia_d.dims()[1], bia_d.dims()[0])); + } + + almc_.dst_tensor_info = arm_compute::TensorInfo( + arm_compute::TensorShape(N(), M()), + arm_compute::Format::F32); + + ACL_CHECK_VALID(arm_compute::NEGEMMLowpMatrixMultiplyCore::validate( + &almc_.src_tensor_info, &almc_.wei_tensor_info, + almc_.with_bias ? &almc_.bia_tensor_info : nullptr, + &almc_.dst_tensor_info, arm_compute::GEMMInfo())); + + return status::success; + } + + acl_lowp_matmul_conf_t almc_; + }; + + acl_lowp_matmul_t(const pd_t *apd) : primitive_t(apd) {} + + status_t create_resource( + engine_t *engine, resource_mapper_t &mapper) const { + + if (mapper.has_resource(this)) return status::success; + + auto r = utils::make_unique(); + if (!r) return status::out_of_memory; + + CHECK(r->configure(pd()->almc_)); + + mapper.add(this, std::move(r)); + + return status::success; + } + + status_t execute(const exec_ctx_t &ctx) const { + std::lock_guard _lock {this->mtx}; + + bool with_bias = pd()->almc_.with_bias; + + acl_lowp_matmul_obj_t &acl_obj + = ctx.get_resource_mapper() + ->get(this) + ->get_acl_obj(); + + auto src = CTX_IN_MEM(const int8_t *, DNNL_ARG_SRC); + auto wei = CTX_IN_MEM(const int8_t *, DNNL_ARG_WEIGHTS); + auto dst = CTX_OUT_MEM(float *, DNNL_ARG_DST); + + acl_obj.src_tensor.allocator()->import_memory( + const_cast(src)); + acl_obj.wei_tensor.allocator()->import_memory( + const_cast(wei)); + if (with_bias) { + auto bias = CTX_IN_MEM(const float *, DNNL_ARG_BIAS); + acl_obj.bia_tensor.allocator()->import_memory( + const_cast(bias)); + } + acl_obj.dst_tensor.allocator()->import_memory(dst); + + DEFINE_ARG_SCALES_BUFFER(src_scale, DNNL_ARG_SRC); + DEFINE_ZERO_POINT_VALUE(src_zero_point, DNNL_ARG_SRC); + DEFINE_ARG_SCALES_BUFFER(wei_scale, DNNL_ARG_WEIGHTS); + DEFINE_ZERO_POINT_VALUE(wei_zero_point, DNNL_ARG_WEIGHTS); + + // Note that we set the offset to be -zero_point, this is a known + // inconsistency with most other operators in the ACL API + acl_obj.src_tensor.info()->set_quantization_info( + arm_compute::QuantizationInfo( + *src_scale, -src_zero_point, true)); + + acl_obj.wei_tensor.info()->set_quantization_info( + arm_compute::QuantizationInfo( + *wei_scale, -wei_zero_point, true)); + + acl_obj.gemm.run(); + + // free() here tells ACL it can no longer use it, it does not deallocate + acl_obj.src_tensor.allocator()->free(); + acl_obj.wei_tensor.allocator()->free(); + if (with_bias) { acl_obj.bia_tensor.allocator()->free(); } + acl_obj.dst_tensor.allocator()->free(); + + return status::success; + }; + +private: + mutable std::mutex mtx; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } +}; + +} // namespace matmul +} // namespace aarch64 +} // namespace cpu +} // namespace impl +} // namespace dnnl + +#endif // CPU_AARCH64_ACL_LOWP_MATMUL_HPP \ No newline at end of file diff --git a/src/cpu/matmul/cpu_matmul_list.cpp b/src/cpu/matmul/cpu_matmul_list.cpp index 10329f9232f..6a53d0920c6 100644 --- a/src/cpu/matmul/cpu_matmul_list.cpp +++ b/src/cpu/matmul/cpu_matmul_list.cpp @@ -1,7 +1,7 @@ /******************************************************************************* * Copyright 2019-2024 Intel Corporation * Copyright 2024 FUJITSU LIMITED -* Copyright 2021 Arm Ltd. and affiliates +* Copyright 2021-2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,7 @@ using namespace dnnl::impl::cpu::x64; #elif DNNL_AARCH64 #include "cpu/aarch64/matmul/brgemm_matmul.hpp" #ifdef DNNL_AARCH64_USE_ACL +#include "cpu/aarch64/matmul/acl_lowp_matmul.hpp" #include "cpu/aarch64/matmul/acl_matmul.hpp" #endif using namespace dnnl::impl::cpu::aarch64::matmul; @@ -73,6 +74,7 @@ using namespace dnnl::impl::cpu::matmul; constexpr impl_list_item_t impl_list[] = REG_MATMUL_P({ CPU_INSTANCE_AARCH64(brgemm_matmul_t) + CPU_INSTANCE_AARCH64_ACL(acl_lowp_matmul_t) CPU_INSTANCE_AARCH64_ACL(acl_matmul_t) CPU_INSTANCE_AARCH64(brgemm_matmul_t) CPU_INSTANCE_AMX(brgemm_matmul_t) From 0c354c3bf654729056b43f448440d10880bd3033 Mon Sep 17 00:00:00 2001 From: Radu Salavat Date: Wed, 28 Feb 2024 19:35:47 +0000 Subject: [PATCH 051/187] cpu: aarch64: matmul: fuse sum post op in acl matmul Fuse the sum post op in acl matmul by setting the accumulate flag to true in arm_compute::GEMMInfo. This speeds up the post op and saves allocating a temporary dst sized tensor. We also added `_for_sum` to `use_dst_acc` flag to stop it being confused with the `dst_acc` used for transposing. Change the way we deal with fused eltwise (as well as the new sum) to fix segfaults when binary ops followed fused ops. Co-authored-by: Milos Puzovic Co-authored-by: Jonathan Deakin --- src/cpu/aarch64/acl_post_ops.cpp | 4 +-- src/cpu/aarch64/acl_post_ops.hpp | 27 ++++++++--------- src/cpu/aarch64/matmul/acl_matmul.cpp | 12 ++++---- src/cpu/aarch64/matmul/acl_matmul.hpp | 33 ++++++++++++++++----- src/cpu/aarch64/matmul/acl_matmul_utils.cpp | 2 +- src/cpu/aarch64/matmul/acl_matmul_utils.hpp | 3 +- 6 files changed, 48 insertions(+), 33 deletions(-) diff --git a/src/cpu/aarch64/acl_post_ops.cpp b/src/cpu/aarch64/acl_post_ops.cpp index ea4bb200ecf..0083bec6c23 100644 --- a/src/cpu/aarch64/acl_post_ops.cpp +++ b/src/cpu/aarch64/acl_post_ops.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Arm Ltd. and affiliates +* Copyright 2022-2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ namespace aarch64 { status_t acl_post_ops_t::execute(const exec_ctx_t &ctx, void *src_orig) const { - int post_op_index = 0; + int post_op_index = post_op_start_index_; // As these are post ops, this src will also be our dst. If we have a sum // post op, the src/dst will start off in a temporary, then change to diff --git a/src/cpu/aarch64/acl_post_ops.hpp b/src/cpu/aarch64/acl_post_ops.hpp index a19b8e74a81..727d4f06b07 100644 --- a/src/cpu/aarch64/acl_post_ops.hpp +++ b/src/cpu/aarch64/acl_post_ops.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Arm Ltd. and affiliates +* Copyright 2022-2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,7 +32,9 @@ struct acl_post_ops_t { // init the acl_post_ops_t. Note that this function modifies the passed in // post ops by setting the preferred memory formats status_t init(engine_t *engine, post_ops_t &post_ops, - const memory_desc_t &dst_md) { + const memory_desc_t &dst_md, int post_op_start_index = 0) { + + post_op_start_index_ = post_op_start_index; CHECK(post_ops.set_default_formats(&dst_md)); dst_data_type = dst_md.data_type; @@ -41,7 +43,7 @@ struct acl_post_ops_t { sum_index = -1; post_op_primitives = {}; - for (int i = 0; i < post_ops.len(); i++) { + for (int i = post_op_start_index; i < post_ops.len(); i++) { auto &po = post_ops.entry_[i]; if (po.is_sum()) { @@ -135,7 +137,8 @@ struct acl_post_ops_t { // formats status_t init(engine_t *engine, post_ops_t &base_post_ops, const memory_desc_t &dst_md, - arm_compute::ActivationLayerInfo &act_info_to_fuse) { + arm_compute::ActivationLayerInfo &act_info_to_fuse, + int post_op_start_index = 0) { CHECK(base_post_ops.set_default_formats(&dst_md)); dst_data_type = dst_md.data_type; @@ -149,18 +152,11 @@ struct acl_post_ops_t { "eltwise post op scale must be 1 (no scale)"); CHECK(acl_utils::convert_to_acl_act(first_po, act_info_to_fuse)); - // Copy all but the first, because it has been fused - post_ops_t post_ops; - for (int idx = 1; idx < base_post_ops.len(); ++idx) { - // Construct empty entry then copy, so that we can check for failure - post_ops.entry_.emplace_back(); - post_ops.entry_.back() = base_post_ops.entry_[idx]; - } - return init(engine, post_ops, dst_md); - + // post_op_start_index + 1 to skip the fused eltwise + return init(engine, base_post_ops, dst_md, post_op_start_index + 1); } else { // Nothing to fuse, just copy all post ops - return init(engine, base_post_ops, dst_md); + return init(engine, base_post_ops, dst_md, post_op_start_index); } } @@ -179,6 +175,9 @@ struct acl_post_ops_t { private: // Index of the sum post op if there is one, < 0 means no sum int sum_index = -1; + // Index of the first post op this primitive executes. This is typically the + // number of post ops which were fused. + int post_op_start_index_ = 0; data_type_t dst_data_type; // Vector of primitives used to execute the post ops. They are constructed // in init to be either acl_binary_t (for sum, add, sub, div, mul, min and diff --git a/src/cpu/aarch64/matmul/acl_matmul.cpp b/src/cpu/aarch64/matmul/acl_matmul.cpp index 4446ee4ae5d..91f342d44e3 100644 --- a/src/cpu/aarch64/matmul/acl_matmul.cpp +++ b/src/cpu/aarch64/matmul/acl_matmul.cpp @@ -34,7 +34,7 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const { bool is_transA = pd()->amp_.is_transA; bool is_transB = pd()->amp_.is_transB; bool do_transC = pd()->amp_.do_transC; - bool use_dst_acc = pd()->amp_.use_dst_acc; + bool use_dst_acc_for_sum = pd()->amp_.use_dst_acc_for_sum; std::lock_guard _lock {this->mtx}; auto *acl_resource = ctx.get_resource_mapper()->get(this); @@ -74,11 +74,11 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const { if (do_transC) { acl_obj.dst_acc_tensor.allocator()->allocate(); } } - // Put the result in a new tensor, if we have a sum post op. - // Result will be accumulated to the dst during the post ops. - auto dst_base = use_dst_acc ? scratchpad.get( + // If we have an unfused sum post op, put the result in a scratchpad tensor. + // Result will be summed to the dst during acl_post_ops.execute + auto dst_base = use_dst_acc_for_sum ? scratchpad.get( memory_tracking::names::key_matmul_dst_in_acc_dt) - : CTX_OUT_MEM(data_t *, DNNL_ARG_DST); + : CTX_OUT_MEM(data_t *, DNNL_ARG_DST); acl_obj.dst_tensor.allocator()->import_memory(dst_base); acl_obj.gemm.run(); @@ -91,7 +91,7 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const { if (is_transB) acl_obj.wei_acc_tensor.allocator()->free(); void *dst = acl_obj.dst_tensor.buffer(); - pd()->post_ops.execute(ctx, dst); + pd()->acl_post_ops.execute(ctx, dst); acl_obj.dst_tensor.allocator()->free(); diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp index 0f81f81a5a0..1427a5bcb85 100644 --- a/src/cpu/aarch64/matmul/acl_matmul.hpp +++ b/src/cpu/aarch64/matmul/acl_matmul.hpp @@ -82,7 +82,9 @@ struct acl_matmul_t : public primitive_t { pd_t(const matmul_desc_t *adesc, const primitive_attr_t *attr, const cpu_matmul_pd_t *hint_fwd_pd) - : cpu_matmul_pd_t(adesc, attr, hint_fwd_pd), amp_(), post_ops() {} + : cpu_matmul_pd_t(adesc, attr, hint_fwd_pd) + , amp_() + , acl_post_ops() {} using cpu_matmul_pd_t::cpu_matmul_pd_t; @@ -135,21 +137,36 @@ struct acl_matmul_t : public primitive_t { amp_, src_md_, weights_md_, dst_md_, *desc(), *attr())); } + // We can only fuse sum if it is the first post op and we aren't + // transposing dst after + if (attr_.post_ops_.contain(primitive_kind::sum, 0) + && !amp_.do_transC) { + // Check there isn't another sum after the first + VDISPATCH_MATMUL( + attr_.post_ops_.find(primitive_kind::sum, 1, -1) < 0, + "cannot contain multiple sum post-ops"); + VDISPATCH_MATMUL(attr_.post_ops_.entry_[0].sum.scale == 1.0f, + "sum post op scale must be 1 (no scale)"); + VDISPATCH_MATMUL(attr_.post_ops_.entry_[0].sum.zero_point == 0, + "sum post op zero point must be 0 (no shift)"); + amp_.gemm_info.set_accumulate(true); + } + arm_compute::ActivationLayerInfo act_info; - CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_, act_info)); + CHECK(acl_post_ops.init(engine, attr_.post_ops_, dst_md_, act_info, + amp_.gemm_info.accumulate() ? 1 : 0)); amp_.gemm_info.set_activation_info(act_info); - amp_.use_dst_acc = post_ops.has_sum(); + amp_.use_dst_acc_for_sum = acl_post_ops.has_sum(); // Validate ACL GEMM if (amp_.do_transC) { ACL_CHECK_VALID(arm_compute::NEGEMM::validate( &_.wei_tensor_info, &_.src_tensor_info, nullptr, - &_.dst_acc_info, amp_.alpha, 0.0f, amp_.gemm_info)); + &_.dst_acc_info, 1.0f, 0.0f, amp_.gemm_info)); } else { ACL_CHECK_VALID(arm_compute::NEGEMM::validate( &_.src_tensor_info, &_.wei_tensor_info, nullptr, - &_.dst_tensor_info, amp_.alpha, 0.0f, - amp_.gemm_info)); + &_.dst_tensor_info, 1.0f, 0.0f, amp_.gemm_info)); } auto scratchpad = scratchpad_registry().registrar(); @@ -159,7 +176,7 @@ struct acl_matmul_t : public primitive_t { } acl_matmul_conf_t amp_; - acl_post_ops_t post_ops; + acl_post_ops_t acl_post_ops; dnnl::impl::format_kind_t weights_format_kind_; protected: @@ -181,7 +198,7 @@ struct acl_matmul_t : public primitive_t { CHECK(r->configure(pd()->amp_, pd()->weights_format_kind_)); mapper.add(this, std::move(r)); - CHECK(pd()->post_ops.create_resource(engine, mapper)); + CHECK(pd()->acl_post_ops.create_resource(engine, mapper)); return status::success; } diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp index fcb81b06165..134ce94c905 100644 --- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp +++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp @@ -175,7 +175,7 @@ status_t init_conf_matmul(acl_matmul_conf_t &, memory_desc_t &src_md, status_t init_scratchpad(memory_tracking::registrar_t &scratchpad, acl_matmul_conf_t &, memory_desc_t &dst_md) { - if (amp.use_dst_acc) { + if (amp.use_dst_acc_for_sum) { const memory_desc_wrapper dst_d(&dst_md); scratchpad.book(memory_tracking::names::key_matmul_dst_in_acc_dt, dst_d.nelems(), dst_d.data_type_size()); diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp index d8d3846aa76..d3fa65d915a 100644 --- a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp +++ b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp @@ -45,7 +45,7 @@ struct acl_matmul_conf_t { bool do_transC; // If this is true, the result of the matmul goes into a temporarily // allocated ACL tensor to be accumulated into the oneDNN dst during postops - bool use_dst_acc; + bool use_dst_acc_for_sum; arm_compute::TensorInfo src_tensor_info; arm_compute::TensorInfo wei_tensor_info; arm_compute::TensorInfo dst_tensor_info; @@ -53,7 +53,6 @@ struct acl_matmul_conf_t { arm_compute::TensorInfo wei_acc_info; arm_compute::TensorInfo dst_acc_info; arm_compute::GEMMInfo gemm_info; - float alpha; }; namespace acl_matmul_utils { From 64e76e944e5a0d4a449dfd1cbae5d7de202ad53e Mon Sep 17 00:00:00 2001 From: "Chen, Shu1" Date: Mon, 13 May 2024 14:06:37 +0800 Subject: [PATCH 052/187] examples: add vanilla_rnn example --- doc/primitives/rnn.md | 6 +- examples/primitives/vanilla_rnn.cpp | 196 ++++++++++++++++++++++++++++ 2 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 examples/primitives/vanilla_rnn.cpp diff --git a/doc/primitives/rnn.md b/doc/primitives/rnn.md index 2e124cd483c..75ed8ac0dfd 100644 --- a/doc/primitives/rnn.md +++ b/doc/primitives/rnn.md @@ -477,6 +477,10 @@ details on how to use and set these quantization parameters. ## Example -[LSTM RNN Primitive Example](@ref lstm_example_cpp) +1. [LSTM RNN Primitive Example](@ref lstm_example_cpp) @copydetails lstm_example_cpp_short + +2. [Vanilla RNN Primitive Example](@ref vanilla_rnn_example_cpp) + +@copydetails vanilla_rnn_example_cpp_short diff --git a/examples/primitives/vanilla_rnn.cpp b/examples/primitives/vanilla_rnn.cpp new file mode 100644 index 00000000000..a288468fc09 --- /dev/null +++ b/examples/primitives/vanilla_rnn.cpp @@ -0,0 +1,196 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @example vanilla_rnn.cpp +/// > Annotated version: @ref vanilla_rnn_example_cpp +/// +/// @page vanilla_rnn_example_cpp_short +/// +/// This C++ API example demonstrates how to create and execute a +/// [Vanilla RNN](@ref dev_guide_rnn) primitive in forward training propagation +/// mode. +/// +/// Key optimizations included in this example: +/// - Creation of optimized memory format from the primitive descriptor. +/// +/// @page vanilla_rnn_example_cpp Vanilla RNN Primitive Example +/// @copydetails vanilla_rnn_example_cpp_short +/// +/// @include vanilla_rnn.cpp + +#include +#include +#include +#include +#include + +#include "dnnl.hpp" +#include "example_utils.hpp" + +using namespace dnnl; + +using tag = memory::format_tag; +using dt = memory::data_type; + +void vanilla_rnn_example(dnnl::engine::kind engine_kind) { + // Create execution dnnl::engine. + dnnl::engine engine(engine_kind, 0); + + // Create dnnl::stream. + dnnl::stream engine_stream(engine); + + // Tensor dimensions. + const memory::dim N = 2, // batch size + T = 3, // time steps + C = 4, // channels + G = 1, // gates + L = 1, // layers + D = 1; // directions + + // Source (src), weights, bias, attention, and destination (dst) tensors + // dimensions. + memory::dims src_dims = {T, N, C}; + memory::dims weights_dims = {L, D, C, G, C}; + memory::dims bias_dims = {L, D, G, C}; + memory::dims dst_layer_dims = {T, N, C}; + memory::dims dst_iter_dims = {L, D, N, C}; + + // Allocate buffers. + std::vector src_layer_data(product(src_dims)); + std::vector weights_layer_data(product(weights_dims)); + std::vector weights_iter_data(product(weights_dims)); + std::vector bias_data(product(bias_dims)); + std::vector dst_layer_data(product(dst_layer_dims)); + std::vector dst_iter_data(product(dst_iter_dims)); + + // Initialize src, weights, and bias tensors. + std::generate(src_layer_data.begin(), src_layer_data.end(), []() { + static int i = 0; + return std::cos(i++ / 10.f); + }); + std::generate(weights_layer_data.begin(), weights_layer_data.end(), []() { + static int i = 0; + return std::sin(i++ * 2.f); + }); + std::generate(weights_iter_data.begin(), weights_iter_data.end(), []() { + static int i = 0; + return std::sin(i++ * 2.f); + }); + std::generate(bias_data.begin(), bias_data.end(), []() { + static int i = 0; + return std::tanh(float(i++)); + }); + + // Create memory descriptors and memory objects for src, bias, and dst. + auto src_layer_md = memory::desc(src_dims, dt::f32, tag::tnc); + auto bias_md = memory::desc(bias_dims, dt::f32, tag::ldgo); + auto dst_layer_md = memory::desc(dst_layer_dims, dt::f32, tag::tnc); + + auto src_layer_mem = memory(src_layer_md, engine); + auto bias_mem = memory(bias_md, engine); + auto dst_layer_mem = memory(dst_layer_md, engine); + + // Create memory objects for weights using user's memory layout. In this + // example, LDIGO (num_layers, num_directions, input_channels, num_gates, + // output_channels) is assumed. + auto user_weights_layer_mem + = memory({weights_dims, dt::f32, tag::ldigo}, engine); + auto user_weights_iter_mem + = memory({weights_dims, dt::f32, tag::ldigo}, engine); + + // Write data to memory object's handle. + write_to_dnnl_memory(src_layer_data.data(), src_layer_mem); + write_to_dnnl_memory(bias_data.data(), bias_mem); + write_to_dnnl_memory(weights_layer_data.data(), user_weights_layer_mem); + write_to_dnnl_memory(weights_iter_data.data(), user_weights_iter_mem); + + // Create memory descriptors for weights with format_tag::any. This enables + // the Vanilla primitive to choose the optimized memory layout. + auto weights_layer_md = memory::desc(weights_dims, dt::f32, tag::any); + auto weights_iter_md = memory::desc(weights_dims, dt::f32, tag::any); + + // Optional memory descriptors for recurrent data. + // Default memory descriptor for initial hidden states of the GRU cells + auto src_iter_md = memory::desc(); + auto dst_iter_md = memory::desc(); + + // Create primitive descriptor. + auto vanilla_rnn_pd = vanilla_rnn_forward::primitive_desc(engine, + prop_kind::forward_training, dnnl::algorithm::eltwise_tanh, + rnn_direction::unidirectional_left2right, src_layer_md, src_iter_md, + weights_layer_md, weights_iter_md, bias_md, dst_layer_md, + dst_iter_md); + + // For now, assume that the weights memory layout generated by the primitive + // and the ones provided by the user are identical. + auto weights_layer_mem = user_weights_layer_mem; + auto weights_iter_mem = user_weights_iter_mem; + + // Reorder the data in case the weights memory layout generated by the + // primitive and the one provided by the user are different. In this case, + // we create additional memory objects with internal buffers that will + // contain the reordered data. + if (vanilla_rnn_pd.weights_desc() != user_weights_layer_mem.get_desc()) { + weights_layer_mem = memory(vanilla_rnn_pd.weights_desc(), engine); + reorder(user_weights_layer_mem, weights_layer_mem) + .execute(engine_stream, user_weights_layer_mem, + weights_layer_mem); + } + + if (vanilla_rnn_pd.weights_iter_desc() + != user_weights_iter_mem.get_desc()) { + weights_iter_mem = memory(vanilla_rnn_pd.weights_iter_desc(), engine); + reorder(user_weights_iter_mem, weights_iter_mem) + .execute( + engine_stream, user_weights_iter_mem, weights_iter_mem); + } + + // Create the memory objects from the primitive descriptor. A workspace is + // also required for Vanilla RNN. + // NOTE: Here, the workspace is required for later usage in backward + // propagation mode. + auto src_iter_mem = memory(vanilla_rnn_pd.src_iter_desc(), engine); + auto dst_iter_mem = memory(vanilla_rnn_pd.dst_iter_desc(), engine); + auto workspace_mem = memory(vanilla_rnn_pd.workspace_desc(), engine); + + // Create the primitive. + auto vanilla_rnn_prim = vanilla_rnn_forward(vanilla_rnn_pd); + + // Primitive arguments + std::unordered_map vanilla_rnn_args; + vanilla_rnn_args.insert({DNNL_ARG_SRC_LAYER, src_layer_mem}); + vanilla_rnn_args.insert({DNNL_ARG_WEIGHTS_LAYER, weights_layer_mem}); + vanilla_rnn_args.insert({DNNL_ARG_WEIGHTS_ITER, weights_iter_mem}); + vanilla_rnn_args.insert({DNNL_ARG_BIAS, bias_mem}); + vanilla_rnn_args.insert({DNNL_ARG_DST_LAYER, dst_layer_mem}); + vanilla_rnn_args.insert({DNNL_ARG_SRC_ITER, src_iter_mem}); + vanilla_rnn_args.insert({DNNL_ARG_DST_ITER, dst_iter_mem}); + vanilla_rnn_args.insert({DNNL_ARG_WORKSPACE, workspace_mem}); + + // Primitive execution: vanilla. + vanilla_rnn_prim.execute(engine_stream, vanilla_rnn_args); + + // Wait for the computation to finalize. + engine_stream.wait(); + + // Read data from memory object's handle. + read_from_dnnl_memory(dst_layer_data.data(), dst_layer_mem); +} + +int main(int argc, char **argv) { + return handle_example_errors( + vanilla_rnn_example, parse_engine_kind(argc, argv)); +} From db32fc4c9c5bfe5c8d5f5b0501873522b2c36616 Mon Sep 17 00:00:00 2001 From: "Chen, Shu1" Date: Mon, 13 May 2024 14:24:06 +0800 Subject: [PATCH 053/187] examples: add lbr_gru example --- doc/primitives/rnn.md | 12 +- examples/primitives/lbr_gru.cpp | 201 ++++++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+), 2 deletions(-) create mode 100644 examples/primitives/lbr_gru.cpp diff --git a/doc/primitives/rnn.md b/doc/primitives/rnn.md index 75ed8ac0dfd..58c7a1ee928 100644 --- a/doc/primitives/rnn.md +++ b/doc/primitives/rnn.md @@ -477,10 +477,18 @@ details on how to use and set these quantization parameters. ## Example -1. [LSTM RNN Primitive Example](@ref lstm_example_cpp) +[LSTM RNN Primitive Example](@ref lstm_example_cpp) @copydetails lstm_example_cpp_short -2. [Vanilla RNN Primitive Example](@ref vanilla_rnn_example_cpp) +[Vanilla RNN Primitive Example](@ref vanilla_rnn_example_cpp) @copydetails vanilla_rnn_example_cpp_short + +[AUGRU RNN Primitive Example](@ref augru_example_cpp) + +@copydetails augru_example_cpp_short + +[Linear-Before-Reset GRU RNN Primitive Example](@ref lbr_gru_example_cpp) + +@copydetails lbr_gru_example_cpp_short diff --git a/examples/primitives/lbr_gru.cpp b/examples/primitives/lbr_gru.cpp new file mode 100644 index 00000000000..aeba8103c88 --- /dev/null +++ b/examples/primitives/lbr_gru.cpp @@ -0,0 +1,201 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @example lbr_gru.cpp +/// > Annotated version: @ref lbr_gru_example_cpp +/// +/// @page lbr_gru_example_cpp_short +/// +/// This C++ API example demonstrates how to create and execute a +/// [Linear-Before-Reset GRU RNN](@ref dev_guide_rnn) primitive in forward +/// training propagation mode. +/// +/// Key optimizations included in this example: +/// - Creation of optimized memory format from the primitive descriptor. +/// +/// @page lbr_gru_example_cpp Linear-Before-Reset GRU RNN Primitive Example +/// @copydetails lbr_gru_example_cpp_short +/// +/// @include lbr_gru.cpp + +#include +#include +#include +#include +#include + +#include "dnnl.hpp" +#include "example_utils.hpp" + +using namespace dnnl; + +using tag = memory::format_tag; +using dt = memory::data_type; + +void lbr_gru_example(dnnl::engine::kind engine_kind) { + // Create execution dnnl::engine. + dnnl::engine engine(engine_kind, 0); + + // Create dnnl::stream. + dnnl::stream engine_stream(engine); + + // Tensor dimensions. + const memory::dim N = 2, // batch size + T = 3, // time steps + IC = 2, // src channels + OC = 3, // dst channels + G = 3, // gates + L = 1, // layers + D = 1, // directions + E = 1; // extra Bias number. Extra Bias for u' gate + + // Source (src), weights, bias, attention, and destination (dst) tensors + // dimensions. + memory::dims src_dims = {T, N, IC}; + memory::dims weights_layer_dims = {L, D, IC, G, OC}; + memory::dims weights_iter_dims = {L, D, OC, G, OC}; + memory::dims bias_dims = {L, D, G + E, OC}; + memory::dims dst_layer_dims = {T, N, OC}; + memory::dims dst_iter_dims = {L, D, N, OC}; + + // Allocate buffers. + std::vector src_layer_data(product(src_dims)); + std::vector weights_layer_data(product(weights_layer_dims)); + std::vector weights_iter_data(product(weights_iter_dims)); + std::vector bias_data(product(bias_dims)); + std::vector dst_layer_data(product(dst_layer_dims)); + std::vector dst_iter_data(product(dst_iter_dims)); + + // Initialize src, weights, and bias tensors. + std::generate(src_layer_data.begin(), src_layer_data.end(), []() { + static int i = 0; + return std::cos(i++ / 10.f); + }); + std::generate(weights_layer_data.begin(), weights_layer_data.end(), []() { + static int i = 0; + return std::sin(i++ * 2.f); + }); + std::generate(weights_iter_data.begin(), weights_iter_data.end(), []() { + static int i = 0; + return std::sin(i++ * 2.f); + }); + std::generate(bias_data.begin(), bias_data.end(), []() { + static int i = 0; + return std::tanh(float(i++)); + }); + + // Create memory descriptors and memory objects for src, bias, and dst. + auto src_layer_md = memory::desc(src_dims, dt::f32, tag::tnc); + auto bias_md = memory::desc(bias_dims, dt::f32, tag::ldgo); + auto dst_layer_md = memory::desc(dst_layer_dims, dt::f32, tag::tnc); + + auto src_layer_mem = memory(src_layer_md, engine); + auto bias_mem = memory(bias_md, engine); + auto dst_layer_mem = memory(dst_layer_md, engine); + + // Create memory objects for weights using user's memory layout. In this + // example, LDIGO (num_layers, num_directions, input_channels, num_gates, + // output_channels) is assumed. + auto user_weights_layer_mem + = memory({weights_layer_dims, dt::f32, tag::ldigo}, engine); + auto user_weights_iter_mem + = memory({weights_iter_dims, dt::f32, tag::ldigo}, engine); + + // Write data to memory object's handle. + // For GRU cells, the gates order is update, reset and output + // gate except the bias. For the bias tensor, the gates order is + // u, r, o and u' gate. + write_to_dnnl_memory(src_layer_data.data(), src_layer_mem); + write_to_dnnl_memory(bias_data.data(), bias_mem); + write_to_dnnl_memory(weights_layer_data.data(), user_weights_layer_mem); + write_to_dnnl_memory(weights_iter_data.data(), user_weights_iter_mem); + + // Create memory descriptors for weights with format_tag::any. This enables + // the lbr_gru primitive to choose the optimized memory layout. + auto weights_layer_md = memory::desc(weights_layer_dims, dt::f32, tag::any); + auto weights_iter_md = memory::desc(weights_iter_dims, dt::f32, tag::any); + + // Optional memory descriptors for recurrent data. + // Default memory descriptor for initial hidden states of the GRU cells + auto src_iter_md = memory::desc(); + auto dst_iter_md = memory::desc(); + + // Create primitive descriptor. + auto lbr_gru_pd = lbr_gru_forward::primitive_desc(engine, + prop_kind::forward_training, + rnn_direction::unidirectional_left2right, src_layer_md, src_iter_md, + weights_layer_md, weights_iter_md, bias_md, dst_layer_md, + dst_iter_md); + + // For now, assume that the weights memory layout generated by the primitive + // and the ones provided by the user are identical. + auto weights_layer_mem = user_weights_layer_mem; + auto weights_iter_mem = user_weights_iter_mem; + + // Reorder the data in case the weights memory layout generated by the + // primitive and the one provided by the user are different. In this case, + // we create additional memory objects with internal buffers that will + // contain the reordered data. + if (lbr_gru_pd.weights_desc() != user_weights_layer_mem.get_desc()) { + weights_layer_mem = memory(lbr_gru_pd.weights_desc(), engine); + reorder(user_weights_layer_mem, weights_layer_mem) + .execute(engine_stream, user_weights_layer_mem, + weights_layer_mem); + } + + if (lbr_gru_pd.weights_iter_desc() != user_weights_iter_mem.get_desc()) { + weights_iter_mem = memory(lbr_gru_pd.weights_iter_desc(), engine); + reorder(user_weights_iter_mem, weights_iter_mem) + .execute( + engine_stream, user_weights_iter_mem, weights_iter_mem); + } + + // Create the memory objects from the primitive descriptor. A workspace is + // also required for Linear-Before-Reset GRU RNN. + // NOTE: Here, the workspace is required for later usage in backward + // propagation mode. + auto src_iter_mem = memory(lbr_gru_pd.src_iter_desc(), engine); + auto dst_iter_mem = memory(lbr_gru_pd.dst_iter_desc(), engine); + auto workspace_mem = memory(lbr_gru_pd.workspace_desc(), engine); + + // Create the primitive. + auto lbr_gru_prim = lbr_gru_forward(lbr_gru_pd); + + // Primitive arguments + std::unordered_map lbr_gru_args; + lbr_gru_args.insert({DNNL_ARG_SRC_LAYER, src_layer_mem}); + lbr_gru_args.insert({DNNL_ARG_WEIGHTS_LAYER, weights_layer_mem}); + lbr_gru_args.insert({DNNL_ARG_WEIGHTS_ITER, weights_iter_mem}); + lbr_gru_args.insert({DNNL_ARG_BIAS, bias_mem}); + lbr_gru_args.insert({DNNL_ARG_DST_LAYER, dst_layer_mem}); + lbr_gru_args.insert({DNNL_ARG_SRC_ITER, src_iter_mem}); + lbr_gru_args.insert({DNNL_ARG_DST_ITER, dst_iter_mem}); + lbr_gru_args.insert({DNNL_ARG_WORKSPACE, workspace_mem}); + + // Primitive execution: lbr_gru. + lbr_gru_prim.execute(engine_stream, lbr_gru_args); + + // Wait for the computation to finalize. + engine_stream.wait(); + + // Read data from memory object's handle. + read_from_dnnl_memory(dst_layer_data.data(), dst_layer_mem); +} + +int main(int argc, char **argv) { + return handle_example_errors( + lbr_gru_example, parse_engine_kind(argc, argv)); +} From f7bbf4b99fa5d723fede371c88a43e433679dcd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Fri, 26 Apr 2024 11:30:07 +0200 Subject: [PATCH 054/187] gpu: sycl: binary: add support for remaining post ops --- src/gpu/sycl/binary_kernels.hpp | 111 ++++++++++++++++++++++++++- src/gpu/sycl/ref_binary.cpp | 22 +++++- src/gpu/sycl/ref_binary.hpp | 19 ++--- src/gpu/sycl/sycl_primitive_conf.hpp | 2 + 4 files changed, 137 insertions(+), 17 deletions(-) diff --git a/src/gpu/sycl/binary_kernels.hpp b/src/gpu/sycl/binary_kernels.hpp index 8fb556fab63..49c35a9c575 100644 --- a/src/gpu/sycl/binary_kernels.hpp +++ b/src/gpu/sycl/binary_kernels.hpp @@ -36,14 +36,24 @@ struct binary_kernel_vec_t { xpu::sycl::in_memory_arg_t &src0, xpu::sycl::in_memory_arg_t &src1, xpu::sycl::out_memory_arg_t &dst, xpu::sycl::in_memory_arg_t &src0_scale, - xpu::sycl::in_memory_arg_t &src1_scale, data_type_t scales_dt) + xpu::sycl::in_memory_arg_t &src1_scale, data_type_t scales_dt, + xpu::sycl::in_memory_arg_t &po1_src, + xpu::sycl::in_memory_arg_t &po2_src, + xpu::sycl::in_memory_arg_t &po3_src, + xpu::sycl::in_memory_arg_t &po4_src, + xpu::sycl::in_memory_arg_t &po5_src) : conf_(conf) , src0_(src0) , src1_(src1) , dst_(dst) , src0_scale_(src0_scale) , src1_scale_(src1_scale) - , scales_dt_(scales_dt) {} + , scales_dt_(scales_dt) + , po1_src_(po1_src) + , po2_src_(po2_src) + , po3_src_(po3_src) + , po4_src_(po4_src) + , po5_src_(po5_src) {} void operator()(::sycl::nd_item<1> item) const { auto sg = item.get_sub_group(); @@ -73,7 +83,7 @@ struct binary_kernel_vec_t { any_broadcast |= conf_.broadcast_dims[i]; } } - if (!any_broadcast + if (!any_broadcast && conf_.post_ops.get_post_op() == 0 && sg_base_idx + (sg.get_local_range()[0] * conf_.block_size) < conf_.wk_size) { for (int i = 0; i < conf_.block_size / vec_len; i++) { @@ -123,7 +133,8 @@ struct binary_kernel_vec_t { if (conf_.do_scale_src1) src1 *= sm_1; auto acc = compute_alg_n(src0, src1, conf_.alg_kind); - acc = conf_.post_ops.apply(acc, dst); + ::sycl::vec post_po_sr = post_op_src_val(idx); + acc = conf_.post_ops.apply(acc, dst, post_po_sr); store_float_value( dst_md().data_type(), acc, dst_ptr(), idx); } @@ -146,6 +157,93 @@ struct binary_kernel_vec_t { return static_cast(src1_scale_.get_pointer()); } + inline ::sycl::vec post_op_src_val(dim_t data_l_off) const { + ::sycl::vec post_po_sr; + const auto maxPostPo = conf_.post_ops.get_post_op(); + + for (dim_t po_idx = 0; po_idx < maxPostPo; po_idx++) { + float res = 0.0f; + if (po_idx == 0) + res = get_post_op_val(po1_src_, po_idx, data_l_off); + else if (po_idx == 1) + res = get_post_op_val(po2_src_, po_idx, data_l_off); + else if (po_idx == 2) + res = get_post_op_val(po3_src_, po_idx, data_l_off); + else if (po_idx == 3) + res = get_post_op_val(po4_src_, po_idx, data_l_off); + else if (po_idx == 4) + res = get_post_op_val(po5_src_, po_idx, data_l_off); + + post_po_sr[po_idx] = res; + } + return post_po_sr; + } + + float get_post_op_val(const xpu::sycl::in_memory_arg_t &bin_src_op, + dim_t &idx, dim_t offset) const { + auto src1_desc = conf_.binary_src_arr[idx]; + + const auto off = get_binary_src1_off( + src1_desc, offset, dst_md().dims(), dst_md().ndims()); + + auto dst = load_float_value( + src1_desc.data_type(), bin_src_op.get_pointer(), off); + return dst; + } + + dim_t get_binary_src1_off(const xpu::sycl::md_t &src1_md, dim_t l_offset, + const xpu::sycl::md_t::dims32_t &dst_dims, + const xpu::sycl::md_t::dim32_t &dst_ndims) const { + const dim_t mask_binary_po + = get_dims_mask(dst_dims, src1_md.dims(), dst_ndims); + return get_po_tensor_off( + src1_md, l_offset, dst_dims, dst_ndims, mask_binary_po); + } + + inline dim_t get_dims_mask(const xpu::sycl::md_t::dims32_t &dims1, + const xpu::sycl::md_t::dims32_t &dims2, const dim_t &ndims, + bool skip_dim_of_one = false) const { + dim_t mask = 0; + for (dim_t d = 0; d < ndims; ++d) { + // Disable mask_bit for dimensions of `1` by request. + dim_t mask_bit = skip_dim_of_one && dims1[d] == 1 ? 0 : (1 << d); + mask += dims1[d] == dims2[d] ? mask_bit : 0; + } + return mask; + } + + inline dim_t get_po_tensor_off(const xpu::sycl::md_t &tensor_md, + dim_t l_offset, const xpu::sycl::md_t::dims32_t &dst_dims, + const dim_t &dst_ndims, const dim_t &mask) const { + dims_t l_dims_po {}; + get_l_dims_po(l_dims_po, l_offset, dst_dims, dst_ndims, mask); + + return tensor_md.off_v(l_dims_po); + } + + inline void get_l_dims_po(dims_t l_dims_po, dim_t l_offset, + const xpu::sycl::md_t::dims32_t &dst_dims, const dim_t &dst_ndims, + const dim_t &mask) const { + + l_dims_by_l_offset(l_dims_po, l_offset, dst_dims, dst_ndims); + utils::apply_mask_on_dims(l_dims_po, dst_ndims, mask); + } + + inline void l_dims_by_l_offset(dims_t dims_pos, dim_t l_offset, + const xpu::sycl::md_t::dims32_t &dims, const dim_t &ndims) const { + for (dim_t rd = 0; rd < ndims; ++rd) { + const dim_t d = ndims - 1 - rd; + /* switch to faster 32-bit division when possible. */ + if (l_offset <= INT32_MAX && dims[d] <= INT32_MAX) { + dims_pos[d] = (int32_t)l_offset % (int32_t)dims[d]; + l_offset = (int32_t)l_offset / (int32_t)dims[d]; + } else { + dims_pos[d] = l_offset % dims[d]; + l_offset /= dims[d]; + } + } + } + template ::sycl::vec compute_alg(::sycl::vec src0, ::sycl::vec src1, alg_kind_t alg) const { @@ -199,6 +297,11 @@ struct binary_kernel_vec_t { xpu::sycl::in_memory_arg_t src0_scale_; xpu::sycl::in_memory_arg_t src1_scale_; data_type_t scales_dt_; + xpu::sycl::in_memory_arg_t po1_src_; + xpu::sycl::in_memory_arg_t po2_src_; + xpu::sycl::in_memory_arg_t po3_src_; + xpu::sycl::in_memory_arg_t po4_src_; + xpu::sycl::in_memory_arg_t po5_src_; }; } // namespace sycl diff --git a/src/gpu/sycl/ref_binary.cpp b/src/gpu/sycl/ref_binary.cpp index 5a89b009afd..26882e08c9d 100644 --- a/src/gpu/sycl/ref_binary.cpp +++ b/src/gpu/sycl/ref_binary.cpp @@ -52,6 +52,13 @@ status_t ref_binary_t::pd_t::init_conf() { conf_.post_ops = sycl_post_ops_t(attr()); + for (auto i = 0; i < conf_.post_ops.get_post_op(); ++i) { + const auto &e = attr()->post_ops_.entry_[i]; + if (e.is_binary() || e.is_prelu()) { + conf_.binary_src_arr[i] = xpu::sycl::md_t( + arg_md(DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1)); + } + } return status::success; } @@ -62,6 +69,7 @@ status_t ref_binary_t::init(engine_t *engine) { } status_t ref_binary_t::execute(const exec_ctx_t &ctx) const { + parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) { auto src0_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0); auto src1_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_1); @@ -76,9 +84,21 @@ status_t ref_binary_t::execute(const exec_ctx_t &ctx) const { .data_type() : data_type_t::dnnl_f32; + auto src_mem_po_1 = CTX_IN_SYCL_KERNEL_MEMORY( + (DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1)); + auto src_mem_po_2 = CTX_IN_SYCL_KERNEL_MEMORY( + (DNNL_ARG_ATTR_MULTIPLE_POST_OP(1) | DNNL_ARG_SRC_1)); + auto src_mem_po_3 = CTX_IN_SYCL_KERNEL_MEMORY( + (DNNL_ARG_ATTR_MULTIPLE_POST_OP(2) | DNNL_ARG_SRC_1)); + auto src_mem_po_4 = CTX_IN_SYCL_KERNEL_MEMORY( + (DNNL_ARG_ATTR_MULTIPLE_POST_OP(3) | DNNL_ARG_SRC_1)); + auto src_mem_po_5 = CTX_IN_SYCL_KERNEL_MEMORY( + (DNNL_ARG_ATTR_MULTIPLE_POST_OP(4) | DNNL_ARG_SRC_1)); + binary_kernel_vec_t binary_kernel(pd()->conf_, src0_mem_arg, src1_mem_arg, dst_mem_arg, src0_scale_mem_arg, - src1_scale_mem_arg, scales_dt); + src1_scale_mem_arg, scales_dt, src_mem_po_1, src_mem_po_2, + src_mem_po_3, src_mem_po_4, src_mem_po_5); const int block_size = pd()->conf_.block_size; const int wg_size = pd()->conf_.wg_size; diff --git a/src/gpu/sycl/ref_binary.hpp b/src/gpu/sycl/ref_binary.hpp index c7c4f90fe52..09bde014c6d 100644 --- a/src/gpu/sycl/ref_binary.hpp +++ b/src/gpu/sycl/ref_binary.hpp @@ -48,6 +48,7 @@ struct ref_binary_t : public sycl_gpu_primitive_t { const memory_desc_wrapper dst_d(dst_md()); const bool ok = set_default_params() == status::success + && attr_.set_default_formats(dst_md()) == status::success && check_data_types(src0_d, src1_d, dst_d) && check_formats(src0_d, src1_d, dst_d) && attr()->has_default_values( @@ -72,18 +73,12 @@ struct ref_binary_t : public sycl_gpu_primitive_t { } bool post_ops_ok() const { - for (int i = 0; i < attr()->post_ops_.len(); i++) { - const auto &e = attr()->post_ops_.entry_[i]; - if (!IMPLICATION(e.is_eltwise(), - utils::one_of(e.eltwise.alg, alg_kind::eltwise_relu, - alg_kind::eltwise_linear))) { - return false; - } - } - // Binary, prelu and dw conv post-ops are not supported. + // Dw conv post-ops are not supported. return attr()->post_ops_.len() <= sycl_post_ops_t::max_post_ops && attr()->post_ops_.has_default_values( - {primitive_kind::eltwise}); + {primitive_kind::eltwise, primitive_kind::binary, + primitive_kind::prelu, + primitive_kind::sum}); } static bool check_data_types(const memory_desc_wrapper &src0, @@ -100,7 +95,7 @@ struct ref_binary_t : public sycl_gpu_primitive_t { } return IMPLICATION(utils::one_of(bf16, src0_dt, src1_dt, dst_dt), - src0_dt == src1_dt == dst_dt); + src0_dt == dst_dt && src1_dt == dst_dt); } static bool check_formats(const memory_desc_wrapper &src0, @@ -109,7 +104,7 @@ struct ref_binary_t : public sycl_gpu_primitive_t { using namespace format_tag; for (const auto &mdw : {src0, src1, dst}) { - if (mdw.matches_one_of_tag(ab, abc, abcd, abcde) == undef) { + if (mdw.matches_one_of_tag(a, ab, abc, abcd, abcde) == undef) { return false; } } diff --git a/src/gpu/sycl/sycl_primitive_conf.hpp b/src/gpu/sycl/sycl_primitive_conf.hpp index 2adcfb2034c..d809134905c 100644 --- a/src/gpu/sycl/sycl_primitive_conf.hpp +++ b/src/gpu/sycl/sycl_primitive_conf.hpp @@ -44,6 +44,8 @@ struct sycl_binary_conf_t { int wg_size; int wk_size; + xpu::sycl::md_t binary_src_arr[8]; + sycl_post_ops_t post_ops; }; From 648c8da1535f8f5ab0d652d61454b917701fe507 Mon Sep 17 00:00:00 2001 From: "Wang, Zhitao" Date: Sat, 11 May 2024 01:05:38 +0000 Subject: [PATCH 055/187] tests: benchdnn: graph: relax cmp threshold --- tests/benchdnn/graph/ref_primitive.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchdnn/graph/ref_primitive.cpp b/tests/benchdnn/graph/ref_primitive.cpp index 0e06e037806..020e21f7b0d 100644 --- a/tests/benchdnn/graph/ref_primitive.cpp +++ b/tests/benchdnn/graph/ref_primitive.cpp @@ -298,7 +298,7 @@ void ref_primitive_t::check_correctness( // // Note: the following threshold is obtained from actual runs on // different hardware. - cmp.set_threshold(7e-5f); + cmp.set_threshold(8e-5f); cmp.set_norm_validation_mode(true); cmp.compare(mem_fp_abx, mem_dt, attr, res); } From bd248c46e9c9f7f7b2680a3706ba387175d2ea98 Mon Sep 17 00:00:00 2001 From: Ankit Manerikar Date: Wed, 8 May 2024 10:06:08 -0700 Subject: [PATCH 056/187] cpu: eltwise: add int8 support for eltwise clip primitive --- src/common/math_utils.hpp | 2 +- src/cpu/x64/jit_uni_eltwise_int.cpp | 46 ++++++++++++++++++- .../inputs/eltwise/option_set_all_algs_int8 | 4 ++ .../eltwise/option_set_all_algs_int8_ci | 4 ++ 4 files changed, 53 insertions(+), 3 deletions(-) diff --git a/src/common/math_utils.hpp b/src/common/math_utils.hpp index 1f0498a89dc..ff6ed3a3e6a 100644 --- a/src/common/math_utils.hpp +++ b/src/common/math_utils.hpp @@ -430,7 +430,7 @@ inline bool is_eltwise_ok( one_of(alg, eltwise_clip, eltwise_clip_v2), beta >= alpha) && IMPLICATION(alg == eltwise_round, src_dt == dnnl_f32) && IMPLICATION(one_of(src_dt, dnnl_s32, dnnl_s8, dnnl_u8), - one_of(alg, eltwise_relu, eltwise_linear)); + one_of(alg, eltwise_relu, eltwise_linear, eltwise_clip)); const bool eltwise_use_dst = one_of(alg, eltwise_relu_use_dst_for_bwd, diff --git a/src/cpu/x64/jit_uni_eltwise_int.cpp b/src/cpu/x64/jit_uni_eltwise_int.cpp index 5a0025a3bd0..6d112d09995 100644 --- a/src/cpu/x64/jit_uni_eltwise_int.cpp +++ b/src/cpu/x64/jit_uni_eltwise_int.cpp @@ -67,7 +67,7 @@ struct jit_uni_subkernel_int_t : public jit_uni_eltwise_int_kernel { // Relu and linear for int types: s32, s8, u8; Only forward direction assert(utils::one_of(desc().alg_kind, alg_kind::eltwise_relu, - alg_kind::eltwise_linear)); + alg_kind::eltwise_linear, alg_kind::eltwise_clip)); assert(utils::one_of(data_type(), s32, s8, u8)); assert(utils::one_of(isa, sse41, avx2, avx512_core)); } @@ -200,6 +200,7 @@ struct jit_uni_subkernel_int_t : public jit_uni_eltwise_int_kernel { // Processing void process_linear(const Vmm &vr_to, const Vmm &vr_from); void process_relu(const Vmm &vr_to, const Vmm &vr_from); + void process_clip(const Vmm &vr_to, const Vmm &vr_from); // Store s32 for any isa void store_32bit( @@ -246,6 +247,10 @@ struct jit_uni_subkernel_int_t : public jit_uni_eltwise_int_kernel { for (size_t i = 0; i < uf; i++) process_relu(vreg_to(i), vreg_from(i)); break; + case alg_kind::eltwise_clip: + for (size_t i = 0; i < uf; i++) + process_clip(vreg_to(i), vreg_from(i)); + break; default: assert(!"unsupported alg"); } @@ -313,6 +318,43 @@ void jit_uni_subkernel_int_t::process_relu( vcvtps2dq(vr_to, vr_to); } +template +void jit_uni_subkernel_int_t::process_clip( + const Vmm &vr_to, const Vmm &vr_from) { + assert(!"unsupported isa"); +} + +template <> +void jit_uni_subkernel_int_t::process_clip( + const Vmm &vr_to, const Vmm &vr_from) { + + cvtdq2ps(vr_from, vr_from); + movups(vr_to, vr_from); + maxps(vr_to, vmm_alpha); + minps(vr_to, vmm_beta); + cvtps2dq(vr_to, vr_to); +} + +template <> +void jit_uni_subkernel_int_t::process_clip( + const Vmm &vr_to, const Vmm &vr_from) { + + vcvtdq2ps(vr_from, vr_from); + vmaxps(vr_to, vr_from, vmm_alpha); + vminps(vr_to, vr_to, vmm_beta); + vcvtps2dq(vr_to, vr_to); +} + +template <> +void jit_uni_subkernel_int_t::process_clip( + const Vmm &vr_to, const Vmm &vr_from) { + + vcvtdq2ps(vr_from, vr_from); + vmaxps(vr_to, vr_from, vmm_alpha); + vminps(vr_to, vr_to, vmm_beta); + vcvtps2dq(vr_to, vr_to); +} + template void jit_uni_subkernel_int_t::store_8bit(const bool vectorize, const Address &mem_to, const Vmm &vr_to, bool is_signed) { @@ -407,7 +449,7 @@ status_t jit_uni_eltwise_int_fwd_t::pd_t::init(engine_t *engine) { VERBOSE_UNSUPPORTED_DT); // only relu and linear so far VDISPATCH_ELTWISE(utils::one_of(desc()->alg_kind, alg_kind::eltwise_relu, - alg_kind::eltwise_linear), + alg_kind::eltwise_linear, alg_kind::eltwise_clip), VERBOSE_BAD_ALGORITHM); VDISPATCH_ELTWISE(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, ""); VDISPATCH_ELTWISE(memory_desc_wrapper(src_md()).is_dense(true), diff --git a/tests/benchdnn/inputs/eltwise/option_set_all_algs_int8 b/tests/benchdnn/inputs/eltwise/option_set_all_algs_int8 index 2c8c9a3d7ad..83aad5d929c 100644 --- a/tests/benchdnn/inputs/eltwise/option_set_all_algs_int8 +++ b/tests/benchdnn/inputs/eltwise/option_set_all_algs_int8 @@ -6,3 +6,7 @@ --alpha=0,0.25,-0.25 --beta=0,0.25,-0.25 --alg=linear --batch=shapes_eltwise + +--alpha=0,0.25,-0.25 --beta=0,0.25,-0.25 +--alg=clip +--batch=shapes_eltwise diff --git a/tests/benchdnn/inputs/eltwise/option_set_all_algs_int8_ci b/tests/benchdnn/inputs/eltwise/option_set_all_algs_int8_ci index 82871af7eba..c244ab48481 100644 --- a/tests/benchdnn/inputs/eltwise/option_set_all_algs_int8_ci +++ b/tests/benchdnn/inputs/eltwise/option_set_all_algs_int8_ci @@ -6,3 +6,7 @@ --alpha=1 --beta=2 --alg=linear --batch=shapes_ci + +--alpha=-2 --beta=3 +--alg=clip +--batch=shapes_ci From e02e1d4539d0437f440bec98a70b6672ddd4d5d2 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Fri, 26 Apr 2024 16:17:45 -0700 Subject: [PATCH 057/187] gpu: intel: jit: keep order in to_string_int_map() --- src/gpu/intel/jit/conv/config.hpp | 2 +- src/gpu/intel/jit/ir/config.hpp | 2 +- src/gpu/intel/jit/ir/problem.hpp | 2 +- src/gpu/intel/jit/utils/utils.hpp | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gpu/intel/jit/conv/config.hpp b/src/gpu/intel/jit/conv/config.hpp index 37f13b4c805..31771ff6197 100644 --- a/src/gpu/intel/jit/conv/config.hpp +++ b/src/gpu/intel/jit/conv/config.hpp @@ -380,7 +380,7 @@ class subtiles_param_t : public param_t { void set_from_str(const std::string &s) override { a_ = 1; b_ = 1; - for (auto &kv : ir_utils::to_string_int_map(s)) { + for (auto &kv : ir_utils::to_string_int_pairs(s)) { if (kv.first == "a") { a_ = kv.second; } else if (kv.first == "b") { diff --git a/src/gpu/intel/jit/ir/config.hpp b/src/gpu/intel/jit/ir/config.hpp index 358910191a9..322aa7490ca 100644 --- a/src/gpu/intel/jit/ir/config.hpp +++ b/src/gpu/intel/jit/ir/config.hpp @@ -199,7 +199,7 @@ class tile_param_t : public param_t { void set_from_str(const std::string &s) override { tile_ = prb_tile_t(); - for (auto &kv : ir_utils::to_string_int_map(s)) { + for (auto &kv : ir_utils::to_string_int_pairs(s)) { tile_[prb_dim_t::from_name(kv.first)] = kv.second; } } diff --git a/src/gpu/intel/jit/ir/problem.hpp b/src/gpu/intel/jit/ir/problem.hpp index c526a42076e..c7bfcd9cdda 100644 --- a/src/gpu/intel/jit/ir/problem.hpp +++ b/src/gpu/intel/jit/ir/problem.hpp @@ -227,7 +227,7 @@ class dim_map_t { dim_map_t(const std::string &s) { is_set_.fill(false); values_.fill(ValueT()); - for (auto &kv : ir_utils::to_string_int_map(s)) { + for (auto &kv : ir_utils::to_string_int_pairs(s)) { operator[](KeyT::from_name(kv.first)) = ValueT(kv.second); } } diff --git a/src/gpu/intel/jit/utils/utils.hpp b/src/gpu/intel/jit/utils/utils.hpp index 0cfa2f6b919..d6c424f9d12 100644 --- a/src/gpu/intel/jit/utils/utils.hpp +++ b/src/gpu/intel/jit/utils/utils.hpp @@ -1120,9 +1120,9 @@ class cli_iface_t { std::vector args_; }; -inline std::unordered_map to_string_int_map( +inline std::vector> to_string_int_pairs( const std::string &s) { - std::unordered_map ret; + std::vector> ret; int name_beg = -1; int value_beg = -1; for (int pos = 0; pos < (int)s.size() + 1; pos++) { @@ -1132,7 +1132,7 @@ inline std::unordered_map to_string_int_map( if (name_beg != -1 && value_beg != -1) { auto key = s.substr(name_beg, value_beg - name_beg); auto value = std::stoi(s.substr(value_beg, pos - value_beg)); - ret[key] = value; + ret.emplace_back(key, value); } name_beg = pos; value_beg = -1; From 19ab4de341fd99456b4a58f960df8198dfa3b450 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Tue, 23 Apr 2024 12:04:38 -0700 Subject: [PATCH 058/187] gpu: intel: jit: conv: simplify grid binding --- src/gpu/intel/jit/conv/plan.cpp | 90 ++++++++++++++------------ src/gpu/intel/jit/ir/gemm_schedule.hpp | 18 ++++++ 2 files changed, 65 insertions(+), 43 deletions(-) diff --git a/src/gpu/intel/jit/conv/plan.cpp b/src/gpu/intel/jit/conv/plan.cpp index ef7a98ce72a..12ff9b592d2 100644 --- a/src/gpu/intel/jit/conv/plan.cpp +++ b/src/gpu/intel/jit/conv/plan.cpp @@ -141,6 +141,36 @@ bool set_g_grid_idx_innermost(const conv_config_t &cfg) { return set_g_grid_idx_innermost(cfg.hw(), cfg.dst_layout().compute()); } +void bind_grid_idx(const conv_config_t &cfg, gemm_schedule_t &gemm_schedule, + const expr_t &var, bool is_tg) { + auto &grid_dims = is_tg ? get_thread_group_grid_conv_dims(cfg.prb()) + : get_kernel_grid_conv_dims(cfg.prb()); + int grid_idx = -1; + for (auto &v : gemm_schedule.get_root_vars(var)) { + auto v_dim = prb_dim_t::from_name(v.as().name); + for (int i = 0; i < 3; i++) { + if (grid_dims[i].has(v_dim)) { + ir_assert(grid_idx == -1 || grid_idx == i); + grid_idx = i; + } + } + } + ir_assert(grid_idx != -1); + gemm_schedule.bind(var, + is_tg ? cfg.thread_group_grid().idx(grid_idx) + : cfg.kernel_grid().idx(grid_idx)); +} + +void bind_kernel_grid_idx(const conv_config_t &cfg, + gemm_schedule_t &gemm_schedule, const expr_t &var) { + bind_grid_idx(cfg, gemm_schedule, var, /*is_tg=*/false); +} + +void bind_thread_group_grid_idx(const conv_config_t &cfg, + gemm_schedule_t &gemm_schedule, const expr_t &var) { + bind_grid_idx(cfg, gemm_schedule, var, /*is_tg=*/true); +} + void init_fwd(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule, view_t &src_view, view_t &wei_view, view_t &dst_view) { auto &prb_ = cfg_.prb(); @@ -284,21 +314,12 @@ void init_fwd(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule, } auto mb_ow_tg_idx = gemm_schedule.fuse(mb_tile.tg_idx(), ow_tile.tg_idx()); - if (prb_.ab_swap_transpose) { - gemm_schedule.bind(mb_tile.grid_idx(), cfg_.kernel_grid().idx(0)); - gemm_schedule.bind(oc_tile.grid_idx(), cfg_.kernel_grid().idx(1)); - gemm_schedule.bind(g_ow_grid_idx, cfg_.kernel_grid().idx(2)); - gemm_schedule.bind(mb_ow_tg_idx, cfg_.thread_group_grid().idx(0)); - gemm_schedule.bind(oc_tile.tg_idx(), cfg_.thread_group_grid().idx(1)); - gemm_schedule.bind(ic_tile.tg_idx(), cfg_.thread_group_grid().idx(2)); - } else { - gemm_schedule.bind(oc_tile.grid_idx(), cfg_.kernel_grid().idx(0)); - gemm_schedule.bind(g_ow_grid_idx, cfg_.kernel_grid().idx(1)); - gemm_schedule.bind(mb_tile.grid_idx(), cfg_.kernel_grid().idx(2)); - gemm_schedule.bind(oc_tile.tg_idx(), cfg_.thread_group_grid().idx(0)); - gemm_schedule.bind(mb_ow_tg_idx, cfg_.thread_group_grid().idx(1)); - gemm_schedule.bind(ic_tile.tg_idx(), cfg_.thread_group_grid().idx(2)); - } + bind_kernel_grid_idx(cfg_, gemm_schedule, oc_tile.grid_idx()); + bind_kernel_grid_idx(cfg_, gemm_schedule, g_ow_grid_idx); + bind_kernel_grid_idx(cfg_, gemm_schedule, mb_tile.grid_idx()); + bind_thread_group_grid_idx(cfg_, gemm_schedule, oc_tile.tg_idx()); + bind_thread_group_grid_idx(cfg_, gemm_schedule, mb_ow_tg_idx); + bind_thread_group_grid_idx(cfg_, gemm_schedule, ic_tile.tg_idx()); gemm_schedule.tensorize(g_tile.iter_idx()); gemm_schedule.tensorize(oc_tile.iter_idx()); @@ -476,21 +497,12 @@ void init_bwd_d(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule, {g_tile.grid_idx(), id, ih, iw_tile.grid_idx()}); } auto mb_iw_tg_idx = gemm_schedule.fuse(mb_tile.tg_idx(), iw_tile.tg_idx()); - if (prb_.ab_swap_transpose /*.ic < 8 && prb_.mb >= 8*/) { - gemm_schedule.bind(mb_tile.grid_idx(), cfg_.kernel_grid().idx(0)); - gemm_schedule.bind(ic_tile.grid_idx(), cfg_.kernel_grid().idx(1)); - gemm_schedule.bind(g_isp_grid_idx, cfg_.kernel_grid().idx(2)); - gemm_schedule.bind(mb_iw_tg_idx, cfg_.thread_group_grid().idx(0)); - gemm_schedule.bind(ic_tile.tg_idx(), cfg_.thread_group_grid().idx(1)); - gemm_schedule.bind(oc_tile.tg_idx(), cfg_.thread_group_grid().idx(2)); - } else { - gemm_schedule.bind(ic_tile.grid_idx(), cfg_.kernel_grid().idx(0)); - gemm_schedule.bind(g_isp_grid_idx, cfg_.kernel_grid().idx(1)); - gemm_schedule.bind(mb_tile.grid_idx(), cfg_.kernel_grid().idx(2)); - gemm_schedule.bind(ic_tile.tg_idx(), cfg_.thread_group_grid().idx(0)); - gemm_schedule.bind(mb_iw_tg_idx, cfg_.thread_group_grid().idx(1)); - gemm_schedule.bind(oc_tile.tg_idx(), cfg_.thread_group_grid().idx(2)); - } + bind_kernel_grid_idx(cfg_, gemm_schedule, ic_tile.grid_idx()); + bind_kernel_grid_idx(cfg_, gemm_schedule, g_isp_grid_idx); + bind_kernel_grid_idx(cfg_, gemm_schedule, mb_tile.grid_idx()); + bind_thread_group_grid_idx(cfg_, gemm_schedule, ic_tile.tg_idx()); + bind_thread_group_grid_idx(cfg_, gemm_schedule, mb_iw_tg_idx); + bind_thread_group_grid_idx(cfg_, gemm_schedule, oc_tile.tg_idx()); gemm_schedule.tensorize(g_tile.iter_idx()); gemm_schedule.tensorize(ic_tile.iter_idx()); @@ -684,19 +696,11 @@ void init_bwd_w(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule, = gemm_schedule.fuse({g_tile.grid_idx(), mb_tile.grid_idx()}); } - if (prb_.ab_swap_transpose) { - gemm_schedule.bind(osp_ksp_ic_grid_idx, cfg_.kernel_grid().idx(0)); - gemm_schedule.bind(g_mb_grid_idx, cfg_.kernel_grid().idx(1)); - gemm_schedule.bind(oc_tile.grid_idx(), cfg_.kernel_grid().idx(2)); - gemm_schedule.bind(ic_tile.tg_idx(), cfg_.thread_group_grid().idx(0)); - gemm_schedule.bind(oc_tile.tg_idx(), cfg_.thread_group_grid().idx(1)); - } else { - gemm_schedule.bind(oc_tile.grid_idx(), cfg_.kernel_grid().idx(0)); - gemm_schedule.bind(osp_ksp_ic_grid_idx, cfg_.kernel_grid().idx(1)); - gemm_schedule.bind(g_mb_grid_idx, cfg_.kernel_grid().idx(2)); - gemm_schedule.bind(oc_tile.tg_idx(), cfg_.thread_group_grid().idx(0)); - gemm_schedule.bind(ic_tile.tg_idx(), cfg_.thread_group_grid().idx(1)); - } + bind_kernel_grid_idx(cfg_, gemm_schedule, oc_tile.grid_idx()); + bind_kernel_grid_idx(cfg_, gemm_schedule, osp_ksp_ic_grid_idx); + bind_kernel_grid_idx(cfg_, gemm_schedule, g_mb_grid_idx); + bind_thread_group_grid_idx(cfg_, gemm_schedule, oc_tile.tg_idx()); + bind_thread_group_grid_idx(cfg_, gemm_schedule, ic_tile.tg_idx()); gemm_schedule.reorder({od_tile.loop_idx(), oh_tile.loop_idx(), ow_tile.loop_idx(), mb_tile.loop_idx()}); diff --git a/src/gpu/intel/jit/ir/gemm_schedule.hpp b/src/gpu/intel/jit/ir/gemm_schedule.hpp index ddae2aa48f2..bc641497551 100644 --- a/src/gpu/intel/jit/ir/gemm_schedule.hpp +++ b/src/gpu/intel/jit/ir/gemm_schedule.hpp @@ -709,6 +709,24 @@ class gemm_schedule_t { return k_loop < k; } + std::vector get_root_vars(const expr_t &var) const { + std::vector ret; + std::function walk; + walk = [&](const expr_t &v) { + auto &loop = find_loop(v); + if (loop.is_root()) { + ret.push_back(loop.var()); + return; + } + ir_assert(loop.is_fused_child() || loop.is_split_child()); + for (auto &pv : loop.parent_vars()) { + walk(pv); + } + }; + walk(var); + return ret; + } + void finalize() { init_problem_tiles(); init_constraint_set(); From 5550e4764725ad30a113081caa76f1a79d9fe8ec Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Tue, 23 Apr 2024 12:18:44 -0700 Subject: [PATCH 059/187] gpu: intel: jit: conv: do not change kernel grid assignment with ab_swap_transpose --- src/gpu/intel/jit/conv/config.cpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/gpu/intel/jit/conv/config.cpp b/src/gpu/intel/jit/conv/config.cpp index 0e27160326a..3077bc88542 100644 --- a/src/gpu/intel/jit/conv/config.cpp +++ b/src/gpu/intel/jit/conv/config.cpp @@ -1219,18 +1219,13 @@ const std::array &get_kernel_grid_conv_dims( static const prb_tile_t bwd_w_2({prb_dims::g, prb_dims::mb}); using prb_tile_3 = std::array; - // non-transposed static const prb_tile_3 fwd = {fwd_0, fwd_1, fwd_2}; static const prb_tile_3 bwd_d = {bwd_d_0, bwd_d_1, bwd_d_2}; static const prb_tile_3 bwd_w = {bwd_w_0, bwd_w_1, bwd_w_2}; - // transposed - static const prb_tile_3 t_fwd = {fwd_2, fwd_0, fwd_1}; - static const prb_tile_3 t_bwd_d = {bwd_d_2, bwd_d_0, bwd_d_1}; - static const prb_tile_3 t_bwd_w = {bwd_w_1, bwd_w_2, bwd_w_0}; - if (prb.is_fwd) return (prb.ab_swap_transpose) ? t_fwd : fwd; - if (prb.is_bwd_d) return (prb.ab_swap_transpose) ? t_bwd_d : bwd_d; - if (prb.is_bwd_w) return (prb.ab_swap_transpose) ? t_bwd_w : bwd_w; + if (prb.is_fwd) return fwd; + if (prb.is_bwd_d) return bwd_d; + if (prb.is_bwd_w) return bwd_w; ir_error_not_expected(); return fwd; } From bc976743923d880ead61be573c9194e8048af07a Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Thu, 25 Apr 2024 13:47:01 -0700 Subject: [PATCH 060/187] gpu, sycl: add query for L3 cache size --- src/gpu/intel/compute/device_info.cpp | 26 +++++++++-------------- src/gpu/intel/compute/device_info.hpp | 6 ++++-- src/gpu/intel/jit/ir/hw.hpp | 5 +++++ src/gpu/intel/ocl/ocl_gpu_device_info.cpp | 6 ++++++ src/sycl/sycl_device_info.cpp | 2 ++ 5 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/gpu/intel/compute/device_info.cpp b/src/gpu/intel/compute/device_info.cpp index c592e5714ed..83c237d47fe 100644 --- a/src/gpu/intel/compute/device_info.cpp +++ b/src/gpu/intel/compute/device_info.cpp @@ -233,20 +233,6 @@ size_t device_info_t::icache_size() const { } status_t device_info_t::init_attributes_common(engine_t *engine) { - // TODO: Fix for discrete GPUs. The code below is written for - // integrated GPUs assuming that last-level cache for GPU is shared - // with CPU. - // Integrated GPUs share LLC with CPU which is L3 cache on CPU. - - // XXX: this is the only place where GPU runtime functionally depends on - // CPU runtime. The `llc_cache_size_` is used only in one kernel for gen9. - // The idea is to use approximate cache size. - - // llc_cache_size_ = cpu::platform::get_per_core_cache_size(3) - // * cpu::platform::get_num_cores(); - // Assumption is that HT is likely enabled on client systems. - llc_cache_size_ = std::thread::hardware_concurrency() * (1 << 20); - bool ocl_backend = true; #ifdef DNNL_WITH_SYCL @@ -286,7 +272,7 @@ status_t device_info_t::init_serialized_device_info( serialized_device_info_.write(&max_subgroup_size_); serialized_device_info_.write(&max_exec_size_); serialized_device_info_.write(&max_wg_size_); - serialized_device_info_.write(&llc_cache_size_); + serialized_device_info_.write(&l3_cache_size_); serialized_device_info_.write(&extensions_); serialized_device_info_.write(&native_extensions_); serialized_device_info_.write(&mayiuse_systolic_); @@ -324,7 +310,7 @@ status_t device_info_t::init_from_cache_blob( DESERIALIZE(max_subgroup_size_, int32_t); DESERIALIZE(max_exec_size_, int); DESERIALIZE(max_wg_size_, size_t); - DESERIALIZE(llc_cache_size_, size_t); + DESERIALIZE(l3_cache_size_, size_t); DESERIALIZE(extensions_, uint64_t); DESERIALIZE(native_extensions_, uint64_t); DESERIALIZE(mayiuse_systolic_, bool); @@ -345,6 +331,14 @@ status_t device_info_t::init_from_cache_blob( return status::success; } +void device_info_t::fixup_l3_cache_size() { + // XXX: OpenCL/DPCPP does not report correct cache size for this + // configuration. + if (gpu_arch() == gpu_arch_t::xe2 && eu_count() <= 64) { + l3_cache_size_ = (1 << 23); + } +} + } // namespace compute } // namespace intel } // namespace gpu diff --git a/src/gpu/intel/compute/device_info.hpp b/src/gpu/intel/compute/device_info.hpp index 2774c8f1fc4..4c4ec5a2055 100644 --- a/src/gpu/intel/compute/device_info.hpp +++ b/src/gpu/intel/compute/device_info.hpp @@ -226,6 +226,7 @@ struct device_info_t { CHECK(init_runtime_version(engine)); CHECK(init_extensions(engine)); CHECK(init_attributes(engine)); + fixup_l3_cache_size(); CHECK(init_attributes_common(engine)); @@ -262,7 +263,7 @@ struct device_info_t { static int max_slm_size_per_tg(gpu_arch_t gpu_arch); static int max_slm_size_per_tg( gpu_arch_t gpu_arch, int tg_size, bool large_grf_mode = false); - size_t llc_cache_size() const { return llc_cache_size_; } + size_t l3_cache_size() const { return l3_cache_size_; } size_t icache_size() const; const runtime_version_t &runtime_version() const { @@ -330,7 +331,7 @@ struct device_info_t { int32_t max_subgroup_size_ = 16; int max_exec_size_ = 0; size_t max_wg_size_ = 0; - size_t llc_cache_size_ = 0; + size_t l3_cache_size_ = 0; // extensions_ and gpu_arch_ describe effective extensions and GPU architecture. uint64_t extensions_ = 0; @@ -342,6 +343,7 @@ struct device_info_t { status_t init_serialized_device_info( const std::vector &cache_blob = {}); status_t init_from_cache_blob(const std::vector &cache_blob); + void fixup_l3_cache_size(); bool mayiuse_non_uniform_work_groups_ = false; diff --git a/src/gpu/intel/jit/ir/hw.hpp b/src/gpu/intel/jit/ir/hw.hpp index 585bc0c0039..493d2b40711 100644 --- a/src/gpu/intel/jit/ir/hw.hpp +++ b/src/gpu/intel/jit/ir/hw.hpp @@ -84,6 +84,7 @@ class hw_t { eu_count_ = device_info->eu_count(); max_wg_size_ = static_cast( device_info->max_wg_size(/*large_grf_mode=*/false)); + l3_cache_size_ = device_info->l3_cache_size(); large_grf_support_ = compute_engine->mayiuse_large_grf_mode(); systolic_support_ = device_info->mayiuse_systolic(); with_atomic_fp64_ @@ -108,6 +109,7 @@ class hw_t { int large_grf_support() const { return large_grf_support_; } int grf_size() const { return ngen::GRF::bytes(hw_); } int systolic_support() const { return systolic_support_; } + size_t l3_cache_size() const { return l3_cache_size_; } int max_tg_size(int regs, int simd) const { int wg_size = max_wg_size(regs); @@ -177,6 +179,7 @@ class hw_t { ir_utils::serialize(stepping_id_, out); ir_utils::serialize(eu_count_, out); ir_utils::serialize(max_wg_size_, out); + ir_utils::serialize(l3_cache_size_, out); ir_utils::serialize(large_grf_support_, out); ir_utils::serialize(systolic_support_, out); } @@ -186,6 +189,7 @@ class hw_t { ir_utils::deserialize(stepping_id_, in); ir_utils::deserialize(eu_count_, in); ir_utils::deserialize(max_wg_size_, in); + ir_utils::deserialize(l3_cache_size_, in); ir_utils::deserialize(large_grf_support_, in); ir_utils::deserialize(systolic_support_, in); } @@ -200,6 +204,7 @@ class hw_t { int stepping_id_ = -1; int eu_count_ = 0; int max_wg_size_ = 0; + size_t l3_cache_size_ = 0; bool large_grf_support_ = false; bool systolic_support_ = false; bool with_atomic_fp64_ = false; diff --git a/src/gpu/intel/ocl/ocl_gpu_device_info.cpp b/src/gpu/intel/ocl/ocl_gpu_device_info.cpp index 39195d76d8e..ff7831843c0 100644 --- a/src/gpu/intel/ocl/ocl_gpu_device_info.cpp +++ b/src/gpu/intel/ocl/ocl_gpu_device_info.cpp @@ -150,6 +150,12 @@ status_t ocl_gpu_device_info_t::init_attributes(engine_t *engine) { OCL_CHECK(err); max_wg_size_ = max_wg_size; + cl_ulong mem_cache_size; + err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, + sizeof(mem_cache_size), &mem_cache_size, nullptr); + OCL_CHECK(err); + l3_cache_size_ = mem_cache_size; + #ifdef cl_intel_unified_shared_memory cl_device_unified_shared_memory_capabilities_intel system_memory_capabilities_intel diff --git a/src/sycl/sycl_device_info.cpp b/src/sycl/sycl_device_info.cpp index d71b0f58b78..22cb503dc60 100644 --- a/src/sycl/sycl_device_info.cpp +++ b/src/sycl/sycl_device_info.cpp @@ -142,6 +142,8 @@ status_t sycl_device_info_t::init_attributes(engine_t *engine) { eu_count_ = device.get_info<::sycl::info::device::max_compute_units>(); } max_wg_size_ = device.get_info<::sycl::info::device::max_work_group_size>(); + l3_cache_size_ + = device.get_info<::sycl::info::device::global_mem_cache_size>(); mayiuse_system_memory_allocators_ = device.has(::sycl::aspect::usm_system_allocations); return status::success; From 1f8bf36390afeb6188f9332de4fce18e13e2a372 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Tue, 7 May 2024 16:23:12 -0700 Subject: [PATCH 061/187] gpu: intel: jit: add integer division by non-constant via FP inverse --- src/gpu/intel/jit/codegen/kernel.hpp | 50 ++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/src/gpu/intel/jit/codegen/kernel.hpp b/src/gpu/intel/jit/codegen/kernel.hpp index bf02a62ecc0..8c331930298 100644 --- a/src/gpu/intel/jit/codegen/kernel.hpp +++ b/src/gpu/intel/jit/codegen/kernel.hpp @@ -725,6 +725,52 @@ class ir_kernel_t : public jit_generator { } // Emulates integer division by a non-constant (rounding towards negative + // infinity). This version is based on FP inverse and does not require a + // pre-computed "magic" value. Note, that cr0 register is updated/restored + // to use RTZ mode when converting float -> int. + // Requirements (validated range): + // -2^20 <= x <= 2^20 + // 0 < y <= 2^20 + // Computes: + // qot = x / y + // rem = x % y + void eidiv(const ngen::InstructionModifier &mod, const ngen::RegData &_qot, + const ngen::RegData &rem, const ngen::RegData &x, + const ngen::RegData &_y, bool update_cr0_fp_to_int_rtz = true) { + ir_assert(mod.getExecSize() == 1); + ir_assert(_y.getType() == ngen::DataType::ud); + auto cr0_save = ra_.alloc_sub(); + auto f_tmp = ra_.alloc_sub(); + auto x_tmp = ra_.alloc_sub(); + auto qot_tmp = ra_.alloc_sub(); + auto y = ngen::Subregister(_y, _y.getOffset(), _y.getType()); + mov(1, cr0_save, cr0); + // Set RTZ rounding mode when converting float to int. + and_(1, cr0, cr0, ~0x1000); + mov(1, f_tmp, y); + mov(1, x_tmp, x); + inv(1, f_tmp, f_tmp); + add(1, f_tmp.ud(0), f_tmp.ud(0), 1); + mul(1, f_tmp, x_tmp, f_tmp); + mov(mod, qot_tmp, f_tmp); + if (!rem.isInvalid()) { + auto tmp = ra_.alloc_sub(); + mul(1, tmp.d(0), qot_tmp, y.uw(0)); + mul(1, tmp.d(1), qot_tmp, y.uw(1)); + shl(1, tmp.ud(1), tmp.ud(1), 16); + add(1, tmp.d(0), tmp.d(1), tmp.d(0)); + add(mod, rem, x, -tmp.d(0)); + ra_.safeRelease(tmp); + } + if (!_qot.isInvalid()) mov(mod, _qot, qot_tmp); + mov(1, cr0, cr0_save); + ra_.safeRelease(cr0_save); + ra_.safeRelease(f_tmp); + ra_.safeRelease(x_tmp); + ra_.safeRelease(qot_tmp); + } + + // Emulates integer division by a constant (rounding towards negative // infinity) // Requirements: // INT32_MIN <= x <= UINT32_MAX @@ -740,7 +786,7 @@ class ir_kernel_t : public jit_generator { ir_assert(x.getHS() == 0); if (ngen::utils::is_zero_or_pow2(y)) { auto _x = get_subregister(x); - if (x.getNeg()) { + if (x.getNeg() || (x == qot) || (x == rem)) { // Negation modifier has bitwise semantics with shr/and so x // needs to be arithmetically negated first. _x = ra_.alloc_sub(div_type); @@ -773,7 +819,6 @@ class ir_kernel_t : public jit_generator { emul(1, q_tmp[0], _x, m); eshr(1, q_tmp.uq(0), q_tmp.uq(0), p); } - if (!qot.isInvalid()) mov(mod, qot, _qot); if (!rem.isInvalid()) { // rem = x - qot * y @@ -791,6 +836,7 @@ class ir_kernel_t : public jit_generator { ra_.safeRelease(tmp); } } + if (!qot.isInvalid()) mov(mod, qot, _qot); ra_.safeRelease(x_tmp); ra_.safeRelease(qot_tmp); From cef517974c0627ca5a65d2f1c60b98b2b0b3fce0 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Tue, 7 May 2024 16:23:57 -0700 Subject: [PATCH 062/187] gpu: intel: jit: introduce spatial_index(prb_dim_t) --- src/gpu/intel/jit/ir/problem.cpp | 9 +++++++++ src/gpu/intel/jit/ir/problem.hpp | 1 + 2 files changed, 10 insertions(+) diff --git a/src/gpu/intel/jit/ir/problem.cpp b/src/gpu/intel/jit/ir/problem.cpp index 44a7ac915b5..cfe8c97ca74 100644 --- a/src/gpu/intel/jit/ir/problem.cpp +++ b/src/gpu/intel/jit/ir/problem.cpp @@ -131,6 +131,15 @@ prb_dim_t n(prb_dim_kind_t::n); prb_dim_t k(prb_dim_kind_t::k); } // namespace prb_dims +int spatial_index(const prb_dim_t &dim) { + switch (to_spatial(dim.kind())) { + case prb_dim_spatial_kind_t::d: return 0; + case prb_dim_spatial_kind_t::h: return 1; + case prb_dim_spatial_kind_t::w: return 2; + default: return -1; + } +} + const expr_t &index_var(const prb_dim_t &prb_dim) { static thread_local dim_map_t index_vars = []() { dim_map_t ret; diff --git a/src/gpu/intel/jit/ir/problem.hpp b/src/gpu/intel/jit/ir/problem.hpp index c7bfcd9cdda..06ba417a595 100644 --- a/src/gpu/intel/jit/ir/problem.hpp +++ b/src/gpu/intel/jit/ir/problem.hpp @@ -440,6 +440,7 @@ bool has_spatial(const dim_map_t &map, return false; } +int spatial_index(const prb_dim_t &dim); const expr_t &index_var(const prb_dim_t &prb_dim); const expr_t &size_var(const prb_dim_t &prb_dim); prb_dim_t index_to_prb_dim(const expr_t &var); From daccb519d0cd2e84f64f01baedfaf00e2f83bfe6 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Tue, 7 May 2024 16:26:44 -0700 Subject: [PATCH 063/187] gpu: intel: jit: introduce get_max_threadgroups_per_wave() --- src/gpu/intel/jit/ir/config.hpp | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/gpu/intel/jit/ir/config.hpp b/src/gpu/intel/jit/ir/config.hpp index 322aa7490ca..9f9be1959ed 100644 --- a/src/gpu/intel/jit/ir/config.hpp +++ b/src/gpu/intel/jit/ir/config.hpp @@ -345,16 +345,29 @@ class prim_config_t : public container_config_t { return ret; } + static int get_max_threadgroups_per_wave( + const exec_config_t &exec_cfg, int tg_elems) { + auto arch = convert_ngen_arch_to_dnnl(exec_cfg.hw().to_ngen()); + int threads_per_eu = compute::device_info_t::threads_per_eu( + arch, exec_cfg.regs() > 128); + int eus_per_subslice = compute::device_info_t::max_eus_per_wg(arch); + int subslice_count = exec_cfg.hw().eu_count() / eus_per_subslice; + + int tgs_per_subslice = eus_per_subslice * threads_per_eu / tg_elems; + ir_assert(tgs_per_subslice > 0); + return subslice_count * tgs_per_subslice; + } + // Return thread utilization as a percentage. If this value is low, // parallelism is a fundamental limitation to the current work scheduling. static float get_thread_utilization( const exec_config_t &exec_cfg, int kg_elems, int tg_elems) { auto arch = convert_ngen_arch_to_dnnl(exec_cfg.hw().to_ngen()); - int eus_per_slice = compute::device_info_t::max_eus_per_wg(arch); - int slice_count = exec_cfg.hw().eu_count() / eus_per_slice; + int eus_per_subslice = compute::device_info_t::max_eus_per_wg(arch); + int subslice_count = exec_cfg.hw().eu_count() / eus_per_subslice; - int min_wg_per_slice_wave = std::max(eus_per_slice / tg_elems, 1); - int min_wg_per_wave = slice_count * min_wg_per_slice_wave; + int min_wg_per_subslice_wave = std::max(eus_per_subslice / tg_elems, 1); + int min_wg_per_wave = subslice_count * min_wg_per_subslice_wave; return (100.f * kg_elems) / utils::rnd_up(kg_elems, min_wg_per_wave); } @@ -362,16 +375,8 @@ class prim_config_t : public container_config_t { // latency may be an issue due to limited use of SMT to hide the latency. static float get_wave_utilization( const exec_config_t &exec_cfg, int kg_elems, int tg_elems) { - auto arch = convert_ngen_arch_to_dnnl(exec_cfg.hw().to_ngen()); - int threads_per_eu = compute::device_info_t::threads_per_eu( - arch, exec_cfg.regs() > 128); - int eus_per_slice = compute::device_info_t::max_eus_per_wg(arch); - int slice_count = exec_cfg.hw().eu_count() / eus_per_slice; - - int wgs_per_slice = eus_per_slice * threads_per_eu / tg_elems; - ir_assert(wgs_per_slice > 0); - int wgs_per_tile = slice_count * wgs_per_slice; - return (100.f * kg_elems) / utils::rnd_up(kg_elems, wgs_per_tile); + int tgs_per_wave = get_max_threadgroups_per_wave(exec_cfg, tg_elems); + return (100.f * kg_elems) / utils::rnd_up(kg_elems, tgs_per_wave); } #define DECL_PARAM(name) \ From c0b25a2c5d673c871ca08ca7c903ecd2ddefb810 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Tue, 7 May 2024 16:26:06 -0700 Subject: [PATCH 064/187] gpu: intel: jit: conv: add grid walk order based on L3 cache size --- src/gpu/intel/jit/codegen/kernel.hpp | 145 +++++++++ src/gpu/intel/jit/conv/config.cpp | 424 +++++++++++++++++++++++-- src/gpu/intel/jit/conv/config.hpp | 34 +- src/gpu/intel/jit/conv/conv_kernel.hpp | 5 +- src/gpu/intel/jit/conv/ir_builder.cpp | 6 + src/gpu/intel/jit/conv/plan.cpp | 139 ++++---- src/gpu/intel/jit/conv/tiler.cpp | 1 + src/gpu/intel/jit/ir/gemm_schedule.hpp | 17 +- src/gpu/intel/jit/ir/walk_order.hpp | 186 +++++++++++ 9 files changed, 828 insertions(+), 129 deletions(-) create mode 100644 src/gpu/intel/jit/ir/walk_order.hpp diff --git a/src/gpu/intel/jit/codegen/kernel.hpp b/src/gpu/intel/jit/codegen/kernel.hpp index 8c331930298..bdcda7a87cb 100644 --- a/src/gpu/intel/jit/codegen/kernel.hpp +++ b/src/gpu/intel/jit/codegen/kernel.hpp @@ -29,6 +29,7 @@ #include "gpu/intel/jit/ir/kernel_info.hpp" #include "gpu/intel/jit/ir/message.hpp" #include "gpu/intel/jit/ir/tensor.hpp" +#include "gpu/intel/jit/ir/walk_order.hpp" #include "gpu/intel/jit/jit_generator.hpp" #include "gpu/intel/jit/ngen/ngen.hpp" #include "gpu/intel/jit/ngen/ngen_register_allocator.hpp" @@ -301,6 +302,18 @@ class ir_kernel_t : public jit_generator { bind_external_vars(kernel_body, grid_ctx, expr_binding); } + void bind_external_vars(const stmt_t &kernel_body, + const walk_order_t &kernel_grid_walk_order, + const std::array &local_id, + expr_binding_t &expr_binding) { + grid_context_t grid_ctx(/*create_empty=*/true); + for (int i = 0; i < 3; i++) { + grid_ctx.set_local_id(i, local_id[i]); + } + bind_external_vars(kernel_body, grid_ctx, expr_binding); + bind_kernel_grid_walk_order(kernel_grid_walk_order, expr_binding); + } + void bind_external_vars(const stmt_t &kernel_body, const grid_context_t &grid_ctx, expr_binding_t &expr_binding) { alloc_manager_t alloc_mgr(kernel_body); @@ -308,6 +321,7 @@ class ir_kernel_t : public jit_generator { // Bind grid indices. int r0_sub_idxs[] = {1, 6, 7}; for (int i = 0; i < 3; i++) { + if (grid_ctx.tg_idx(i).is_empty()) continue; auto tmp = ra_.template alloc_sub(); mov(1, tmp, r0.ud(r0_sub_idxs[i])); expr_binding.bind(grid_ctx.tg_idx(i), tmp); @@ -334,6 +348,137 @@ class ir_kernel_t : public jit_generator { if (!slm_buf.is_empty()) expr_binding.bind(slm_buf, to_ngen(expr_t(0))); } + void bind_kernel_grid_walk_order_blocked(const ngen::Subregister &id, + const std::vector> &blocks, + const std::vector &dims, const std::vector &grid_vars, + expr_binding_t &expr_binding) { + int ndims = (int)dims.size(); + int nblocks = (int)blocks.size(); + std::vector rem_dims(ndims); + std::vector dim_idxs(ndims); + for (int i = 0; i < ndims; i++) { + rem_dims[i] = ra_.alloc_sub(); + dim_idxs[i] = ra_.alloc_sub(); + emov(1, rem_dims[i], dims[i]); + emov(1, dim_idxs[i], 0); + } + + auto mul_add = [&](const ngen::Subregister &dst, + const ngen::Subregister &src0, + const ngen::Subregister &src1, uint32_t src2) { + bool is_src2_16_bit + = (src2 <= std::numeric_limits::max()); + if (hw >= ngen::HW::XeLP && is_src2_16_bit && false) { + mad(1, dst, src0, src1, src2); + } else { + auto tmp = ra_.alloc_sub(); + mul(1, tmp.d(0), src1, src2 & 0xFFFF); + mul(1, tmp.d(1), src1, src2 >> 16); + shl(1, tmp.ud(1), tmp.ud(1), 16); + add(1, tmp.d(0), tmp.d(1), tmp.d(0)); + add(1, dst, src0, tmp.d(0)); + ra_.safeRelease(tmp); + } + }; + + auto _id = ra_.alloc_sub(); + auto qot = ra_.alloc_sub(); + auto rem = ra_.alloc_sub(); + auto rem_size = ra_.alloc_sub(); + auto rounded = ra_.alloc_sub(); + emov(1, _id, id); + for (int i = nblocks - 1; i >= 0; i--) { + int dim_idx = blocks[i].first; + int inner_block_size = 1; + for (int j = 0; j < i; j++) { + if (blocks[j].first == dim_idx) + inner_block_size *= blocks[j].second; + } + emov(1, rem_size, inner_block_size); + for (int j = 0; j < ndims; j++) { + if (j == dim_idx) continue; + emul(1, rem_size, rem_size, rem_dims[j]); + } + eidiv(1, qot, rem, _id, rem_size); + emov(1, _id, rem); + mul_add(dim_idxs[dim_idx], qot, dim_idxs[dim_idx], + blocks[i].second); + emul(1, rounded, qot, inner_block_size); + eadd(1, rounded, rem_dims[dim_idx], -rounded); + min_(1, rem_dims[dim_idx], rounded, inner_block_size); + } + ra_.safeRelease(_id); + ra_.safeRelease(qot); + ra_.safeRelease(rem); + ra_.safeRelease(rem_size); + ra_.safeRelease(rounded); + + for (int i = 0; i < ndims; i++) + ra_.safeRelease(rem_dims[i]); + + for (int i = 0; i < ndims; i++) { + expr_binding.bind(grid_vars[i], dim_idxs[i]); + } + } + + void bind_kernel_grid_walk_order_non_blocked(const ngen::Subregister &id, + const std::vector> &blocks, + const std::vector &grid_vars, + expr_binding_t &expr_binding) { + int nblocks = (int)blocks.size(); + ir_assert((int)grid_vars.size() == nblocks); + if (nblocks == 1) { + expr_binding.bind(grid_vars[0], id); + return; + } + auto _id = ra_.alloc_sub(); + emov(1, _id, id); + for (int i = 0; i < nblocks; i++) { + int dim_idx = blocks[i].first; + auto idx = ra_.alloc_sub(); + eidiv(1, _id, idx, _id, (uint32_t)blocks[i].second); + expr_binding.bind(grid_vars[dim_idx], idx); + } + ra_.safeRelease(_id); + } + + void bind_kernel_grid_walk_order( + const walk_order_t &walk_order, expr_binding_t &expr_binding) { + const int grid_ndims = 3; + ngen::Subregister grid_ids[grid_ndims] = {r0.ud(1), r0.ud(6), r0.ud(7)}; + for (int i = 0; i < grid_ndims; i++) { + std::vector> blocks; + std::unordered_map> + dim_map; + auto to_dim_idx = [&](const prb_dim_t &dim) { + if (dim_map.count(dim) != 0) return dim_map.at(dim); + int idx = (int)dim_map.size(); + dim_map.emplace(dim, idx); + return idx; + }; + for (auto &b : walk_order.blocks()) { + if (b.grid_id != i) continue; + blocks.emplace_back(to_dim_idx(b.dim), b.size); + } + if (dim_map.empty()) continue; + std::vector dims; + std::vector grid_vars; + dims.resize(dim_map.size()); + grid_vars.resize(dim_map.size()); + for (auto &kv : dim_map) { + dims[kv.second] = walk_order.dim_size(kv.first); + grid_vars[kv.second] = walk_order.grid_var(kv.first); + } + if (walk_order.is_blocked(i) || gpu_utils::dev_getenv("B", false)) { + bind_kernel_grid_walk_order_blocked( + grid_ids[i], blocks, dims, grid_vars, expr_binding); + } else { + bind_kernel_grid_walk_order_non_blocked( + grid_ids[i], blocks, grid_vars, expr_binding); + } + } + } + void generate_epilogue() { epilogue(); pad_kernel(); diff --git a/src/gpu/intel/jit/conv/config.cpp b/src/gpu/intel/jit/conv/config.cpp index 3077bc88542..846d483a50f 100644 --- a/src/gpu/intel/jit/conv/config.cpp +++ b/src/gpu/intel/jit/conv/config.cpp @@ -1201,37 +1201,19 @@ void init_params(conv_config_t &cfg) { cfg.tiler().set_params(cfg); } -const std::array &get_kernel_grid_conv_dims( - const conv_problem_t &prb) { - static const prb_tile_t fwd_0({prb_dims::oc}); - static const prb_tile_t fwd_1( - {prb_dims::g, prb_dims::od, prb_dims::oh, prb_dims::ow}); - static const prb_tile_t fwd_2({prb_dims::mb}); - - static const prb_tile_t bwd_d_0({prb_dims::ic}); - static const prb_tile_t bwd_d_1( - {prb_dims::g, prb_dims::id, prb_dims::ih, prb_dims::iw}); - static const prb_tile_t bwd_d_2({prb_dims::mb}); - - static const prb_tile_t bwd_w_0({prb_dims::oc}); - static const prb_tile_t bwd_w_1({prb_dims::ic, prb_dims::kd, prb_dims::kh, - prb_dims::kw, prb_dims::od, prb_dims::oh, prb_dims::ow}); - static const prb_tile_t bwd_w_2({prb_dims::g, prb_dims::mb}); - - using prb_tile_3 = std::array; - static const prb_tile_3 fwd = {fwd_0, fwd_1, fwd_2}; - static const prb_tile_3 bwd_d = {bwd_d_0, bwd_d_1, bwd_d_2}; - static const prb_tile_3 bwd_w = {bwd_w_0, bwd_w_1, bwd_w_2}; - - if (prb.is_fwd) return fwd; - if (prb.is_bwd_d) return bwd_d; - if (prb.is_bwd_w) return bwd_w; - ir_error_not_expected(); - return fwd; +std::array get_kernel_grid_conv_dims(const conv_config_t &cfg) { + std::array grid_dims; + for (int i = 0; i < 3; i++) { + for (auto &d : cfg.walk_order().grid_dims(i)) { + grid_dims[i][d] = 1; + } + } + return grid_dims; } -const std::array &get_thread_group_grid_conv_dims( - const conv_problem_t &prb) { +using prb_tile_3 = std::array; + +prb_tile_3 get_thread_group_grid_conv_dims(const conv_config_t &cfg) { static const prb_tile_t fwd_0({prb_dims::oc}); static const prb_tile_t fwd_1({prb_dims::mb, prb_dims::ow}); static const prb_tile_t fwd_2({prb_dims::ic}); @@ -1244,7 +1226,6 @@ const std::array &get_thread_group_grid_conv_dims( static const prb_tile_t bwd_w_1({prb_dims::ic}); static const prb_tile_t bwd_w_2; - using prb_tile_3 = std::array; // non-transposed static const prb_tile_3 fwd = {fwd_0, fwd_1, fwd_2}; static const prb_tile_3 bwd_d = {bwd_d_0, bwd_d_1, bwd_d_2}; @@ -1254,6 +1235,7 @@ const std::array &get_thread_group_grid_conv_dims( static const prb_tile_3 t_bwd_d = {bwd_d_1, bwd_d_0, bwd_d_2}; static const prb_tile_3 t_bwd_w = {bwd_w_1, bwd_w_0, bwd_w_2}; + auto &prb = cfg.prb(); if (prb.is_fwd) return (prb.ab_swap_transpose) ? t_fwd : fwd; if (prb.is_bwd_d) return (prb.ab_swap_transpose) ? t_bwd_d : bwd_d; if (prb.is_bwd_w) return (prb.ab_swap_transpose) ? t_bwd_w : bwd_w; @@ -1262,11 +1244,381 @@ const std::array &get_thread_group_grid_conv_dims( } void init_kernel_grid(conv_config_t &cfg) { - cfg.init_kernel_grid(get_kernel_grid_conv_dims(cfg.prb())); + cfg.init_kernel_grid(get_kernel_grid_conv_dims(cfg)); } void init_thread_group_grid(conv_config_t &cfg) { - cfg.init_thread_group_grid(get_thread_group_grid_conv_dims(cfg.prb())); + cfg.init_thread_group_grid(get_thread_group_grid_conv_dims(cfg)); +} + +void get_layout_and_dims(tensor_kind_t ab_kind, const conv_config_t &cfg, + layout_t &layout, std::vector &dims) { + auto &prb = cfg.prb(); + auto &src_dims + = conv_layout_dims(tensor_kind_t::src, /*src_dst_with_group=*/true); + auto &wei_dims + = conv_layout_dims(tensor_kind_t::wei, /*src_dst_with_group=*/true); + auto &dst_dims + = conv_layout_dims(tensor_kind_t::dst, /*src_dst_with_group=*/true); + switch (ab_kind) { + case tensor_kind_t::a: + layout = prb.pick_a(cfg.src_layout(), + cfg.wei_layout(), cfg.dst_layout()) + .compute(); + dims = prb.pick_a &>( + src_dims, wei_dims, dst_dims); + break; + case tensor_kind_t::b: + layout = prb.pick_b(cfg.src_layout(), + cfg.wei_layout(), cfg.dst_layout()) + .compute(); + dims = prb.pick_b &>( + src_dims, wei_dims, dst_dims); + break; + default: ir_error_not_expected(); + } + ir_assert(layout.ndims() == (int)dims.size()); +} + +// Calculates the size of the range for spatial dimensions within a tile. +// For example, consider forward convolution with stride of 2 and tile ow8kw3. +// After mapping (iw = ow * SW + kw), "iw" range is [0, 16] of size 17. +int map_spatial(const conv_config_t &cfg, const prb_dim_t &dim, + const prb_tile_t &tile) { + auto &prb = cfg.prb(); + bool is_isp = utils::one_of(dim, prb_dims::id, prb_dims::ih, prb_dims::iw); + bool is_osp = utils::one_of(dim, prb_dims::od, prb_dims::oh, prb_dims::ow); + const prb_dim_t isp_dims[] = {prb_dims::id, prb_dims::ih, prb_dims::iw}; + const prb_dim_t ksp_dims[] = {prb_dims::kd, prb_dims::kh, prb_dims::kw}; + const prb_dim_t osp_dims[] = {prb_dims::od, prb_dims::oh, prb_dims::ow}; + int isp[] = {prb.id, prb.ih, prb.iw}; + int osp[] = {prb.od, prb.oh, prb.ow}; + int padding[] = {prb.pd, prb.ph, prb.pw}; + int stride[] = {prb.sd, prb.sh, prb.sw}; + int dilation[] = {prb.dd, prb.dh, prb.dw}; + int idx = spatial_index(dim); + ir_assert(idx != -1); + int O = tile.get(osp_dims[idx], 1); + int I = tile.get(isp_dims[idx], 1); + int K = tile.get(ksp_dims[idx], 1); + int P = padding[idx]; + int S = stride[idx]; + int D = dilation[idx]; + if (is_isp) { + // Source tensor, map ox, kx to ix. + ir_assert(prb.is_fwd || prb.is_bwd_w); + int i_min = -P; + int i_max = (O - 1) * S - P + (K - 1) * (1 + D); + return std::min(isp[idx], i_max - i_min + 1); + } + // Destination tensor, map ix, kx to ox. + ir_assert(is_osp && prb.is_bwd_d); + int os_min = P - (K - 1) * (1 + D); + int os_max = (I - 1) + P; + return std::min(osp[idx], utils::div_up(os_max - os_min + 1, S)); +} + +bool needs_spatial_mapping(const conv_config_t &cfg, const prb_dim_t &dim) { + auto &prb = cfg.prb(); + switch (dim.kind()) { + case prb_dim_kind_t::od: + case prb_dim_kind_t::oh: + case prb_dim_kind_t::ow: return prb.is_bwd_d; + case prb_dim_kind_t::id: + case prb_dim_kind_t::ih: + case prb_dim_kind_t::iw: return prb.is_fwd || prb.is_bwd_w; + default: return false; + } +} + +size_t get_memory_footprint(const tensor_kind_t &ab_kind, + const conv_config_t &cfg, const prb_tile_t &_tile) { + layout_t layout; + std::vector dims; + get_layout_and_dims(ab_kind, cfg, layout, dims); + dim_t elems = 1; + prb_tile_t tile; + for (int i = 0; i < layout.ndims(); i++) { + auto &d = dims[i]; + dim_t d_size + = (needs_spatial_mapping(cfg, d) ? map_spatial(cfg, d, _tile) + : _tile.get(d, 1)); + tile[d] = d_size; + elems *= std::min(d_size, layout.dim(i)); + } + ir_assert(elems >= 1); + return (size_t)layout.type().size() * elems; +} + +// Returns the memory footprint in bytes for both input tensors accessed inside +// the tile that is combined from tg_tile and grid_tile. +size_t get_memory_footprint(const conv_config_t &cfg, const prb_tile_t &tg_tile, + const prb_tile_t &grid_tile) { + prb_tile_t tile; + for (auto &d : tg_tile) { + if (tg_tile[d] == 1) continue; + tile[d] = tg_tile[d]; + } + for (auto &d : grid_tile) { + if (grid_tile[d] == 1) continue; + tile[d] = tile.get(d, 1) * grid_tile[d]; + } + auto a_bytes = get_memory_footprint(tensor_kind_t::a, cfg, tile); + auto b_bytes = get_memory_footprint(tensor_kind_t::b, cfg, tile); + return a_bytes + b_bytes; +} + +prb_tile_t get_grid_tile(const conv_config_t &cfg) { + prb_tile_t grid_tile; + for (auto &d : conv_index_dims(cfg.prb().prop_kind())) { + int size = cfg.grid_dim(d); + if (size == 1) continue; + grid_tile[d] = size; + } + return grid_tile; +} + +// Adjusts walk order to iterate group dimension earlier to ensure better +// access locality for a higher cache hit rate. +walk_order_t maybe_fixup_group_with_small_channels( + const conv_config_t &cfg, const walk_order_t &walk_order) { + auto &prb = cfg.prb(); + auto grid_tile = get_grid_tile(cfg); + if (prb.g == 1 || !grid_tile.has(prb_dims::g)) return walk_order; + + auto &layout = (prb.is_fwd || prb.is_bwd_w) ? cfg.src_layout().compute() + : cfg.dst_layout().compute(); + const int g_dim_idx = 1; + const int c_dim_idx = 2; + if (layout.nblocks() <= 1) return walk_order; + auto &b0 = layout.blocks()[0]; + auto &b1 = layout.blocks()[1]; + // Check that layout has groups followed by channels, i.e. *gc form. + if (b0.dim_idx != c_dim_idx || b1.dim_idx != g_dim_idx) return walk_order; + // If the full channel dimension exceeds the cache line size, cache reuse + // should be already good enough. + // Xe2 has 256 byte L3 cache block so try to span 4 cache lines. + int factor = (cfg.hw() == ngen::HW::Xe2) ? 4 : 1; + if (layout.type().size() * b0.block >= cfg.hw().cache_line_size() * factor) + return walk_order; + + walk_order_t fixed; + fixed.add(prb_dims::g, grid_tile.at(prb_dims::g), 0); + for (auto &b : walk_order.blocks()) { + if (b.dim == prb_dims::g) continue; + fixed.add(b.dim, b.size, b.grid_id); + } + fixed.finalize(grid_tile); + return fixed; +} + +walk_order_t get_default_walk_order( + const conv_config_t &cfg, const prb_tile_t &grid_tile) { + using vec_t = std::vector; + // Ordered from innermost to outermost. + static const vec_t fwd_0({prb_dims::oc}); + static const vec_t fwd_1( + {prb_dims::ow, prb_dims::oh, prb_dims::od, prb_dims::g}); + static const vec_t fwd_2({prb_dims::mb}); + + static const vec_t bwd_d_0({prb_dims::ic}); + static const vec_t bwd_d_1( + {prb_dims::iw, prb_dims::ih, prb_dims::id, prb_dims::g}); + static const vec_t bwd_d_2({prb_dims::mb}); + + static const vec_t bwd_w_0({prb_dims::oc}); + static const vec_t bwd_w_1({prb_dims::ic, prb_dims::kw, prb_dims::kh, + prb_dims::kd, prb_dims::ow, prb_dims::oh, prb_dims::od}); + static const vec_t bwd_w_2({prb_dims::g, prb_dims::mb}); + static const std::array fwd = {fwd_0, fwd_1, fwd_2}; + static const std::array bwd_d = {bwd_d_0, bwd_d_1, bwd_d_2}; + static const std::array bwd_w = {bwd_w_0, bwd_w_1, bwd_w_2}; + auto grid_dims + = (cfg.prb().is_fwd ? fwd : (cfg.prb().is_bwd_d ? bwd_d : bwd_w)); + walk_order_t walk_order; + for (int i = 0; i < 3; i++) { + for (auto &d : grid_dims[i]) { + if (grid_tile.has(d)) walk_order.add(d, grid_tile[d], i); + } + } + walk_order.finalize(grid_tile); + walk_order = maybe_fixup_group_with_small_channels(cfg, walk_order); + return walk_order; +} + +// Helper class to iterate through M/N problem sizes in blocks to ensure +// squarish (M x N) size for more optimal cache reuse. +class mn_walker_t { +public: + struct entry_t { + prb_dim_t dim; + int size = 1; + int tile_size = 1; + prb_dim_kind_t mn_kind = prb_dim_kind_t::undef; + + bool has_next() const { return size < tile_size; } + }; + + mn_walker_t(const prb_tile_t &tile, const conv_problem_t &prb) : prb_(prb) { + for (auto &d : tile) { + auto bmnk = to_gemm(d, prb); + entry_t e; + e.dim = d; + e.tile_size = tile[d]; + e.mn_kind = bmnk.kind(); + if (!utils::one_of(e.mn_kind, prb_dim_kind_t::m, prb_dim_kind_t::n)) + continue; + entries_.push_back(e); + } + // Put through spatial dimensions first and order spatial accordingly + // (WHD, width is first). + std::sort(entries_.begin(), entries_.end(), + [&](const entry_t &a, const entry_t &b) { + int a_sp_idx = spatial_index(a.dim); + int b_sp_idx = spatial_index(b.dim); + if (a_sp_idx >= 0 && b_sp_idx >= 0) + return a_sp_idx > b_sp_idx; + return (a_sp_idx >= 0) && (b_sp_idx < 0); + }); + } + + bool has_next() const { + for (auto &e : entries_) + if (e.has_next()) return true; + return false; + } + + entry_t next(const prb_tile_t &inner) { + int m_size = 1; + int n_size = 1; + for (auto &d : inner) { + auto bmnk = to_gemm(d, prb_); + if (bmnk == prb_dims::m) { + m_size *= inner[d]; + } else if (bmnk == prb_dims::n) { + n_size *= inner[d]; + } + } + auto mn_kind + = (m_size < n_size ? prb_dim_kind_t::m : prb_dim_kind_t::n); + for (auto kind : {mn_kind, prb_dim_kind_t::undef}) { + for (auto &e : entries_) { + if (utils::one_of(kind, e.mn_kind, prb_dim_kind_t::undef) + && e.has_next()) { + e.size *= 2; + return e; + } + } + } + ir_error_not_expected(); + return entry_t(); + } + +private: + conv_problem_t prb_; + std::vector entries_; +}; + +walk_order_t compute_walk_order(const conv_config_t &cfg) { + auto &prb = cfg.prb(); + int tg_size = 1; + prb_tile_t inner; + for (auto &d : conv_index_dims(cfg.prb().prop_kind())) { + int iter = cfg.iter_dim(d); + int tg = cfg.thread_group_dim(d); + int loop = cfg.loop_dim(d); + int size = iter * tg * loop; + if (size == 1) continue; + inner[d] = size; + tg_size *= tg; + } + auto grid_tile = get_grid_tile(cfg); + auto default_walk_order = get_default_walk_order(cfg, grid_tile); + + // Depthwise does not expose much reuse so keep the default order. + if (prb.is_dw) return default_walk_order; + + // If threadgroup memory footprint exceeds L3 then L3 blocking is not + // applied. + const size_t l3_size = cfg.hw().l3_cache_size(); + size_t inner_bytes = get_memory_footprint(cfg, inner, prb_tile_t()); + if (inner_bytes > l3_size) return default_walk_order; + + // If input memory fits L3 then no L3 blocking is not applied. + size_t ab_bytes = get_memory_footprint(cfg, inner, grid_tile); + if (ab_bytes <= l3_size) return default_walk_order; + + // If the kernel does not require multiple waves then no L3 blocking is not + // applied. + float max_tgs_per_wave = conv_config_t::get_max_threadgroups_per_wave( + cfg.exec_cfg(), tg_size); + if (grid_tile.elems() <= max_tgs_per_wave) return default_walk_order; + + // Add M/N blocks until the full footprint fits L3 cache. + prb_tile_t grid_inner; + prb_tile_t rem_tile = grid_tile; + ab_bytes = inner_bytes; + mn_walker_t mn_walker(rem_tile, cfg.prb()); + while (mn_walker.has_next()) { + auto entry = mn_walker.next(grid_inner); + auto outer = grid_inner; + outer[entry.dim] = std::min(rem_tile[entry.dim], entry.size); + size_t ab_bytes = get_memory_footprint(cfg, inner, outer); + if (ab_bytes <= l3_size) grid_inner = outer; + } + // Add the blocks in this order: + // - Step 1. Add grid_inner blocks (fitting L3 cache) + // - Step 2. Add the remaining M/N blocks + // - Step 3. Add the remaining B/K blocks + // Within a step follow the default walk order between dimensions. + walk_order_t walk_order; + for (int step = 0; step < 3; step++) { + for (auto &b : default_walk_order.blocks()) { + switch (step) { + case 0: + if (grid_inner.has(b.dim)) { + walk_order.add(b.dim, grid_inner[b.dim], 0); + } + break; + case 1: + case 2: + int rem = utils::div_up( + grid_tile[b.dim], grid_inner.get(b.dim, 1)); + if (rem == 1) continue; + auto bmnk = to_gemm(b.dim, prb); + bool is_bk = utils::one_of(bmnk, prb_dims::b, prb_dims::k); + if ((step == 2) != is_bk) continue; + walk_order.add(b.dim, rem, 0); + break; + } + } + } + walk_order.finalize(grid_tile); + walk_order = maybe_fixup_group_with_small_channels(cfg, walk_order); + + // Emulated integer division can handle a limited range only. + const int max_size_per_grid_id = (1 << 20); + for (int id = 0; id < 3; id++) { + if (!walk_order.is_blocked(id)) continue; + int size = 1; + for (auto &b : walk_order.blocks()) { + if (b.grid_id == id) size *= b.size; + } + if (size > max_size_per_grid_id) return default_walk_order; + } + + return walk_order; +} + +void init_walk_order(conv_config_t &cfg) { + if (cfg.walk_order_param().is_overridden()) { + auto walk_order = cfg.walk_order(); + walk_order.finalize(get_grid_tile(cfg)); + cfg.set_walk_order(walk_order); + return; + } + auto walk_order = compute_walk_order(cfg); + cfg.walk_order_param().set(walk_order); } int fixup_slm_bufs(const conv_problem_t &prb, int slm_bufs, @@ -1371,9 +1723,8 @@ void validate_config_and_plan(conv_config_t &cfg) { if (d == dim) return; ir_error_not_expected() << dim.name(); }; - const auto &prb = cfg.prb(); - const auto &tg_dims = get_thread_group_grid_conv_dims(prb); - const auto &grid_dims = get_kernel_grid_conv_dims(prb); + const auto &tg_dims = get_thread_group_grid_conv_dims(cfg); + const auto &grid_dims = get_kernel_grid_conv_dims(cfg); for (auto &d : cfg.dims()) { if (cfg.thread_group_dim(d) != 1) check_if_in_grid_dims(tg_dims, d); if (cfg.grid_dim(d) != 1) check_if_in_grid_dims(grid_dims, d); @@ -1390,6 +1741,7 @@ void validate_config_and_plan(conv_config_t &cfg) { send_pattern b_load_pattern; bool a_2d = plan.uses_2d_load(abc_kind_t::a); bool b_2d = plan.uses_2d_load(abc_kind_t::b); + auto &prb = cfg.prb(); if (prb.is_fwd) { a_load_pattern = validate_blocking( cfg, conv_stride_layout_t::input_tensor_t::src, a_2d); @@ -1423,6 +1775,7 @@ void validate_config_and_plan(conv_config_t &cfg) { status_t try_init_cfg(conv_config_t &cfg) { init_params(cfg); + init_walk_order(cfg); init_kernel_grid(cfg); init_thread_group_grid(cfg); @@ -1527,6 +1880,7 @@ std::string conv_config_t::str() const { oss << " Subtiles: " << "A: " << subtiles().a() << ", B: " << subtiles().b() << std::endl; oss << " Estimated GRF usage: " << estimated_peak_regs << std::endl; oss << " AB Swap Transpose: " << to_string(prb().ab_swap_transpose) << std::endl; + oss << " Kernel grid walk order: " << walk_order() << std::endl; oss << " Configuration line: " << get_config_line() << std::endl; // clang-format on return oss.str(); diff --git a/src/gpu/intel/jit/conv/config.hpp b/src/gpu/intel/jit/conv/config.hpp index 31771ff6197..4d3d3ab54d1 100644 --- a/src/gpu/intel/jit/conv/config.hpp +++ b/src/gpu/intel/jit/conv/config.hpp @@ -29,6 +29,7 @@ #include "gpu/intel/jit/ir/fma.hpp" #include "gpu/intel/jit/ir/hw.hpp" #include "gpu/intel/jit/ir/tensor_config.hpp" +#include "gpu/intel/jit/ir/walk_order.hpp" #include "gpu/intel/jit/utils/utils.hpp" namespace dnnl { @@ -412,6 +413,29 @@ class subtiles_param_t : public param_t { int b_ = 1; }; +class walk_order_param_t : public value_param_t { +public: + using value_param_t::value_param_t; + + std::string name() const override { return "walk-order"; } + std::string desc() const override { + return "Kernel grid walk order (innermost -> outermost)."; + } + bool is_overridable() const override { return true; } + bool is_default() const override { return false; } + + void set_from_str(const std::string &s) override { + if (s.empty()) return; + value_ = walk_order_t(s); + } + + std::string str() const override { + std::ostringstream oss; + oss << short_name() << "=" << value_; + return oss.str(); + } +}; + class wei_layout_param_t : public layout_param_t { std::string name() const override { return "wei"; } std::string desc() const override { return "Weights layout."; } @@ -468,6 +492,7 @@ class conv_config_t : public prim_config_t { DECL_PARAM(fma_kind) DECL_PARAM(pad_slm) DECL_PARAM(prb) + DECL_PARAM(walk_order) DECL_PARAM2(pipeline) DECL_PARAM2(prefetch) DECL_PARAM2(slm) @@ -593,6 +618,7 @@ class conv_config_t : public prim_config_t { INIT_PARAM(unroll) INIT_PARAM(wei_layout) INIT_PARAM(bia_layout) + INIT_PARAM(walk_order) #undef INIT_PARAM }; @@ -632,11 +658,11 @@ tensor_config_t get_tensor_config( int estimate_register_count(const conv_config_t &cfg); int default_regs(const conv_config_t &cfg); void init_kernel_grid(conv_config_t &cfg); +void init_walk_order(conv_config_t &cfg); void init_thread_group_grid(conv_config_t &cfg); -const std::array &get_kernel_grid_conv_dims( - const conv_problem_t &prb); -const std::array &get_thread_group_grid_conv_dims( - const conv_problem_t &prb); +std::array get_kernel_grid_conv_dims(const conv_config_t &cfg); +std::array get_thread_group_grid_conv_dims( + const conv_config_t &cfg); } // namespace jit } // namespace intel diff --git a/src/gpu/intel/jit/conv/conv_kernel.hpp b/src/gpu/intel/jit/conv/conv_kernel.hpp index 32e529572d1..4021e5f6ab2 100644 --- a/src/gpu/intel/jit/conv/conv_kernel.hpp +++ b/src/gpu/intel/jit/conv/conv_kernel.hpp @@ -30,6 +30,7 @@ #include "gpu/intel/jit/conv/config.hpp" #include "gpu/intel/jit/conv/grf_usage.hpp" #include "gpu/intel/jit/conv/ir_builder.hpp" +#include "gpu/intel/jit/conv/plan.hpp" namespace dnnl { namespace impl { @@ -85,8 +86,8 @@ conv_kernel_t::conv_kernel_t(const conv_config_t &cfg, // Bind "external" variables. expr_binding_t expr_binding(hw); - bind_external_vars( - body, cfg_.kernel_grid(), builder.local_id(), expr_binding); + bind_external_vars(body, cfg_.plan().gemm_schedule.kernel_grid_walk_order(), + builder.local_id(), expr_binding); profile.stamp("Bind Variables"); #ifdef DNNL_DEV_MODE diff --git a/src/gpu/intel/jit/conv/ir_builder.cpp b/src/gpu/intel/jit/conv/ir_builder.cpp index 8ce031a0e5b..626a1f7c791 100644 --- a/src/gpu/intel/jit/conv/ir_builder.cpp +++ b/src/gpu/intel/jit/conv/ir_builder.cpp @@ -674,6 +674,12 @@ void conv_ir_builder_t::build() { init_kernel_grid(cfg_.kernel_grid(), cfg_.thread_group_grid(), cfg_.simd(), init_cset, init_stmts); + auto &walk_order = gemm_schedule.kernel_grid_walk_order(); + for (auto &info : walk_order.dim_infos()) { + init_cset.add_constraint(info.grid_var >= 0); + init_cset.add_constraint(info.grid_var < info.size); + } + // Initialize memory buffers. std::vector inner_lets; diff --git a/src/gpu/intel/jit/conv/plan.cpp b/src/gpu/intel/jit/conv/plan.cpp index 12ff9b592d2..e22e3b76c82 100644 --- a/src/gpu/intel/jit/conv/plan.cpp +++ b/src/gpu/intel/jit/conv/plan.cpp @@ -87,8 +87,8 @@ static dim_tile_t create_tile(gemm_schedule_t &gemm_schedule, bool is_tg = (dim_idx == 2); bool is_iter = (dim_idx == 3); if (is_thr || is_iter) return true; - auto &grid = is_tg ? get_thread_group_grid_conv_dims(cfg.prb()) - : get_kernel_grid_conv_dims(cfg.prb()); + auto grid = is_tg ? get_thread_group_grid_conv_dims(cfg) + : get_kernel_grid_conv_dims(cfg); for (auto &tile : grid) for (auto &d : tile) if (dim_name == d.name()) return true; @@ -115,60 +115,34 @@ static dim_tile_t create_tile(gemm_schedule_t &gemm_schedule, return tile; } -// Checks if groups should be iterated first to ensure better access locality -// for a higher cache hit rate. -bool set_g_grid_idx_innermost(const hw_t &hw, const layout_t &layout) { - const int g_dim_idx = 1; - const int c_dim_idx = 2; - if (layout.nblocks() <= 1) return false; - auto &b0 = layout.blocks()[0]; - auto &b1 = layout.blocks()[1]; - // Check that layout has groups followed by channels, i.e. *gc form. - if (b0.dim_idx != c_dim_idx || b1.dim_idx != g_dim_idx) return false; - // If the full channel dimension exceeds the cache line size, cache reuse - // should be already good enough. - if (layout.type().size() * b0.block >= hw.cache_line_size()) return false; - return true; -} - -bool set_g_grid_idx_innermost(const conv_config_t &cfg) { - auto &prb = cfg.prb(); - if (prb.g == 1) return false; - - if (prb.is_fwd || prb.is_bwd_w) { - return set_g_grid_idx_innermost(cfg.hw(), cfg.src_layout().compute()); - } - return set_g_grid_idx_innermost(cfg.hw(), cfg.dst_layout().compute()); -} - -void bind_grid_idx(const conv_config_t &cfg, gemm_schedule_t &gemm_schedule, - const expr_t &var, bool is_tg) { - auto &grid_dims = is_tg ? get_thread_group_grid_conv_dims(cfg.prb()) - : get_kernel_grid_conv_dims(cfg.prb()); - int grid_idx = -1; +void bind_thread_group_grid_idx(const conv_config_t &cfg, + gemm_schedule_t &gemm_schedule, const expr_t &var) { + auto grid_dims = get_thread_group_grid_conv_dims(cfg); + int grid_id = -1; for (auto &v : gemm_schedule.get_root_vars(var)) { auto v_dim = prb_dim_t::from_name(v.as().name); for (int i = 0; i < 3; i++) { if (grid_dims[i].has(v_dim)) { - ir_assert(grid_idx == -1 || grid_idx == i); - grid_idx = i; + ir_assert(grid_id == -1 || grid_id == i); + grid_id = i; } } } - ir_assert(grid_idx != -1); - gemm_schedule.bind(var, - is_tg ? cfg.thread_group_grid().idx(grid_idx) - : cfg.kernel_grid().idx(grid_idx)); + ir_assert(grid_id != -1); + gemm_schedule.bind(var, cfg.thread_group_grid().idx(grid_id)); } -void bind_kernel_grid_idx(const conv_config_t &cfg, - gemm_schedule_t &gemm_schedule, const expr_t &var) { - bind_grid_idx(cfg, gemm_schedule, var, /*is_tg=*/false); -} - -void bind_thread_group_grid_idx(const conv_config_t &cfg, - gemm_schedule_t &gemm_schedule, const expr_t &var) { - bind_grid_idx(cfg, gemm_schedule, var, /*is_tg=*/true); +void bind_kernel_grid( + gemm_schedule_t &gemm_schedule, const std::vector &vars) { + for (auto &v : vars) { + if (gemm_schedule.var_bound(v) == 1) continue; + auto root_vars = gemm_schedule.get_root_vars(v); + ir_assert((int)root_vars.size() == 1); + auto v_dim = prb_dim_t::from_name(root_vars[0].as().name); + auto dummy_grid_var + = gemm_schedule.kernel_grid_walk_order().grid_var(v_dim); + gemm_schedule.bind(v, dummy_grid_var); + } } void init_fwd(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule, @@ -304,19 +278,17 @@ void init_fwd(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule, auto ic_tile = create_tile(gemm_schedule, cfg_, ic); auto kw_tile = create_tile(gemm_schedule, cfg_, kw); - expr_t g_ow_grid_idx; - if (set_g_grid_idx_innermost(cfg_)) { - g_ow_grid_idx = gemm_schedule.fuse( - {od, oh, ow_tile.grid_idx(), g_tile.grid_idx()}); - } else { - g_ow_grid_idx = gemm_schedule.fuse( - {g_tile.grid_idx(), od, oh, ow_tile.grid_idx()}); - } auto mb_ow_tg_idx = gemm_schedule.fuse(mb_tile.tg_idx(), ow_tile.tg_idx()); - bind_kernel_grid_idx(cfg_, gemm_schedule, oc_tile.grid_idx()); - bind_kernel_grid_idx(cfg_, gemm_schedule, g_ow_grid_idx); - bind_kernel_grid_idx(cfg_, gemm_schedule, mb_tile.grid_idx()); + std::vector kernel_grid_vars; + kernel_grid_vars.push_back(oc_tile.grid_idx()); + kernel_grid_vars.push_back(od); + kernel_grid_vars.push_back(oh); + kernel_grid_vars.push_back(ow_tile.grid_idx()); + kernel_grid_vars.push_back(g_tile.grid_idx()); + kernel_grid_vars.push_back(mb_tile.grid_idx()); + bind_kernel_grid(gemm_schedule, kernel_grid_vars); + bind_thread_group_grid_idx(cfg_, gemm_schedule, oc_tile.tg_idx()); bind_thread_group_grid_idx(cfg_, gemm_schedule, mb_ow_tg_idx); bind_thread_group_grid_idx(cfg_, gemm_schedule, ic_tile.tg_idx()); @@ -488,18 +460,16 @@ void init_bwd_d(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule, auto iw_tile = create_tile(gemm_schedule, cfg_, iw); auto oc_tile = create_tile(gemm_schedule, cfg_, oc); - expr_t g_isp_grid_idx; - if (set_g_grid_idx_innermost(cfg_)) { - g_isp_grid_idx = gemm_schedule.fuse( - {id, ih, iw_tile.grid_idx(), g_tile.grid_idx()}); - } else { - g_isp_grid_idx = gemm_schedule.fuse( - {g_tile.grid_idx(), id, ih, iw_tile.grid_idx()}); - } + std::vector kernel_grid_vars; + kernel_grid_vars.push_back(ic_tile.grid_idx()); + kernel_grid_vars.push_back(id); + kernel_grid_vars.push_back(ih); + kernel_grid_vars.push_back(iw_tile.grid_idx()); + kernel_grid_vars.push_back(g_tile.grid_idx()); + kernel_grid_vars.push_back(mb_tile.grid_idx()); + bind_kernel_grid(gemm_schedule, kernel_grid_vars); + auto mb_iw_tg_idx = gemm_schedule.fuse(mb_tile.tg_idx(), iw_tile.tg_idx()); - bind_kernel_grid_idx(cfg_, gemm_schedule, ic_tile.grid_idx()); - bind_kernel_grid_idx(cfg_, gemm_schedule, g_isp_grid_idx); - bind_kernel_grid_idx(cfg_, gemm_schedule, mb_tile.grid_idx()); bind_thread_group_grid_idx(cfg_, gemm_schedule, ic_tile.tg_idx()); bind_thread_group_grid_idx(cfg_, gemm_schedule, mb_iw_tg_idx); bind_thread_group_grid_idx(cfg_, gemm_schedule, oc_tile.tg_idx()); @@ -683,22 +653,19 @@ void init_bwd_w(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule, auto ow_tile = create_tile(gemm_schedule, cfg_, ow); auto kw_tile = create_tile(gemm_schedule, cfg_, kw); - auto osp_ksp_ic_grid_idx = gemm_schedule.fuse( - {od_tile.grid_idx(), oh_tile.grid_idx(), ow_tile.grid_idx(), kd, kh, - kw_tile.grid_idx(), ic_tile.grid_idx()}); - - expr_t g_mb_grid_idx; - if (set_g_grid_idx_innermost(cfg_)) { - g_mb_grid_idx - = gemm_schedule.fuse({mb_tile.grid_idx(), g_tile.grid_idx()}); - } else { - g_mb_grid_idx - = gemm_schedule.fuse({g_tile.grid_idx(), mb_tile.grid_idx()}); - } + std::vector kernel_grid_vars; + kernel_grid_vars.push_back(oc_tile.grid_idx()); + kernel_grid_vars.push_back(od_tile.grid_idx()); + kernel_grid_vars.push_back(oh_tile.grid_idx()); + kernel_grid_vars.push_back(ow_tile.grid_idx()); + kernel_grid_vars.push_back(kd); + kernel_grid_vars.push_back(kh); + kernel_grid_vars.push_back(kw_tile.grid_idx()); + kernel_grid_vars.push_back(ic_tile.grid_idx()); + kernel_grid_vars.push_back(mb_tile.grid_idx()); + kernel_grid_vars.push_back(g_tile.grid_idx()); + bind_kernel_grid(gemm_schedule, kernel_grid_vars); - bind_kernel_grid_idx(cfg_, gemm_schedule, oc_tile.grid_idx()); - bind_kernel_grid_idx(cfg_, gemm_schedule, osp_ksp_ic_grid_idx); - bind_kernel_grid_idx(cfg_, gemm_schedule, g_mb_grid_idx); bind_thread_group_grid_idx(cfg_, gemm_schedule, oc_tile.tg_idx()); bind_thread_group_grid_idx(cfg_, gemm_schedule, ic_tile.tg_idx()); @@ -2416,8 +2383,8 @@ class plan_builder_t { auto &init_cset = plan.init_cset; auto &gemm_schedule = plan.gemm_schedule; - gemm_schedule = gemm_schedule_t( - init_cset, cfg_.kernel_grid(), cfg_.thread_group_grid()); + gemm_schedule = gemm_schedule_t(init_cset, cfg_.kernel_grid(), + cfg_.thread_group_grid(), cfg_.walk_order()); view_t a_view; view_t b_view; view_t c_view; diff --git a/src/gpu/intel/jit/conv/tiler.cpp b/src/gpu/intel/jit/conv/tiler.cpp index b70fab1fbe9..9923b2e5258 100644 --- a/src/gpu/intel/jit/conv/tiler.cpp +++ b/src/gpu/intel/jit/conv/tiler.cpp @@ -1528,6 +1528,7 @@ class conv_tiler_impl_t { void maybe_try_small_grf(conv_config_t &cfg) { auto try_cfg = cfg; + init_walk_order(try_cfg); init_kernel_grid(try_cfg); init_thread_group_grid(try_cfg); int kg_elems = try_cfg.kernel_grid().elems(), diff --git a/src/gpu/intel/jit/ir/gemm_schedule.hpp b/src/gpu/intel/jit/ir/gemm_schedule.hpp index bc641497551..72f5e218a5f 100644 --- a/src/gpu/intel/jit/ir/gemm_schedule.hpp +++ b/src/gpu/intel/jit/ir/gemm_schedule.hpp @@ -27,6 +27,7 @@ #include "gpu/intel/jit/ir/ir.hpp" #include "gpu/intel/jit/ir/tensor.hpp" +#include "gpu/intel/jit/ir/walk_order.hpp" #include "gpu/intel/jit/utils/utils.hpp" namespace dnnl { @@ -411,11 +412,18 @@ class gemm_schedule_t { gemm_schedule_t() = default; gemm_schedule_t(constraint_set_t &cset, const grid_info_t &kernel_grid, - const grid_info_t &tg_grid) - : cset_(&cset), kernel_grid_(kernel_grid), tg_grid_(tg_grid) {} + const grid_info_t &tg_grid, + const walk_order_t &kernel_grid_walk_order = {}) + : cset_(&cset) + , kernel_grid_(kernel_grid) + , tg_grid_(tg_grid) + , kernel_grid_walk_order_(kernel_grid_walk_order) {} const grid_info_t &kernel_grid() const { return kernel_grid_; } const grid_info_t &tg_grid() const { return tg_grid_; } + const walk_order_t &kernel_grid_walk_order() const { + return kernel_grid_walk_order_; + } bmnk_kind_t bmnk_kind(const expr_t &var) const { return bmnk_kind(std::vector({var})); @@ -936,6 +944,8 @@ class gemm_schedule_t { for (int i = 0; i < tg_grid_.ndims(); i++) { if (tg_grid_.idx(i).is_same(v)) return loop_kind_t::tg_grid; } + if (kernel_grid_walk_order_.is_grid_var(v)) + return loop_kind_t::kernel_grid; ir_error_not_expected() << "Unknown external variable: " << v; return loop_kind_t::undef; } @@ -947,6 +957,8 @@ class gemm_schedule_t { for (int i = 0; i < tg_grid_.ndims(); i++) { if (tg_grid_.idx(i).is_same(v)) return tg_grid_.dim(i); } + if (kernel_grid_walk_order_.is_grid_var(v)) + return kernel_grid_walk_order_.dim_size(v); ir_error_not_expected() << "Unknown external variable: " << v; return -1; } @@ -1120,6 +1132,7 @@ class gemm_schedule_t { constraint_set_t *cset_ = nullptr; grid_info_t kernel_grid_; grid_info_t tg_grid_; + walk_order_t kernel_grid_walk_order_; // Loop indices, ordered from outermost to innermost. std::vector vars_; diff --git a/src/gpu/intel/jit/ir/walk_order.hpp b/src/gpu/intel/jit/ir/walk_order.hpp new file mode 100644 index 00000000000..fdf7d519772 --- /dev/null +++ b/src/gpu/intel/jit/ir/walk_order.hpp @@ -0,0 +1,186 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_INTEL_JIT_IR_WALK_ORDER_HPP +#define GPU_INTEL_JIT_IR_WALK_ORDER_HPP + +#include "gpu/intel/jit/ir/problem.hpp" +#include "gpu/intel/jit/utils/utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace jit { + +// Represents blocked kernel grid walk order, together with assignments to +// X/Y/Z grid IDs (0, 1, 2 indices). +class walk_order_t { +public: + struct block_t { + block_t() = default; + block_t(const prb_dim_t &dim, int size, int grid_id) + : dim(dim), size(size), grid_id(grid_id) {} + prb_dim_t dim; + int size = 0; + int grid_id = -1; + }; + + struct dim_info_t { + dim_info_t() = default; + dim_info_t(const prb_dim_t &dim, int size) : dim(dim), size(size) { + grid_var = var_t::make(type_t::s32(), dim.str() + "_grid_var"); + } + + prb_dim_t dim; + int size = 0; + expr_t grid_var; + }; + + walk_order_t() = default; + walk_order_t(const std::string &s) { + auto parts = gpu_utils::split(s, ","); + ir_assert(parts.size() <= 3); + for (int i = 0; i < (int)parts.size(); i++) { + for (auto &kv : ir_utils::to_string_int_pairs(parts[i])) { + add(prb_dim_t::from_name(kv.first), kv.second, i); + } + } + } + + void add(const prb_dim_t &dim, int block_size, int grid_id) { + blocks_.emplace_back(dim, block_size, grid_id); + } + + const std::vector &blocks() const { return blocks_; } + const std::vector &dim_infos() const { return dim_infos_; } + + bool has(const prb_dim_t &dim) const { + for (auto &info : dim_infos_) { + if (info.dim == dim) return true; + } + return false; + } + + bool is_blocked(int id) const { + for (auto &info : dim_infos_) { + if (grid_id(info.dim) != id) continue; + int count = 0; + for (auto &b : blocks_) { + if (b.dim != info.dim) continue; + count += 1; + } + if (count > 1) return true; + } + return false; + } + + std::vector grid_dims(int id) const { + std::vector ret; + for (auto &info : dim_infos_) { + if (grid_id(info.dim) == id) ret.push_back(info.dim); + } + return ret; + } + + int grid_id(const prb_dim_t &dim) const { + int id = -1; + for (auto &b : blocks_) { + if (b.dim != dim) continue; + if (id == -1) id = b.grid_id; + ir_assert(b.grid_id == id); + } + ir_assert(id != -1); + return id; + } + + expr_t grid_var(const prb_dim_t &dim) const { + for (auto &info : dim_infos_) { + if (info.dim == dim) return info.grid_var; + } + ir_error_not_expected() << "Grid variable not found: " << dim; + return expr_t(); + } + + int dim_size(const expr_t &grid_var) const { + for (auto &info : dim_infos_) { + if (info.grid_var.is_same(grid_var)) return info.size; + } + ir_error_not_expected() << "Grid variable not found: " << grid_var; + return -1; + } + + int dim_size(const prb_dim_t &dim) const { return dim_size(grid_var(dim)); } + + bool is_grid_var(const expr_t &grid_var) const { + for (auto &info : dim_infos_) { + if (info.grid_var.is_same(grid_var)) return true; + } + return false; + } + + void finalize(const prb_tile_t &grid_tile) { + for (auto &d : grid_tile) { + int inner_block = 1; + for (auto &b : blocks_) { + if (b.dim == d) inner_block *= b.size; + } + int outer = utils::div_up(grid_tile[d], inner_block); + int id = (inner_block != 1 ? grid_id(d) : 0); + dim_infos_.emplace_back(d, grid_tile[d]); + if (outer != 1) add(d, outer, id); + } + for (auto &info : dim_infos_) { + int nblocks = 0; + block_t *first_block = nullptr; + for (auto &b : blocks_) { + if (b.dim != info.dim) continue; + if (!first_block) first_block = &b; + nblocks++; + } + if (nblocks == 1) { + first_block->size = std::min(first_block->size, info.size); + } + } + } + + std::string str() const { + std::ostringstream oss; + for (int id = 0; id < 3; id++) { + if (id != 0) oss << ","; + for (auto &b : blocks_) { + if (b.grid_id != id) continue; + oss << b.dim << b.size; + } + } + return oss.str(); + } + + IR_DEFINE_DUMP() + +private: + // Ordered from innermost to outermost. + std::vector blocks_; + std::vector dim_infos_; +}; + +} // namespace jit +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif From 6e248387e6de095dd7c6945be71220359b0f7378 Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Tue, 7 May 2024 08:22:51 -0700 Subject: [PATCH 065/187] common, gpu: introduce engine impls --- src/common/engine.hpp | 36 ++++----- src/common/engine_impl.hpp | 72 +++++++++++++++++ src/cpu/cpu_engine.hpp | 6 +- src/gpu/intel/compute/compute_engine.hpp | 5 +- src/gpu/intel/ocl/ocl_gpu_engine.cpp | 41 ++-------- src/gpu/intel/ocl/ocl_gpu_engine.hpp | 32 +++----- src/gpu/intel/ocl/ocl_utils.cpp | 1 + src/sycl/sycl_engine_base.hpp | 57 ++++---------- src/xpu/ocl/engine_impl.hpp | 98 ++++++++++++++++++++++++ src/xpu/sycl/engine_impl.hpp | 93 ++++++++++++++++++++++ 10 files changed, 317 insertions(+), 124 deletions(-) create mode 100644 src/common/engine_impl.hpp create mode 100644 src/xpu/ocl/engine_impl.hpp create mode 100644 src/xpu/sycl/engine_impl.hpp diff --git a/src/common/engine.hpp b/src/common/engine.hpp index f71fe674e88..ceffa54a4fd 100644 --- a/src/common/engine.hpp +++ b/src/common/engine.hpp @@ -24,6 +24,7 @@ #endif #include "c_types_map.hpp" +#include "common/engine_impl.hpp" #include "engine_id.hpp" #include "memory.hpp" #include "memory_storage.hpp" @@ -49,21 +50,18 @@ * - Provide engine specific primitive_desc_t creators */ struct dnnl_engine : public dnnl::impl::c_compatible { - dnnl_engine(dnnl::impl::engine_kind_t kind, - dnnl::impl::runtime_kind_t runtime_kind, size_t index) - : kind_(kind) - , runtime_kind_(runtime_kind) - , index_(index) - , counter_(1) {} + dnnl_engine(dnnl::impl::engine_impl_t *impl) : impl_(impl), counter_(1) {} /** get kind of the current engine */ - dnnl::impl::engine_kind_t kind() const { return kind_; } + dnnl::impl::engine_kind_t kind() const { return impl()->kind(); } /** get the runtime kind of the current engine */ - dnnl::impl::runtime_kind_t runtime_kind() const { return runtime_kind_; } + dnnl::impl::runtime_kind_t runtime_kind() const { + return impl()->runtime_kind(); + } /** get index of the current engine */ - size_t index() const { return index_; } + size_t index() const { return impl()->index(); } virtual dnnl::impl::device_id_t device_id() const = 0; @@ -130,11 +128,14 @@ struct dnnl_engine : public dnnl::impl::c_compatible { virtual bool mayiuse_f16_accumulator_with_f16() const { return false; } + const dnnl::impl::engine_impl_t *impl() const { return impl_.get(); } + #ifdef ONEDNN_BUILD_GRAPH - /** only used in graph implementation **/ - void *get_allocator() const { return (void *)(&allocator_); }; + // only used in graph implementation + void *get_allocator() const { return impl()->get_allocator(); } + // TODO: consider moving it to constructor. void set_allocator(dnnl::impl::graph::allocator_t *alloc) { - allocator_ = *alloc; + impl_->set_allocator(alloc); } #endif @@ -145,18 +146,11 @@ struct dnnl_engine : public dnnl::impl::c_compatible { } protected: - dnnl::impl::engine_kind_t kind_; - dnnl::impl::runtime_kind_t runtime_kind_; - size_t index_; - -#ifdef ONEDNN_BUILD_GRAPH - /** only used in graph implementation **/ - dnnl::impl::graph::allocator_t allocator_; -#endif - + dnnl::impl::status_t init_impl() { return impl_->init(); } virtual ~dnnl_engine() = default; private: + std::unique_ptr impl_; std::atomic counter_; }; diff --git a/src/common/engine_impl.hpp b/src/common/engine_impl.hpp new file mode 100644 index 00000000000..8c6a910a00b --- /dev/null +++ b/src/common/engine_impl.hpp @@ -0,0 +1,72 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef COMMON_ENGINE_IMPL_HPP +#define COMMON_ENGINE_IMPL_HPP + +#include "common/c_types_map.hpp" +#include "common/utils.hpp" + +#ifdef ONEDNN_BUILD_GRAPH +#include "graph/interface/allocator.hpp" +#endif + +#define VERROR_ENGINE_IMPL(cond, stat, msg, ...) \ + do { \ + if (!(cond)) { \ + VERROR(common, runtime, msg, ##__VA_ARGS__); \ + return stat; \ + } \ + } while (0) + +namespace dnnl { +namespace impl { + +class engine_impl_t { +public: + engine_impl_t() = delete; + engine_impl_t(engine_kind_t kind, runtime_kind_t runtime_kind, size_t index) + : kind_(kind), runtime_kind_(runtime_kind), index_(index) {} + + virtual ~engine_impl_t() = default; + + engine_kind_t kind() const { return kind_; } + runtime_kind_t runtime_kind() const { return runtime_kind_; } + size_t index() const { return index_; } + +#ifdef ONEDNN_BUILD_GRAPH + void *get_allocator() const { return (void *)(&allocator_); }; + void set_allocator(graph::allocator_t *alloc) { allocator_ = *alloc; } +#endif + + virtual status_t init() { return status::success; } + +private: + DNNL_DISALLOW_COPY_AND_ASSIGN(engine_impl_t) + + engine_kind_t kind_; + runtime_kind_t runtime_kind_; + size_t index_; + +#ifdef ONEDNN_BUILD_GRAPH + graph::allocator_t allocator_; +#endif +}; + +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/cpu/cpu_engine.hpp b/src/cpu/cpu_engine.hpp index 5a17f07ddb3..3de6ca24ad3 100644 --- a/src/cpu/cpu_engine.hpp +++ b/src/cpu/cpu_engine.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2016-2023 Intel Corporation +* Copyright 2016-2024 Intel Corporation * Copyright 2020-2023 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -115,7 +115,9 @@ class cpu_engine_impl_list_t { class cpu_engine_t : public engine_t { public: - cpu_engine_t() : engine_t(engine_kind::cpu, get_cpu_native_runtime(), 0) {} + cpu_engine_t() + : engine_t(new impl::engine_impl_t( + engine_kind::cpu, get_cpu_native_runtime(), 0)) {} /* implementation part */ diff --git a/src/gpu/intel/compute/compute_engine.hpp b/src/gpu/intel/compute/compute_engine.hpp index fcbf9289197..368d563f5f4 100644 --- a/src/gpu/intel/compute/compute_engine.hpp +++ b/src/gpu/intel/compute/compute_engine.hpp @@ -23,6 +23,7 @@ #include #include "common/c_types_map.hpp" +#include "common/engine_impl.hpp" #include "common/primitive.hpp" #include "common/primitive_desc_iterator.hpp" #include "common/resource.hpp" @@ -43,9 +44,7 @@ namespace compute { class compute_engine_t : public engine_t { public: - compute_engine_t( - engine_kind_t kind, runtime_kind_t runtime_kind, size_t index) - : engine_t(kind, runtime_kind, index) {} + compute_engine_t(impl::engine_impl_t *impl) : engine_t(impl) {} virtual status_t init(); status_t init(const std::vector &cache_blob); diff --git a/src/gpu/intel/ocl/ocl_gpu_engine.cpp b/src/gpu/intel/ocl/ocl_gpu_engine.cpp index 2b8bc720247..f1cc5a0cb7e 100644 --- a/src/gpu/intel/ocl/ocl_gpu_engine.cpp +++ b/src/gpu/intel/ocl/ocl_gpu_engine.cpp @@ -58,37 +58,8 @@ status_t ocl_gpu_engine_t::init() { } status_t ocl_gpu_engine_t::init(const std::vector &cache_blob) { - cl_int err = CL_SUCCESS; - err = clGetDeviceInfo(device_, CL_DEVICE_PLATFORM, sizeof(platform_), - &platform_, nullptr); - if (err != CL_SUCCESS) { - device_ = nullptr; - context_ = nullptr; - } - - OCL_CHECK(err); - - err = clRetainDevice(device_); - if (err != CL_SUCCESS) { - device_ = nullptr; - context_ = nullptr; - } - - OCL_CHECK(err); - - if (is_user_context_) { - err = clRetainContext(context_); - if (err != CL_SUCCESS) context_ = nullptr; - } else { - context_ - = clCreateContext(nullptr, 1, &device_, nullptr, nullptr, &err); - } - - OCL_CHECK(err); - - CHECK(xpu::ocl::check_device(engine_kind::gpu, device_, context_)); - compute::compute_engine_t::init(cache_blob); - + CHECK(init_impl()); + CHECK(compute::compute_engine_t::init(cache_blob)); return status::success; } @@ -381,13 +352,13 @@ status_t ocl_gpu_engine_t::init_device_info( status_t ocl_gpu_engine_t::serialize_device( serialization_stream_t &sstream) const { size_t platform_name_len; - cl_int err = clGetPlatformInfo( - platform_, CL_PLATFORM_NAME, 0, nullptr, &platform_name_len); + cl_int err = clGetPlatformInfo(impl()->platform(), CL_PLATFORM_NAME, 0, + nullptr, &platform_name_len); OCL_CHECK(err); std::vector platform_name(platform_name_len); - err = clGetPlatformInfo(platform_, CL_PLATFORM_NAME, platform_name.size(), - platform_name.data(), nullptr); + err = clGetPlatformInfo(impl()->platform(), CL_PLATFORM_NAME, + platform_name.size(), platform_name.data(), nullptr); OCL_CHECK(err); sstream.write(platform_name.data(), platform_name.size()); diff --git a/src/gpu/intel/ocl/ocl_gpu_engine.hpp b/src/gpu/intel/ocl/ocl_gpu_engine.hpp index 7cb6882f705..16c5b897791 100644 --- a/src/gpu/intel/ocl/ocl_gpu_engine.hpp +++ b/src/gpu/intel/ocl/ocl_gpu_engine.hpp @@ -23,6 +23,7 @@ #include "gpu/intel/compute/compute_engine.hpp" #include "gpu/intel/ocl/ocl_gpu_engine_id.hpp" #include "gpu/intel/ocl/ocl_utils.hpp" +#include "xpu/ocl/engine_impl.hpp" #include "xpu/utils.hpp" namespace dnnl { @@ -34,10 +35,8 @@ namespace ocl { class ocl_gpu_engine_t : public compute::compute_engine_t { public: ocl_gpu_engine_t(cl_device_id adevice, cl_context acontext, size_t index) - : compute::compute_engine_t(engine_kind::gpu, runtime_kind::ocl, index) - , device_(adevice) - , context_(acontext) - , is_user_context_(acontext) {} + : compute::compute_engine_t( + new xpu::ocl::engine_impl_t(adevice, acontext, index)) {} status_t init() override; status_t init(const std::vector &cache_blob); @@ -94,13 +93,11 @@ class ocl_gpu_engine_t : public compute::compute_engine_t { return gpu_impl_list_t::get_implementation_list(desc); } - cl_device_id device() const { return device_; } - cl_context context() const { return context_; } - cl_platform_id platform() const { return platform_; } + cl_device_id device() const { return impl()->device(); } + cl_context context() const { return impl()->context(); } + cl_platform_id platform() const { return impl()->platform(); } - device_id_t device_id() const override { - return std::make_tuple(0, reinterpret_cast(device()), 0); - } + device_id_t device_id() const override { return impl()->device_id(); } status_t serialize_device(serialization_stream_t &sstream) const override; @@ -118,23 +115,18 @@ class ocl_gpu_engine_t : public compute::compute_engine_t { } protected: + const xpu::ocl::engine_impl_t *impl() const { + return (const xpu::ocl::engine_impl_t *)engine_t::impl(); + } + status_t build_program_from_source(xpu::ocl::wrapper_t &program, const char *code_string, const compute::kernel_ctx_t &kernel_ctx) const; - ~ocl_gpu_engine_t() override { - if (device_) { clReleaseDevice(device_); } - if (context_) { clReleaseContext(context_); } - } + ~ocl_gpu_engine_t() override = default; status_t init_device_info() override; status_t init_device_info(const std::vector &cache_blob) override; - -private: - cl_device_id device_; - cl_context context_; - cl_platform_id platform_ = nullptr; - bool is_user_context_; }; } // namespace ocl diff --git a/src/gpu/intel/ocl/ocl_utils.cpp b/src/gpu/intel/ocl/ocl_utils.cpp index e1538766651..7fe2dccc905 100644 --- a/src/gpu/intel/ocl/ocl_utils.cpp +++ b/src/gpu/intel/ocl/ocl_utils.cpp @@ -22,6 +22,7 @@ #include "gpu/intel/ocl/ocl_gpu_engine.hpp" #include "gpu/intel/ocl/ocl_gpu_kernel.hpp" #include "gpu/intel/ocl/ocl_utils.hpp" +#include "xpu/ocl/utils.hpp" #ifndef CL_KERNEL_BINARY_PROGRAM_INTEL #define CL_KERNEL_BINARY_PROGRAM_INTEL 0x407D diff --git a/src/sycl/sycl_engine_base.hpp b/src/sycl/sycl_engine_base.hpp index ff0f10867db..8a0fd1a4775 100644 --- a/src/sycl/sycl_engine_base.hpp +++ b/src/sycl/sycl_engine_base.hpp @@ -29,6 +29,7 @@ #include "gpu/intel/sycl/utils.hpp" #include "gpu/sycl/sycl_interop_gpu_kernel.hpp" #include "xpu/sycl/engine_id.hpp" +#include "xpu/sycl/engine_impl.hpp" namespace dnnl { namespace impl { @@ -38,21 +39,11 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { public: sycl_engine_base_t(engine_kind_t kind, const ::sycl::device &dev, const ::sycl::context &ctx, size_t index) - : gpu::intel::compute::compute_engine_t(kind, runtime_kind::sycl, index) - , device_(dev) - , context_(ctx) - , backend_(xpu::sycl::backend_t::unknown) {} + : gpu::intel::compute::compute_engine_t(new xpu::sycl::engine_impl_t( + engine_kind::gpu, dev, ctx, index)) {} status_t init() override { - backend_ = xpu::sycl::get_backend(device_); - VERROR_ENGINE(utils::one_of(backend_, xpu::sycl::backend_t::host, - xpu::sycl::backend_t::opencl, - xpu::sycl::backend_t::level0, - xpu::sycl::backend_t::nvidia, - xpu::sycl::backend_t::amd), - status::invalid_arguments, VERBOSE_UNSUPPORTED_BACKEND, "sycl"); - - CHECK(xpu::sycl::check_device(kind(), device_, context_)); + CHECK(init_impl()); CHECK(gpu::intel::compute::compute_engine_t::init()); return status::success; @@ -163,33 +154,15 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { return status::success; } - const ::sycl::device &device() const { return device_; } - const ::sycl::context &context() const { return context_; } + const ::sycl::device &device() const { return impl()->device(); } + const ::sycl::context &context() const { return impl()->context(); } - xpu::sycl::backend_t backend() const { return backend_; } + xpu::sycl::backend_t backend() const { return impl()->backend(); } - cl_device_id ocl_device() const { - if (backend() != xpu::sycl::backend_t::opencl) { - assert(!"not expected"); - return nullptr; - } - assert(device_.is_cpu() || device_.is_gpu()); - return xpu::ocl::make_wrapper( - xpu::sycl::compat::get_native(device())); - } - cl_context ocl_context() const { - if (backend() != xpu::sycl::backend_t::opencl) { - assert(!"not expected"); - return nullptr; - } - assert(device_.is_cpu() || device_.is_gpu()); - return xpu::ocl::make_wrapper( - xpu::sycl::compat::get_native(context())); - } + cl_device_id ocl_device() const { return impl()->ocl_device(); } + cl_context ocl_context() const { return impl()->ocl_context(); } - device_id_t device_id() const override { - return xpu::sycl::device_id(device_); - } + device_id_t device_id() const override { return impl()->device_id(); } engine_id_t engine_id() const override { return engine_id_t(new xpu::sycl::engine_id_impl_t( @@ -197,14 +170,12 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { } protected: + const xpu::sycl::engine_impl_t *impl() const { + return (const xpu::sycl::engine_impl_t *)engine_t::impl(); + } + ~sycl_engine_base_t() override = default; status_t init_device_info() override; - -private: - ::sycl::device device_; - ::sycl::context context_; - - xpu::sycl::backend_t backend_; }; } // namespace sycl diff --git a/src/xpu/ocl/engine_impl.hpp b/src/xpu/ocl/engine_impl.hpp new file mode 100644 index 00000000000..121086d631d --- /dev/null +++ b/src/xpu/ocl/engine_impl.hpp @@ -0,0 +1,98 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef XPU_OCL_ENGINE_IMPL_HPP +#define XPU_OCL_ENGINE_IMPL_HPP + +#include +#include + +#include "common/engine_impl.hpp" +#include "common/utils.hpp" + +#include "xpu/ocl/utils.hpp" + +namespace dnnl { +namespace impl { +namespace xpu { +namespace ocl { + +class engine_impl_t : public impl::engine_impl_t { +public: + engine_impl_t() = delete; + engine_impl_t(cl_device_id device, cl_context context, size_t index) + : impl::engine_impl_t(engine_kind::gpu, runtime_kind::ocl, index) + , device_(device) + , context_(context) + , is_user_context_(context) {} + + ~engine_impl_t() override = default; + + status_t init() override { + cl_int err = CL_SUCCESS; + err = clGetDeviceInfo(device(), CL_DEVICE_PLATFORM, sizeof(platform_), + &platform_, nullptr); + if (err != CL_SUCCESS) { + device_ = nullptr; + context_ = nullptr; + } + + OCL_CHECK(err); + + err = clRetainDevice(device()); + if (err != CL_SUCCESS) { + device_ = nullptr; + context_ = nullptr; + } + + OCL_CHECK(err); + + if (is_user_context_) { + err = clRetainContext(context()); + if (err != CL_SUCCESS) context_ = nullptr; + } else { + context_ = clCreateContext( + nullptr, 1, &device_.unwrap(), nullptr, nullptr, &err); + } + + OCL_CHECK(err); + + CHECK(check_device(engine_kind::gpu, device(), context())); + + return status::success; + } + + cl_device_id device() const { return device_; } + cl_context context() const { return context_; } + cl_platform_id platform() const { return platform_; } + + device_id_t device_id() const { + return std::make_tuple(0, reinterpret_cast(device()), 0); + } + +private: + xpu::ocl::wrapper_t device_; + xpu::ocl::wrapper_t context_; + cl_platform_id platform_ = nullptr; + bool is_user_context_; +}; + +} // namespace ocl +} // namespace xpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/xpu/sycl/engine_impl.hpp b/src/xpu/sycl/engine_impl.hpp new file mode 100644 index 00000000000..194195de05b --- /dev/null +++ b/src/xpu/sycl/engine_impl.hpp @@ -0,0 +1,93 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef XPU_SYCL_ENGINE_IMPL_HPP +#define XPU_SYCL_ENGINE_IMPL_HPP + +#include "common/engine_impl.hpp" +#include "common/utils.hpp" + +#include "xpu/sycl/compat.hpp" +#include "xpu/sycl/utils.hpp" + +namespace dnnl { +namespace impl { +namespace xpu { +namespace sycl { + +class engine_impl_t : public impl::engine_impl_t { +public: + engine_impl_t() = delete; + engine_impl_t(engine_kind_t kind, const ::sycl::device &device, + const ::sycl::context &context, size_t index) + : impl::engine_impl_t(kind, runtime_kind::sycl, index) + , device_(device) + , context_(context) + , backend_(backend_t::unknown) {} + + ~engine_impl_t() override = default; + + status_t init() override { + backend_ = xpu::sycl::get_backend(device_); + VERROR_ENGINE_IMPL( + utils::one_of(backend_, backend_t::host, backend_t::opencl, + backend_t::level0, backend_t::nvidia, backend_t::amd), + status::invalid_arguments, VERBOSE_UNSUPPORTED_BACKEND, "sycl"); + + CHECK(check_device(kind(), device_, context_)); + return status::success; + } + + const ::sycl::device &device() const { return device_; } + const ::sycl::context &context() const { return context_; } + + backend_t backend() const { return backend_; } + + cl_device_id ocl_device() const { + if (backend() != backend_t::opencl) { + assert(!"not expected"); + return nullptr; + } + assert(device_.is_cpu() || device_.is_gpu()); + return xpu::ocl::make_wrapper( + compat::get_native(device())); + } + + cl_context ocl_context() const { + if (backend() != backend_t::opencl) { + assert(!"not expected"); + return nullptr; + } + assert(device_.is_cpu() || device_.is_gpu()); + return xpu::ocl::make_wrapper( + compat::get_native(context())); + } + + device_id_t device_id() const { return xpu::sycl::device_id(device_); } + +private: + ::sycl::device device_; + ::sycl::context context_; + + backend_t backend_; +}; + +} // namespace sycl +} // namespace xpu +} // namespace impl +} // namespace dnnl + +#endif From 5a9da81fa93d4342fd0ed9ce64977a7a6aeaa9f9 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Tue, 16 Apr 2024 22:39:54 -0700 Subject: [PATCH 066/187] cpu: x64: brgemm_container: fix incorrect implicit status conversion Always returned `true`, either because of success, or because of converting error status with value > 0 into `true`. --- src/cpu/x64/brgemm/brgemm.cpp | 5 ++--- src/cpu/x64/brgemm/brgemm.hpp | 14 ++++++++++---- src/cpu/x64/brgemm/brgemm_containers.cpp | 4 +++- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/cpu/x64/brgemm/brgemm.cpp b/src/cpu/x64/brgemm/brgemm.cpp index 828248d14da..8e53294f295 100644 --- a/src/cpu/x64/brgemm/brgemm.cpp +++ b/src/cpu/x64/brgemm/brgemm.cpp @@ -617,8 +617,6 @@ status_t brgemm_kernel_destroy(brgemm_kernel_t *brg_kernel) { } status_t brgemm_init_tiles(const brgemm_desc_t &brg, char palette[64]) { - constexpr int max_palette_size_in_bytes = 64; - if (!brg.is_tmm) return status::unimplemented; //TODO: Add support of tail processing by reduction dimension @@ -629,6 +627,7 @@ status_t brgemm_init_tiles(const brgemm_desc_t &brg, char palette[64]) { palette_config_t *buff = (palette_config_t *)(palette); char *_tc = (char *)(buff); + static constexpr int max_palette_size_in_bytes = 64; for (int i = 0; i < max_palette_size_in_bytes; i++) _tc[i] = 0; @@ -646,7 +645,7 @@ status_t brgemm_init_tiles(const brgemm_desc_t &brg, char palette[64]) { if (brg.get_num_A_tiles() + brg.get_num_B_tiles() + brg.get_num_C_tiles() > brgemm_desc_t::AMX_TILES_NUM) { assert(!"brgemm internal error: invalid blocking"); - return status::unimplemented; + return status::runtime_error; } // Due to interleaving tileload/tmul we don't support blocking 1x6 and 6x1 diff --git a/src/cpu/x64/brgemm/brgemm.hpp b/src/cpu/x64/brgemm/brgemm.hpp index c93cd5fb4c8..630520a3656 100644 --- a/src/cpu/x64/brgemm/brgemm.hpp +++ b/src/cpu/x64/brgemm/brgemm.hpp @@ -253,16 +253,22 @@ void DNNL_API brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel, /// AMX utilities: Creates a palette based on BRGEMM descriptor /// /// @note -/// This call expects brgemm_desc_t object completely set up, thus, used after -/// `brgemm_desc_set_attr` call for non-empty attributes. +/// This call expects brgemm_desc_t object completely set up, thus, must be +/// used after `brgemm_desc_set_attr` call for non-empty attributes. /// /// @note /// Caller is expected to subsequently configure AMX tiles by calling /// amx_tile_configure(palette). /// -/// @param brg BRGEMM descriptor -/// @param palette 64 bytes array contains tiles configuration +/// @param brg Input BRGeMM descriptor +/// @param palette Output 64 bytes array initialized with tile configuration if +/// returned status is status::success. When any other status is returned, +/// the `palette` is not initialized and can't be used. /// +/// TODO: replace `char[64]` with a proper type that can express itself if it +/// was properly initialized and whether it's empty. Current API is broken in a +/// sense that multiple different scenarios are considered equal, whether +/// it's not AMX, or blocking is completely broken or unsupported. status_t DNNL_API brgemm_init_tiles(const brgemm_desc_t &brg, char palette[64]); } // namespace x64 diff --git a/src/cpu/x64/brgemm/brgemm_containers.cpp b/src/cpu/x64/brgemm/brgemm_containers.cpp index 6860040a9a6..b0a3d047d43 100644 --- a/src/cpu/x64/brgemm/brgemm_containers.cpp +++ b/src/cpu/x64/brgemm/brgemm_containers.cpp @@ -123,7 +123,9 @@ status_t brgemm_kernel_container_t::insert(int idx, const brgemm_desc_t *brg) { bool brgemm_palette_container_t::insert(int idx, const brgemm_desc_t *brg) { S_t kernel_palette; - CHECK(brgemm_init_tiles(*brg, kernel_palette.data())); + auto status = brgemm_init_tiles(*brg, kernel_palette.data()); + if (status != status::success) return false; + const auto ret = set_.insert(kernel_palette); refs_[idx] = &(*ret.first); return ret.second; From 38347a4e74f8854c8b249375737983096fae5620 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Mon, 8 Apr 2024 18:24:24 -0700 Subject: [PATCH 067/187] cpu: x64: matmul: consolidate init logic into function to reuse it --- src/cpu/x64/matmul/brgemm_matmul_reorders.cpp | 147 +++++++++--------- src/cpu/x64/matmul/brgemm_matmul_utils.cpp | 43 +++++ src/cpu/x64/matmul/brgemm_matmul_utils.hpp | 6 + 3 files changed, 125 insertions(+), 71 deletions(-) diff --git a/src/cpu/x64/matmul/brgemm_matmul_reorders.cpp b/src/cpu/x64/matmul/brgemm_matmul_reorders.cpp index bca9a87265e..86201703ca3 100644 --- a/src/cpu/x64/matmul/brgemm_matmul_reorders.cpp +++ b/src/cpu/x64/matmul/brgemm_matmul_reorders.cpp @@ -23,10 +23,81 @@ namespace impl { namespace cpu { namespace x64 { +using namespace format_tag; + +format_tag_t get_otag(const memory_desc_t &dst_md) { + + const memory_desc_wrapper od(dst_md); + const auto vnni_granularity = data_type_vnni_granularity(od.data_type()); + + format_tag_t otag = format_tag::undef; + switch (vnni_granularity) { + case 4: + otag = od.matches_one_of_tag(aCB16b64c4b, BA16a64b4a, aCB16b48c4b, + BA16a48b4a, aCB16b32c4b, BA16a32b4a, aCB16b16c4b, + BA16a16b4a); + break; + case 2: + otag = od.matches_one_of_tag(aCB16b64c2b, BA16a64b2a, aCB16b48c2b, + BA16a48b2a, aCB16b32c2b, BA16a32b2a, aCB16b16c2b, + BA16a16b2a); + break; + case 1: + otag = od.matches_one_of_tag(aCB16b64c, BA16a64b, aCB16b48c, + BA16a48b, aCB16b32c, BA16a32b, aCB16b16c, BA16a16b); + break; + default: otag = format_tag::undef; + } + return otag; +} + +// This function initializes all required fields in the conf object to generate +// copy_b kernel. +// This particular call relies on memory descriptors and used in this +// implementation. The sub-call is reduced to simplified objects and used in +// BRGeMM public API implementation for copy routines. +// +// Note: this version has some extra definitions that are available in memory +// descriptors only. +status_t init_conf(matmul::brgemm_matmul_conf_t &conf, + const memory_desc_t &src_md, const memory_desc_t &dst_md) { + const memory_desc_wrapper id(src_md), od(dst_md); + const int ndims = id.ndims(); + const auto &dims = id.dims(); + const auto type_i = id.data_type(); + const auto type_o = od.data_type(); + + const bool is_bf16_with_int_wei = type_o == data_type::bf16 + && utils::one_of(type_i, data_type::s8, data_type::u8); + + format_tag_t otag = get_otag(dst_md); + // TODO: enable for itag = {ba, acb} + format_tag_t itag = id.matches_one_of_tag( + ab, abc, is_bf16_with_int_wei ? otag : format_tag::undef); + if (utils::one_of(format_tag::undef, itag, otag)) + return status::invalid_arguments; + + dim_t batch = ndims > 2 ? dims[ndims - 3] : 1; + dim_t K = dims[ndims - 2]; + dim_t N = dims[ndims - 1]; + + CHECK(matmul::init_conf(conf, batch, K, N, + matmul::get_n_block_from_tag(otag), type_i, type_o, itag)); + + conf.s8s8_compensation_required + = od.extra().flags & memory_extra_flags::compensation_conv_s8s8; + const bool req_asymmetric_comp = od.extra().flags + & memory_extra_flags::compensation_conv_asymmetric_src; + conf.src_zp_type = req_asymmetric_comp ? brgemm_broadcast_t::per_tensor + : brgemm_broadcast_t::none; + conf.has_zero_point_a = conf.src_zp_type != brgemm_broadcast_t::none; + + return status::success; +} + status_t brgemm_matmul_matrix_B_reorder_t::pd_t::init( engine_t *engine, engine_t *src_engine, engine_t *dst_engine) { using namespace status; - using namespace format_tag; status_t status = cpu_reorder_pd_t::init(engine, src_engine, dst_engine); if (status != success) return status; @@ -50,8 +121,6 @@ status_t brgemm_matmul_matrix_B_reorder_t::pd_t::init( const bool is_s8s8 = type_i == data_type::s8 && type_o == data_type::s8; const bool is_bf16_with_int_wei = type_o == data_type::bf16 && utils::one_of(type_i, data_type::s8, data_type::u8); - const bool with_wei_decompression = type_i != type_o - && utils::one_of(type_i, data_type::s8, data_type::u8); const bool has_adj_scale = od.extra().flags & memory_extra_flags::scale_adjust; const bool args_ok = true && dt_ok && id.is_dense() @@ -64,79 +133,15 @@ status_t brgemm_matmul_matrix_B_reorder_t::pd_t::init( && !od.has_runtime_dims_or_strides() && !od.has_zero_dim(); if (!args_ok) return invalid_arguments; - const auto &dims = id.dims(); - // TODO: enable for itag = {ba, acb} - format_tag_t itag = id.matches_one_of_tag(ab, abc); - format_tag_t otag = format_tag::undef; - - const auto vnni_granularity = data_type_vnni_granularity(type_o); - switch (vnni_granularity) { - case 4: - otag = od.matches_one_of_tag(aCB16b64c4b, BA16a64b4a, aCB16b48c4b, - BA16a48b4a, aCB16b32c4b, BA16a32b4a, aCB16b16c4b, - BA16a16b4a); - break; - case 2: - otag = od.matches_one_of_tag(aCB16b64c2b, BA16a64b2a, aCB16b48c2b, - BA16a48b2a, aCB16b32c2b, BA16a32b2a, aCB16b16c2b, - BA16a16b2a); - break; - case 1: - otag = od.matches_one_of_tag(aCB16b64c, BA16a64b, aCB16b48c, - BA16a48b, aCB16b32c, BA16a32b, aCB16b16c, BA16a16b); - break; - default: otag = format_tag::undef; - } - if (is_bf16_with_int_wei) itag = id.matches_one_of_tag(ab, abc, otag); - - if (utils::one_of(format_tag::undef, itag, otag)) return invalid_arguments; - - // initialize all required fields to generate copy_b kernel - matmul_conf_for_reorder_.blocked_B = !utils::one_of(itag, ab, abc); - matmul_conf_for_reorder_.is_bf16_with_int_wei = is_bf16_with_int_wei; - matmul_conf_for_reorder_.with_wei_decompression = with_wei_decompression; - matmul_conf_for_reorder_.apply_scales_in_buffer_b = false; - matmul_conf_for_reorder_.orig_wei_dt = type_i; - matmul_conf_for_reorder_.wei_tag = itag; - matmul_conf_for_reorder_.batch = ndims > 2 ? dims[ndims - 3] : 1; - matmul_conf_for_reorder_.K = dims[ndims - 2]; - matmul_conf_for_reorder_.N = dims[ndims - 1]; - matmul_conf_for_reorder_.wei_n_blk = matmul_conf_for_reorder_.N_blk - = matmul_conf_for_reorder_.LDB = matmul::get_n_block_from_tag(otag); - matmul_conf_for_reorder_.N_tail - = matmul_conf_for_reorder_.N % matmul_conf_for_reorder_.N_blk; - matmul_conf_for_reorder_.K_blk = 16 * vnni_granularity; - matmul_conf_for_reorder_.K_tail - = matmul_conf_for_reorder_.K % matmul_conf_for_reorder_.K_blk; - matmul_conf_for_reorder_.src_dt = matmul_conf_for_reorder_.wei_dt = type_o; - matmul_conf_for_reorder_.a_dt_sz = matmul_conf_for_reorder_.tr_a_dt_sz - = types::data_type_size(matmul_conf_for_reorder_.src_dt); - matmul_conf_for_reorder_.b_dt_sz = types::data_type_size(type_i); - matmul_conf_for_reorder_.tr_b_dt_sz - = types::data_type_size(matmul_conf_for_reorder_.wei_dt); - matmul_conf_for_reorder_.copy_B_wei_stride - = matmul_conf_for_reorder_.N * matmul_conf_for_reorder_.b_dt_sz; - matmul_conf_for_reorder_.transposed_B = false; - matmul_conf_for_reorder_.s8s8_comp_b_str = utils::rnd_up( - matmul_conf_for_reorder_.N, matmul_conf_for_reorder_.wei_n_blk); - matmul_conf_for_reorder_.s8s8_comp_n_str - = matmul_conf_for_reorder_.wei_n_blk; - matmul_conf_for_reorder_.s8s8_compensation_required - = od.extra().flags & memory_extra_flags::compensation_conv_s8s8; - const bool req_asymmetric_comp = od.extra().flags - & memory_extra_flags::compensation_conv_asymmetric_src; - matmul_conf_for_reorder_.src_zp_type = req_asymmetric_comp - ? brgemm_broadcast_t::per_tensor - : brgemm_broadcast_t::none; - matmul_conf_for_reorder_.has_zero_point_a - = matmul_conf_for_reorder_.src_zp_type != brgemm_broadcast_t::none; - matmul_conf_for_reorder_.isa = is_f16 ? avx512_core_fp16 : avx512_core; + CHECK(init_conf(matmul_conf_for_reorder_, src_md_, dst_md_)); auto mask_ok = [&](bool check, int mask) { return IMPLICATION( check, mask == (1 << ndims) - 1 - (1 << (ndims - 2))); }; + const bool req_asymmetric_comp = od.extra().flags + & memory_extra_flags::compensation_conv_asymmetric_src; const bool comp_masks_ok = true && mask_ok(matmul_conf_for_reorder_.s8s8_compensation_required, od.extra().compensation_mask) @@ -201,7 +206,7 @@ status_t brgemm_matmul_matrix_B_reorder_t::execute_body( parallel_nd(kernel_conf.batch, div_up(kernel_conf.N, kernel_conf.N_blk), [&](dim_t batch, dim_t n_blk_idx) { const auto n = n_blk_idx * kernel_conf.N_blk; - const bool is_N_tail = (kernel_conf.N - n < kernel_conf.N_blk); + const bool is_N_tail = (kernel_conf.N - n) < kernel_conf.N_blk; auto ker_exec_ctx = matmul::jit_brgemm_matmul_copy_b_t::ctx_t(); ker_exec_ctx.current_N_blk = is_N_tail ? kernel_conf.N_tail : kernel_conf.N_blk; diff --git a/src/cpu/x64/matmul/brgemm_matmul_utils.cpp b/src/cpu/x64/matmul/brgemm_matmul_utils.cpp index 20bbda9bdb3..5437d86ab85 100644 --- a/src/cpu/x64/matmul/brgemm_matmul_utils.cpp +++ b/src/cpu/x64/matmul/brgemm_matmul_utils.cpp @@ -1532,6 +1532,49 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc, return status::success; } +status_t init_conf(brgemm_matmul_conf_t &conf, dim_t batch, dim_t K, dim_t N, + dim_t n_blk, data_type_t in_type, data_type_t out_type, + format_tag_t in_tag) { + if (n_blk <= 0) return status::invalid_arguments; + + const bool is_f16 = utils::one_of(data_type::f16, in_type, out_type); + const bool is_bf16_with_int_wei = out_type == data_type::bf16 + && utils::one_of(in_type, data_type::s8, data_type::u8); + const bool with_wei_decompression = in_type != out_type + && utils::one_of(in_type, data_type::s8, data_type::u8); + + conf.blocked_B = !utils::one_of(in_tag, ab, abc); + conf.is_bf16_with_int_wei = is_bf16_with_int_wei; + conf.with_wei_decompression = with_wei_decompression; + conf.orig_wei_dt = in_type; + conf.wei_tag = in_tag; + conf.batch = batch; + conf.K = K; + conf.N = N; + conf.wei_n_blk = conf.N_blk = conf.LDB = n_blk; + conf.N_tail = conf.N % conf.N_blk; + conf.K_blk = 16 * data_type_vnni_granularity(out_type); + conf.K_tail = conf.K % conf.K_blk; + conf.src_dt = conf.wei_dt = out_type; + conf.a_dt_sz = conf.tr_a_dt_sz = types::data_type_size(conf.src_dt); + conf.b_dt_sz = types::data_type_size(in_type); + conf.tr_b_dt_sz = types::data_type_size(conf.wei_dt); + conf.copy_B_wei_stride = conf.N * conf.b_dt_sz; + conf.transposed_B = false; + conf.s8s8_comp_b_str = utils::rnd_up(conf.N, conf.wei_n_blk); + conf.s8s8_comp_n_str = conf.wei_n_blk; + conf.isa = is_f16 ? avx512_core_fp16 : avx512_core; + // The following members are different from the upper level `init_conf()` + // call from the reorder implementation due to lacking a memory descriptor + // to tip on compensation. + // TODO: re-consider an interface change to enable these members. + conf.s8s8_compensation_required = false; + conf.src_zp_type = brgemm_broadcast_t::none; + conf.has_zero_point_a = false; + + return status::success; +} + void init_aux_values(brgemm_matmul_conf_t &bgmmc, const memory_desc_wrapper &src_d, const memory_desc_wrapper &wei_d, const memory_desc_wrapper &dst_d) { diff --git a/src/cpu/x64/matmul/brgemm_matmul_utils.hpp b/src/cpu/x64/matmul/brgemm_matmul_utils.hpp index 4fc372a8809..d9868e7fc72 100644 --- a/src/cpu/x64/matmul/brgemm_matmul_utils.hpp +++ b/src/cpu/x64/matmul/brgemm_matmul_utils.hpp @@ -333,6 +333,12 @@ struct brgemm_matmul_conf_utils_t { const cpu_isa_t isa_; }; +// This function initializes all required fields in the conf object to generate +// copy_b kernel. Used in this impl and re-used in brgemm kernel API. +status_t init_conf(brgemm_matmul_conf_t &conf, dim_t batch, dim_t K, dim_t N, + dim_t n_blk, data_type_t in_type, data_type_t out_type, + format_tag_t in_tag); + void init_aux_values(brgemm_matmul_conf_t &bgmmc, const memory_desc_wrapper &src_d, const memory_desc_wrapper &wei_d, const memory_desc_wrapper &dst_d); From 063432bfa9da299361da232cd7a8d0c4b40884b9 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Tue, 9 Apr 2024 15:53:09 -0700 Subject: [PATCH 068/187] cpu: x64: amx_tile_config: introduce lazy initialization support --- src/cpu/x64/amx_tile_configure.cpp | 46 +++++++++++++++++++++++++++--- src/cpu/x64/amx_tile_configure.hpp | 3 +- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/src/cpu/x64/amx_tile_configure.cpp b/src/cpu/x64/amx_tile_configure.cpp index f50d1a165c8..f250905f2cc 100644 --- a/src/cpu/x64/amx_tile_configure.cpp +++ b/src/cpu/x64/amx_tile_configure.cpp @@ -26,16 +26,48 @@ struct jit_amx_tilecfg_t : public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_amx_tilecfg_t) // TODO: Need to check status - jit_amx_tilecfg_t() : jit_generator(jit_name(), avx512_core_amx) { + jit_amx_tilecfg_t(bool lazy = false) + : jit_generator(jit_name(), avx512_core_amx), is_lazy_(lazy) { create_kernel(); } void tile_configure(const char *palette) const { (*this)(palette); } + // TODO: merge into a single call. Keep both versions for now until there's + // a clear path lazy initialization API used across the library. + void tile_lazy_configure(const char *palette) const { + (*this)(palette, palette_store_); + } private: + // Lazy initialization first checks if tile is configured on a core. If it + // is, and the palette loaded is same as palette provided by user, then the + // tileload instruction is skipped. + // According to measurements, the impact on performance is marginal compared + // to manual handling of when palette should be loaded. + bool is_lazy_; + char palette_store_[AMX_PALETTE_SIZE]; + void generate() override { - ldtilecfg(ptr[abi_param1]); - ret(); + if (is_lazy_) { + Xbyak::Label skip_tilecfg; + // Store currect tilecfg into `palette_store_`. + sttilecfg(ptr[abi_param2]); + // Move tilecfg into Zmm for further comparison. + vmovdqu64(Xbyak::Zmm(0), ptr[abi_param2]); + // Sets `1` per word if values are equal. + vpcmpeqw(Xbyak::Opmask(0), Xbyak::Zmm(0), ptr[abi_param1]); + // `kortestw` will set CF=1 if all `1` in the mask. + kortestw(Xbyak::Opmask(0), Xbyak::Opmask(0)); + // Checks if CF=1. If it is, everything matched, skipping config... + jc(skip_tilecfg, T_NEAR); + // ... otherwise, configure tile with user palette. + ldtilecfg(ptr[abi_param1]); + L(skip_tilecfg); + ret(); + } else { + ldtilecfg(ptr[abi_param1]); + ret(); + } } }; @@ -57,11 +89,17 @@ struct jit_amx_tilerelease_t : public jit_generator { }; status_t amx_tile_configure(const char palette[AMX_PALETTE_SIZE]) { - static const jit_amx_tilecfg_t tilecfg; + static const jit_amx_tilecfg_t tilecfg(/* is_lazy = */ false); tilecfg.tile_configure(palette); return status::success; }; +status_t amx_tile_lazy_configure(const char palette[AMX_PALETTE_SIZE]) { + static const jit_amx_tilecfg_t tilecfg(/* is_lazy = */ true); + tilecfg.tile_lazy_configure(palette); + return status::success; +}; + status_t amx_tile_release() { static const jit_amx_tilerelease_t tilerls; tilerls.tile_release(); diff --git a/src/cpu/x64/amx_tile_configure.hpp b/src/cpu/x64/amx_tile_configure.hpp index 5dbd7b2411a..d68071eda53 100644 --- a/src/cpu/x64/amx_tile_configure.hpp +++ b/src/cpu/x64/amx_tile_configure.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2021 Intel Corporation +* Copyright 2020-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,7 @@ namespace x64 { static constexpr size_t AMX_PALETTE_SIZE = 64; status_t DNNL_API amx_tile_configure(const char palette[AMX_PALETTE_SIZE]); +status_t DNNL_API amx_tile_lazy_configure(const char palette[AMX_PALETTE_SIZE]); status_t DNNL_API amx_tile_release(); } // namespace x64 From 868664306f18ec7eb2c4c7d944fd1cb1da52d21e Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Mon, 8 Apr 2024 18:39:06 -0700 Subject: [PATCH 069/187] cpu: x64: brgemm: missing doc for brgemm arguments --- src/cpu/x64/brgemm/brgemm.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/cpu/x64/brgemm/brgemm.hpp b/src/cpu/x64/brgemm/brgemm.hpp index 630520a3656..084013e13c5 100644 --- a/src/cpu/x64/brgemm/brgemm.hpp +++ b/src/cpu/x64/brgemm/brgemm.hpp @@ -60,6 +60,7 @@ namespace x64 { /// @param K Specifies the number of columns of the matrix A and /// the number of rows of the matrix B /// @param strides Strides between the matrices in the batch. Can be nullptr. +/// TODO: what does "Can be nullptr" mean? /// status_t DNNL_API brgemm_desc_init(brgemm_desc_t *brg, cpu_isa_t isa, brgemm_batch_kind_t type, impl::data_type_t dt_a, @@ -94,6 +95,7 @@ status_t DNNL_API brgemm_desc_init(brgemm_desc_t *brg, cpu_isa_t isa, /// LDC must be at least max(1, N) /// @param M Specifies the number of rows of the matrix A and C. /// @param N Specifies the number of columns of the matrix A and C. +/// @param strides - TODO: missing documentation. /// status_t DNNL_API brdgmm_desc_init(brgemm_desc_t *brg, cpu_isa_t isa, brgemm_batch_kind_t type, impl::data_type_t dt_a, @@ -111,6 +113,7 @@ status_t DNNL_API brdgmm_desc_init(brgemm_desc_t *brg, cpu_isa_t isa, /// determine dst data type. /// @param LDD Specifies the leading dimension of matrix D /// LDD must be at least max(1, N) +/// TODO: why LDD can't be obtained from dst_md directly? /// @param dt_bias Specifies the data type Bias /// Can be u8, s8, s32, bf16, f16 or fp32 /// @@ -161,6 +164,7 @@ status_t DNNL_API brgemm_kernel_destroy(brgemm_kernel_t *brg_kernel); /// @param scratch Scratchpad memory needed in several scenarios: /// * Where: AMX+ hardware; When: always; For: buffer for tiles store. /// * In rest scenarios is not used. +/// @param dynamic_values TODO: missing doc /// void DNNL_API brgemm_kernel_execute(const brgemm_kernel_t *brg_kernel, int bs, const brgemm_batch_element_t *batch, void *ptr_C, @@ -187,6 +191,7 @@ void DNNL_API brgemm_kernel_execute(const brgemm_kernel_t *brg_kernel, int bs, /// @param scratch Scratchpad memory needed in several scenarios: /// * Where: AMX+ hardware; When: always; For: buffer for tiles store. /// * In rest scenarios is not used. +/// @param dynamic_values TODO: missing doc /// void brgemm_kernel_execute(const brgemm_kernel_t *brg_kernel, int bs, const void *addr_A, const void *addr_B, @@ -214,6 +219,7 @@ void brgemm_kernel_execute(const brgemm_kernel_t *brg_kernel, int bs, /// * Where: AMX+ hardware; When: always; For: buffer for tiles store. /// * Where: pre-VNNI hardware; When: s8s8 kernel; For: compensation buffer. /// * In rest scenarios is not used. +/// @param dynamic_values TODO: missing doc /// void DNNL_API brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel, int bs, const brgemm_batch_element_t *batch, void *ptr_C, void *ptr_D, @@ -243,6 +249,7 @@ void DNNL_API brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel, /// * Where: AMX+ hardware; When: always; For: buffer for tiles store. /// * Where: pre-VNNI hardware; When: s8s8 kernel; For: compensation buffer. /// * In rest scenarios is not used. +/// @param dynamic_values TODO: missing doc /// void DNNL_API brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel, int bs, const void *addr_A, const void *addr_B, From 9b61f09c37fd7daf445a21ed5181e98b349ed6e7 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Mon, 8 Apr 2024 18:43:37 -0700 Subject: [PATCH 070/187] api: introduce experimental brgemm ukernel support --- cmake/options.cmake | 5 + examples/CMakeLists.txt | 4 + examples/ukernels/brgemm.cpp | 309 +++++++++++++++++++++ include/oneapi/dnnl/dnnl_config.h.in | 3 + include/oneapi/dnnl/dnnl_ukernel.h | 217 +++++++++++++++ include/oneapi/dnnl/dnnl_ukernel.hpp | 307 +++++++++++++++++++++ include/oneapi/dnnl/dnnl_ukernel_types.h | 70 +++++ src/CMakeLists.txt | 7 + src/cpu/x64/brgemm/capi/brgemm_api.cpp | 328 +++++++++++++++++++++++ src/cpu/x64/brgemm/capi/brgemm_api.hpp | 63 +++++ 10 files changed, 1313 insertions(+) create mode 100644 examples/ukernels/brgemm.cpp create mode 100644 include/oneapi/dnnl/dnnl_ukernel.h create mode 100644 include/oneapi/dnnl/dnnl_ukernel.hpp create mode 100644 include/oneapi/dnnl/dnnl_ukernel_types.h create mode 100644 src/cpu/x64/brgemm/capi/brgemm_api.cpp create mode 100644 src/cpu/x64/brgemm/capi/brgemm_api.hpp diff --git a/cmake/options.cmake b/cmake/options.cmake index 91359d92100..8c314507cc9 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -198,6 +198,11 @@ option(DNNL_EXPERIMENTAL_SPARSE independetly from DNNL_EXPERIMENTAL." OFF) # disabled by default +option(DNNL_EXPERIMENTAL_UKERNEL + "Enable experimental functionality for ukernels. This option works + independetly from DNNL_EXPERIMENTAL." + OFF) # disabled by default + option(DNNL_EXPERIMENTAL_PROFILING "Enable experimental profiling capabilities. This option works independently from DNNL_EXPERIMENTAL." diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 19aa2991a50..3e5ebfc4fd8 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -49,6 +49,10 @@ if(NOT DNNL_EXPERIMENTAL_SPARSE) list(REMOVE_ITEM sources ${CMAKE_CURRENT_SOURCE_DIR}/cpu_matmul_weights_compression.cpp) endif() +if(NOT DNNL_EXPERIMENTAL_UKERNEL) + list(REMOVE_ITEM sources ${CMAKE_CURRENT_SOURCE_DIR}/ukernels/brgemm.cpp) +endif() + # Remove tests for CUDA which use unimplemented primitives if(DNNL_SYCL_CUDA) list(REMOVE_ITEM sources diff --git a/examples/ukernels/brgemm.cpp b/examples/ukernels/brgemm.cpp new file mode 100644 index 00000000000..4ceffd94d01 --- /dev/null +++ b/examples/ukernels/brgemm.cpp @@ -0,0 +1,309 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @example brgemm.cpp +/// > Annotated version: @ref brgemm_example_cpp +/// +/// This C++ API example demonstrates how to create and execute a BRGeMM +/// ukernel. +/// +/// @include brgemm.cpp + +#include +#include +#include +#include +#include +#include + +#include "example_utils.hpp" +#include "oneapi/dnnl/dnnl_ukernel.hpp" + +using namespace dnnl; +using namespace dnnl::ukernel; + +using tag = memory::format_tag; +using dt = memory::data_type; + +void brgemm_example(dnnl::engine::kind engine_kind) { + + // Create execution dnnl::engine. Needed for reorders to operate over input + // data. + dnnl::engine engine(engine_kind, 0); + + // Create dnnl::stream. Needed for reorders for the same reason. + dnnl::stream engine_stream(engine); + + // ukernel dimensions. + // K is for a whole tensor, K_k is for a single ukernel. + const memory::dim M = 8, K = 64, K_k = 32, N = 48; + if (K % K_k != 0) { + printf("K_k must divide K.\n"); + return; + } + const memory::dim n_calls = K / K_k; + + const memory::dim lda = K; + const memory::dim ldb = N; + const memory::dim ldc = N; // Leading dimension for accumulator. + const memory::dim ldd = N; // Leading dimension for an actual output. + const memory::dim batch_size = n_calls - 1; + +#define DT dt::f32 + memory::data_type a_dt = DT; + memory::data_type b_dt = DT; + memory::data_type c_dt = dt::f32; // Accumulator data type. + memory::data_type d_dt = DT; // Output data type. +#undef DT + + // A, B, and C tensors dimensions. + memory::dims A_dims = {M, K}; + memory::dims B_dims = {K, N}; + memory::dims C_dims = {M, N}; + memory::dims D_dims = {M, N}; + memory::dims binary_add_dims = {1, 1}; + + // Allocate buffers with user data. + std::vector A_user_data(product(A_dims)); + std::vector B_user_data(product(B_dims)); + std::vector binary_add_user_data(product(binary_add_dims)); + std::vector D_data(product(D_dims)); // For reference comparison + std::vector D_user_data(product(D_dims)); // For reference comparison + + // Initialize A, B, and binary_add. + std::generate(A_user_data.begin(), A_user_data.end(), []() { + static int i = 0; + return i++ % 4; + }); + std::generate(B_user_data.begin(), B_user_data.end(), []() { + static int i = 6; + static int sign_gen = 0; + int sign = (sign_gen++ % 2) ? -1 : 1; + float val = sign * (i++ % 5); + return val; + }); + std::generate( + binary_add_user_data.begin(), binary_add_user_data.end(), []() { + static int i = 3; + return i++ % 6; + }); + + // Create f32 memories. They are used as data holders and reorder into + // memories passed to the ukernel. + auto A_f32_md = memory::desc(A_dims, dt::f32, tag::ab); + auto B_f32_md = memory::desc(B_dims, dt::f32, tag::ab); + auto binary_add_f32_md = memory::desc(binary_add_dims, dt::f32, tag::ab); + auto D_f32_md = memory::desc(D_dims, dt::f32, tag::ab); + + auto A_f32_mem = memory(A_f32_md, engine, A_user_data.data()); + auto B_f32_mem = memory(B_f32_md, engine, B_user_data.data()); + auto binary_add_f32_mem + = memory(binary_add_f32_md, engine, binary_add_user_data.data()); + auto D_f32_mem = memory(D_f32_md, engine, D_user_data.data()); + + // Create ukernel memories in requested data types. + // Note that all formats are `ab`. + auto A_md = memory::desc(A_dims, a_dt, tag::ab); + auto B_md = memory::desc(B_dims, b_dt, tag::ab); + auto binary_add_md = memory::desc(binary_add_dims, dt::f32, tag::ab); + auto C_md = memory::desc(C_dims, c_dt, tag::ab); + auto D_md = memory::desc(D_dims, d_dt, tag::ab); + + auto A_mem = memory(A_md, engine); + auto B_mem = memory(B_md, engine); + auto binary_add_mem = memory(binary_add_md, engine); + auto C_mem = memory(C_md, engine); + auto D_mem = memory(D_md, engine); + + const auto *A_ptr = reinterpret_cast(A_mem.get_data_handle()); + auto *B_ptr = reinterpret_cast(B_mem.get_data_handle()); + + const size_t a_dt_size + = memory::data_type_size(A_mem.get_desc().get_data_type()); + const size_t b_dt_size + = memory::data_type_size(B_mem.get_desc().get_data_type()); + + // Reorder user data into buffers passed to ukernels in target data types. + reorder(A_f32_mem, A_mem).execute(engine_stream, A_f32_mem, A_mem); + reorder(B_f32_mem, B_mem).execute(engine_stream, B_f32_mem, B_mem); + reorder(binary_add_f32_mem, binary_add_mem) + .execute(engine_stream, binary_add_f32_mem, binary_add_mem); + reorder(D_f32_mem, D_mem).execute(engine_stream, D_f32_mem, D_mem); + // Prepare C buffer. Needed to use a single ukernel in the example with + // `beta = 1.f`. + // Note: to avoid this step, the first ukernel should run `beta = 0`, and it + // will initialize C buffer with intermediate values. + float *C_ptr = reinterpret_cast(C_mem.get_data_handle()); + for (memory::dim i = 0; i < M * N; i++) { + C_ptr[i] = 0; + } + + // Create ukernel post-ops (ReLU + Add). + // It reuses `primitive_attr` abstraction. + post_ops brgemm_ops; + brgemm_ops.append_eltwise( + algorithm::eltwise_relu, /* alpha = */ 0.f, /* beta = */ 0.f); + brgemm_ops.append_binary(algorithm::binary_add, binary_add_md); + primitive_attr brgemm_attr; + brgemm_attr.set_post_ops(brgemm_ops); + + // Create BRGeMM ukernel objects. + // There are two objects: + // * `brg` is the main one which operates over partitioned K dimension. It + // utilizes `beta = 1.f` to accumulate into the same buffer. It also uses + // `batch_size` to process as much as `n_calls - 1` iterations. + // * `brg_po` is the ukernel that would be called the last in the chain + // since it has attributes attached to the object and those will execute + // after all accumulation over K dimension is done. + // Note: `beta = 1.f` makes a ukernel reusable over K but will require + // zeroing the correspondent piece of accumulation buffer. + brgemm brg, brg_po; + if (batch_size > 0) { + brg = brgemm(M, N, K_k, batch_size, lda, ldb, ldc, a_dt, b_dt, c_dt, + /* alpha = */ 1.f, /* beta = */ 1.f); + // Generate the executable JIT code for the objects. + brg.generate(); + } + + brg_po = brgemm(M, N, K_k, 1, lda, ldb, ldc, ldd, a_dt, b_dt, c_dt, d_dt, + 1.f, 1.f, brgemm_attr); + // Generate the executable JIT code for the objects. + brg_po.generate(); + + // Query a scratchpad size and initialize a scratchpad buffer if the ukernel + // is expecting it. This is a service space needed, has nothing in common + // with accumulation buffer. + size_t scratchpad_size = brg_po.get_scratchpad_size(); + std::vector scratchpad(scratchpad_size); + + // Packing B tensor routine. The BRGeMM ukernel expects B passed in a + // special VNNI format for low precision data types, e.g., bf16. + // For f32 data type the routine blocks data in memory friendly way. This + // is beneficial in cases when leading dimension has a big power of 2 which + // leads to cache aliasing effects. + // Note: the routine doesn't take `batch_size` in the constructor as there's + // no performance benefit to copy more data at once. It's user's + // responsibility to iterate pack routine over batch_size provided for a + // ukernel. + brgemm_pack_B pack_B(/* K = */ K_k, /* N = */ N, /* in_ld = */ N, + /* out_ld = */ ldb, /* in_dt = */ b_dt, /* out_dt = */ b_dt); + + uint8_t *B_blocked = nullptr; + void *B_base_ptr = B_ptr; + size_t blocked_B_size = 0; + + if (pack_B.need_pack()) { + // Size of the packed tensor. + blocked_B_size = ldb * K_k * memory::data_type_size(b_dt); + + B_blocked = new uint8_t[blocked_B_size * n_calls]; + B_base_ptr = B_blocked; + + // Pack B routine execution. + // Note: usually should be split to process only that part of B that the + // ukernel will execute. + + pack_B.generate(); + + for (memory::dim i = 0; i < n_calls; i++) { + auto *B_ptr_i = B_ptr + i * N * K_k * b_dt_size; + auto *B_blocked_ptr_i = B_blocked + i * blocked_B_size; + pack_B.execute(B_ptr_i, B_blocked_ptr_i); + } + } + + // BRGeMM ukernel execute section. + // Prepare buffers for execution. + std::vector> A_B_offsets(batch_size); + for (memory::dim i = 0; i < batch_size; i++) { + const memory::dim A_offset_i = i * K_k * a_dt_size; + const memory::dim B_offset_i = pack_B.need_pack() + ? i * blocked_B_size + : i * N * K_k * b_dt_size; + A_B_offsets[i] = std::make_pair(A_offset_i, B_offset_i); + } + + if (brg) { + // Make an object to call HW specialized routines. For example, prepare + // AMX unit. + brg.set_hw_context(); + + // An execute call. `A_B` is a vector of pointers to A and packed B + // tensors. `acc_ptr` is a pointer to an accumulator buffer. + brg.execute(A_ptr, B_base_ptr, A_B_offsets, C_ptr, scratchpad.data()); + } + + // Same set of operations for a ukernel with post-ops. + std::vector> A_B_po_offsets; + const memory::dim A_offset_po = batch_size * K_k * a_dt_size; + const memory::dim B_offset_po = pack_B.need_pack() + ? batch_size * blocked_B_size + : batch_size * N * K_k * b_dt_size; + A_B_po_offsets.emplace_back(A_offset_po, B_offset_po); + + // This object also requires this call. + brg_po.set_hw_context(); + + // An execute call. The difference here is an additional D tensor pointer + // to store final output result after finishing accumulation and post-ops + // application. + brg_po.execute(A_ptr, B_base_ptr, A_B_po_offsets, C_ptr, + D_mem.get_data_handle(), scratchpad.data(), + binary_add_mem.get_data_handle()); + + // Once all computations are done, need to release HW context. + brgemm::release_hw_context(); + + // Clean up an extra buffer. + delete B_blocked; + + // Used for verification results, need unconditional reorder. + auto user_D_mem = memory(D_f32_md, engine, D_data.data()); + reorder(D_mem, user_D_mem).execute(engine_stream, D_mem, user_D_mem); + + // A simplified fast verification that ukernel returned expected results. + // Note: potential off-by-1 or 2 errors may pop up. This could be solved + // with more sparse filling. + bool to_throw = false; + for (int m = 0; m < M; m++) { + for (int n = 0; n < N; n++) { + D_user_data[m * N + n] = 0; + for (int k = 0; k < K; k++) { + D_user_data[m * N + n] + += A_user_data[m * K + k] * B_user_data[k * N + n]; + } + // Relu post-op ref + D_user_data[m * N + n] = std::max(D_user_data[m * N + n], 0.f); + // Binary post-op ref + D_user_data[m * N + n] += binary_add_user_data[0]; + + const float diff + = fabsf(D_user_data[m * N + n] - D_data[m * N + n]); + if (diff > 1.19e-7) { + to_throw = true; + if (true) { + printf("Error: [%3d:%3d] Ref:%12g Got:%12g Diff:%12g\n", m, + n, D_user_data[m * N + n], D_data[m * N + n], diff); + } + } + } + } + if (to_throw) { throw status::runtime_error; } +} + +int main(int argc, char **argv) { + return handle_example_errors(brgemm_example, dnnl::engine::kind::cpu); +} diff --git a/include/oneapi/dnnl/dnnl_config.h.in b/include/oneapi/dnnl/dnnl_config.h.in index 30c8a8c0ccf..18fa5cffdec 100644 --- a/include/oneapi/dnnl/dnnl_config.h.in +++ b/include/oneapi/dnnl/dnnl_config.h.in @@ -160,6 +160,9 @@ // When defined, experimental functionality for sparse domain is enabled. #cmakedefine DNNL_EXPERIMENTAL_SPARSE +// When defined, experimental functionality for ukernels is enabled. +#cmakedefine DNNL_EXPERIMENTAL_UKERNEL + // When defined, graph component is enabled. #cmakedefine ONEDNN_BUILD_GRAPH diff --git a/include/oneapi/dnnl/dnnl_ukernel.h b/include/oneapi/dnnl/dnnl_ukernel.h new file mode 100644 index 00000000000..0ae5ec52e65 --- /dev/null +++ b/include/oneapi/dnnl/dnnl_ukernel.h @@ -0,0 +1,217 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// ukernel C API + +#ifndef ONEAPI_DNNL_DNNL_UKERNEL_H +#define ONEAPI_DNNL_DNNL_UKERNEL_H + +#include "oneapi/dnnl/dnnl.h" +#include "oneapi/dnnl/dnnl_ukernel_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_api_ukernel +/// @{ + +#ifdef DNNL_EXPERIMENTAL_UKERNEL + +/// @addtogroup dnnl_api_ukernel_brgemm +/// @{ + +/// Creates a BRGeMM ukernel object. Operates by the following formula: +/// `C = alpha * [A x B] + beta * C`. +/// `D = post-operations(C)`. +/// +/// Post-operations applies if one of the following holds: +/// * Non-empty attributes are specified. +/// * Output data type `d_dt` is different from accumulation data type `c_dt`. +/// +/// If any of conditions happens, the final call of the accumulation chain +/// must be `dnnl_brgemm_execute_postops`, and `dnnl_brgemm_execute`, otherwise. +/// +/// @param brgemm Output BRGeMM ukernel object. +/// @param M Dimension M of tensor A. +/// @param N Dimension N of tensor B. +/// @param K Dimension K of tensors A and B. +/// @param batch_size Number of batches to process. +/// @param lda Leading dimension of tensor A. +/// @param ldb Leading dimension of tensor B. +/// @param ldc Leading dimension of tensor C. +/// @param ldd Leading dimension of tensor D. +/// @param a_dt Data type of tensor A. +/// @param b_dt Data type of tensor B. +/// @param c_dt Data type of tensor C. Must be dnnl_f32. +/// @param d_dt Data type of tensor D. +/// @param alpha Scale for an accumulation output. +/// @param beta Scale for a tensor C to append on an accumulation output. +/// @param attr Primitive attributes to extend the kernel operations. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_create(dnnl_brgemm_t *brgemm, dnnl_dim_t M, + dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t batch_size, dnnl_dim_t lda, + dnnl_dim_t ldb, dnnl_dim_t ldc, dnnl_dim_t ldd, dnnl_data_type_t a_dt, + dnnl_data_type_t b_dt, dnnl_data_type_t c_dt, dnnl_data_type_t d_dt, + float alpha, float beta, const_dnnl_primitive_attr_t attr); + +/// Returns the size of a scratchpad memory needed for the BRGeMM ukernel +/// object. +/// +/// @param brgemm BRGeMM ukernel object. +/// @param size Output size of a buffer required for the BRGeMM ukernel object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_get_scratchpad_size( + const_dnnl_brgemm_t brgemm, size_t *size); + +/// Initializes the hardware-specific context. If no initialization required, +/// returns the success status. +/// +/// @param brgemm BRGeMM ukernel object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_set_hw_context(const_dnnl_brgemm_t brgemm); + +/// Releases the hardware-specific context. Must be used after all the execution +/// calls to BRGeMM ukernel objects. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_release_hw_context(); + +/// Generates an executable part of BRGeMM ukernel object. +/// @param brgemm BRGeMM ukernel object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_generate(dnnl_brgemm_t brgemm); + +/// Executes a BRGeMM ukernel object. +/// +/// @param brgemm BRGeMM ukernel object. +/// @param A_ptr Base pointer to a tensor A. +/// @param B_ptr Base pointer to a tensor B. +/// @param A_B_offsets Pointer to the set of tensor A and tensor B offsets for +/// each batch; the set must be contiguous in memory. Single batch should +/// supply offsets for both tensors A and B simultaneously. The number of +/// batches must coincide with the `batch_size` value passed at the creation +/// stage. +/// @param C_ptr Pointer to a tensor C (accumulation buffer). +/// @param scratchpad_ptr Pointer to a scratchpad buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_execute(const_dnnl_brgemm_t brgemm, + const void *A_ptr, const void *B_ptr, const dnnl_dim_t *A_B_offsets, + void *C_ptr, void *scratchpad_ptr); + +/// Executes a BRGeMM ukernel object with post operations. +/// +/// @param brgemm BRGeMM ukernel object. +/// @param A_ptr Base pointer to a tensor A. +/// @param B_ptr Base pointer to a tensor B. +/// @param A_B_offsets Pointer to a set of tensor A and tensor B offsets for +/// each batch. A set must be contiguous in memory. A single batch should +/// supply offsets for both tensors A and B simultaneously. The number of +/// batches must coincide with the `batch_size` value passed at the creation +/// stage. +/// @param C_ptr Pointer to a tensor C (accumulation buffer). +/// @param D_ptr Pointer to a tensor D (output buffer). +/// @param scratchpad_ptr Pointer to a scratchpad buffer. +/// @param binary_po_ptr Pointer to binary post-op data. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_execute_postops(const_dnnl_brgemm_t brgemm, + const void *A, const void *B, const dnnl_dim_t *A_B_offsets, + const void *C_ptr, void *D_ptr, void *scratchpad_ptr, + const void *binary_po_ptr); + +/// Destroys a BRGeMM ukernel object. +/// +/// @param brgemm BRGeMM ukernel object to destroy. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_destroy(dnnl_brgemm_t brgemm); + +/// Creates a BRGeMM ukernel packing tensor B object. +/// +/// @param brgemm_pack_B Output BRGeMM ukernel packing B object. +/// @param K Dimension K. +/// @param N Dimension N. +/// @param in_ld Input leading dimension. +/// @param out_ld Output leading dimension. Specifies a block by N dimension +/// during data packing. +/// @param in_dt Input data type. +/// @param out_dt Output data type. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_pack_B_create( + dnnl_brgemm_pack_B_t *brgemm_pack_B, dnnl_dim_t K, dnnl_dim_t N, + dnnl_dim_t in_ld, dnnl_dim_t out_ld, dnnl_data_type_t in_dt, + dnnl_data_type_t out_dt); + +/// Returns the flag if packing is expected by BRGeMM ukernel kernel. +/// +/// @param brgemm_pack_B BRGeMM ukernel packing B object. +/// @param need_pack Output flag specifying if packing is needed. +/// Possible values are 0 (not needed) and 1 (needed). +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_pack_B_need_pack( + const_dnnl_brgemm_pack_B_t brgemm_pack_B, int *need_pack); + +/// Generates an executable part of BRGeMM ukernel packing B object. +/// @param brgemm BRGeMM ukernel packing B object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_pack_B_generate( + dnnl_brgemm_pack_B_t brgemm_pack_B); + +/// Executes a BRGeMM ukernel packing tensor B object. +/// +/// @param brgemm_pack_B BRGeMM ukernel packing B object. +/// @param in_ptr Pointer to an input buffer. +/// @param out_ptr Pointer to an output buffer. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_pack_B_execute( + const_dnnl_brgemm_pack_B_t brgemm_pack_B, const void *in_ptr, + void *out_ptr); + +/// Destroys a BRGeMM ukernel packing tensor B object. +/// +/// @param brgemm_pack_B BRGeMM ukernel packing B object. +/// @returns #dnnl_success on success and a status describing the error +/// otherwise. +dnnl_status_t DNNL_API dnnl_brgemm_pack_B_destroy( + dnnl_brgemm_pack_B_t brgemm_pack_B); + +/// @} dnnl_api_ukernel_brgemm + +#endif + +/// @} dnnl_api_ukernel + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif /* ONEAPI_DNNL_DNNL_UKERNEL_H */ diff --git a/include/oneapi/dnnl/dnnl_ukernel.hpp b/include/oneapi/dnnl/dnnl_ukernel.hpp new file mode 100644 index 00000000000..4ac0b93c56f --- /dev/null +++ b/include/oneapi/dnnl/dnnl_ukernel.hpp @@ -0,0 +1,307 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// ukernel C++ API + +#ifndef ONEAPI_DNNL_DNNL_UKERNEL_HPP +#define ONEAPI_DNNL_DNNL_UKERNEL_HPP + +#include "oneapi/dnnl/dnnl.hpp" +#include "oneapi/dnnl/dnnl_ukernel.h" + +/// @addtogroup dnnl_api oneDNN API +/// @{ + +/// oneDNN namespace +namespace dnnl { + +/// @addtogroup dnnl_api_utils +/// @{ + +/// @cond DO_NOT_DOCUMENT_THIS + +template <> +struct handle_traits { + static dnnl_status_t destructor(dnnl_brgemm_t p) { + return dnnl_brgemm_destroy(p); + } +}; + +template <> +struct handle_traits { + static dnnl_status_t destructor(dnnl_brgemm_pack_B_t p) { + return dnnl_brgemm_pack_B_destroy(p); + } +}; + +/// @endcond + +/// @} dnnl_api_utils + +/// @addtogroup dnnl_api_ukernel +/// @{ + +/// ukernel namespace +namespace ukernel { + +#ifdef DNNL_EXPERIMENTAL_UKERNEL + +/// @addtogroup dnnl_api_ukernel_brgemm BRGeMM ukernel +/// @{ + +struct brgemm : public handle { + /// Default constructor. Produces an empty object. + brgemm() = default; + + /// Constructs a BRGeMM ukernel object. Operates by the following formula: + /// `C = alpha * [A x B] + beta * C`. + /// + /// @param M Dimension M of tensor A. + /// @param N Dimension N of tensor B. + /// @param K Dimension K of tensors A and B. + /// @param batch_size Number of batches to process. + /// @param lda Leading dimension of tensor A. + /// @param ldb Leading dimension of tensor B. + /// @param ldc Leading dimension of tensor C. + /// @param a_dt Data type of tensor A. + /// @param b_dt Data type of tensor B. + /// @param c_dt Data type of tensor C. + /// @param alpha Scale for an accumulation output. + /// @param beta Scale for a tensor C to append on an accumulated output. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + brgemm(memory::dim M, memory::dim N, memory::dim K, memory::dim batch_size, + memory::dim lda, memory::dim ldb, memory::dim ldc, + memory::data_type a_dt, memory::data_type b_dt, + memory::data_type c_dt, float alpha, float beta, + bool allow_empty = false) { + + dnnl_brgemm_t brgemm = nullptr; + dnnl_status_t status = dnnl_brgemm_create(&brgemm, M, N, K, batch_size, + lda, ldb, ldc, ldc, memory::convert_to_c(a_dt), + memory::convert_to_c(b_dt), memory::convert_to_c(c_dt), + memory::convert_to_c(c_dt), alpha, beta, nullptr); + + if (!allow_empty) + error::wrap_c_api( + status, "could not create a BRGeMM ukernel object"); + reset(brgemm); + } + + /// Constructs a BRGeMM ukernel object. Operates by the following formula: + /// `C = alpha * [A x B] + beta * C`; + /// `D = post-operations(C)`. + /// + /// @param M Dimension M of tensor A. + /// @param N Dimension N of tensor B. + /// @param K Dimension K of tensors A and B. + /// @param batch_size Number of batches to process. + /// @param lda Leading dimension of tensor A. + /// @param ldb Leading dimension of tensor B. + /// @param ldc Leading dimension of tensor C. + /// @param ldd Leading dimension of tensor D. + /// @param a_dt Data type of tensor A. + /// @param b_dt Data type of tensor B. + /// @param c_dt Data type of tensor C. Must be data_type::f32. + /// @param d_dt Data type of tensor D. + /// @param alpha Scale for an accumulation output. + /// @param beta Scale for a tensor C to append on an accumulated output. + /// @param attr Primitive attributes to extend the kernel operations. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + brgemm(memory::dim M, memory::dim N, memory::dim K, memory::dim batch_size, + memory::dim lda, memory::dim ldb, memory::dim ldc, memory::dim ldd, + memory::data_type a_dt, memory::data_type b_dt, + memory::data_type c_dt, memory::data_type d_dt, float alpha, + float beta, const primitive_attr &attr, bool allow_empty = false) { + + dnnl_brgemm_t brgemm = nullptr; + dnnl_status_t status = dnnl_brgemm_create(&brgemm, M, N, K, batch_size, + lda, ldb, ldc, ldd, memory::convert_to_c(a_dt), + memory::convert_to_c(b_dt), memory::convert_to_c(c_dt), + memory::convert_to_c(d_dt), alpha, beta, attr.get()); + + if (!allow_empty) + error::wrap_c_api( + status, "could not create a BRGeMM ukernel object"); + reset(brgemm); + } + + /// Returns the size of a scratchpad memory needed for the BRGeMM ukernel + /// object. + size_t get_scratchpad_size() const { + size_t size; + dnnl_status_t status = dnnl_brgemm_get_scratchpad_size(get(), &size); + if (status != dnnl_success) + error::wrap_c_api(status, + "could not query a scratchpad size from a BRGeMM ukernel " + "object"); + return size; + } + + /// Initializes the hardware-specific context. Affects the global state for + /// all BRGeMM ukernel objects. If no initialization required, returns. + void set_hw_context() const { + dnnl_status_t status = dnnl_brgemm_set_hw_context(get()); + if (status != dnnl_success) + error::wrap_c_api(status, "could not set hardware context"); + } + + /// Releases the hardware-specific context. Affects the global state for + /// all BRGeMM ukernel objects. Must be used after all the execution calls + /// to BRGeMM ukernel objects. + static void release_hw_context() { + dnnl_status_t status = dnnl_brgemm_release_hw_context(); + if (status != dnnl_success) + error::wrap_c_api(status, "could not release hardware context"); + } + + /// Generates an executable part of BRGeMM ukernel object. + void generate() { + dnnl_status_t status = dnnl_brgemm_generate(get()); + if (status != dnnl_success) + error::wrap_c_api(status, "could not generate a kernel"); + } + + /// Executes a BRGeMM ukernel object. + /// + /// @param A Base pointer to a tensor A. + /// @param B Base pointer to a tensor B. + /// @param A_B_offsets Vector of pairs of tensors A and B offsets for + /// each batch. The number of batches must coincide with the + /// `batch_size` value passed at object construction stage. + /// @param C Pointer to a tensor C (accumulation buffer). + /// @param scratchpad Pointer to a scratchpad buffer. + void execute(const void *A, const void *B, + const std::vector> &A_B_offsets, + void *C, void *scratchpad) const { + // TODO: export batch_element to C API later for user to fill it and + // pass directly to the call. + dnnl_status_t status = dnnl_brgemm_execute(get(), A, B, + (const dnnl_dim_t *)A_B_offsets.data(), C, scratchpad); + if (status != dnnl_success) + error::wrap_c_api( + status, "could not execute a BRGeMM ukernel object"); + } + + /// Executes a BRGeMM ukernel object with post operations. + /// + /// @param A Base pointer to a tensor A. + /// @param B Base pointer to a tensor B. + /// @param A_B_offsets Vector of pairs of tensors A and B offsets for + /// each batch. The number of batches must coincide with the + /// `batch_size` value passed at object construction stage. + /// @param C Pointer to a tensor C (accumulation buffer). + /// @param D Pointer to a tensor D (output buffer). + /// @param scratchpad Pointer to a scratchpad buffer. + /// @param binary_po Binary post-op memory buffer. Must be passed If binary + /// post-op was specified at construction call. + void execute(const void *A, const void *B, + const std::vector> &A_B_offsets, + void *C, void *D, void *scratchpad, + const void *binary_po = nullptr) const { + // TODO: export batch_element to C API later for user to fill it and + // pass directly to the call. + dnnl_status_t status = dnnl_brgemm_execute_postops(get(), A, B, + (const dnnl_dim_t *)A_B_offsets.data(), C, D, scratchpad, + binary_po); + if (status != dnnl_success) + error::wrap_c_api( + status, "could not execute a BRGeMM ukernel object"); + } +}; + +struct brgemm_pack_B : public handle { + /// Default constructor. Produces an empty object. + brgemm_pack_B() = default; + + /// Constructs a BRGeMM ukernel packing tensor B object. + /// + /// @param K Dimension K. + /// @param N Dimension N. + /// @param in_ld Input leading dimension. + /// @param out_ld Output leading dimension. Specifies a block by N dimension + /// during data packing. + /// @param in_dt Input data type. + /// @param out_dt Output data type. + /// @param allow_empty A flag signifying whether construction is + /// allowed to fail without throwing an exception. In this case an + /// empty object will be produced. This flag is optional and + /// defaults to false. + brgemm_pack_B(memory::dim K, memory::dim N, memory::dim in_ld, + memory::dim out_ld, memory::data_type in_dt, + memory::data_type out_dt, bool allow_empty = false) { + + dnnl_brgemm_pack_B_t brgemm_pack_B = nullptr; + dnnl_status_t status = dnnl_brgemm_pack_B_create(&brgemm_pack_B, K, N, + in_ld, out_ld, memory::convert_to_c(in_dt), + memory::convert_to_c(out_dt)); + + if (!allow_empty) + error::wrap_c_api(status, + "could not create a BRGeMM ukernel packing B object"); + reset(brgemm_pack_B); + } + + /// Returns the flag if packing is expected by BRGeMM ukernel kernel. + bool need_pack() const { + int flag; + dnnl_status_t status = dnnl_brgemm_pack_B_need_pack(get(), &flag); + if (status != dnnl_success) + error::wrap_c_api(status, + "could not query a need_pack flag from a BRGeMM ukernel " + "packing B object"); + return bool(flag); + } + + /// Generates an executable part of BRGeMM ukernel packing B object. + void generate() { + dnnl_status_t status = dnnl_brgemm_pack_B_generate(get()); + if (status != dnnl_success) + error::wrap_c_api(status, + "could not generate a BRGeMM ukernel packing B object"); + } + + /// Executes a BRGeMM ukernel packing tensor B object. + /// + /// @param in_ptr Pointer to an input buffer. + /// @param out_ptr Pointer to an output buffer. + void execute(const void *in, void *out) const { + dnnl_status_t status = dnnl_brgemm_pack_B_execute(get(), in, out); + if (status != dnnl_success) + error::wrap_c_api(status, + "could not execute a BRGeMM ukernel packing B object"); + } +}; + +/// @} dnnl_api_ukernel_brgemm + +#endif + +} // namespace ukernel + +/// @} dnnl_api_ukernel + +} // namespace dnnl + +/// @} dnnl_api + +#endif /* ONEAPI_DNNL_DNNL_UKERNEL_HPP */ diff --git a/include/oneapi/dnnl/dnnl_ukernel_types.h b/include/oneapi/dnnl/dnnl_ukernel_types.h new file mode 100644 index 00000000000..ca593b3d6aa --- /dev/null +++ b/include/oneapi/dnnl/dnnl_ukernel_types.h @@ -0,0 +1,70 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/// @file +/// ukernel C API types definitions + +#ifndef ONEAPI_DNNL_DNNL_UKERNEL_TYPES_H +#define ONEAPI_DNNL_DNNL_UKERNEL_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "oneapi/dnnl/dnnl_types.h" + +/// @addtogroup dnnl_api +/// @{ + +/// @addtogroup dnnl_api_ukernel +/// @{ + +#ifdef DNNL_EXPERIMENTAL_UKERNEL +/// @addtogroup dnnl_api_ukernel_brgemm +/// @{ + +/// @struct dnnl_brgemm +/// An opaque structure to describe a brgemm ukernel. +struct dnnl_brgemm; + +/// A brgemm ukernel handle. +typedef struct dnnl_brgemm *dnnl_brgemm_t; + +/// A constant brgemm ukernel handle. +typedef const struct dnnl_brgemm *const_dnnl_brgemm_t; + +/// @struct dnnl_brgemm_pack_B +/// An opaque structure to describe a brgemm ukernel packing B routine. +struct dnnl_brgemm_pack_B; + +/// A brgemm ukernel packing B routine handle. +typedef struct dnnl_brgemm_pack_B *dnnl_brgemm_pack_B_t; + +/// A constant brgemm ukernel packing B routine handle. +typedef const struct dnnl_brgemm_pack_B *const_dnnl_brgemm_pack_B_t; + +/// @} dnnl_api_ukernel_brgemm +#endif + +/// @} dnnl_api_ukernel + +/// @} dnnl_api + +#ifdef __cplusplus +} +#endif + +#endif /* ONEAPI_DNNL_DNNL_UKERNEL_TYPES_H */ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0b74e33d563..b060468f0ad 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -79,6 +79,13 @@ if(DNNL_EXPERIMENTAL_SPARSE) message(STATUS "Experimental functionality for sparse domain is enabled") endif() +if(DNNL_EXPERIMENTAL_UKERNEL) + if(DNNL_TARGET_ARCH STREQUAL "ARCH_GENERIC") + message(FATAL_ERROR "ukernel API does not support generic architecture.") + endif() + message(STATUS "Experimental functionality for ukernels is enabled") +endif() + if(DNNL_EXPERIMENTAL_PROFILING) message(STATUS "Experimental profiling capabilities are enabled") endif() diff --git a/src/cpu/x64/brgemm/capi/brgemm_api.cpp b/src/cpu/x64/brgemm/capi/brgemm_api.cpp new file mode 100644 index 00000000000..fa6af04f6a7 --- /dev/null +++ b/src/cpu/x64/brgemm/capi/brgemm_api.cpp @@ -0,0 +1,328 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dnnl/dnnl_ukernel.h" + +#include "common/c_types_map.hpp" +#include "common/memory_desc_wrapper.hpp" +#include "common/verbose.hpp" + +#include "cpu/x64/amx_tile_configure.hpp" + +#include "cpu/x64/brgemm/brgemm.hpp" + +#include "cpu/x64/brgemm/capi/brgemm_api.hpp" + +using namespace dnnl::impl; +using namespace dnnl::impl::format_tag; +using namespace dnnl::impl::status; +using namespace dnnl::impl::cpu::x64; + +using brgemm_t = dnnl_brgemm; +using brgemm_pack_B_t = dnnl_brgemm_pack_B; + +#define VCHECK_BRGEMM(cond, msg, ...) \ + VCONDCHECK(primitive, create, check, brgemm, (cond), \ + status::invalid_arguments, msg, ##__VA_ARGS__) + +#define VCHECK_BRGEMM_STATUS(status, cond, msg, ...) \ + VCONDCHECK(primitive, create, check, brgemm, (cond), (status), msg, \ + ##__VA_ARGS__) + +status_t dnnl_brgemm_create(brgemm_t **brgemm, dim_t M, dim_t N, dim_t K, + dim_t batch_size, dim_t lda, dim_t ldb, dim_t ldc, dim_t ldd, + data_type_t a_dt, data_type_t b_dt, data_type_t c_dt, data_type_t d_dt, + float alpha, float beta, const primitive_attr_t *attr) { + if (brgemm == nullptr) return invalid_arguments; + + auto _brgemm = new brgemm_t(); + auto &brgemm_desc = _brgemm->brgemm_desc_; + + brgemm_batch_kind_t batch_kind = brgemm_batch_kind_t::brgemm_offs; + + auto status = brgemm_desc_init(&brgemm_desc, cpu_isa_t::isa_undef, + batch_kind, a_dt, b_dt, /* transA = */ false, /* trans_B = */ false, + brgemm_row_major, alpha, beta, lda, ldb, ldc, M, N, K, + /* strides = */ nullptr); + if (status != status::success) { + delete _brgemm; + VCHECK_BRGEMM_STATUS(status, false, "brgemm_desc_init failed"); + } + + memory_desc_t D_md; + dims_t dims {M, N}; + dims_t strides {ldc, 1}; + status = memory_desc_init_by_strides( + D_md, /* ndims = */ 2, dims, d_dt, strides); + if (status != status::success) { + delete _brgemm; + VCHECK_BRGEMM_STATUS(status, false, "D_md creation failed"); + } + + status = brgemm_desc_set_postops( + &brgemm_desc, attr, &D_md, ldd, data_type::undef); + if (status != status::success) { + delete _brgemm; + VCHECK_BRGEMM_STATUS(status, false, "brgemm_desc_set_postops failed"); + } + + VCHECK_BRGEMM(batch_size > 0, "batch size is non-positive"); + brgemm_attr_t brgemm_attr; + brgemm_attr.max_bs = batch_size; + if (mayiuse(avx512_core_amx)) { + brgemm_attr.use_uker = true; + brgemm_attr.use_interleave_stores = true; + brgemm_attr.hint_prefetching = brgemm_kernel_prefetching_t::brgemm_prf0; + } + + status = brgemm_desc_set_attr(&brgemm_desc, brgemm_attr); + if (status != status::success) { + delete _brgemm; + VCHECK_BRGEMM_STATUS(status, false, "brgemm_desc_set_attr failed"); + } + + // Note: API can't take a compensation buffer externally. Users must add + // compensation on their own as a binary post-op. + brgemm_desc.req_s8s8_compensation = false; + + *brgemm = _brgemm; + return status::success; +} + +status_t dnnl_brgemm_get_scratchpad_size(const brgemm_t *brgemm, size_t *size) { + if (utils::any_null(brgemm, size)) return invalid_arguments; + + *size = brgemm->brgemm_desc_.get_wsp_buffer_size(); + return status::success; +} + +status_t dnnl_brgemm_set_hw_context(const brgemm_t *brgemm) { + if (brgemm == nullptr) return invalid_arguments; + + const auto &brgemm_desc = brgemm->brgemm_desc_; + char palette[AMX_PALETTE_SIZE] = {}; + auto status = brgemm_init_tiles(brgemm_desc, palette); + if (status == status::success) { + status = amx_tile_lazy_configure(palette); + VCHECK_BRGEMM_STATUS( + status, status == status::success, "amx_tile_configure failed"); + } + + return status::success; +} + +status_t dnnl_brgemm_release_hw_context() { + if (mayiuse(avx512_core_amx)) { + VCHECK_BRGEMM(amx_tile_release() == status::success, + "amx_tile_release failed"); + } + + return status::success; +} + +status_t dnnl_brgemm_generate(brgemm_t *brgemm) { + if (brgemm == nullptr) return invalid_arguments; + + // Re-generation won't take any effect. + if (brgemm->brgemm_kernel_ != nullptr) return status::success; + + auto &brgemm_desc = brgemm->brgemm_desc_; + auto status = brgemm_kernel_create(&brgemm->brgemm_kernel_, brgemm_desc); + VCHECK_BRGEMM_STATUS( + status, status == status::success, "brgemm_kernel_create failed"); + + return status::success; +} + +status_t dnnl_brgemm_execute(const brgemm_t *brgemm, const void *A_ptr, + const void *B_ptr, const dim_t *A_B_offsets, void *C_ptr, + void *scratchpad_ptr) { + const auto &brgemm_desc = brgemm->brgemm_desc_; + const auto &brgemm_kernel = brgemm->brgemm_kernel_; + + const auto batch_size = brgemm_desc.brgattr.max_bs; + + std::vector v_batch_element(batch_size); + for (int i = 0; i < batch_size; i++) { + v_batch_element[i].offset.A = A_B_offsets[2 * i]; + v_batch_element[i].offset.B = A_B_offsets[2 * i + 1]; + } + + brgemm_kernel_execute(brgemm_kernel, batch_size, A_ptr, B_ptr, + v_batch_element.data(), C_ptr, scratchpad_ptr, + /* dynamic_values = */ nullptr); + return status::success; +} + +status_t dnnl_brgemm_execute_postops(const brgemm_t *brgemm, const void *A_ptr, + const void *B_ptr, const dim_t *A_B_offsets, const void *C_ptr, + void *D_ptr, void *scratchpad_ptr, const void *binary_po_ptr) { + const auto &brgemm_desc = brgemm->brgemm_desc_; + const auto &brgemm_kernel = brgemm->brgemm_kernel_; + + const auto batch_size = brgemm_desc.brgattr.max_bs; + + std::vector v_batch_element(batch_size); + for (int i = 0; i < batch_size; i++) { + v_batch_element[i].offset.A = A_B_offsets[2 * i]; + v_batch_element[i].offset.B = A_B_offsets[2 * i + 1]; + } + + brgemm_post_ops_data_t post_ops_data; + post_ops_data.data_C_ptr_ = reinterpret_cast(C_ptr); + // This member expect a pointer to a vector of pointers to binary_po args. + post_ops_data.binary_post_ops_rhs = &binary_po_ptr; + + bool use_D_as_C = false; + if (brgemm_desc.dt_c == brgemm_desc.dt_d + && brgemm_desc.attr()->has_default_values( + primitive_attr_t::skip_mask_t::fpmath_mode)) + use_D_as_C = true; + + if (use_D_as_C) C_ptr = D_ptr; + + brgemm_kernel_execute_postops(brgemm_kernel, batch_size, A_ptr, B_ptr, + v_batch_element.data(), const_cast(C_ptr), D_ptr, + post_ops_data, scratchpad_ptr, + /* dynamic_values = */ nullptr); + return status::success; +} + +status_t dnnl_brgemm_destroy(brgemm_t *brgemm) { + brgemm_kernel_destroy(brgemm->brgemm_kernel_); + delete brgemm; + + return status::success; +} + +dnnl_brgemm_pack_B::dnnl_brgemm_pack_B(dim_t K, dim_t N, dim_t in_ld, + dim_t out_ld, data_type_t in_dt, data_type_t out_dt) { + // So far, only `ab` input format (dense or strided) is supported. + assert(in_ld >= N); + // Only special N_blk sizes are supported by matmul copy routines. Rest + // will crash. + assert(utils::one_of(out_ld, 16, 32, 48, 64)); + + auto status = matmul::init_conf( + bmc_, /* batch = */ 1, K, N, out_ld, in_dt, out_dt, format_tag::ab); + assert(status == status::success); + if (status != status::success) return; +} + +bool brgemm_pack_B_t::need_pack() const { + // TODO: move on unified method from the library. + return bmc_.orig_wei_dt != data_type::f32 + && bmc_.orig_wei_dt != data_type::f16; +} + +void brgemm_pack_B_t::generate() { + // Re-generation won't take any effect. + if (kernel_ != nullptr) return; + + auto status = matmul::create_brgemm_matmul_copy_b(kernel_, &bmc_); + assert(status == status::success); + if (status != status::success) return; +} + +void brgemm_pack_B_t::execute(const void *src, void *dst) const { + const uint8_t *src_ptr = reinterpret_cast(src); + uint8_t *dst_ptr = reinterpret_cast(dst); + + const auto &kernel_conf = bmc_; + const dim_t n_blks = utils::div_up(kernel_conf.N, kernel_conf.N_blk); + const dim_t k_blks = utils::div_up(kernel_conf.K, kernel_conf.K_blk); + const auto blk_size = kernel_conf.K_blk * kernel_conf.N_blk; + + const auto i_dt_sz = kernel_conf.b_dt_sz; + const auto o_dt_sz = kernel_conf.a_dt_sz; + + for (dim_t n_blk_idx = 0; n_blk_idx < n_blks; n_blk_idx++) { + const auto n = n_blk_idx * kernel_conf.N_blk; + const bool is_N_tail = (kernel_conf.N - n) < kernel_conf.N_blk; + auto ker_exec_ctx = matmul::jit_brgemm_matmul_copy_b_t::ctx_t(); + ker_exec_ctx.current_N_blk + = is_N_tail ? kernel_conf.N_tail : kernel_conf.N_blk; + + int k_blk_idx = 0; + for (; k_blk_idx < kernel_conf.K / kernel_conf.K_blk; k_blk_idx++) { + const auto k = k_blk_idx * kernel_conf.K_blk; + assert(kernel_conf.wei_tag == format_tag::ab); + // Since only `ab` is supported so far, hard code the stride. + const auto src_offset = i_dt_sz * (k * kernel_conf.N + n); + const auto dst_offset + = o_dt_sz * (k_blk_idx * blk_size + n_blk_idx * k_blks); + ker_exec_ctx.src = &src_ptr[src_offset]; + ker_exec_ctx.tr_src = &dst_ptr[dst_offset]; + ker_exec_ctx.current_K_start = k; + ker_exec_ctx.current_K_iters = kernel_conf.K_blk; + (*kernel_)(&ker_exec_ctx); + } + if (kernel_conf.K_tail > 0) { + const auto k = k_blk_idx * kernel_conf.K_blk; + assert(kernel_conf.wei_tag == format_tag::ab); + // Since only `ab` is supported so far, hard code the stride. + const auto src_offset = i_dt_sz * (k * kernel_conf.N + n); + const auto dst_offset + = o_dt_sz * (k_blk_idx * blk_size + n_blk_idx * k_blks); + ker_exec_ctx.src = &src_ptr[src_offset]; + ker_exec_ctx.tr_src = &dst_ptr[dst_offset]; + ker_exec_ctx.current_K_start = k; + ker_exec_ctx.current_K_iters = kernel_conf.K_tail; + (*kernel_)(&ker_exec_ctx); + } + } +} + +status_t dnnl_brgemm_pack_B_create(brgemm_pack_B_t **brgemm_pack_B, dim_t K, + dim_t N, dim_t in_ld, dim_t out_ld, data_type_t in_dt, + data_type_t out_dt) { + if (brgemm_pack_B == nullptr) return status::invalid_arguments; + + *brgemm_pack_B = new brgemm_pack_B_t(K, N, in_ld, out_ld, in_dt, out_dt); + return status::success; +} + +status_t dnnl_brgemm_pack_B_need_pack( + const brgemm_pack_B_t *brgemm_pack_B, int *need_pack) { + if (utils::any_null(brgemm_pack_B, need_pack)) + return status::invalid_arguments; + + *need_pack = brgemm_pack_B->need_pack(); + return status::success; +} + +status_t dnnl_brgemm_pack_B_generate(brgemm_pack_B_t *brgemm_pack_B) { + if (brgemm_pack_B == nullptr) return status::invalid_arguments; + + brgemm_pack_B->generate(); + return status::success; +} + +status_t dnnl_brgemm_pack_B_execute(const brgemm_pack_B_t *brgemm_pack_B, + const void *in_ptr, void *out_ptr) { + if (utils::any_null(brgemm_pack_B, in_ptr, out_ptr)) + return status::invalid_arguments; + + brgemm_pack_B->execute(in_ptr, out_ptr); + return status::success; +} + +status_t dnnl_brgemm_pack_B_destroy(brgemm_pack_B_t *brgemm_pack_B) { + delete brgemm_pack_B; + return status::success; +} + +//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s diff --git a/src/cpu/x64/brgemm/capi/brgemm_api.hpp b/src/cpu/x64/brgemm/capi/brgemm_api.hpp new file mode 100644 index 00000000000..40b0f471597 --- /dev/null +++ b/src/cpu/x64/brgemm/capi/brgemm_api.hpp @@ -0,0 +1,63 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_X64_BRGEMM_CAPI_BRGEMM_API_HPP +#define CPU_X64_BRGEMM_CAPI_BRGEMM_API_HPP + +#include + +#include "cpu/x64/matmul/brgemm_matmul_copy_utils.hpp" +#include "cpu/x64/matmul/brgemm_matmul_utils.hpp" + +#include "cpu/x64/brgemm/brgemm_types.hpp" + +struct dnnl_brgemm : public dnnl::impl::c_compatible { + dnnl_brgemm() = default; + + // Just members here because brgemm API is C-based and should remain until + // this new API becomes a production one. + // Once becamoes, internal C API can be re-factored. + dnnl::impl::cpu::x64::brgemm_desc_t brgemm_desc_; + dnnl::impl::cpu::x64::brgemm_kernel_t *brgemm_kernel_; +}; + +struct dnnl_brgemm_pack_B : public dnnl::impl::c_compatible { + dnnl_brgemm_pack_B() = default; + + // Ctor that follows a call to initialize matmul conf struct. + dnnl_brgemm_pack_B(dnnl::impl::dim_t K, dnnl::impl::dim_t N, + dnnl::impl::dim_t in_ld, dnnl::impl::dim_t out_ld, + dnnl::impl::data_type_t in_type, dnnl::impl::data_type_t out_type); + + // Returns the flag is packing for VNNI is needed. + // Note: not completely aligned with primitives logic. + bool need_pack() const; + + // Generates a copy_b kernel. + void generate(); + + // Executes a copy_b kernel. + void execute(const void *src, void *dst) const; + + dnnl::impl::cpu::x64::matmul::brgemm_matmul_conf_t bmc_; + // unique_ptr is required by API that generates a kernel. + std::unique_ptr + kernel_; +}; + +#endif + +//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s From ff2f84d23c3dcd64bd8a19e49162a7c87e40d46e Mon Sep 17 00:00:00 2001 From: Mourad Gouicem Date: Fri, 26 Apr 2024 14:58:09 +0200 Subject: [PATCH 071/187] doc: add ukernel dev_guide --- doc/advanced/experimental.md | 15 ++- doc/build/link.md | 2 + doc/programming_model/basic_concepts.md | 8 ++ doc/ukernel/operations/brgemm.md | 91 +++++++++++++++++++ doc/ukernel/operations/transform.md | 42 +++++++++ .../ukernel_basic_concepts.md | 48 ++++++++++ examples/ukernels/brgemm.cpp | 5 + 7 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 doc/ukernel/operations/brgemm.md create mode 100644 doc/ukernel/operations/transform.md create mode 100644 doc/ukernel/programming_model/ukernel_basic_concepts.md diff --git a/doc/advanced/experimental.md b/doc/advanced/experimental.md index 03bb4e720b3..e7383641779 100644 --- a/doc/advanced/experimental.md +++ b/doc/advanced/experimental.md @@ -27,9 +27,11 @@ Both kinds of experimental features can be enabled simultaneously. | Build time option | Description | |:-------------------------------------------|:-------------------------------------------------------------------| | ONEDNN_EXPERIMENTAL_SPARSE | Enable experimental API and functionality for sparse domain. | -| ONEDNN_EXPERIMENTAL_PROFILING | Enable experimental profiling API. +| ONEDNN_EXPERIMENTAL_UKERNEL | Enable experimental microkernel APIs and functionalities. | +| ONEDNN_EXPERIMENTAL_PROFILING | Enable experimental profiling API. | | ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND | Enable experimental graph compiler backend of the graph component. | + ## Features details ### ONEDNN_EXPERIMENTAL_SPARSE @@ -192,6 +194,17 @@ destination tensor should also work for the sparse one. Multiplication and Reorder primitives * Sparse memory can be created only for a CPU engine +### ONEDNN_EXPERIMENTAL_UKERNEL + +This option enables a new set of CPU-only APIs to support block-level +functionalities. By composing these low-level, sequential operations, users can +implement their own custom operations/fusions, and tailor blocking/threading +logic to their applications. + +More details on this API are available in the [Microkernel APIs +section](@ref dev_guide_ukernel_basic_concepts). + + ### ONEDNN_EXPERIMENTAL_PROFILING This option enables profiling API that can be used to query different profiling data. diff --git a/doc/build/link.md b/doc/build/link.md index 17cd73b806d..aa2f80c50dd 100644 --- a/doc/build/link.md +++ b/doc/build/link.md @@ -17,6 +17,8 @@ on how oneDNN was built. | ``include/oneapi/dnnl/dnnl_graph.h`` | C header for graph API | | ``include/oneapi/dnnl/dnnl_graph.hpp`` | C++ header for graph API | | ``include/oneapi/dnnl/dnnl_graph_types.h`` | Auxiliary C header for graph API | +| ``include/oneapi/dnnl/dnnl_ukernel.h`` | C header with ukernel API | +| ``include/oneapi/dnnl/dnnl_ukernel.hpp`` | C++ header with ukernel API | ## Libraries diff --git a/doc/programming_model/basic_concepts.md b/doc/programming_model/basic_concepts.md index f34651aa703..84a719af3b0 100644 --- a/doc/programming_model/basic_concepts.md +++ b/doc/programming_model/basic_concepts.md @@ -135,3 +135,11 @@ you to make an operation fusion: The programming model for the graph extension is detailed in the [graph basic concepts section](@ref dev_guide_graph_basic_concepts). + +## Micro-kernel Extension + +The Micro-kernel API extension (ukernel API) is a low-level abstraction in +oneDNN that implements sequential, block-level operations. This abstraction +typically allows users to implement custom operations by composing those +block-level computations. Users of the ukernel API has full control of the +threading and blocking logic, so they can be tailored to their application. diff --git a/doc/ukernel/operations/brgemm.md b/doc/ukernel/operations/brgemm.md new file mode 100644 index 00000000000..70322633eb8 --- /dev/null +++ b/doc/ukernel/operations/brgemm.md @@ -0,0 +1,91 @@ +Batch-reduce General Matrix Multiplication {#dev_guide_ukernel_brgemm} +======================================= + +> +> [API Reference](@ref dnnl_api_ukernel_brgemm) +> + + +## General + +The batch-reduce General Matrix Multiplication ukernel (BRGeMM) is an +operation that allows to compute a batch of small matrix +multiplication and accumulate their results in the same destination. + +```math +C = \sum_i A_i \cdot B_i +``` + +with +- \f$A_i\f$ a set of matrices of dimension \f$M \times K\f$ +- \f$B_i\f$ a set of matrices of dimension \f$K \times N\f$ +- C matrix of dimension \f$M \times N\f$. + +The BRGeMM ukernel also supports accumulation with values already +present in \f$C\f$, as well as post-operation and down-conversion to +another \f$D\f$ matrix: + +```math +D = \operatorname{convert}( \operatorname{post\_ops}(C + \sum_i A_i \cdot B_i, post_ops_args)). +``` + +## Data Types + +In general, C represents an accumulation buffer. Hence when +computations are carried in floating-point arithmetic, C shall be of +type f32, and when computation is carried in integer arithmetic, C +should be of type s32. + +The BRGeMM ukernel supports the following combinations of data-types. + +| A | B | C | D | +|:-------|:-------|:----|:----------------------------| +| f32 | f32 | f32 | u8, s8, s32, f32, f16, bf16 | +| f16 | f16 | f32 | u8, s8, s32, f32, f16, bf16 | +| bf16 | bf16 | f32 | u8, s8, s32, f32, f16, bf16 | +| u8, s8 | u8, s8 | s32 | u8, s8, s32, f32, f16, bf16 | + +## Data Representation + +Because of hardware restrictions, the BRGeMM ukernel requires specific +data layout. + + The +@ref dnnl::ukernel::brgemm_pack_B::need_pack() method can be called to determine +if packing is necessary. If so, +[packB ukernel](@ref dev_guide_ukernel_transform) shall be created to do the +actual packing. + + + +## Attributes + +The following ukernel attributes can be set through dedicated setters. + +| Type | Operation | Description | Restrictions | +|:----------|:-----------------------------------------------------------|:----------------------------------------------------------|:------------------------------------| +| Attribute | [Scales](@ref dnnl::primitive_attr::set_scales_mask) | Scales the corresponding tensors by given scale factor(s) | | +| Post-op | [Eltwise](@ref dnnl::post_ops::append_eltwise) | Applies an @ref dnnl_api_eltwise operation to the result | | +| Post-op | [Binary](@ref dnnl::post_ops::append_binary) | Applies a @ref dnnl_api_binary operation to the result | General binary post-op restrictions | + + +@note if zero-points are passed for A/B, fpmath_mode should be set for +the computation to happen over floating-point format (so up-conversion +to floating-point format would happen before computation). If +computation in integer format is needed, BRGeMM ukernel should be +configured without zero-point, and the user should prepare a +compensation term that will be passed to the binary post-op. + +## Implementation limitations + +BRGeMM ukernel has no known limitations. + +## Examples + +[BRGeMM ukernel example](@ref brgemm_example_cpp) + +@copydetails brgemm_example_cpp_short diff --git a/doc/ukernel/operations/transform.md b/doc/ukernel/operations/transform.md new file mode 100644 index 00000000000..49d9f09f26d --- /dev/null +++ b/doc/ukernel/operations/transform.md @@ -0,0 +1,42 @@ +Data transformation {#dev_guide_ukernel_transform} +======================================= + +> +> [API Reference](@ref dnnl_api_ukernel_brgemm) +> + +## General + +The packB ukernel allows to pack BRGeMM B matrices in optimal layout +before executing the [BRGeMM ukernel](@ref dev_guide_ukernel_brgemm). +This is an out-of-place operation. + +## Data Types + +The packB ukernel does not allow data type conversion. + +## Data Representation + +| src | dst | +|:-----|:-----| +| f32 | f32 | +| f16 | f16 | +| bf16 | bf16 | +| s8 | s8 | +| u8 | u8 | + +## Attributes + +No attribute is supported for packB ukernel. + +## Implementation limitations + +- Source leading dimension should be greater or equal to N to return the correct + result. +- Destination leading dimension should be one of 16, 32, 48, or 64. + +## Examples + +[BRGeMM ukernel example](@ref brgemm_example_cpp) + +@copydetails brgemm_example_cpp_short diff --git a/doc/ukernel/programming_model/ukernel_basic_concepts.md b/doc/ukernel/programming_model/ukernel_basic_concepts.md new file mode 100644 index 00000000000..2a5a79b6599 --- /dev/null +++ b/doc/ukernel/programming_model/ukernel_basic_concepts.md @@ -0,0 +1,48 @@ +Basic Concepts {#dev_guide_ukernel_basic_concepts} +================================================== + +## Introduction + +The oneDNN micro-kernel API (also denoted ukernel API), is a low-level, +sequential abstraction for CPU only. This API allows maximum flexibility and +composability with user provided code. In particular, the user keeps full +control of: +- threading logic, as this API is sequential and independent of any threading + runtime. +- blocking logic, as the user can configure ukernel objects sizes to fit in + local caches. +- customization, as user can interleave its custom code with ukernel code within + a parallel region. + +The API is designed to be as simple as possible, with the small number of +abstractions, to have minimal potential overhead. + +## Memory representation + +In oneDNN ukernel API, there is no dedicated abstraction for memory object. +Users must describe memory properties for each ukernel operation through: +- a pointer containing the address of the start of a buffer. +- a set of dimensions (1 dimension for vectors, 2 dimensions for matrices). +- a set of strides, which for 2d matrices is the number of elements between two + consecutive rows. + +Some operations might require data in a given layout on some hardware +architectures to benefit from hardware acceleration (e.g., interleaved +rows/columns with a given granularity). This is exposed through a dedicated enum +value. + +## Operation representation + +For all ukernel operations, there are 5 fundamental steps: +- create a dedicated ukernel object. This step uses only fundamental parameters + that define the operation (e.g., memory input/output shapes, datatypes, ...). +- configure the ukernel object. This step is to guide code generation by setting + attributes. Once configured, the user must finalize the object. +- query the ukernel object. This step freezes the configuration of the ukernel + object. At this point, the user can query various information (e.g., if the + ukernel code will require inputs/output in a specific format). +- generate binary code. This will effectively generate the code that will be + executed. This operation is time consuming, so it is advised to hoist it out + of the main computation loop as much as possible. This must happen only once + for each ukernel object. +- execute the generated code. diff --git a/examples/ukernels/brgemm.cpp b/examples/ukernels/brgemm.cpp index 4ceffd94d01..d0486e63f4f 100644 --- a/examples/ukernels/brgemm.cpp +++ b/examples/ukernels/brgemm.cpp @@ -17,9 +17,14 @@ /// @example brgemm.cpp /// > Annotated version: @ref brgemm_example_cpp /// +/// @page brgemm_example_cpp_short +/// /// This C++ API example demonstrates how to create and execute a BRGeMM /// ukernel. /// +/// @page brgemm_example_cpp Example of using BRGeMM ukernel to implement Matmul +/// @copydetails brgemm_example_cpp_short +/// /// @include brgemm.cpp #include From b66042fde0bf73ce6283e43a5be837343ef4b185 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Mon, 13 May 2024 03:50:07 -0700 Subject: [PATCH 072/187] gpu: jit: gemm: fixup: use scanline ordering in more thin-m/n cases --- src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp index 4db55c87876..8c94532d98f 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp @@ -123,6 +123,11 @@ status_t gen_gemm_kernel_desc_t::finalize(const char *tags) { bool use_linear = (m_tiles * n_tiles <= tiles_gpu); bool use_linear_m = (m_tiles * m_tiles <= 2 * tiles_gpu); bool use_linear_n = (n_tiles * n_tiles <= 2 * tiles_gpu); + + if (strategy_.fused) + if (strategy_.wg[LoopM] % 2 || strategy_.wg[LoopN] % 2) + use_linear_m = use_linear_n = false; /* cannot swap */ + if (use_linear) { if (strategy_.kParallelVariable) strategy_.cWalkOrder = WalkOrder::SimpleLinear; From cb4cb4e765f795aad55673c9079a275133a08722 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Mon, 13 May 2024 01:39:58 -0700 Subject: [PATCH 073/187] cpu: move template argument for use_inversion/is_deconv into conv desc --- src/common/convolution.cpp | 1 + src/common/deconvolution.cpp | 1 + src/common/opdesc.hpp | 4 +- src/common/primitive_hashing.cpp | 2 + src/common/serialization.cpp | 2 + src/common/type_helpers.hpp | 3 +- src/cpu/cpu_convolution_list.cpp | 182 ++++++-------------- src/cpu/x64/jit_brgemm_conv.cpp | 145 +++++++--------- src/cpu/x64/jit_brgemm_conv.hpp | 6 +- src/cpu/x64/jit_brgemm_conv_bwd.cpp | 7 +- src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp | 122 ++++++------- src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp | 2 +- src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp | 34 ++-- src/cpu/x64/jit_brgemm_conv_bwd_utils.hpp | 5 +- src/cpu/x64/jit_brgemm_conv_utils.cpp | 8 +- src/cpu/x64/jit_brgemm_conv_utils.hpp | 6 +- src/cpu/x64/jit_brgemm_deconv.cpp | 17 +- 17 files changed, 225 insertions(+), 322 deletions(-) diff --git a/src/common/convolution.cpp b/src/common/convolution.cpp index 3aad49cca1a..722a58ab6a7 100644 --- a/src/common/convolution.cpp +++ b/src/common/convolution.cpp @@ -58,6 +58,7 @@ status_t conv_desc_init(convolution_desc_t *conv_desc, prop_kind_t prop_kind, cd.primitive_kind = primitive_kind::convolution; cd.prop_kind = prop_kind; cd.alg_kind = alg_kind; + cd.use_inversion = false; // Not to be specified by user, only for internal. cd.diff_src_desc = cd.src_desc = zero_md(); cd.diff_dst_desc = cd.dst_desc = zero_md(); diff --git a/src/common/deconvolution.cpp b/src/common/deconvolution.cpp index 97a6eef89b5..00f3f89d037 100644 --- a/src/common/deconvolution.cpp +++ b/src/common/deconvolution.cpp @@ -57,6 +57,7 @@ status_t deconv_desc_init(deconvolution_desc_t *deconv_desc, dd.primitive_kind = primitive_kind::deconvolution; dd.prop_kind = prop_kind; dd.alg_kind = alg_kind; + dd.use_inversion = false; // Must be always `false` for deconv. dd.diff_src_desc = dd.src_desc = zero_md(); dd.diff_dst_desc = dd.dst_desc = zero_md(); diff --git a/src/common/opdesc.hpp b/src/common/opdesc.hpp index a0029236f49..872fe9025a7 100644 --- a/src/common/opdesc.hpp +++ b/src/common/opdesc.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2023 Intel Corporation +* Copyright 2019-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -139,6 +139,8 @@ struct convolution_desc_t { dims_t padding[2]; // The accumulator data type. Initialized automatically. data_type_t accum_data_type; + // For internal use only. To mark conv is used for deconv. + bool use_inversion; }; // A descriptor of a deconvolution operation. diff --git a/src/common/primitive_hashing.cpp b/src/common/primitive_hashing.cpp index d28cb4ddd50..90de09081f5 100644 --- a/src/common/primitive_hashing.cpp +++ b/src/common/primitive_hashing.cpp @@ -382,6 +382,8 @@ size_t get_desc_hash(const convolution_desc_t &desc) { seed = get_array_hash(seed, desc.padding[1], DNNL_MAX_NDIMS); // Accumulator type seed = hash_combine(seed, static_cast(desc.accum_data_type)); + // Internal member + seed = hash_combine(seed, static_cast(desc.use_inversion)); // Combined hash for (de-)convolution desc return seed; } diff --git a/src/common/serialization.cpp b/src/common/serialization.cpp index 23d22b0ecdb..bee9d54f205 100644 --- a/src/common/serialization.cpp +++ b/src/common/serialization.cpp @@ -306,6 +306,8 @@ void serialize_desc( sstream.write(desc.padding[1], DNNL_MAX_NDIMS); // Accumulator type sstream.write(&desc.accum_data_type); + // Internal member + sstream.write(&desc.use_inversion); } // Eltwise diff --git a/src/common/type_helpers.hpp b/src/common/type_helpers.hpp index a950383501f..bb56e1fe3a5 100644 --- a/src/common/type_helpers.hpp +++ b/src/common/type_helpers.hpp @@ -518,7 +518,8 @@ inline bool operator==( && COMPARE_DESC_ARRAY_MEMBERS(dilates, DNNL_MAX_NDIMS) && COMPARE_DESC_ARRAY_MEMBERS(padding[0], DNNL_MAX_NDIMS) && COMPARE_DESC_ARRAY_MEMBERS(padding[1], DNNL_MAX_NDIMS) - && COMPARE_DESC_MEMBERS(accum_data_type); + && COMPARE_DESC_MEMBERS(accum_data_type) + && COMPARE_DESC_MEMBERS(use_inversion); return ret; } diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp index bf06adf2b06..5f44cbe2d6e 100644 --- a/src/cpu/cpu_convolution_list.cpp +++ b/src/cpu/cpu_convolution_list.cpp @@ -83,8 +83,6 @@ using namespace dnnl::impl::prop_kind; CPU_INSTANCE_AMX( \ brgemm_1x1_convolution_fwd_t) \ CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) \ - CPU_INSTANCE_AMX( \ - brgemm_convolution_fwd_t) \ CPU_INSTANCE(ref_convolution_fwd_t) nullptr, \ } \ } @@ -98,12 +96,8 @@ using namespace dnnl::impl::prop_kind; CPU_INSTANCE_AMX( \ brgemm_convolution_bwd_strided_t< \ avx10_1_512_amx_fp16>) \ - CPU_INSTANCE_AMX( \ - brgemm_convolution_bwd_strided_t< \ - avx10_1_512_amx_fp16, \ - true>) \ - CPU_INSTANCE( \ - ref_convolution_bwd_data_t) nullptr, \ + CPU_INSTANCE( \ + ref_convolution_bwd_data_t) nullptr, \ }) \ } @@ -116,17 +110,14 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_common_dw_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_common_1x1_convolution_fwd_f32_t) CPU_INSTANCE_AVX512(jit_avx512_common_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_avx2_dw_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_avx2_1x1_convolution_fwd_t) CPU_INSTANCE_SSE41(jit_sse41_dw_convolution_fwd_t) CPU_INSTANCE_SSE41(jit_sse41_1x1_convolution_fwd_t) @@ -150,19 +141,16 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_uni_dw_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_bf16_convolution_fwd_t) CPU_INSTANCE_AVX512(gemm_bf16_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE(ref_convolution_fwd_t) nullptr, }}, @@ -171,19 +159,16 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_uni_dw_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_bf16_convolution_fwd_t) CPU_INSTANCE_AVX512(gemm_bf16_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE(ref_convolution_fwd_t) CPU_INSTANCE(ref_fused_convolution_fwd_t) nullptr, @@ -193,13 +178,10 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE(ref_convolution_fwd_t) nullptr, }}, @@ -208,13 +190,10 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AARCH64_ACL(acl_wino_convolution_fwd_t) CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t) CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t) @@ -244,13 +223,11 @@ const std::map> &impl_list_map() CPU_INSTANCE_AMX(brgemm_convolution_bwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) CPU_INSTANCE_AVX512(jit_avx512_common_dw_convolution_bwd_data_t) CPU_INSTANCE_AVX512(jit_avx512_common_1x1_convolution_bwd_data_f32_t) CPU_INSTANCE_AVX512(jit_avx512_common_convolution_bwd_data_t) CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE_AVX2(jit_avx2_dw_convolution_bwd_data_t) CPU_INSTANCE_AVX2(jit_avx2_1x1_convolution_bwd_data_t) CPU_INSTANCE_SSE41(jit_sse41_dw_convolution_bwd_data_t) @@ -266,18 +243,15 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_bwd_data_t) CPU_INSTANCE_AMX(brgemm_convolution_bwd_t) CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_bwd_data_t) CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) CPU_INSTANCE_AVX512(jit_uni_dw_convolution_bwd_data_t) CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_bwd_data_t) CPU_INSTANCE_AVX512(jit_avx512_core_bf16_convolution_bwd_data_t) CPU_INSTANCE_AVX512(gemm_bf16_convolution_bwd_data_t) CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(ref_convolution_bwd_data_t) nullptr, })}, @@ -285,18 +259,15 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_bwd_data_t) CPU_INSTANCE_AMX(brgemm_convolution_bwd_t) CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_bwd_data_t) CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) CPU_INSTANCE_AVX512(jit_uni_dw_convolution_bwd_data_t) CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_bwd_data_t) CPU_INSTANCE_AVX512(jit_avx512_core_bf16_convolution_bwd_data_t) CPU_INSTANCE_AVX512(gemm_bf16_convolution_bwd_data_t) CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(ref_convolution_bwd_data_t) nullptr, })}, @@ -304,14 +275,11 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_bwd_data_t) CPU_INSTANCE_AMX(brgemm_convolution_bwd_t) CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_bwd_data_t) CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(ref_convolution_bwd_data_t) nullptr, })}, @@ -319,14 +287,11 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_bwd_data_t) CPU_INSTANCE_AMX(brgemm_convolution_bwd_t) CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_bwd_data_t) CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(ref_convolution_bwd_data_t) nullptr, })}, @@ -395,23 +360,18 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t) @@ -428,15 +388,12 @@ const std::map> &impl_list_map() CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t) CPU_INSTANCE(ref_convolution_int8_fwd_t) nullptr, @@ -446,23 +403,18 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t) @@ -478,23 +430,18 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t) @@ -511,23 +458,18 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t) @@ -544,23 +486,18 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t) @@ -574,20 +511,16 @@ const std::map> &impl_list_map() CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t) CPU_INSTANCE(ref_convolution_int8_fwd_t) nullptr, @@ -597,23 +530,18 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t) @@ -628,23 +556,18 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t) @@ -660,23 +583,18 @@ const std::map> &impl_list_map() CPU_INSTANCE_X64(ip_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t) CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) - CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t) CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t) CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t) @@ -689,100 +607,100 @@ const std::map> &impl_list_map() }}, // BWD int8 (diff_dst:u8) {{backward_data, f32, s8, u8}, REG_BWD_D_PK({ - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t) CPU_INSTANCE(ref_convolution_int8_bwd_data_t) nullptr, })}, {{backward_data, bf16, s8, u8}, REG_BWD_D_PK({ - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t) CPU_INSTANCE(ref_convolution_int8_bwd_data_t) nullptr, })}, {{backward_data, s32, s8, u8}, REG_BWD_D_PK({ - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t) CPU_INSTANCE(ref_convolution_int8_bwd_data_t) nullptr, })}, {{backward_data, s8, s8, u8}, REG_BWD_D_PK({ - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t) CPU_INSTANCE(ref_convolution_int8_bwd_data_t) nullptr, })}, {{backward_data, u8, s8, u8}, REG_BWD_D_PK({ - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t) CPU_INSTANCE(ref_convolution_int8_bwd_data_t) nullptr, })}, // BWD int8 (diff_dst:s8) {{backward_data, f32, s8, s8}, REG_BWD_D_PK({ - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t) CPU_INSTANCE(ref_convolution_int8_bwd_data_t) nullptr, })}, {{backward_data, bf16, s8, s8}, REG_BWD_D_PK({ - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t) CPU_INSTANCE(ref_convolution_int8_bwd_data_t) nullptr, })}, {{backward_data, s32, s8, s8}, REG_BWD_D_PK({ - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t) CPU_INSTANCE(ref_convolution_int8_bwd_data_t) nullptr, })}, {{backward_data, s8, s8, s8}, REG_BWD_D_PK({ - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t) CPU_INSTANCE(ref_convolution_int8_bwd_data_t) nullptr, })}, {{backward_data, u8, s8, s8}, REG_BWD_D_PK({ - CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) - CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t) CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t) CPU_INSTANCE(ref_convolution_int8_bwd_data_t) nullptr, diff --git a/src/cpu/x64/jit_brgemm_conv.cpp b/src/cpu/x64/jit_brgemm_conv.cpp index f6d8598e752..0e2aa49ddac 100644 --- a/src/cpu/x64/jit_brgemm_conv.cpp +++ b/src/cpu/x64/jit_brgemm_conv.cpp @@ -42,8 +42,8 @@ using namespace jit_uni_brgemm_conv_comp_pad_kernel; #define ndims_pick(v5, v4, v3) \ ((ndims == 5) ? (v5) : (ndims == 4) ? (v4) : (ndims == 3) ? (v3) : 0) -template -int brgemm_convolution_fwd_t::pd_t::get_brg_idx(int m, +template +int brgemm_convolution_fwd_t::pd_t::get_brg_idx(int m, bool do_initialization, bool is_N_tail, bool is_K_tail, int kd_b, int kd_e, int kh_b, int kh_e) const { const auto brg_idx = jcp_.use_uker @@ -55,8 +55,8 @@ int brgemm_convolution_fwd_t::pd_t::get_brg_idx(int m, return brg_idx->second; } -template -int brgemm_convolution_fwd_t::pd_t::get_any_brg_idx( +template +int brgemm_convolution_fwd_t::pd_t::get_any_brg_idx( bool is_N_tail, bool is_K_tail) const { // return first defined brgemm_descriptor for specified parameters for (const auto &key_value_pair : brg_indices) { @@ -69,8 +69,8 @@ int brgemm_convolution_fwd_t::pd_t::get_any_brg_idx( return 0; } -template -void brgemm_convolution_fwd_t::pd_t::init_batch(int icc, +template +void brgemm_convolution_fwd_t::pd_t::init_batch(int icc, const char *src_base, const char *wei_base, int n_ic_blocks, int ic_block_s, int iid_b, int iih_b, int iiw_b, const dim_t *const __restrict kw_top_vpads, @@ -150,8 +150,8 @@ void brgemm_convolution_fwd_t::pd_t::init_batch(int icc, } } -template -inline void brgemm_convolution_fwd_t::pd_t::get_A_B(int icc, +template +inline void brgemm_convolution_fwd_t::pd_t::get_A_B(int icc, const char *src_base, const char *wei_base, int ic_block_s, int iid_b, int iih_b, int iiw_b, int kd_b, int kh_b, const void *&ptrA, const void *&ptrB) const { @@ -179,10 +179,10 @@ inline void brgemm_convolution_fwd_t::pd_t::get_A_B(int icc, ptrB = wei_base_kh + wei_kw * wei_kw_offset; } -template -status_t brgemm_convolution_fwd_t::pd_t::add_brg_descriptor( - int vM, bool is_N_tail, bool is_K_tail, bool do_init, int kd_b, - int kd_e, int kh_b, int kh_e) { +template +status_t brgemm_convolution_fwd_t::pd_t::add_brg_descriptor(int vM, + bool is_N_tail, bool is_K_tail, bool do_init, int kd_b, int kd_e, + int kh_b, int kh_e) { if (do_init && is_K_tail && jcp_.K > 0) return status::success; @@ -334,8 +334,8 @@ status_t brgemm_convolution_fwd_t::pd_t::add_brg_descriptor( return status::success; } -template -void brgemm_convolution_fwd_t::pd_t::get_kw_range( +template +void brgemm_convolution_fwd_t::pd_t::get_kw_range( int ow, int &kw_s, int &kw_full_s, int &kw_full_f, int &kw_f) const { // This function is used for exec_base only // TODO: calculate these values instead direct loop by kw @@ -362,8 +362,8 @@ void brgemm_convolution_fwd_t::pd_t::get_kw_range( if (kw_full_f == -1) kw_full_s = kw_full_f = kw_f; } -template -void brgemm_convolution_fwd_t::pd_t::get_ow_range( +template +void brgemm_convolution_fwd_t::pd_t::get_ow_range( int ow, int kw, int &ow_s, int &ow_f) const { // This function is used for exec_base only @@ -392,9 +392,8 @@ void brgemm_convolution_fwd_t::pd_t::get_ow_range( ow_f = nstl::min(nstl::max(ow_f, ow_s), ow + M); } -template -status_t brgemm_convolution_fwd_t::pd_t::init( - engine_t *engine) { +template +status_t brgemm_convolution_fwd_t::pd_t::init(engine_t *engine) { using namespace data_type; using namespace utils; brgemm_descriptors_ @@ -411,7 +410,7 @@ status_t brgemm_convolution_fwd_t::pd_t::init( // executing 'use_inversion == true' as FWD. This can only work if the // diff_src_desc and diff_dst_desc are defined in the aforementioned. const convolution_desc_t &cd = *desc(); - if (use_inversion + if (cd.use_inversion && one_of(true, types::is_zero_md(&cd.diff_src_desc), types::is_zero_md(&cd.diff_dst_desc))) return status::unimplemented; @@ -440,9 +439,8 @@ status_t brgemm_convolution_fwd_t::pd_t::init( VDISPATCH_CONV(zero_points_ok(), VERBOSE_UNSUPPORTED_ZP_CFG); VDISPATCH_CONV(arg_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG); - CHECK(brgemm_convolution_utils::init_conf(jcp_, use_inversion, isa, *desc(), - src_md_, weights_md_, dst_md_, bias_md_, attr_, - dnnl_get_max_threads())); + CHECK(brgemm_convolution_utils::init_conf(jcp_, isa, *desc(), src_md_, + weights_md_, dst_md_, bias_md_, attr_, dnnl_get_max_threads())); // 1. The unrolled kernel can be used for exec_trans and exec_base and for // amx only. For exec_base it makes sense to use unrolled kernel only if @@ -732,14 +730,12 @@ status_t brgemm_convolution_fwd_t::pd_t::init( return status::success; } -template -brgemm_convolution_fwd_t::brgemm_convolution_fwd_t( - const pd_t *apd) +template +brgemm_convolution_fwd_t::brgemm_convolution_fwd_t(const pd_t *apd) : primitive_t(apd), bias_d(pd()->weights_md(1)) {} -template -status_t brgemm_convolution_fwd_t::add_brg_kernel( - int brg_idx) { +template +status_t brgemm_convolution_fwd_t::add_brg_kernel(int brg_idx) { const auto _pd = pd(); const auto &brgs = *(_pd->brgemm_descriptors_); @@ -752,8 +748,8 @@ status_t brgemm_convolution_fwd_t::add_brg_kernel( return status::success; } -template -status_t brgemm_convolution_fwd_t::add_po_kernel( +template +status_t brgemm_convolution_fwd_t::add_po_kernel( brgemm_desc_t *bcfg, int ker_idx, bool is_init) { if (!bcfg) return status::success; const auto _pd = pd(); @@ -770,8 +766,8 @@ status_t brgemm_convolution_fwd_t::add_po_kernel( return status::success; } -template -void brgemm_convolution_fwd_t::add_po_kernels( +template +void brgemm_convolution_fwd_t::add_po_kernels( int i_N, int init_bcast_dim, int po_bcast_dim) { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -806,9 +802,8 @@ void brgemm_convolution_fwd_t::add_po_kernels( } } -template -int brgemm_convolution_fwd_t::get_comp_oh( - const int oh) const { +template +int brgemm_convolution_fwd_t::get_comp_oh(const int oh) const { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -831,10 +826,10 @@ int brgemm_convolution_fwd_t::get_comp_oh( return comp_oh_e; } -template -int brgemm_convolution_fwd_t::get_comp_ker_idx( - const int kd_b, const int kd_e, const int kh_b, const int kh_e, - const int kw_b, const int kw_e, const int oh) const { +template +int brgemm_convolution_fwd_t::get_comp_ker_idx(const int kd_b, + const int kd_e, const int kh_b, const int kh_e, const int kw_b, + const int kw_e, const int oh) const { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -852,9 +847,9 @@ int brgemm_convolution_fwd_t::get_comp_ker_idx( return -1; } -template -inline int brgemm_convolution_fwd_t::get_comp_offset( - const int g, const int ocb, const int oh, const int ow, const int kd_b, +template +inline int brgemm_convolution_fwd_t::get_comp_offset(const int g, + const int ocb, const int oh, const int ow, const int kd_b, const int kd_e, const int kh_b, const int kh_e, const int kw_b, const int kw_e) const { const auto _pd = pd(); @@ -871,8 +866,8 @@ inline int brgemm_convolution_fwd_t::get_comp_offset( : (g * jcp.nb_oc + ocb) * jcp.oc_block; } -template -status_t brgemm_convolution_fwd_t::init(engine_t *engine) { +template +status_t brgemm_convolution_fwd_t::init(engine_t *engine) { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -1248,8 +1243,8 @@ status_t brgemm_convolution_fwd_t::init(engine_t *engine) { return status::success; } -template -struct brgemm_convolution_fwd_t::brgemm_thread_ctx_t { +template +struct brgemm_convolution_fwd_t::brgemm_thread_ctx_t { brgemm_thread_ctx_t(brgemm_exec_ctx_t &brgemm_ctx_, int ithr_, brgemm_batch_element_t *__restrict brg_batch_, char *c_buffer_, char *wsp_tile_, const char *__restrict weights_) @@ -1282,9 +1277,8 @@ struct brgemm_convolution_fwd_t::brgemm_thread_ctx_t { void *__restrict inp_buffer_zero {nullptr}; }; -template -status_t brgemm_convolution_fwd_t::execute( - const exec_ctx_t &ctx) const { +template +status_t brgemm_convolution_fwd_t::execute(const exec_ctx_t &ctx) const { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -1485,8 +1479,8 @@ status_t brgemm_convolution_fwd_t::execute( return status::success; } -template -status_t brgemm_convolution_fwd_t::cal_compensation( +template +status_t brgemm_convolution_fwd_t::cal_compensation( const char *__restrict weights, int32_t *src_zp_buffer, int32_t *s8s8_comp_buffer) const { const auto _pd = pd(); @@ -1579,7 +1573,7 @@ status_t brgemm_convolution_fwd_t::cal_compensation( p.kw_l = kw_e - kw_b; p.ker_l = k_l; p.last_ocb = ocb == jcp.nb_oc - 1; - p.use_inversion = use_inversion; + p.use_inversion = _pd->desc()->use_inversion; p.ptr_in = &weights[wei_offs]; p.ptr_zp_out = jcp.src_zero_point ? &src_zp_buffer[buffer_offs] : nullptr; @@ -1594,8 +1588,8 @@ status_t brgemm_convolution_fwd_t::cal_compensation( return status::success; } -template -void brgemm_convolution_fwd_t::perform_outwork( +template +void brgemm_convolution_fwd_t::perform_outwork( const brgemm_thread_ctx_t &btc, char *dst_base, const char *bias_w, int ow, int g_oc, bool is_oc_tail, int ker_ow_s, int ker_ow_f, int kd_l, int kh_l, bool maybe_do_init, bool do_postwork, size_t comp_ker_offs, @@ -1677,8 +1671,8 @@ void brgemm_convolution_fwd_t::perform_outwork( } } -template -inline void brgemm_convolution_fwd_t::call_brgemm_kernel( +template +inline void brgemm_convolution_fwd_t::call_brgemm_kernel( const brgemm_thread_ctx_t &btc, const brgemm_kernel_t *brg_ker, int batch_size, char *ptr_C, char *ptr_D, const char *bias_w, int g_oc, bool do_postops, size_t comp_ker_offs, bool do_only_comp) const { @@ -1728,9 +1722,9 @@ inline void brgemm_convolution_fwd_t::call_brgemm_kernel( ptr_C, static_cast(btc.wsp_tile)); } -template -void brgemm_convolution_fwd_t::maybe_conv_weights( - const exec_ctx_t &ctx, const char *__restrict input_weights, +template +void brgemm_convolution_fwd_t::maybe_conv_weights(const exec_ctx_t &ctx, + const char *__restrict input_weights, const char *__restrict &wei) const { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -1784,10 +1778,9 @@ void brgemm_convolution_fwd_t::maybe_conv_weights( wei = wei_buffer; } -template -void brgemm_convolution_fwd_t::maybe_conv_inp( - brgemm_thread_ctx_t &btc, const brgemm_thread_ctx_t &last_btc, - const char *__restrict src) const { +template +void brgemm_convolution_fwd_t::maybe_conv_inp(brgemm_thread_ctx_t &btc, + const brgemm_thread_ctx_t &last_btc, const char *__restrict src) const { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -2045,9 +2038,8 @@ void brgemm_convolution_fwd_t::maybe_conv_inp( char *ptr_D; \ int kd_b(0), kd_e(0), kh_b(0), kh_e(0), k_l(0), iiw_b(0); -template -void brgemm_convolution_fwd_t::ker_base( - brgemm_thread_ctx_t &btc) const { +template +void brgemm_convolution_fwd_t::ker_base(brgemm_thread_ctx_t &btc) const { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -2199,9 +2191,8 @@ void brgemm_convolution_fwd_t::ker_base( } } -template -void brgemm_convolution_fwd_t::ker_trans( - brgemm_thread_ctx_t &btc) const { +template +void brgemm_convolution_fwd_t::ker_trans(brgemm_thread_ctx_t &btc) const { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -2336,9 +2327,8 @@ void brgemm_convolution_fwd_t::ker_trans( } } -template -void brgemm_convolution_fwd_t::ker_vpad( - brgemm_thread_ctx_t &btc) const { +template +void brgemm_convolution_fwd_t::ker_vpad(brgemm_thread_ctx_t &btc) const { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -2438,23 +2428,14 @@ void brgemm_convolution_fwd_t::ker_vpad( #undef BRGEMM_CONV_KER_HEADER template struct brgemm_convolution_fwd_t; -template struct brgemm_convolution_fwd_t; template struct brgemm_convolution_fwd_t; -template struct brgemm_convolution_fwd_t; template struct brgemm_convolution_fwd_t; -template struct brgemm_convolution_fwd_t; template struct brgemm_convolution_fwd_t; -template struct brgemm_convolution_fwd_t; template struct brgemm_convolution_fwd_t; -template struct brgemm_convolution_fwd_t; template struct brgemm_convolution_fwd_t; -template struct brgemm_convolution_fwd_t; template struct brgemm_convolution_fwd_t; -template struct brgemm_convolution_fwd_t; template struct brgemm_convolution_fwd_t; -template struct brgemm_convolution_fwd_t; template struct brgemm_convolution_fwd_t; -template struct brgemm_convolution_fwd_t; } // namespace x64 diff --git a/src/cpu/x64/jit_brgemm_conv.hpp b/src/cpu/x64/jit_brgemm_conv.hpp index c75a7d9b6a3..2db203e2216 100644 --- a/src/cpu/x64/jit_brgemm_conv.hpp +++ b/src/cpu/x64/jit_brgemm_conv.hpp @@ -47,7 +47,7 @@ namespace impl { namespace cpu { namespace x64 { -template +template struct brgemm_convolution_fwd_t : public primitive_t { struct brgemm_thread_ctx_t; @@ -119,7 +119,7 @@ struct brgemm_convolution_fwd_t : public primitive_t { int get_any_brg_idx(bool is_N_tail, bool is_K_tail) const; inline int maybe_invert(int k, int K) const { - return use_inversion ? K - 1 - k : k; + return desc()->use_inversion ? K - 1 - k : k; }; // This method calculates the value of k_l @@ -203,7 +203,7 @@ struct brgemm_convolution_fwd_t : public primitive_t { } inline int maybe_invert_range(int k, int k_inv, int K) const { - return use_inversion ? K - k_inv : k; + return pd()->desc()->use_inversion ? K - k_inv : k; }; void ker_base(brgemm_thread_ctx_t &btc) const; diff --git a/src/cpu/x64/jit_brgemm_conv_bwd.cpp b/src/cpu/x64/jit_brgemm_conv_bwd.cpp index 8da2c8ba92b..1e563794bf4 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd.cpp @@ -93,6 +93,9 @@ status_t fwd_conv_desc_create( fwd_conv_d->diff_src_desc = fwd_conv_d->src_desc; fwd_conv_d->diff_dst_desc = fwd_conv_d->dst_desc; } + // Note: internal field to hint this conv is created from deconv. + fwd_conv_d->use_inversion = true; + return status::success; } } // namespace @@ -122,9 +125,7 @@ status_t brgemm_convolution_bwd_t::pd_t::init(engine_t *engine) { const auto pd_1x1 = dynamic_cast((*it).get()); if (pd_1x1 != nullptr) break; // 1x1 implementation found - constexpr bool use_inversion = true; // invert weights' spatial indices - using fwd_conv_pd_t = - typename brgemm_convolution_fwd_t::pd_t; + using fwd_conv_pd_t = typename brgemm_convolution_fwd_t::pd_t; const auto pd = dynamic_cast((*it).get()); if (pd != nullptr) break; // non-1x1 implementation found } diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp index e76321eb1f0..a4cb990712e 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp @@ -63,9 +63,8 @@ static bool impl_supports_datatype(data_type_t data_type) { } } -template -status_t brgemm_convolution_bwd_strided_t::pd_t::init( - engine_t *engine) { +template +status_t brgemm_convolution_bwd_strided_t::pd_t::init(engine_t *engine) { using namespace data_type; const auto diff_src_type = diff_src_md(0)->data_type; @@ -75,18 +74,19 @@ status_t brgemm_convolution_bwd_strided_t::pd_t::init( // The following check will detect if this implementation is being // executed through a deconvolution call and prevent the primitive from - // executing 'is_deconv == true' as BWD_D. This can only work if the + // executing 'use_inversion == true' as BWD_D. This can only work if the // src_desc and dst_desc are defined in the aforementioned. const convolution_desc_t &cd = *desc(); - if (is_deconv + if (cd.use_inversion && one_of(true, types::is_zero_md(&cd.src_desc), types::is_zero_md(&cd.dst_desc))) return status::unimplemented; using skip_mask_t = primitive_attr_t::skip_mask_t; auto skip_mask = skip_mask_t::fpmath_mode; - if (is_deconv) skip_mask |= skip_mask_t::post_ops | skip_mask_t::sum_dt; - if (is_int8 && is_deconv) + if (cd.use_inversion) + skip_mask |= skip_mask_t::post_ops | skip_mask_t::sum_dt; + if (is_int8 && cd.use_inversion) skip_mask |= skip_mask_t::scales_runtime | skip_mask_t::zero_points_runtime; @@ -103,7 +103,7 @@ status_t brgemm_convolution_bwd_strided_t::pd_t::init( && wei_type == s8 && is_int8 && IMPLICATION( with_bias(), one_of(bias_md_.data_type, f32, s32, s8, u8)) - && is_deconv /* only deconv uses int8 */; + && cd.use_inversion /* only deconv uses int8 */; const bool is_fp8_supported = one_of(wei_type, f8_e5m2, f8_e4m3) && one_of(diff_dst_type, f8_e5m2, f8_e4m3) @@ -125,7 +125,7 @@ status_t brgemm_convolution_bwd_strided_t::pd_t::init( VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, ""); VDISPATCH_CONV(attr()->has_default_values(skip_mask, diff_src_type), VERBOSE_UNSUPPORTED_ATTR); - VDISPATCH_CONV(IMPLICATION(is_deconv, + VDISPATCH_CONV(IMPLICATION(cd.use_inversion, attr()->post_ops_.check_sum_consistency( diff_src_type, is_int8_supported)), VERBOSE_UNSUPPORTED_POSTOP); @@ -134,7 +134,7 @@ status_t brgemm_convolution_bwd_strided_t::pd_t::init( CHECK(brgemm_convolution_bwd_utils::init_conf(jcp_, isa, desc_, diff_dst_md_, weights_md_, diff_src_md_, bias_md_, attr_, - dnnl_get_max_threads(), is_deconv)); + dnnl_get_max_threads())); const auto adj_M = nstl::max(jcp_.M, jcp_.M_tail); @@ -241,10 +241,9 @@ status_t brgemm_convolution_bwd_strided_t::pd_t::init( return status::success; } -template -void brgemm_convolution_bwd_strided_t::get_kw_range(int iw, - int iw_raw, int &kw_s, int &kw_full_s, int &kw_full_f, - int &kw_f) const { +template +void brgemm_convolution_bwd_strided_t::get_kw_range(int iw, int iw_raw, + int &kw_s, int &kw_full_s, int &kw_full_f, int &kw_f) const { // This function is needed for exec_base only const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -253,8 +252,8 @@ void brgemm_convolution_bwd_strided_t::get_kw_range(int iw, jcp, iw, iw_raw, kw_s, kw_full_s, kw_full_f, kw_f); } -template -void brgemm_convolution_bwd_strided_t::get_iw_range( +template +void brgemm_convolution_bwd_strided_t::get_iw_range( int iw, int iw_raw, int kw, int &iw_s, int &M_without_overflow) const { // This function is needed for exec_base only const auto _pd = pd(); @@ -264,8 +263,8 @@ void brgemm_convolution_bwd_strided_t::get_iw_range( jcp, iw, iw_raw, kw, iw_s, M_without_overflow); } -template -status_t brgemm_convolution_bwd_strided_t::add_brg_kernel( +template +status_t brgemm_convolution_bwd_strided_t::add_brg_kernel( int bs, int M, int i_N, int i_K, int i_init) { if (M <= 0) return status::success; const auto _pd = pd(); @@ -285,8 +284,8 @@ status_t brgemm_convolution_bwd_strided_t::add_brg_kernel( return status::success; } -template -status_t brgemm_convolution_bwd_strided_t::add_po_kernel( +template +status_t brgemm_convolution_bwd_strided_t::add_po_kernel( brgemm_desc_t *bcfg, int ker_idx, bool is_init) { if (!bcfg) return status::success; const auto _pd = pd(); @@ -304,8 +303,8 @@ status_t brgemm_convolution_bwd_strided_t::add_po_kernel( return status::success; } -template -void brgemm_convolution_bwd_strided_t::add_po_kernels( +template +void brgemm_convolution_bwd_strided_t::add_po_kernels( int i_N, int init_bcast_dim, int po_bcast_dim) { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -341,10 +340,10 @@ void brgemm_convolution_bwd_strided_t::add_po_kernels( } } } -template -int brgemm_convolution_bwd_strided_t::get_comp_ker_idx( - const int kd_b, const int kd_e, const int kh_b, const int kh_e, - const int kw_b, const int kw_e) const { +template +int brgemm_convolution_bwd_strided_t::get_comp_ker_idx(const int kd_b, + const int kd_e, const int kh_b, const int kh_e, const int kw_b, + const int kw_e) const { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -362,11 +361,10 @@ int brgemm_convolution_bwd_strided_t::get_comp_ker_idx( return -1; } -template -int brgemm_convolution_bwd_strided_t::get_comp_offset( - const int g, const int icb, const int iw, const int kd_b, - const int kd_e, const int kh_b, const int kh_e, const int kw_b, - const int kw_e) const { +template +int brgemm_convolution_bwd_strided_t::get_comp_offset(const int g, + const int icb, const int iw, const int kd_b, const int kd_e, + const int kh_b, const int kh_e, const int kw_b, const int kw_e) const { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -382,8 +380,8 @@ int brgemm_convolution_bwd_strided_t::get_comp_offset( : (g * jcp.nb_ic + icb) * jcp.ic_block; } -template -void brgemm_convolution_bwd_strided_t::create_kernels() { +template +void brgemm_convolution_bwd_strided_t::create_kernels() { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -484,9 +482,8 @@ void brgemm_convolution_bwd_strided_t::create_kernels() { } } -template -status_t brgemm_convolution_bwd_strided_t::init( - engine_t *engine) { +template +status_t brgemm_convolution_bwd_strided_t::init(engine_t *engine) { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -644,8 +641,8 @@ status_t brgemm_convolution_bwd_strided_t::init( : (((f).ndims() == 4) ? (f).blk_off(n, c, h, w) \ : (f).blk_off(n, c, d, h, w))) -template -status_t brgemm_convolution_bwd_strided_t::execute( +template +status_t brgemm_convolution_bwd_strided_t::execute( const exec_ctx_t &ctx) const { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -877,8 +874,8 @@ status_t brgemm_convolution_bwd_strided_t::execute( #undef data_blk_off -template -void brgemm_convolution_bwd_strided_t::cal_compensation( +template +void brgemm_convolution_bwd_strided_t::cal_compensation( const char *__restrict weights, int32_t *src_zp_buffer, int32_t *s8s8_comp_buffer) const { const auto _pd = pd(); @@ -941,15 +938,15 @@ void brgemm_convolution_bwd_strided_t::cal_compensation( }); } -template -void brgemm_convolution_bwd_strided_t::perform_outwork( - char *dst_base, char *dst, char *c_buffer, const char *bias_w, int id, - int ih, int iw, int iw_raw, int g_ic, bool is_ic_tail, int ker_iw_s, - int ker_iw_f, int kd_l, int kh_l, - const void *post_ops_binary_rhs_arg_vec, const float *oscales, - int32_t src_zp_vals, int32_t *src_zp_ptr, int32_t *dst_zp_ptr, - int32_t *s8s8_compensation, size_t comp_ker_offs, bool maybe_do_init, - bool do_postwork, bool do_post_comp, const float *dst_scales) const { +template +void brgemm_convolution_bwd_strided_t::perform_outwork(char *dst_base, + char *dst, char *c_buffer, const char *bias_w, int id, int ih, int iw, + int iw_raw, int g_ic, bool is_ic_tail, int ker_iw_s, int ker_iw_f, + int kd_l, int kh_l, const void *post_ops_binary_rhs_arg_vec, + const float *oscales, int32_t src_zp_vals, int32_t *src_zp_ptr, + int32_t *dst_zp_ptr, int32_t *s8s8_compensation, size_t comp_ker_offs, + bool maybe_do_init, bool do_postwork, bool do_post_comp, + const float *dst_scales) const { const auto _pd = pd(); const auto &jcp = _pd->jcp_; @@ -1028,8 +1025,8 @@ void brgemm_convolution_bwd_strided_t::perform_outwork( } } -template -void brgemm_convolution_bwd_strided_t::call_brgemm_kernel( +template +void brgemm_convolution_bwd_strided_t::call_brgemm_kernel( brgemm_bwd_thread_ctx_t &btc, int brg_idx, int batch_size, char *ptr_C, char *ptr_D, const char *bias_w, int g_ic, bool do_postops, const void *binary_post_ops_rhs, int32_t src_zp_vals, @@ -1074,8 +1071,8 @@ void brgemm_convolution_bwd_strided_t::call_brgemm_kernel( static_cast(btc.wsp_tile)); } -template -void brgemm_convolution_bwd_strided_t::maybe_trans_inp(int ithr, +template +void brgemm_convolution_bwd_strided_t::maybe_trans_inp(int ithr, const char *__restrict src, char *__restrict inp_buffer, uint8_t *__restrict inp_buffer_mask, int g, int n, int occ, int idb, int ihb, int iwb, int last_g, int last_n, int last_occ, int last_idb, @@ -1153,8 +1150,8 @@ void brgemm_convolution_bwd_strided_t::maybe_trans_inp(int ithr, } } -template -void brgemm_convolution_bwd_strided_t::ker_base( +template +void brgemm_convolution_bwd_strided_t::ker_base( brgemm_bwd_thread_ctx_t &btc) const { const auto _pd = pd(); @@ -1404,8 +1401,8 @@ void brgemm_convolution_bwd_strided_t::ker_base( } }; -template -void brgemm_convolution_bwd_strided_t::ker_trans( +template +void brgemm_convolution_bwd_strided_t::ker_trans( brgemm_bwd_thread_ctx_t &btc, char *inp_buffer) const { const auto _pd = pd(); @@ -1604,21 +1601,14 @@ void brgemm_convolution_bwd_strided_t::ker_trans( } template struct brgemm_convolution_bwd_strided_t; -template struct brgemm_convolution_bwd_strided_t; -template struct brgemm_convolution_bwd_strided_t; +template struct brgemm_convolution_bwd_strided_t; template struct brgemm_convolution_bwd_strided_t; -template struct brgemm_convolution_bwd_strided_t; template struct brgemm_convolution_bwd_strided_t; -template struct brgemm_convolution_bwd_strided_t; template struct brgemm_convolution_bwd_strided_t; -template struct brgemm_convolution_bwd_strided_t; template struct brgemm_convolution_bwd_strided_t; -template struct brgemm_convolution_bwd_strided_t; -template struct brgemm_convolution_bwd_strided_t; +template struct brgemm_convolution_bwd_strided_t; template struct brgemm_convolution_bwd_strided_t; -template struct brgemm_convolution_bwd_strided_t; template struct brgemm_convolution_bwd_strided_t; -template struct brgemm_convolution_bwd_strided_t; } // namespace x64 diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp b/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp index 7e61440eac6..63cf0aac7b5 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp @@ -40,7 +40,7 @@ namespace impl { namespace cpu { namespace x64 { -template +template struct brgemm_convolution_bwd_strided_t : public primitive_t { struct pd_t : public cpu_convolution_bwd_data_pd_t { diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp index 8163e5e1c6c..36459180dbf 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp @@ -64,12 +64,12 @@ bool is_amx(cpu_isa_t isa) { } bool post_ops_ok(jit_brgemm_conv_conf_t &jcp, primitive_attr_t &attr, - const memory_desc_wrapper &dst_d, bool is_deconv) { + const memory_desc_wrapper &dst_d, bool use_inversion) { using namespace injector; const auto &post_ops = attr.post_ops_; - if (post_ops.len() > 0 && !is_deconv) return false; + if (post_ops.len() > 0 && !use_inversion) return false; return injector::post_ops_ok(post_ops_ok_args_t(jcp.isa, {sum, eltwise, binary}, post_ops, &dst_d, @@ -1403,8 +1403,7 @@ brgemm_broadcast_t get_zp_type(const primitive_attr_t &attr, int arg) { status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, const convolution_desc_t &cd, memory_desc_t &diff_dst_md, memory_desc_t &weights_md, memory_desc_t &diff_src_md, - memory_desc_t &bias_md, primitive_attr_t &attr, int nthreads, - bool is_deconv) { + memory_desc_t &bias_md, primitive_attr_t &attr, int nthreads) { using namespace prop_kind; brg_blocking_t::L1 = platform::get_per_core_cache_size(1); @@ -1451,7 +1450,7 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, const bool has_uneven_spatial = jcp.id % jcp.stride_d != 0 || jcp.ih % jcp.stride_h != 0 || jcp.has_uneven_iw; - if (is_deconv && has_uneven_spatial) return status::unimplemented; + if (cd.use_inversion && has_uneven_spatial) return status::unimplemented; jcp.dilate_d = (ndims == 5) ? cd.dilates[0] : 0; jcp.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims - 4]; @@ -1623,10 +1622,12 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind); jcp.src_zero_point - = get_zp_type(attr, is_deconv ? DNNL_ARG_SRC : DNNL_ARG_DIFF_DST) + = get_zp_type( + attr, cd.use_inversion ? DNNL_ARG_SRC : DNNL_ARG_DIFF_DST) != brgemm_broadcast_t::none; jcp.dst_zero_point - = get_zp_type(attr, is_deconv ? DNNL_ARG_DST : DNNL_ARG_DIFF_SRC) + = get_zp_type( + attr, cd.use_inversion ? DNNL_ARG_DST : DNNL_ARG_DIFF_SRC) != brgemm_broadcast_t::none; const bool has_zero_points = jcp.src_zero_point || jcp.dst_zero_point; @@ -1634,11 +1635,13 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, const bool params_ok = IMPLICATION(has_zero_points, utils::one_of(jcp.src_dt, u8, s8)) && IMPLICATION(jcp.src_zero_point, - attr.zero_points_.common( - is_deconv ? DNNL_ARG_SRC : DNNL_ARG_DIFF_DST)) + attr.zero_points_.common(cd.use_inversion + ? DNNL_ARG_SRC + : DNNL_ARG_DIFF_DST)) && IMPLICATION(jcp.dst_zero_point, - attr.zero_points_.common( - is_deconv ? DNNL_ARG_DST : DNNL_ARG_DIFF_SRC)); + attr.zero_points_.common(cd.use_inversion + ? DNNL_ARG_DST + : DNNL_ARG_DIFF_SRC)); VDISPATCH_CONV_IC(params_ok, VERBOSE_UNSUPPORTED_ZP_CFG); jcp.nthr = nthreads; @@ -1659,7 +1662,7 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, CHECK(init_tag(jcp.dst_tag, diff_src_md, diff_src_d, src_tag)); CHECK(attr.set_default_formats(&diff_src_md)); - VDISPATCH_CONV_IC(post_ops_ok(jcp, attr, diff_src_d, is_deconv), + VDISPATCH_CONV_IC(post_ops_ok(jcp, attr, diff_src_d, cd.use_inversion), VERBOSE_UNSUPPORTED_POSTOP); return status::success; @@ -1892,8 +1895,7 @@ dim_t precalculate_comp_pad_kernels(const jit_brgemm_conv_conf_t &jcp, status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, const convolution_desc_t &cd, memory_desc_t &diff_dst_md, memory_desc_t &weights_md, memory_desc_t &diff_src_md, - memory_desc_t &bias_md, primitive_attr_t &attr, int nthreads, - bool is_deconv) { + memory_desc_t &bias_md, primitive_attr_t &attr, int nthreads) { using namespace prop_kind; @@ -1901,7 +1903,7 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, if (!mayiuse(isa)) return status::unimplemented; CHECK(init_jcp(jcp, isa, cd, diff_dst_md, weights_md, diff_src_md, bias_md, - attr, nthreads, is_deconv)); + attr, nthreads)); const memory_desc_wrapper diff_dst_d(&diff_dst_md); const memory_desc_wrapper weights_d(&weights_md); @@ -2040,7 +2042,7 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, ? 1 / weights_md.extra.scale_adjust : 1.0f; - if (is_deconv) { + if (cd.use_inversion) { const auto &src_scales = attr.scales_.get(DNNL_ARG_SRC); const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS); jcp.with_scales = !src_scales.has_default_values() diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_utils.hpp b/src/cpu/x64/jit_brgemm_conv_bwd_utils.hpp index dea1cf1a7a5..08e68a49bdd 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_utils.hpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_utils.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -59,8 +59,7 @@ dim_t precalculate_comp_pad_kernels(const jit_brgemm_conv_conf_t &jcp, status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, const convolution_desc_t &cd, memory_desc_t &src_md, memory_desc_t &weights_md, memory_desc_t &dst_md, - memory_desc_t &bias_md, primitive_attr_t &attr, int nthreads, - bool enable_postops); + memory_desc_t &bias_md, primitive_attr_t &attr, int nthreads); void init_scratchpad(memory_tracking::registrar_t &scratchpad, const jit_brgemm_conv_conf_t &jcp); diff --git a/src/cpu/x64/jit_brgemm_conv_utils.cpp b/src/cpu/x64/jit_brgemm_conv_utils.cpp index 20d029812a0..713061b85cb 100644 --- a/src/cpu/x64/jit_brgemm_conv_utils.cpp +++ b/src/cpu/x64/jit_brgemm_conv_utils.cpp @@ -1866,8 +1866,8 @@ void adjust_nthr(jit_brgemm_conv_conf_t &jcp, const memory_desc_wrapper &src_d, } } -status_t init_conf(jit_brgemm_conv_conf_t &jcp, bool use_inversion, - cpu_isa_t isa, const convolution_desc_t &cd, memory_desc_t &src_md, +status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, + const convolution_desc_t &cd, memory_desc_t &src_md, memory_desc_t &weights_md, memory_desc_t &dst_md, memory_desc_t &bias_md, primitive_attr_t &attr, int nthreads) { @@ -2014,7 +2014,7 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, bool use_inversion, //TODO: support all 3d cases const bool relo_supported_shape = jcp.trans_dim_koef == 1 && IMPLICATION(jcp.id > 1, relo_conv_weights_wi == false) - && !use_inversion && jcp.dilate_w == 0; + && !cd.use_inversion && jcp.dilate_w == 0; const auto rnd_kwic = (float)jcp.kw * rnd_up(jcp.ic, jcp.simd_w); const auto src_per_ic @@ -2055,7 +2055,7 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, bool use_inversion, //TODO: support 3d cases const bool relo_supported_shape = everyone_is(0, jcp.dilate_h, jcp.dilate_w) - && jcp.trans_dim_koef == 1 && jcp.ndims < 5 && !use_inversion + && jcp.trans_dim_koef == 1 && jcp.ndims < 5 && !cd.use_inversion && IMPLICATION(jcp.s8s8_compensation_required, everyone_is(0, jcp.t_pad, jcp.b_pad)); diff --git a/src/cpu/x64/jit_brgemm_conv_utils.hpp b/src/cpu/x64/jit_brgemm_conv_utils.hpp index 1c9b4bbbe3d..bee9d321917 100644 --- a/src/cpu/x64/jit_brgemm_conv_utils.hpp +++ b/src/cpu/x64/jit_brgemm_conv_utils.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2023 Intel Corporation +* Copyright 2021-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,8 +41,8 @@ bool is_amx(cpu_isa_t isa); bool uses_batch_elements( brgemm_batch_kind_t brg_type, conv_brgemm_exec_type_t exec_type); -status_t init_conf(jit_brgemm_conv_conf_t &jcp, bool use_inversion, - cpu_isa_t isa, const convolution_desc_t &cd, memory_desc_t &src_md, +status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, + const convolution_desc_t &cd, memory_desc_t &src_md, memory_desc_t &weights_md, memory_desc_t &dst_md, memory_desc_t &bias_md, primitive_attr_t &attr, int nthreads); diff --git a/src/cpu/x64/jit_brgemm_deconv.cpp b/src/cpu/x64/jit_brgemm_deconv.cpp index 5288674fd41..c8f2c8d8fd8 100644 --- a/src/cpu/x64/jit_brgemm_deconv.cpp +++ b/src/cpu/x64/jit_brgemm_deconv.cpp @@ -89,6 +89,8 @@ status_t fwd_conv_desc_create(const deconvolution_desc_t *fwd_deconv_d, fwd_conv_d->diff_src_desc = fwd_conv_d->src_desc; fwd_conv_d->diff_dst_desc = fwd_conv_d->dst_desc; } + // Note: internal field to hint this conv is created from deconv. + fwd_conv_d->use_inversion = true; return status::success; } @@ -133,6 +135,10 @@ status_t bwd_conv_desc_create(const deconvolution_desc_t *fwd_deconv_d, // directly into bwd conv implementations. bwd_conv_d->src_desc = bwd_conv_d->diff_src_desc; bwd_conv_d->dst_desc = bwd_conv_d->diff_dst_desc; + + // Note: internal field to hint this conv is created from deconv. + bwd_conv_d->use_inversion = true; + return status::success; } } // namespace @@ -192,11 +198,9 @@ status_t brgemm_deconvolution_fwd_t::pd_t::init(engine_t *engine) { while (++it != it.end()) { conv_pd_ = *it; - // flag used to enable post-ops and properly disable zero-points - constexpr bool is_deconv = true; if (check_embedded_impl_init< - typename brgemm_convolution_bwd_strided_t::pd_t>(it) + typename brgemm_convolution_bwd_strided_t::pd_t>( + it) == status::success) break; } @@ -218,9 +222,8 @@ status_t brgemm_deconvolution_fwd_t::pd_t::init(engine_t *engine) { == status::success) break; // try non-1x1 fwd convolution with invert weights' spatial indices - constexpr bool use_inversion = true; - if (check_embedded_impl_init::pd_t>(it) + if (check_embedded_impl_init< + typename brgemm_convolution_fwd_t::pd_t>(it) == status::success) break; } From 3cb934aea8d089709a603507eb2419d03f8d5fda Mon Sep 17 00:00:00 2001 From: "Lv, Tao A" Date: Mon, 13 May 2024 12:59:44 +0800 Subject: [PATCH 074/187] graph: backend: dnnl: fix reciprocal + multiply fusion use eltwise + binary po instead of div which will change the calculation order and affect numeric precision. --- src/graph/backend/dnnl/kernels/binary.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/graph/backend/dnnl/kernels/binary.hpp b/src/graph/backend/dnnl/kernels/binary.hpp index 7da8f7d9933..d4bc22f5d4f 100644 --- a/src/graph/backend/dnnl/kernels/binary.hpp +++ b/src/graph/backend/dnnl/kernels/binary.hpp @@ -105,7 +105,6 @@ struct binary_t : public kernel_base_t { pass_pipeline_t pipeline(vis); BACKEND_DNNL_ADD_PASS(pipeline, lower_down); - BACKEND_DNNL_ADD_PASS(pipeline, fuse_reciprocal_mul_to_div); BACKEND_DNNL_ADD_PASS(pipeline, fuse_mul_sigmoid_to_swish); BACKEND_DNNL_ADD_PASS(pipeline, binary_canonicalization); From 2fa152e2018f5480921d671f299d4a1353a74b92 Mon Sep 17 00:00:00 2001 From: "Lv, Tao A" Date: Mon, 13 May 2024 13:02:17 +0800 Subject: [PATCH 075/187] benchdnn: graph: inputs: set fpmath mode strict as fpmath mode any will be skipped by benchdnn when gc is enabled. --- .../inputs/graph/pattern/bf16/reciprocal_multiply_fusion.json | 2 +- .../inputs/graph/pattern/f16/reciprocal_multiply_fusion.json | 2 +- .../inputs/graph/pattern/f32/reciprocal_multiply_fusion.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/reciprocal_multiply_fusion.json b/tests/benchdnn/inputs/graph/pattern/bf16/reciprocal_multiply_fusion.json index 7e116ae3d68..4ca04a8931a 100644 --- a/tests/benchdnn/inputs/graph/pattern/bf16/reciprocal_multiply_fusion.json +++ b/tests/benchdnn/inputs/graph/pattern/bf16/reciprocal_multiply_fusion.json @@ -1,7 +1,7 @@ { "version": "3.0.0", "engine_kind": "cpu", - "fpmath_mode": "any", + "fpmath_mode": "strict", "graph": [ { "id": 0, diff --git a/tests/benchdnn/inputs/graph/pattern/f16/reciprocal_multiply_fusion.json b/tests/benchdnn/inputs/graph/pattern/f16/reciprocal_multiply_fusion.json index 2678c988d8f..a7effd51560 100644 --- a/tests/benchdnn/inputs/graph/pattern/f16/reciprocal_multiply_fusion.json +++ b/tests/benchdnn/inputs/graph/pattern/f16/reciprocal_multiply_fusion.json @@ -1,7 +1,7 @@ { "version": "3.0.0", "engine_kind": "cpu", - "fpmath_mode": "any", + "fpmath_mode": "strict", "graph": [ { "id": 0, diff --git a/tests/benchdnn/inputs/graph/pattern/f32/reciprocal_multiply_fusion.json b/tests/benchdnn/inputs/graph/pattern/f32/reciprocal_multiply_fusion.json index 5e184f5d65b..9f67b5c936f 100644 --- a/tests/benchdnn/inputs/graph/pattern/f32/reciprocal_multiply_fusion.json +++ b/tests/benchdnn/inputs/graph/pattern/f32/reciprocal_multiply_fusion.json @@ -1,7 +1,7 @@ { "version": "3.0.0", "engine_kind": "cpu", - "fpmath_mode": "any", + "fpmath_mode": "strict", "graph": [ { "id": 0, From 2036efbb11b9732315a33bb9b0538732d7585c6e Mon Sep 17 00:00:00 2001 From: yifeizh2 Date: Thu, 16 May 2024 09:58:53 +0800 Subject: [PATCH 076/187] graph: backend: compiler: support optional quantization zps --- src/graph/backend/graph_compiler/compiler_graph.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/graph/backend/graph_compiler/compiler_graph.cpp b/src/graph/backend/graph_compiler/compiler_graph.cpp index d6d912a0a39..db861f42d98 100644 --- a/src/graph/backend/graph_compiler/compiler_graph.cpp +++ b/src/graph/backend/graph_compiler/compiler_graph.cpp @@ -179,7 +179,11 @@ gc::sc_op_ptr compiler_graph_impl_t::make_backend_op(const op_t *aop, std::vector scales = attrs[graph::op_attr::scales].get>(); std::vector zps_int64 - = attrs[graph::op_attr::zps].get>(); + = std::vector(scales.size(), 0); + if (attrs.find(graph::op_attr::zps) != attrs.end()) { + zps_int64 = attrs[graph::op_attr::zps] + .get>(); + } std::vector zps(zps_int64.begin(), zps_int64.end()); backend_attrs.set("scales", scales); backend_attrs.set("zero_points", zps); From 16cc878a46881cd17b78a3f07f5836763ea1cc62 Mon Sep 17 00:00:00 2001 From: "Lv, Tao A" Date: Sat, 11 May 2024 10:51:34 +0800 Subject: [PATCH 077/187] doc: graph: fix build warnings --- doc/examples.md | 2 +- doc/rst/dev_guide_examples.rst | 2 +- examples/graph/sycl_single_op_partition.cpp | 44 ++++++++++----------- include/oneapi/dnnl/dnnl_graph_ocl.hpp | 12 +++--- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/doc/examples.md b/doc/examples.md index 308172486be..a7b118c966d 100644 --- a/doc/examples.md +++ b/doc/examples.md @@ -22,4 +22,4 @@ Examples {#dev_guide_examples} | | CPU | @ref graph_cpu_inference_int8_cpp | | | | CPU/GPU | @ref graph_sycl_getting_started_cpp | | | | CPU | @ref graph_cpu_single_op_partition_cpp| | -| | GPU | @ref graph_gpu_single_op_partition_cpp| | +| | GPU | @ref graph_sycl_single_op_partition_cpp| | diff --git a/doc/rst/dev_guide_examples.rst b/doc/rst/dev_guide_examples.rst index 92b3c027c69..af845c01b2a 100644 --- a/doc/rst/dev_guide_examples.rst +++ b/doc/rst/dev_guide_examples.rst @@ -89,7 +89,7 @@ Examples - * - - GPU - - :ref:`doxid-graph_gpu_single_op_partition_cpp` + - :ref:`doxid-graph_sycl_single_op_partition_cpp` - diff --git a/examples/graph/sycl_single_op_partition.cpp b/examples/graph/sycl_single_op_partition.cpp index 3177d58288f..acbb1e223b4 100644 --- a/examples/graph/sycl_single_op_partition.cpp +++ b/examples/graph/sycl_single_op_partition.cpp @@ -14,14 +14,14 @@ * limitations under the License. *******************************************************************************/ -/// @example gpu_single_op_partition.cpp -/// @copybrief graph_gpu_single_op_partition_cpp -/// > Annotated version: @ref graph_gpu_single_op_partition_cpp +/// @example sycl_single_op_partition.cpp +/// @copybrief graph_sycl_single_op_partition_cpp +/// > Annotated version: @ref graph_sycl_single_op_partition_cpp -/// @page graph_gpu_single_op_partition_cpp Single op partition on GPU +/// @page graph_sycl_single_op_partition_cpp Single op partition on GPU /// This is an example to demonstrate how to build a simple op graph and run it on gpu. /// -/// > Example code: @ref gpu_single_op_partition.cpp +/// > Example code: @ref sycl_single_op_partition.cpp /// /// Some key take-aways included in this example: /// @@ -36,14 +36,14 @@ /// * Unsupported partitions should be handled by users themselves /// -/// @page graph_gpu_single_op_partition_cpp -/// @section graph_gpu_single_op_partition_cpp_headers Public headers +/// @page graph_sycl_single_op_partition_cpp +/// @section graph_sycl_single_op_partition_cpp_headers Public headers /// /// To start using oneDNN Graph, we must include the @ref dnnl_graph.hpp header /// file in the application. All the C++ APIs reside in namespace `dnnl::graph`. /// -/// @page graph_gpu_single_op_partition_cpp -/// @snippet gpu_single_op_partition.cpp Headers and namespace +/// @page graph_sycl_single_op_partition_cpp +/// @snippet sycl_single_op_partition.cpp Headers and namespace //[Headers and namespace] #include "oneapi/dnnl/dnnl_graph.hpp" #include "oneapi/dnnl/dnnl_graph_sycl.hpp" @@ -68,8 +68,8 @@ using dim = logical_tensor::dim; using dims = logical_tensor::dims; //[Headers and namespace] -/// @page graph_gpu_single_op_partition_cpp -/// @section graph_gpu_single_op_partition_cpp_tutorial gpu_single_op_partition_tutorial() function +/// @page graph_sycl_single_op_partition_cpp +/// @section graph_sycl_single_op_partition_cpp_tutorial sycl_single_op_partition_tutorial() function /// void gpu_single_op_partition_tutorial() { @@ -78,8 +78,8 @@ void gpu_single_op_partition_tutorial() { dims src0_dims {M, K}; dims src1_dims {K, N}; - /// @page graph_gpu_single_op_partition_cpp - /// @subsection graph_gpu_single_op_partition_cpp_get_partition Build Graph and Get Partitions + /// @page graph_sycl_single_op_partition_cpp + /// @subsection graph_sycl_single_op_partition_cpp_get_partition Build Graph and Get Partitions /// /// In this section, we are trying to create a partition containing the /// single op `matmul` without building a graph and getting partition. @@ -87,7 +87,7 @@ void gpu_single_op_partition_tutorial() { /// Create first `Matmul` op (#dnnl::graph::op) and attaches attributes /// to it, including `transpose_a` and `transpose_b`. - /// @snippet gpu_single_op_partition.cpp Create matmul + /// @snippet sycl_single_op_partition.cpp Create matmul //[Create matmul] logical_tensor matmul_src0_desc {0, data_type::f32}; logical_tensor matmul_src1_desc {1, data_type::f32}; @@ -98,8 +98,8 @@ void gpu_single_op_partition_tutorial() { matmul.set_attr(op::attr::transpose_b, false); //[Create matmul] - /// @page graph_gpu_single_op_partition_cpp - /// @subsection graph_gpu_single_op_partition_cpp_compile Compile and Execute Partition + /// @page graph_sycl_single_op_partition_cpp + /// @subsection graph_sycl_single_op_partition_cpp_compile Compile and Execute Partition /// /// In the real case, users like framework should provide device information /// at this stage. But in this example, we just use a self-defined device to @@ -109,7 +109,7 @@ void gpu_single_op_partition_tutorial() { /// #dnnl_graph_sycl_allocate_f and #dnnl_graph_sycl_deallocate_f /// call-back functions. /// - /// @snippet gpu_single_op_partition.cpp Create allocator + /// @snippet sycl_single_op_partition.cpp Create allocator //[Create allocator] allocator alloc = sycl_interop::make_allocator( sycl_malloc_wrapper, sycl_free_wrapper); @@ -133,7 +133,7 @@ void gpu_single_op_partition_tutorial() { /// Create a #dnnl::stream on a given engine /// - /// @snippet gpu_single_op_partition.cpp Create stream + /// @snippet sycl_single_op_partition.cpp Create stream //[Create stream] dnnl::stream strm = dnnl::sycl_interop::make_stream(eng, q); //[Create stream] @@ -168,7 +168,7 @@ void gpu_single_op_partition_tutorial() { partition part(matmul, dnnl::engine::kind::gpu); //[Create partition] if (!part.is_supported()) { - std::cout << "gpu_single_op_partition: Got unsupported partition, " + std::cout << "sycl_single_op_partition: Got unsupported partition, " "users need to handle the operators by themselves." << std::endl; return; @@ -198,7 +198,7 @@ void gpu_single_op_partition_tutorial() { /// Compile the partition to generate compiled partition with the /// input and output logical tensors. /// - /// @snippet gpu_single_op_partition.cpp Compile partition + /// @snippet sycl_single_op_partition.cpp Compile partition //[Compile partition] compiled_partition cp = part.compile(inputs, outputs, eng); //[Compile partition] @@ -217,7 +217,7 @@ void gpu_single_op_partition_tutorial() { /// Execute the compiled partition on the specified stream. /// - /// @snippet gpu_single_op_partition.cpp Execute compiled partition + /// @snippet sycl_single_op_partition.cpp Execute compiled partition //[Execute compiled partition] cp.execute(strm, inputs_ts, outputs_ts); //[Execute compiled partition] @@ -225,7 +225,7 @@ void gpu_single_op_partition_tutorial() { // Wait for all compiled partition's execution finished strm.wait(); - /// @page graph_gpu_single_op_partition_cpp + /// @page graph_sycl_single_op_partition_cpp /// std::cout << "Graph:" << std::endl << " [matmul_src0] [matmul_src1]" << std::endl diff --git a/include/oneapi/dnnl/dnnl_graph_ocl.hpp b/include/oneapi/dnnl/dnnl_graph_ocl.hpp index ac6355096c6..ba5b34a2440 100644 --- a/include/oneapi/dnnl/dnnl_graph_ocl.hpp +++ b/include/oneapi/dnnl/dnnl_graph_ocl.hpp @@ -64,15 +64,15 @@ inline allocator make_allocator(dnnl_graph_ocl_allocate_f ocl_malloc, /// Constructs an engine from an OpenCL device, an OpenCL context, and an /// allocator. /// -/// @param adevice A valid OpenCL device to construct the engine -/// @param acontext A valid OpenCL context to construct the engine +/// @param device A valid OpenCL device to construct the engine +/// @param context A valid OpenCL context to construct the engine /// @param alloc An allocator to associate with the engine /// @returns Created engine inline engine make_engine_with_allocator( - cl_device_id adevice, cl_context acontext, const allocator &alloc) { + cl_device_id device, cl_context context, const allocator &alloc) { dnnl_engine_t c_engine; error::wrap_c_api(dnnl_graph_ocl_interop_make_engine_with_allocator( - &c_engine, adevice, acontext, alloc.get()), + &c_engine, device, context, alloc.get()), "could not make an engine with allocator"); return engine(c_engine); } @@ -80,8 +80,8 @@ inline engine make_engine_with_allocator( /// Constructs an engine from an OpenCL device, an OpenCL context, an /// allocator, and a serialized engine cache blob. /// -/// @param adevice A valid OpenCL device to construct the engine -/// @param acontext A valid OpenCL context to construct the engine +/// @param device A valid OpenCL device to construct the engine +/// @param context A valid OpenCL context to construct the engine /// @param alloc An allocator to associate with the engine /// @param cache_blob Cache blob serialized beforehand /// @returns Created engine From 578c258e94d52839dc5bf891014118990cec1cdf Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Tue, 7 May 2024 13:51:45 -0700 Subject: [PATCH 078/187] x64: brgemm kernel: fix reg64_fp8_aux saving on stack --- src/cpu/x64/brgemm/jit_brgemm_kernel.cpp | 32 ++++++++++++++++-------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/cpu/x64/brgemm/jit_brgemm_kernel.cpp b/src/cpu/x64/brgemm/jit_brgemm_kernel.cpp index 06f50491258..24efbdbd02a 100644 --- a/src/cpu/x64/brgemm/jit_brgemm_kernel.cpp +++ b/src/cpu/x64/brgemm/jit_brgemm_kernel.cpp @@ -651,21 +651,15 @@ void jit_brgemm_kernel_t::cvt2ps(data_type_t type_in, const Vmm vmm_in, case data_type::s8: uni_vpmovsxbd(vmm, op); break; case data_type::u8: uni_vpmovzxbd(vmm, op); break; case data_type::f8_e5m2: - if (brg.is_fp8_via_convert()) { - // note: unoptimized, probably move stack use outside loop - mov(ptr[rsp + reg_val_tmp_1_], reg64_fp8_aux); + if (brg.is_fp8_via_convert()) f8_e5m2_emulator_->vcvt_f8_to_f32(vmm, op); - mov(reg64_fp8_aux, ptr[rsp + reg_val_tmp_1_]); - } else + else assert(!"Error, native conversion unsupported"); break; case data_type::f8_e4m3: - if (brg.is_fp8_via_convert()) { - // note: unoptimized, probably move stack use outside loop - mov(ptr[rsp + reg_val_tmp_1_], reg64_fp8_aux); + if (brg.is_fp8_via_convert()) f8_e4m3_emulator_->vcvt_f8_to_f32(vmm, op); - mov(reg64_fp8_aux, ptr[rsp + reg_val_tmp_1_]); - } else + else assert(!"Error, native conversion unsupported"); break; @@ -1083,6 +1077,8 @@ void jit_brgemm_kernel_t::apply_alpha_beta( if (brg.is_runtime_ldc && bd_block > 1) mov(ptr[rsp + reg_aux_C_backup_offs_], reg_aux_C); + if (brg.is_fp8_via_convert()) mov(ptr[rsp + reg_val_tmp_1_], reg64_fp8_aux); + for_(int bd = 0; bd < bd_block; bd++) for (int ld = 0; ld < ld_block2; ld++) { const bool is_tail = is_ld_tail && ld + 1 == ld_block2; @@ -1116,6 +1112,8 @@ void jit_brgemm_kernel_t::apply_alpha_beta( add(reg_aux_C, ptr[rsp + reg_C_shift_bytes_offs_]); } + if (brg.is_fp8_via_convert()) mov(reg64_fp8_aux, ptr[rsp + reg_val_tmp_1_]); + if (brg.is_runtime_ldc && bd_block > 1) mov(reg_aux_C, ptr[rsp + reg_aux_C_backup_offs_]); @@ -1195,6 +1193,11 @@ void jit_brgemm_kernel_t::apply_post_ops( } } + // We have to use push/pop to preserve reg64_fp8_aux because we + // are in the range of conditional_register_preserve_guard_t + // objects above that use push/pop + if (brg.is_fp8_via_convert()) push(reg64_fp8_aux); + for_(int bd = bd_start; bd < bd_end; bd++) for (int ld = 0; ld < ld_block2; ld++) { const auto vmm = accm(ld_block2, bd, ld); @@ -1217,6 +1220,7 @@ void jit_brgemm_kernel_t::apply_post_ops( } else uni_vaddps(vmm, vmm, vmm_prev_dst); } + if (brg.is_fp8_via_convert()) pop(reg64_fp8_aux); } if (reset_avx_tail_mask) maybe_set_avx_mask(is_ld_tail); @@ -1277,6 +1281,8 @@ void jit_brgemm_kernel_t::store_accumulators_apply_post_ops( } if (brg.with_bias) { mov(reg_aux_bias, ptr[rsp + reg_aux_bias_offs_]); } + + if (brg.is_fp8_via_convert()) mov(ptr[rsp + reg_val_tmp_1_], reg64_fp8_aux); for (int ld = 0; ld < ld_block2; ld++) { auto vmm_bias = vmm_tmp(0); if (brg.with_bias) { @@ -1291,6 +1297,7 @@ void jit_brgemm_kernel_t::store_accumulators_apply_post_ops( if (brg.with_bias) uni_vaddps(vmm, vmm, vmm_bias); } } + if (brg.is_fp8_via_convert()) mov(reg64_fp8_aux, ptr[rsp + reg_val_tmp_1_]); if (postops_injector_) apply_post_ops(bd_block, ld_block2, ldb_and_bdb_offset, is_ld_tail); @@ -1320,6 +1327,9 @@ void jit_brgemm_kernel_t::store_accumulators_apply_post_ops( uni_vcvtdq2ps(vmm_zp_c, vmm_zp_c); } } + if (brg.is_fp8_via_convert()) + mov(ptr[rsp + reg_val_tmp_1_], reg64_fp8_aux); + for (int ld = 0; ld < ld_block2; ld++) { const bool is_tail = is_ld_tail && ld + 1 == ld_block2; if (brg.zp_type_c == brgemm_broadcast_t::per_n) { @@ -1340,6 +1350,8 @@ void jit_brgemm_kernel_t::store_accumulators_apply_post_ops( uni_vaddps(vmm, vmm, vmm_zp_c); } } + if (brg.is_fp8_via_convert()) + mov(reg64_fp8_aux, ptr[rsp + reg_val_tmp_1_]); } const bool dt_requires_saturation From 67d58af7153bf5bdd370544ab96de2fabebc5329 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Wed, 3 Apr 2024 15:12:26 -0700 Subject: [PATCH 079/187] common: add internal sdpa primitive --- scripts/generate_dnnl_debug.py | 6 +- src/common/c_types_map.hpp | 1 + src/common/dnnl_debug_autogenerated.cpp | 3 + src/common/dnnl_traits.hpp | 1 + src/common/ittnotify.cpp | 1 + src/common/opdesc.hpp | 3 + src/common/primitive_desc_iface.cpp | 4 +- src/common/primitive_hashing.cpp | 18 +++ src/common/primitive_hashing.hpp | 4 +- src/common/sdpa_pd.hpp | 141 ++++++++++++++++++++++++ src/common/sdpa_types.hpp | 62 +++++++++++ src/common/sdpa_utils.hpp | 79 +++++++++++++ src/common/serialization.cpp | 13 +++ src/common/serialization.hpp | 3 +- src/common/type_helpers.hpp | 14 +++ src/common/verbose.cpp | 3 + src/cpu/cpu_engine.hpp | 3 +- src/gpu/gpu_impl_list.cpp | 1 + src/gpu/gpu_impl_list.hpp | 2 + src/gpu/gpu_sdpa_list.cpp | 41 +++++++ 20 files changed, 397 insertions(+), 6 deletions(-) create mode 100644 src/common/sdpa_pd.hpp create mode 100644 src/common/sdpa_types.hpp create mode 100644 src/common/sdpa_utils.hpp create mode 100755 src/gpu/gpu_sdpa_list.cpp diff --git a/scripts/generate_dnnl_debug.py b/scripts/generate_dnnl_debug.py index e670ed5dc02..ffbf9a571a2 100755 --- a/scripts/generate_dnnl_debug.py +++ b/scripts/generate_dnnl_debug.py @@ -1,6 +1,6 @@ #!/usr/bin/env python ################################################################################ -# Copyright 2018-2023 Intel Corporation +# Copyright 2018-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -99,6 +99,8 @@ def source(body): #include "oneapi/dnnl/dnnl_debug.h" #include "oneapi/dnnl/dnnl_types.h" +#include "common/c_types_map.hpp" + %s """ % body @@ -249,6 +251,8 @@ def func_to_str(enum, values): func += func_to_str_decl(enum) + " {\n" for v in values: func += '%sif (v == %s) return "%s";\n' % (indent, v, sanitize_value(v)) + if (enum == "dnnl_primitive_kind_t"): + func += '%sif (v == dnnl::impl::primitive_kind::sdpa) return "sdpa";\n' % indent func += '%sassert(!"unknown %s");\n' % (indent, abbrev) func += '%sreturn "unknown %s";\n}\n' % (indent, abbrev) return func diff --git a/src/common/c_types_map.hpp b/src/common/c_types_map.hpp index dc3ac6783f0..d61d0ef6c97 100644 --- a/src/common/c_types_map.hpp +++ b/src/common/c_types_map.hpp @@ -1939,6 +1939,7 @@ const primitive_kind_t group_normalization = dnnl_group_normalization; // Internal only primitive kinds. const primitive_kind_t internal_only_start = (primitive_kind_t)(1 << 12); const primitive_kind_t zero_pad = internal_only_start; +const primitive_kind_t sdpa = (primitive_kind_t)(internal_only_start + 1); } // namespace primitive_kind using query_t = dnnl_query_t; diff --git a/src/common/dnnl_debug_autogenerated.cpp b/src/common/dnnl_debug_autogenerated.cpp index aa62dcf40ed..dca7900eb34 100644 --- a/src/common/dnnl_debug_autogenerated.cpp +++ b/src/common/dnnl_debug_autogenerated.cpp @@ -24,6 +24,8 @@ #include "oneapi/dnnl/dnnl_debug.h" #include "oneapi/dnnl/dnnl_types.h" +#include "common/c_types_map.hpp" + const char *dnnl_status2str(dnnl_status_t v) { if (v == dnnl_success) return "success"; if (v == dnnl_out_of_memory) return "out_of_memory"; @@ -1746,6 +1748,7 @@ const char *dnnl_prim_kind2str(dnnl_primitive_kind_t v) { if (v == dnnl_layer_normalization) return "layer_normalization"; if (v == dnnl_group_normalization) return "group_normalization"; if (v == dnnl_primitive_kind_max) return "primitive_kind_max"; + if (v == dnnl::impl::primitive_kind::sdpa) return "sdpa"; assert(!"unknown prim_kind"); return "unknown prim_kind"; } diff --git a/src/common/dnnl_traits.hpp b/src/common/dnnl_traits.hpp index 87ccd5e5ff6..bcebf6fcb01 100644 --- a/src/common/dnnl_traits.hpp +++ b/src/common/dnnl_traits.hpp @@ -174,6 +174,7 @@ PKIND_TRAITS_INST(binary); PKIND_TRAITS_INST(matmul); PKIND_TRAITS_INST(resampling); PKIND_TRAITS_INST(reduction); +PKIND_TRAITS_INST(sdpa); #undef PKIND_TRAITS_INST } // namespace impl diff --git a/src/common/ittnotify.cpp b/src/common/ittnotify.cpp index 4821bc275bd..e9c9dfa8404 100644 --- a/src/common/ittnotify.cpp +++ b/src/common/ittnotify.cpp @@ -79,6 +79,7 @@ void primitive_task_start(primitive_kind_t kind) { CASE(softmax), CASE(layer_normalization), CASE(group_normalization), + CASE(sdpa), }; #undef CASE int kind_idx = (int)kind; diff --git a/src/common/opdesc.hpp b/src/common/opdesc.hpp index 872fe9025a7..8067ae0ddb6 100644 --- a/src/common/opdesc.hpp +++ b/src/common/opdesc.hpp @@ -21,6 +21,7 @@ #include "common/c_types_map.hpp" #include "common/gemm_types.hpp" +#include "common/sdpa_types.hpp" namespace dnnl { namespace impl { @@ -616,6 +617,7 @@ struct op_desc_t { resampling_desc_t resampling; zero_pad_desc_t zero_pad; reduction_desc_t reduction; + sdpa_desc_t sdpa; }; #define DECL_CTOR_AND_CONVERTERS(c_type) \ @@ -648,6 +650,7 @@ struct op_desc_t { DECL_CTOR_AND_CONVERTERS(resampling_desc_t); DECL_CTOR_AND_CONVERTERS(zero_pad_desc_t); DECL_CTOR_AND_CONVERTERS(reduction_desc_t); + DECL_CTOR_AND_CONVERTERS(sdpa_desc_t); // concat_desc_t and sum_desc_t have data members which have non-trivial // special member functions hence the default destructor is implicitly diff --git a/src/common/primitive_desc_iface.cpp b/src/common/primitive_desc_iface.cpp index 9b74295c1ac..1dc59f9fb74 100644 --- a/src/common/primitive_desc_iface.cpp +++ b/src/common/primitive_desc_iface.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,7 +40,7 @@ status_t primitive_desc_create(primitive_desc_iface_t **primitive_desc_iface, const bool known_primitive_kind = utils::one_of(op_desc->kind, batch_normalization, binary, convolution, deconvolution, eltwise, gemm, group_normalization, inner_product, layer_normalization, lrn, - matmul, pooling, prelu, reduction, resampling, rnn, shuffle, + matmul, pooling, prelu, reduction, resampling, rnn, sdpa, shuffle, softmax); if (!known_primitive_kind) return invalid_arguments; diff --git a/src/common/primitive_hashing.cpp b/src/common/primitive_hashing.cpp index 90de09081f5..285eb2c930c 100644 --- a/src/common/primitive_hashing.cpp +++ b/src/common/primitive_hashing.cpp @@ -83,6 +83,7 @@ bool key_t::operator==(const key_t &rhs) const { CASE(reorder) CASE(resampling) CASE(rnn) + CASE(sdpa) CASE(shuffle) CASE(softmax) CASE(sum) @@ -707,6 +708,23 @@ size_t get_desc_hash(const zero_pad_desc_t &desc) { return seed; } +size_t get_desc_hash(const sdpa_desc_t &desc) { + size_t seed = 0; + // Kinds + seed = hash_combine(seed, static_cast(desc.primitive_kind)); + // Memory descriptors + seed = hash_combine(seed, get_md_hash(desc.q_desc)); + seed = hash_combine(seed, get_md_hash(desc.k_desc)); + seed = hash_combine(seed, get_md_hash(desc.v_desc)); + seed = hash_combine(seed, get_md_hash(desc.dst_desc)); + seed = hash_combine(seed, get_md_hash(desc.attn_mask_desc)); + // Scale type + seed = hash_combine(seed, static_cast(desc.scale_dt)); + seed = hash_combine(seed, desc.invert_scale); + // Combined hash for sdpa desc + return seed; +} + } // namespace primitive_hashing } // namespace impl } // namespace dnnl diff --git a/src/common/primitive_hashing.hpp b/src/common/primitive_hashing.hpp index 719a65b5709..fa33f920e55 100644 --- a/src/common/primitive_hashing.hpp +++ b/src/common/primitive_hashing.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2023 Intel Corporation +* Copyright 2019-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -91,6 +91,7 @@ size_t get_desc_hash(const reduction_desc_t &desc); size_t get_desc_hash(const reorder_desc_t &desc); size_t get_desc_hash(const resampling_desc_t &desc); size_t get_desc_hash(const rnn_desc_t &desc); +size_t get_desc_hash(const sdpa_desc_t &desc); size_t get_desc_hash(const shuffle_desc_t &desc); size_t get_desc_hash(const softmax_desc_t &desc); size_t get_desc_hash(const sum_desc_t &desc); @@ -179,6 +180,7 @@ struct hash { CASE(reorder) CASE(resampling) CASE(rnn) + CASE(sdpa) CASE(shuffle) CASE(softmax) CASE(sum) diff --git a/src/common/sdpa_pd.hpp b/src/common/sdpa_pd.hpp new file mode 100644 index 00000000000..06e9ed6b2f0 --- /dev/null +++ b/src/common/sdpa_pd.hpp @@ -0,0 +1,141 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef COMMON_SDPA_PD_HPP +#define COMMON_SDPA_PD_HPP + +#include "oneapi/dnnl/dnnl.h" + +#include "common/c_types_map.hpp" +#include "common/primitive_desc.hpp" +#include "common/sdpa_utils.hpp" +#include "common/utils.hpp" + +namespace dnnl { +namespace impl { + +#define DNNL_ARG_QUERIES DNNL_ARG_SRC_0 +#define DNNL_ARG_KEYS DNNL_ARG_SRC_1 +#define DNNL_ARG_VALUES DNNL_ARG_SRC_2 +#define DNNL_ARG_ATTN_MASK DNNL_ARG_SHIFT + +#define VDISPATCH_SDPA(cond, msg, ...) \ + VCONDCHECK(primitive, create, dispatch, sdpa, (cond), \ + status::unimplemented, "%s," msg, this->info(engine), \ + ##__VA_ARGS__) + +#define VDISPATCH_SDPA_SC(f, msg, ...) \ + VCHECK(primitive, create, dispatch, sdpa, (f), "%s," msg, \ + this->info(engine), ##__VA_ARGS__) + +struct sdpa_pd_t : public primitive_desc_t { + static constexpr auto base_pkind = primitive_kind::sdpa; + + typedef sdpa_pd_t base_class; + typedef sdpa_pd_t hint_class; + + const sdpa_desc_t *desc() const { return &desc_; } + const op_desc_t *op_desc() const override { + return reinterpret_cast(this->desc()); + } + + arg_usage_t arg_usage(int arg) const override { + if (utils::one_of(arg, DNNL_ARG_QUERIES, DNNL_ARG_KEYS, DNNL_ARG_VALUES, + DNNL_ARG_ATTN_MASK, DNNL_ARG_SCALE)) + return arg_usage_t::input; + + if (arg == DNNL_ARG_DST) return arg_usage_t::output; + + return primitive_desc_t::arg_usage(arg); + } + + const memory_desc_t *arg_md( + int arg, bool user_input = false) const override { + switch (arg) { + case DNNL_ARG_QUERIES: return src_md(0); + case DNNL_ARG_KEYS: return src_md(1); + case DNNL_ARG_VALUES: return src_md(2); + case DNNL_ARG_ATTN_MASK: return src_md(3); + case DNNL_ARG_DST: return dst_md(0, user_input); + default: return primitive_desc_t::arg_md(arg); + } + } + + const memory_desc_t *src_md( + int index = 0, bool user_input = false) const override { + switch (index) { + case 0: return &desc_.q_desc; + case 1: return &desc_.k_desc; + case 2: return &desc_.v_desc; + case 3: return &desc_.attn_mask_desc; + default: return &glob_zero_md; + } + } + const memory_desc_t *dst_md( + int index = 0, bool user_input = false) const override { + return index == 0 ? &desc_.dst_desc : &glob_zero_md; + } + + const memory_desc_t *qry_md() const { return &desc_.q_desc; } + const memory_desc_t *key_md() const { return &desc_.k_desc; } + const memory_desc_t *val_md() const { return &desc_.v_desc; } + const memory_desc_t *attn_mask_md() const { return &desc_.attn_mask_desc; } + + int n_inputs() const override { return 3 + int(with_attn_mask()); } + int n_outputs() const override { return 1; } + + bool with_attn_mask() const { + return (attn_mask_md()->data_type != data_type::undef); + } + +protected: + sdpa_desc_t desc_; + + sdpa_pd_t(const sdpa_desc_t *adesc, const primitive_attr_t *attr, + const hint_class *hint_fwd_pd) + : primitive_desc_t(attr, base_pkind), desc_(*adesc) {} + + // By default, we just resolve 'any' with blocked layout and trivial strides + bool set_default_format(memory_desc_t *md) { + memory_desc_wrapper mdw(md); + if (mdw.format_any()) { + if (mdw.has_runtime_dims_or_strides()) return false; + status_t status = memory_desc_init_by_strides(*md, nullptr); + if (status != status::success) return false; + } + + return true; + } + + bool set_default_formats() { + bool ok = true; + + for (auto md : {&desc_.q_desc, &desc_.k_desc, &desc_.v_desc, + &desc_.dst_desc}) { + ok = ok && set_default_format(md); + } + + auto status = attr_.post_ops_.set_default_formats(&desc_.dst_desc); + ok = ok && (status == status::success); + + return ok; + } +}; + +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/common/sdpa_types.hpp b/src/common/sdpa_types.hpp new file mode 100644 index 00000000000..84ee79bcef5 --- /dev/null +++ b/src/common/sdpa_types.hpp @@ -0,0 +1,62 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef COMMON_SDPA_TYPES_HPP +#define COMMON_SDPA_TYPES_HPP + +#include +#include "common/c_types_map.hpp" +#include "common/memory_desc.hpp" + +namespace dnnl { +namespace impl { + +// A descriptor for a scaled dot product attention (SDPA) operation. +struct sdpa_desc_t { + // The kind of primitive. Used for self identifying the primitive + // descriptor. Must be sdpa. + dnnl_primitive_kind_t primitive_kind; + memory_desc_t q_desc; /* queries */ + memory_desc_t k_desc; /* keys */ + memory_desc_t v_desc; /* values */ + memory_desc_t dst_desc; + memory_desc_t attn_mask_desc; + data_type_t scale_dt; + // invert_scale = false: multiply by scale + // invert_scale = true: divide by scale + bool invert_scale; + + // Number of queries. + dnnl_dim_t queries() const { return q_desc.dims[q_desc.ndims - 2]; } + // Head size. + dnnl_dim_t head_size() const { return q_desc.dims[q_desc.ndims - 1]; } + // Number of keys. + dnnl_dim_t keys() const { return k_desc.dims[k_desc.ndims - 1]; } + // Number of values. + dnnl_dim_t values() const { return v_desc.dims[v_desc.ndims - 1]; } + // Total batch size. + dnnl_dim_t batch_size() const { + dnnl_dim_t batch = 1; + for (int i = 0; i < dst_desc.ndims - 2; i++) + batch *= dst_desc.dims[i]; + return batch; + } +}; + +} // namespace impl +} // namespace dnnl + +#endif // COMMON_SDPA_TYPES_HPP diff --git a/src/common/sdpa_utils.hpp b/src/common/sdpa_utils.hpp new file mode 100644 index 00000000000..8dd166d6ed7 --- /dev/null +++ b/src/common/sdpa_utils.hpp @@ -0,0 +1,79 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef COMMON_SDPA_UTILS_HPP +#define COMMON_SDPA_UTILS_HPP + +#include "oneapi/dnnl/dnnl.h" + +#include "common/c_types_map.hpp" +#include "common/nstl.hpp" +#include "common/primitive_desc_iterator.hpp" +#include "common/sdpa_types.hpp" +#include "common/utils.hpp" + +namespace dnnl { +namespace impl { + +static inline sdpa_desc_t create_sdpa_desc(const memory_desc_t *q_md, + const memory_desc_t *k_md, const memory_desc_t *v_md, + const memory_desc_t *dst_md, const memory_desc_t *attn_mask_md, + data_type_t scale_dt, bool invert_scale = false) { + auto sdpa_desc = sdpa_desc_t(); + sdpa_desc.primitive_kind = primitive_kind::sdpa; + sdpa_desc.q_desc = *q_md; + sdpa_desc.k_desc = *k_md; + sdpa_desc.v_desc = *v_md; + sdpa_desc.dst_desc = *dst_md; + if (attn_mask_md) sdpa_desc.attn_mask_desc = *attn_mask_md; + sdpa_desc.scale_dt = scale_dt; + sdpa_desc.invert_scale = invert_scale; + return sdpa_desc; +} + +static inline status_t create_sdpa_pd( + std::shared_ptr &sdpa_pd_, engine_t *engine, + const memory_desc_t *q_md, const memory_desc_t *k_md, + const memory_desc_t *v_md, const memory_desc_t *dst_md, + const memory_desc_t *attn_mask_md, data_type_t scale_dt, + bool invert_scale, const primitive_attr_t *attr) { + auto sdpa_desc = create_sdpa_desc( + q_md, k_md, v_md, dst_md, attn_mask_md, scale_dt, invert_scale); + + int ndims = dst_md->ndims; + int r = ndims - 2, c = ndims - 1; + if (!utils::everyone_is(ndims, q_md->ndims, k_md->ndims, v_md->ndims)) + return status::invalid_arguments; + if (q_md->dims[c] != k_md->dims[r]) return status::invalid_arguments; + if (k_md->dims[c] != v_md->dims[r]) return status::invalid_arguments; + if (dst_md->dims[r] != q_md->dims[r] || dst_md->dims[c] != v_md->dims[c]) + return status::invalid_arguments; + + primitive_attr_t sdpa_attr = *attr; + + primitive_desc_iterator_t it( + engine, (op_desc_t *)&sdpa_desc, &sdpa_attr, nullptr); + + sdpa_pd_ = *(++it); + if (!sdpa_pd_) return status::unimplemented; + + return status::success; +} + +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/common/serialization.cpp b/src/common/serialization.cpp index bee9d54f205..97fc9a1014d 100644 --- a/src/common/serialization.cpp +++ b/src/common/serialization.cpp @@ -48,6 +48,7 @@ status_t serialize_desc( CASE(reorder) CASE(resampling) CASE(rnn) + CASE(sdpa) CASE(shuffle) CASE(softmax) CASE(sum) @@ -586,6 +587,18 @@ void serialize_desc(serialization_stream_t &sstream, const sum_desc_t &desc) { serialize_md(sstream, *desc.src_mds[i]); } +void serialize_desc(serialization_stream_t &sstream, const sdpa_desc_t &desc) { + // Kind + sstream.write(&desc.primitive_kind); + serialize_md(sstream, desc.q_desc); + serialize_md(sstream, desc.k_desc); + serialize_md(sstream, desc.v_desc); + serialize_md(sstream, desc.dst_desc); + serialize_md(sstream, desc.attn_mask_desc); + sstream.write(&desc.scale_dt); + sstream.write(&desc.invert_scale); +} + } // namespace serialization } // namespace impl } // namespace dnnl diff --git a/src/common/serialization.hpp b/src/common/serialization.hpp index e1a353c94f2..afd4ffba136 100644 --- a/src/common/serialization.hpp +++ b/src/common/serialization.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2023 Intel Corporation +* Copyright 2021-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -59,6 +59,7 @@ void serialize_desc( void serialize_desc( serialization_stream_t &sstream, const resampling_desc_t &desc); void serialize_desc(serialization_stream_t &sstream, const rnn_desc_t &desc); +void serialize_desc(serialization_stream_t &sstream, const sdpa_desc_t &desc); void serialize_desc( serialization_stream_t &sstream, const shuffle_desc_t &desc); void serialize_desc( diff --git a/src/common/type_helpers.hpp b/src/common/type_helpers.hpp index bb56e1fe3a5..4779c331890 100644 --- a/src/common/type_helpers.hpp +++ b/src/common/type_helpers.hpp @@ -766,6 +766,19 @@ inline bool operator==(const zero_pad_desc_t &lhs, const zero_pad_desc_t &rhs) { bool ret = COMPARE_DESC_MEMBERS(primitive_kind); return ret; } + +inline bool operator==(const sdpa_desc_t &lhs, const sdpa_desc_t &rhs) { + bool ret = COMPARE_DESC_MEMBERS(primitive_kind) + && COMPARE_DESC_MEMBERS(q_desc) + && COMPARE_DESC_MEMBERS(k_desc) + && COMPARE_DESC_MEMBERS(v_desc) + && COMPARE_DESC_MEMBERS(dst_desc) + && COMPARE_DESC_MEMBERS(attn_mask_desc) + && COMPARE_DESC_MEMBERS(scale_dt) + && COMPARE_DESC_MEMBERS(invert_scale); + return ret; +} + // clang-format on #undef COMPARE_DESC_MEMBERS @@ -1078,6 +1091,7 @@ inline void copy_c_op_desc(op_desc_t *dst, const op_desc_t *src) { CASE_OP_DESC(reduction); CASE_OP_DESC(resampling); CASE_OP_DESC(rnn); + CASE_OP_DESC(sdpa); CASE_OP_DESC(shuffle); CASE_OP_DESC(softmax); diff --git a/src/common/verbose.cpp b/src/common/verbose.cpp index 684bf974403..bd3318ea391 100644 --- a/src/common/verbose.cpp +++ b/src/common/verbose.cpp @@ -1546,6 +1546,9 @@ void pd_info_t::init(engine_t *engine, const primitive_desc_t *pd) { CASE(shuffle); CASE(softmax); CASE(sum); + case primitive_kind::sdpa: + str_ = "sdpa, unknown info"; + break; case primitive_kind::zero_pad: str_ = "zero_pad, unknown info"; break; diff --git a/src/cpu/cpu_engine.hpp b/src/cpu/cpu_engine.hpp index 3de6ca24ad3..3950e6c8dc9 100644 --- a/src/cpu/cpu_engine.hpp +++ b/src/cpu/cpu_engine.hpp @@ -88,7 +88,7 @@ class cpu_engine_impl_list_t { #define CASE(kind) \ case primitive_kind::kind: \ return get_##kind##_impl_list((const kind##_desc_t *)desc); - switch (desc->kind) { + switch ((int) desc->kind) { CASE(batch_normalization); CASE(binary); CASE(convolution); @@ -106,6 +106,7 @@ class cpu_engine_impl_list_t { CASE(rnn); CASE(shuffle); CASE(softmax); + case primitive_kind::sdpa: return empty_list; default: assert(!"unknown primitive kind"); return empty_list; } #undef CASE diff --git a/src/gpu/gpu_impl_list.cpp b/src/gpu/gpu_impl_list.cpp index a7b23e891e0..e044e7dbf89 100644 --- a/src/gpu/gpu_impl_list.cpp +++ b/src/gpu/gpu_impl_list.cpp @@ -45,6 +45,7 @@ const impl_list_item_t *gpu_impl_list_t::get_implementation_list( CASE(reduction); CASE(resampling); CASE(rnn); + CASE(sdpa); CASE(shuffle); CASE(softmax); CASE(zero_pad); diff --git a/src/gpu/gpu_impl_list.hpp b/src/gpu/gpu_impl_list.hpp index 8fd04fa5b38..43198956dfd 100644 --- a/src/gpu/gpu_impl_list.hpp +++ b/src/gpu/gpu_impl_list.hpp @@ -23,6 +23,7 @@ #include "common/engine.hpp" #include "common/impl_list_item.hpp" #include "common/impl_registration.hpp" +#include "common/sdpa_types.hpp" namespace dnnl { namespace impl { @@ -51,6 +52,7 @@ DECLARE_IMPL_LIST(prelu); DECLARE_IMPL_LIST(reduction); DECLARE_IMPL_LIST(resampling); DECLARE_IMPL_LIST(rnn); +DECLARE_IMPL_LIST(sdpa); DECLARE_IMPL_LIST(shuffle); DECLARE_IMPL_LIST(softmax); DECLARE_IMPL_LIST(zero_pad); diff --git a/src/gpu/gpu_sdpa_list.cpp b/src/gpu/gpu_sdpa_list.cpp new file mode 100755 index 00000000000..04907620b7f --- /dev/null +++ b/src/gpu/gpu_sdpa_list.cpp @@ -0,0 +1,41 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "common/compiler_workarounds.hpp" + +#include "gpu/gpu_impl_list.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { + +namespace { + +// clang-format off +constexpr impl_list_item_t impl_list[] = { + nullptr, +}; +// clang-format on +} // namespace + +const impl_list_item_t *get_sdpa_impl_list(const sdpa_desc_t *desc) { + UNUSED(desc); + return impl_list; +} + +} // namespace gpu +} // namespace impl +} // namespace dnnl From 109cf499795e1a1382addbeee080bfad88b15564 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Thu, 4 Apr 2024 11:57:22 -0700 Subject: [PATCH 080/187] graph: backend: dnnl: enable internal sdpa primitive --- src/graph/backend/dnnl/kernels/sdp_base.hpp | 19 +- .../backend/dnnl/kernels/sdp_primitive.hpp | 496 ++++++++++++++++++ 2 files changed, 510 insertions(+), 5 deletions(-) create mode 100644 src/graph/backend/dnnl/kernels/sdp_primitive.hpp diff --git a/src/graph/backend/dnnl/kernels/sdp_base.hpp b/src/graph/backend/dnnl/kernels/sdp_base.hpp index 20d5a7153f8..1d225152cf4 100644 --- a/src/graph/backend/dnnl/kernels/sdp_base.hpp +++ b/src/graph/backend/dnnl/kernels/sdp_base.hpp @@ -33,6 +33,7 @@ #include "graph/backend/dnnl/utils.hpp" #include "graph/backend/dnnl/kernels/sdp.hpp" +#include "graph/backend/dnnl/kernels/sdp_primitive.hpp" #include "graph/backend/dnnl/passes/compile_ops.hpp" #include "graph/backend/dnnl/passes/constant_propagation.hpp" #include "graph/backend/dnnl/passes/insert_ops.hpp" @@ -63,18 +64,26 @@ struct sdp_base_t : public kernel_base_t { const engine_kind_t ekind = g_engine->kind(); const bool enable_decomp = ekind == engine_kind::cpu && enable_decomp_kernel(); - status_t sdp_decomp_status = status::success; - if (enable_decomp) { + const bool enable_prim = (ekind == engine_kind::gpu) && !quantized; + status_t subkernel_status = status::unimplemented; + + if (enable_prim) { + kernel = std::make_shared(); + subkernel_status + = kernel->compile_impl(part, g_engine, inputs, outputs); + } + + if (subkernel_status != status::success && enable_decomp) { kernel = std::make_shared>(); - sdp_decomp_status + subkernel_status = kernel->compile_impl(part, g_engine, inputs, outputs); } - if (!enable_decomp || sdp_decomp_status != status::success) { + if (subkernel_status != status::success) { kernel = std::make_shared(); return kernel->compile_impl(part, g_engine, inputs, outputs); } - return sdp_decomp_status; + return subkernel_status; } // The fuction is used to check if enable the decompostion kernel based on diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive.hpp b/src/graph/backend/dnnl/kernels/sdp_primitive.hpp new file mode 100644 index 00000000000..800f0062a98 --- /dev/null +++ b/src/graph/backend/dnnl/kernels/sdp_primitive.hpp @@ -0,0 +1,496 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GRAPH_BACKEND_DNNL_KERNELS_SDP_PRIMITIVE_HPP +#define GRAPH_BACKEND_DNNL_KERNELS_SDP_PRIMITIVE_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/primitive.hpp" +#include "common/primitive_exec_types.hpp" +#include "common/sdpa_pd.hpp" +#include "common/sdpa_types.hpp" +#include "common/sdpa_utils.hpp" +#include "common/utils.hpp" +#include "cpu/cpu_stream.hpp" + +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL +#include "gpu/intel/ocl/ocl_stream.hpp" +#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL +#include "sycl/sycl_stream.hpp" +#endif + +#include "graph/interface/backend.hpp" +#include "graph/interface/graph.hpp" + +#include "graph/backend/dnnl/common.hpp" +#include "graph/backend/dnnl/dnnl_constant_tensor_cache.hpp" +#include "graph/backend/dnnl/dnnl_partition_impl.hpp" +#include "graph/backend/dnnl/op_executable.hpp" +#include "graph/backend/dnnl/scratchpad.hpp" +#include "graph/backend/dnnl/thread_local_cache.hpp" +#include "graph/backend/dnnl/utils.hpp" + +#include "graph/backend/dnnl/passes/compile_ops.hpp" +#include "graph/backend/dnnl/passes/insert_ops.hpp" +#include "graph/backend/dnnl/passes/lower.hpp" +#include "graph/backend/dnnl/passes/memory_planning.hpp" +#include "graph/backend/dnnl/passes/transform.hpp" +#include "graph/backend/dnnl/passes/utils.hpp" + +namespace dnnl { +namespace impl { +namespace graph { +namespace dnnl_impl { +using ltw = logical_tensor_wrapper_t; +using op_ptr = std::shared_ptr; +using registry_key = size_t; + +struct sdp_primitive_config_t { +public: + sdp_primitive_config_t() = default; + + std::shared_ptr q_ = nullptr; + std::shared_ptr k_ = nullptr; + std::shared_ptr v_ = nullptr; + std::shared_ptr dst_ = nullptr; + std::shared_ptr scale_ = nullptr; + std::shared_ptr attn_mask_ = nullptr; + bool invert_scale_ = false; + + // SDP pd and primitive. + std::shared_ptr sdpa_pd_; + std::shared_ptr sdpa_prim_; + +private: + op_ptr get_post_op(const op_ptr &op) const { + const auto out_val = op->get_output_value(0); + const auto &consumers = out_val->get_consumers(); + if (consumers.size() != 1) return nullptr; + return consumers[0].get_op().shared_from_this(); + } + +public: + status_t locate_io(std::shared_ptr &sg, + const std::vector &inputs, + const std::vector &outputs) { + + using dnnl::impl::utils::one_of; + + auto follow_back = [](std::shared_ptr val) { + while (val->has_producer() && val->get_producer().num_inputs() == 1) + val = val->get_producer().get_input_value(0); + return val; + }; + + auto in_tensor_list + = [](const value_t *val, + const std::vector &list) { + for (auto &t : list) + if (val->get_logical_tensor().id == t.id) return true; + return false; + }; + + // Locate ops of interest: matmuls, scale, mask + op_ptr mm1, mm2, scale, add, final_op; + for (const auto &cur_op : sg->get_ops()) { + if (in_tensor_list(cur_op->get_output_value(0).get(), outputs)) + final_op = cur_op; + if (cur_op->get_kind() != op_kind::dnnl_matmul) continue; + auto post_op = get_post_op(cur_op); + if (post_op && post_op->get_kind() == op_kind::dnnl_binary) { + if (mm1) return status::unimplemented; + mm1 = cur_op; + scale = post_op; + + auto scale_alg = static_cast( + post_op->get_attr(op_attr::alg_kind)); + if (!one_of(scale_alg, alg_kind::binary_mul, + alg_kind::binary_div)) + return status::unimplemented; + invert_scale_ = (scale_alg == alg_kind::binary_div); + + if (get_post_op(post_op)->get_kind() == op_kind::dnnl_binary) + add = get_post_op(post_op); + } else { + if (mm2) return status::unimplemented; + mm2 = cur_op; + } + } + + // Locate input/outputs: Q, K, V, dst, scale, mask + if (!mm1 || !mm2 || !final_op) return status::unimplemented; + q_ = mm1->get_input_value(0); + k_ = mm1->get_input_value(1); + v_ = mm2->get_input_value(1); + dst_ = (final_op->get_kind() == op_kind::dnnl_transpose) + ? final_op->get_input_value(0) + : final_op->get_output_value( + 0); /* for some reason final transpose is not fused into mm2 */ + + if (scale) { + auto s0 = follow_back(scale->get_input_value(0)); + auto s1 = follow_back(scale->get_input_value(1)); + scale_ = in_tensor_list(s1.get(), inputs) ? s1 : s0; + } + + if (add) { + auto m0 = add->get_input_value(0), m1 = add->get_input_value(1); + attn_mask_ = in_tensor_list(m1.get(), inputs) ? m1 : m0; + } + + return status::success; + } + + // Initialize parameters and primitive. + status_t init(std::shared_ptr &sg, const dnnl::engine &p_engine, + const std::vector &inputs, + const std::vector &outputs) { + + CHECK(locate_io(sg, inputs, outputs)); + + // Retrieve mds and create pd, primitive + auto md_q = make_dnnl_memory_desc(q_->get_logical_tensor()); + auto md_k = make_dnnl_memory_desc(k_->get_logical_tensor()); + auto md_v = make_dnnl_memory_desc(v_->get_logical_tensor()); + auto md_dst = make_dnnl_memory_desc(dst_->get_logical_tensor()); + + dnnl::memory::desc md_mask; + if (attn_mask_) + md_mask = make_dnnl_memory_desc(attn_mask_->get_logical_tensor()); + + auto scale_dt = impl::data_type::undef; + if (scale_) scale_dt = scale_->get_logical_tensor().data_type; + + dnnl::primitive_attr attr; + + auto &mgr = sg->fusion_info_mgr_; + attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + attr.set_fpmath_mode( + static_cast(mgr.get_fpmath_mode())); + + CHECK(create_sdpa_pd(sdpa_pd_, p_engine.get(), md_q.get(), md_k.get(), + md_v.get(), md_dst.get(), md_mask.get(), scale_dt, + invert_scale_, attr.get())); + + auto status = sdpa_pd_->create_primitive(sdpa_prim_, p_engine.get()); + + if (status != status::success) { + if (get_verbose(verbose_t::create_dispatch, component_t::graph)) { + printf("onednn_verbose,graph,create:dispatch,sdpa,could not " + "create primitive, falling back\n"); + } + } + + return status; + } +}; + +class sdp_primitive_kernel_t : public kernel_base_t { +private: + allocator_t *g_alloc_ = nullptr; + + std::shared_ptr subgraph_; + memory_planner_t memory_planner_; + std::function()> resource_ctor_; + + sdp_primitive_config_t cfg_; + +public: + sdp_primitive_kernel_t() { + thread_local_cache_t res_cache; + res_cache.retain(); + } + + ~sdp_primitive_kernel_t() override { + thread_local_cache_t res_cache; + res_cache.remove_if_exist(reinterpret_cast(this)); + res_cache.release(); + } + + status_t compile_impl(const dnnl_partition_impl_t *part, + const engine_t *g_engine, + const std::vector &inputs, + const std::vector &outputs) override { + p_engine_ = make_dnnl_engine(*g_engine); + g_alloc_ = reinterpret_cast( + g_engine->get_allocator()); + + // First, dry run on a deep copy + subgraph_ = std::make_shared( + graph_t::deep_copy(part->get_ops()), p_engine_, + part->get_fpmath_mode(), part->get_use_blocked_layout(), true); + CHECK(set_given_inputs_outputs(subgraph_, inputs, outputs)); + + subgraph_visualizer_t vis(part->id(), [this](const value_t *val) { + return this->memory_planner_.get_memory_info(val); + }); + pass_pipeline_t pipeline = pass_pipeline_t(vis); + + BACKEND_DNNL_ADD_PASS(pipeline, lower_down); + BACKEND_DNNL_ADD_PASS(pipeline, binary_canonicalization); + BACKEND_DNNL_ADD_PASS(pipeline, insert_permute_for_matmul); + + pipeline.reset_visualize_arg(true, false); + BACKEND_DNNL_ADD_PASS(pipeline, infer_shape); + BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul); + BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation); + + // bind the memory for each op + auto memory_plan = [&](std::shared_ptr &sg) { + return memory_planner_.run(sg); + }; + pipeline.reset_visualize_arg(true, true); + BACKEND_DNNL_ADD_PASS(pipeline, memory_plan); + + auto modify_subgraph = [&] { + // Run the added passes + CHECK(pipeline.run(subgraph_)); + + // fill information for inputs logical tensors + for (size_t i = 0; i < inputs.size(); i++) { + auto &in = const_cast(inputs[i]); + in = subgraph_->ins_[i]; + } + + // fill information for outputs logical tensors + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = const_cast(outputs[i]); + out = subgraph_->outs_[i]; + } + + return status::success; + }; + + resource_ctor_ = [this]() { + return this->memory_planner_.get_exec_args_set().clone(); + }; + + CHECK(modify_subgraph()); + CHECK(cfg_.init(subgraph_, p_engine_, inputs, outputs)); + + // Successfully created the primitive. Rerun the passes again, modifying + // the original ops. + subgraph_ = std::make_shared(part->get_ops(), p_engine_, + part->get_fpmath_mode(), part->get_use_blocked_layout(), true); + CHECK(set_given_inputs_outputs(subgraph_, inputs, outputs)); + CHECK(modify_subgraph()); + CHECK(cfg_.locate_io(subgraph_, inputs, outputs)); + + return status::success; + } + + void prepare_args_set(const execution_args_set_t *res, + const std::vector &inputs, + const std::vector &outputs, + const scratchpad_t &scratchpad) { + // update the data of partition in/outputs args + for (const auto &mem_idx : res->get_mems_use_external_inputs()) { + mem_idx.first.set_data_handle( + inputs[mem_idx.second].get_data_handle()); + } + for (const auto &mem_idx : res->get_mems_use_external_outputs()) { + mem_idx.first.set_data_handle( + outputs[mem_idx.second].get_data_handle()); + } + + grantor_t var_grantor = memory_planner_.internal_temporary_grantor( + scratchpad.get_buffer()); + + for (auto &mem_offkey : res->get_mems_use_internal_temporary()) { + mem_offkey.first.set_data_handle( + var_grantor.get(mem_offkey.second)); + } + } + + status_t get_prim_exec_args(exec_args_t &args, memory (&mem_storage)[6], + const execution_args_set_t *res) { + bool ok = res->find_value_mem_map(cfg_.q_.get(), mem_storage[0]) + && res->find_value_mem_map(cfg_.k_.get(), mem_storage[1]) + && res->find_value_mem_map(cfg_.v_.get(), mem_storage[2]) + && res->find_value_mem_map(cfg_.dst_.get(), mem_storage[3]); + + if (cfg_.scale_) + ok = ok + && res->find_value_mem_map( + cfg_.scale_.get(), mem_storage[4]); + if (cfg_.attn_mask_) + ok = ok + && res->find_value_mem_map( + cfg_.attn_mask_.get(), mem_storage[5]); + + if (!ok) return status::runtime_error; + + memory_arg_t mem_arg_q = {mem_storage[0].get(), true}; + memory_arg_t mem_arg_k = {mem_storage[1].get(), true}; + memory_arg_t mem_arg_v = {mem_storage[2].get(), true}; + memory_arg_t mem_arg_dst = {mem_storage[3].get(), false}; + memory_arg_t mem_arg_scale = {mem_storage[4].get(true), true}; + memory_arg_t mem_arg_mask = {mem_storage[5].get(true), true}; + + args.clear(); + args[DNNL_ARG_QUERIES] = mem_arg_q; + args[DNNL_ARG_KEYS] = mem_arg_k; + args[DNNL_ARG_VALUES] = mem_arg_v; + args[DNNL_ARG_DST] = mem_arg_dst; + args[DNNL_ARG_SCALE] = mem_arg_scale; + args[DNNL_ARG_ATTN_MASK] = mem_arg_mask; + + return status::success; + } + + status_t execute_impl(const stream_t *g_stream, + const std::vector &inputs, + const std::vector &outputs) override { + dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream); + + thread_local_cache_t res_cache; + execution_args_set_t *res = res_cache.get_or_add( + reinterpret_cast(this), resource_ctor_); + + temporary_scratchpad_t scratchpad( + memory_planner_.total_internal_temporary_size(), p_engine_, + *g_alloc_); + assertm(scratchpad.size() + >= memory_planner_.total_internal_temporary_size(), + "not enough scratchpad memory"); + prepare_args_set(res, inputs, outputs, scratchpad); + + memory mem_storage[6]; + exec_args_t args; + CHECK(get_prim_exec_args(args, mem_storage, res)); + exec_ctx_t ctx(p_stream.get(), std::move(args)); + + return cfg_.sdpa_prim_->execute(ctx); + } + +#ifdef DNNL_WITH_SYCL + status_t sycl_execute_impl(const stream_t *g_stream, + const std::vector &inputs, + const std::vector &outputs, + const std::vector<::sycl::event> &sycl_deps, + ::sycl::event *sycl_event) override { + + dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream); + + thread_local_cache_t res_cache; + execution_args_set_t *res = res_cache.get_or_add( + reinterpret_cast(this), resource_ctor_); + + temporary_scratchpad_t scratchpad( + memory_planner_.total_internal_temporary_size(), p_engine_, + *g_alloc_); + assertm(scratchpad.size() + >= memory_planner_.total_internal_temporary_size(), + "not enough scratchpad memory"); + prepare_args_set(res, inputs, outputs, scratchpad); + + memory mem_storage[6]; + exec_args_t args; + CHECK(get_prim_exec_args(args, mem_storage, res)); + exec_ctx_t ctx(p_stream.get(), std::move(args)); + + auto *sycl_stream = dnnl::impl::utils::downcast< + dnnl::impl::sycl::sycl_stream_t *>(p_stream.get()); + + sycl_stream->before_exec_hook(); + + if (!sycl_deps.empty()) sycl_stream->sycl_ctx().set_deps(sycl_deps); + + auto status = cfg_.sdpa_prim_->execute(ctx); + + auto return_event = sycl_stream->get_output_event(); + + scratchpad.set_deps(return_event); + if (sycl_event) *sycl_event = return_event; + + sycl_stream->after_exec_hook(); + + return status; + } +#endif + +#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL + status_t ocl_execute_impl(const stream_t *g_stream, + const std::vector &inputs, + const std::vector &outputs, + const std::vector &cl_deps, + cl_event *ret_event) override { + + dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream); + + thread_local_cache_t res_cache; + execution_args_set_t *res = res_cache.get_or_add( + reinterpret_cast(this), resource_ctor_); + + temporary_scratchpad_t scratchpad( + memory_planner_.total_internal_temporary_size(), p_engine_, + *g_alloc_); + assertm(scratchpad.size() + >= memory_planner_.total_internal_temporary_size(), + "not enough scratchpad memory"); + prepare_args_set(res, inputs, outputs, scratchpad); + + memory mem_storage[6]; + exec_args_t args; + CHECK(get_prim_exec_args(args, mem_storage, res)); + exec_ctx_t ctx(p_stream.get(), std::move(args)); + + // TODO (pc): refactor + namespace ocl = gpu::intel::ocl; + auto *ocl_stream = dnnl::impl::utils::downcast( + p_stream.get()); + + ocl_stream->before_exec_hook(); + + if (!cl_deps.empty()) { + std::vector> events(cl_deps.size()); + for (size_t i = 0; i < cl_deps.size(); i++) + events[i] = xpu::ocl::wrapper_t(cl_deps[i], true); + ocl_stream->ocl_ctx().set_deps(events); + } + + auto status = cfg_.sdpa_prim_->execute(ctx); + + cl_event return_event = nullptr; + if ((ocl_stream->flags() & stream_flags::in_order) == 0) { + auto last = ocl_stream->get_output_event(); + return_event = last.release(); + } + + scratchpad.set_deps(return_event); + if (ret_event) *ret_event = return_event; + + ocl_stream->after_exec_hook(); + + return status; + } +#endif +}; + +} // namespace dnnl_impl +} // namespace graph +} // namespace impl +} // namespace dnnl + +#endif From 39d33dbbd9b852ae71da7bf87c3474fc10afc4f1 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Mon, 8 Apr 2024 06:49:04 -0700 Subject: [PATCH 081/187] gpu: ocl: reference sdpa implementation --- src/gpu/gpu_sdpa_list.cpp | 2 + src/gpu/intel/ocl/ref_sdpa.cl | 85 +++++++++++++++++++++ src/gpu/intel/ocl/ref_sdpa.cpp | 66 +++++++++++++++++ src/gpu/intel/ocl/ref_sdpa.hpp | 122 +++++++++++++++++++++++++++++++ src/gpu/intel/primitive_conf.hpp | 5 +- 5 files changed, 279 insertions(+), 1 deletion(-) create mode 100644 src/gpu/intel/ocl/ref_sdpa.cl create mode 100644 src/gpu/intel/ocl/ref_sdpa.cpp create mode 100644 src/gpu/intel/ocl/ref_sdpa.hpp diff --git a/src/gpu/gpu_sdpa_list.cpp b/src/gpu/gpu_sdpa_list.cpp index 04907620b7f..173a58df8e8 100755 --- a/src/gpu/gpu_sdpa_list.cpp +++ b/src/gpu/gpu_sdpa_list.cpp @@ -17,6 +17,7 @@ #include "common/compiler_workarounds.hpp" #include "gpu/gpu_impl_list.hpp" +#include "gpu/intel/ocl/ref_sdpa.hpp" namespace dnnl { namespace impl { @@ -26,6 +27,7 @@ namespace { // clang-format off constexpr impl_list_item_t impl_list[] = { + INSTANCE(intel::ocl::ref_sdpa_t) nullptr, }; // clang-format on diff --git a/src/gpu/intel/ocl/ref_sdpa.cl b/src/gpu/intel/ocl/ref_sdpa.cl new file mode 100644 index 00000000000..d801f84b901 --- /dev/null +++ b/src/gpu/intel/ocl/ref_sdpa.cl @@ -0,0 +1,85 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/intel/ocl/ocl_post_ops.h" +#include "gpu/intel/ocl/ocl_types.h" + +#define _4D_OFF(tag, x0, x1, x2, x3) \ + (((x0) % tag##_B0) * tag##_SB0 + ((x0) / tag##_B0) * tag##_S0 \ + + ((x1) % tag##_B1) * tag##_SB1 + ((x1) / tag##_B1) * tag##_S1 \ + + ((x2) % tag##_B2) * tag##_SB2 + ((x2) / tag##_B2) * tag##_S2 \ + + ((x3) % tag##_B3) * tag##_SB3 + ((x3) / tag##_B3) * tag##_S3) + +#define QRY_OFF(x0, x1, x2, x3) _4D_OFF(QRY, x0, x1, x2, x3) +#define KEY_OFF(x0, x1, x2, x3) _4D_OFF(KEY, x0, x1, x2, x3) +#define VAL_OFF(x0, x1, x2, x3) _4D_OFF(VAL, x0, x1, x2, x3) +#define MSK_OFF(x0, x1, x2, x3) _4D_OFF(MSK, x0, x1, x2, x3) + +__kernel void ref_sdpa(const __global QRY_DATA_T *Q, + const __global KEY_DATA_T *K, const __global VAL_DATA_T *V, + __global DST_DATA_T *dst, const __global SCALE_DATA_T *scale_ptr, + const __global MSK_DATA_T *mask, long nv, long nd) { + + long q = get_global_id(0); + long b0 = get_global_id(1); + long b1 = get_global_id(2); + SCALE_DATA_T scale = *scale_ptr; + +#if INVERT_SCALE + scale = 1 / scale; +#endif + + float s[SIZE_K]; + + // Multiply (row of Q)*K + for (long k = 0; k < SIZE_K; k++) { + float acc = 0; + for (long h = 0; h < nd; h++) { + long qry_off = QRY_OFF(b1 % QRY_D0, b0 % QRY_D1, q, h); + long key_off = KEY_OFF(b1 % KEY_D0, b0 % KEY_D1, h, k); + + acc += convert_float(Q[qry_off]) * convert_float(K[key_off]); + } + s[k] = acc; + } + + // Scale + shift + softmax + float s_sum = 0; + for (long k = 0; k < SIZE_K; k++) { + s[k] *= scale; +#if WITH_ATTN_MASK + long msk_off = MSK_OFF(b1 % MSK_D0, b0 % MSK_D1, q, k); + s[k] += convert_float(mask[msk_off]); +#endif + s[k] = exp(s[k]); + s_sum += s[k]; + } + + for (long k = 0; k < SIZE_K; k++) + s[k] /= s_sum; + + // Multiply (row of S)*V + for (long v = 0; v < nv; v++) { + float acc = 0; + for (long k = 0; k < SIZE_K; k++) { + long val_off = VAL_OFF(b1 % VAL_D0, b0 % VAL_D1, k, v); + acc += convert_float(V[val_off]) * s[k]; + } + + long dst_off = DST_OFF(b1 % DST_D0, b0 % DST_D1, 0, q, v); + dst[dst_off] = TO_DST(acc); + } +} diff --git a/src/gpu/intel/ocl/ref_sdpa.cpp b/src/gpu/intel/ocl/ref_sdpa.cpp new file mode 100644 index 00000000000..2218c8eba21 --- /dev/null +++ b/src/gpu/intel/ocl/ref_sdpa.cpp @@ -0,0 +1,66 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/intel/ocl/ref_sdpa.hpp" + +#include "common/c_types_map.hpp" +#include "common/type_helpers.hpp" +#include "gpu/intel/compute/utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace ocl { + +status_t ref_sdpa_t::execute_ref(const exec_ctx_t &ctx) const { + const auto &qry = CTX_IN_STORAGE(DNNL_ARG_QUERIES); + const auto &key = CTX_IN_STORAGE(DNNL_ARG_KEYS); + const auto &val = CTX_IN_STORAGE(DNNL_ARG_VALUES); + auto &dst = CTX_OUT_STORAGE(DNNL_ARG_DST); + const auto &scale = CTX_IN_STORAGE(DNNL_ARG_SCALE); + const auto &attn_mask = CTX_IN_STORAGE(DNNL_ARG_ATTN_MASK); + + const auto dst_mdw = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md()); + + const int last = dst_mdw.ndims() - 1; + const dim_t B1 = dst_mdw.ndims() > 3 ? dst_mdw.dims()[last - 3] : 1; + const dim_t B0 = dst_mdw.ndims() > 2 ? dst_mdw.dims()[last - 2] : 1; + const dim_t V = pd()->desc()->values(); + const dim_t D = pd()->desc()->head_size(); + const dim_t Q = pd()->desc()->queries(); + + compute::kernel_arg_list_t arg_list; + arg_list.set(0, qry); + arg_list.set(1, key); + arg_list.set(2, val); + arg_list.set(3, dst); + arg_list.set(4, scale); + arg_list.set(5, attn_mask); + arg_list.set(6, V); + arg_list.set(7, D); + + compute::range_t gws = {(size_t)Q, (size_t)B0, (size_t)B1}; + auto nd_range = compute::nd_range_t(gws); + + return parallel_for(ctx, nd_range, kernel_, arg_list); +} + +} // namespace ocl +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/ocl/ref_sdpa.hpp b/src/gpu/intel/ocl/ref_sdpa.hpp new file mode 100644 index 00000000000..208a3fdf7b1 --- /dev/null +++ b/src/gpu/intel/ocl/ref_sdpa.hpp @@ -0,0 +1,122 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_OCL_REF_SDPA_HPP +#define GPU_OCL_REF_SDPA_HPP + +#include + +#include "common/c_types_map.hpp" +#include "common/primitive.hpp" +#include "common/sdpa_pd.hpp" +#include "common/type_helpers.hpp" +#include "common/utils.hpp" +#include "gpu/intel/gpu_primitive.hpp" +#include "gpu/intel/gpu_resource.hpp" +#include "gpu/intel/ocl/ocl_utils.hpp" +#include "gpu/intel/primitive_conf.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace ocl { + +struct ref_sdpa_t : public gpu_primitive_t { + using gpu_primitive_t::gpu_primitive_t; + struct pd_t : public sdpa_pd_t { + using sdpa_pd_t::sdpa_pd_t; + + DECLARE_COMMON_PD_T("ocl:ref:any", ref_sdpa_t); + + status_t init(engine_t *engine) { + using namespace data_type; + using smask_t = primitive_attr_t::skip_mask_t; + + VDISPATCH_SDPA(attr()->has_default_values(smask_t::scales_runtime), + VERBOSE_UNSUPPORTED_ATTR); + VDISPATCH_SDPA( + utils::everyone_is(4, qry_md()->ndims, key_md()->ndims, + val_md()->ndims, dst_md()->ndims), + VERBOSE_UNSUPPORTED_TAG); + if (with_attn_mask()) { + VDISPATCH_SDPA( + attn_mask_md()->ndims == 4, VERBOSE_UNSUPPORTED_TAG); + } + VDISPATCH_SDPA(set_default_formats(), VERBOSE_UNSUPPORTED_TAG); + + return status::success; + } + }; + + status_t init(engine_t *engine) override { + compute::kernel_ctx_t kernel_ctx; + + kernel_ctx.set_data_type(pd()->dst_md()->data_type); + + int ndims = 4; + + const memory_desc_wrapper qry_mdw(pd()->qry_md()); + const memory_desc_wrapper key_mdw(pd()->key_md()); + const memory_desc_wrapper val_mdw(pd()->val_md()); + const memory_desc_wrapper dst_mdw(pd()->dst_md()); + const memory_desc_wrapper msk_mdw(pd()->attn_mask_md()); + using offset_t = decltype(offsets_t().src_off); + offset_t qry_off, key_off, val_off, dst_off, msk_off; + set_offsets(qry_mdw, qry_off); + set_offsets(key_mdw, key_off); + set_offsets(val_mdw, val_off); + set_offsets(dst_mdw, dst_off); + set_offsets(msk_mdw, msk_off); + def_offsets(qry_off, kernel_ctx, "QRY", ndims); + def_offsets(key_off, kernel_ctx, "KEY", ndims); + def_offsets(val_off, kernel_ctx, "VAL", ndims); + def_offsets(dst_off, kernel_ctx, "DST", ndims); + def_offsets(msk_off, kernel_ctx, "MSK", ndims); + kernel_ctx.define_int("NDIMS", ndims); + + kernel_ctx.define_int("SIZE_K", pd()->desc()->keys()); + kernel_ctx.define_int("INVERT_SCALE", pd()->desc()->invert_scale); + kernel_ctx.define_int("WITH_ATTN_MASK", pd()->with_attn_mask()); + + def_data_type(kernel_ctx, pd()->qry_md()->data_type, "QRY"); + def_data_type(kernel_ctx, pd()->key_md()->data_type, "KEY"); + def_data_type(kernel_ctx, pd()->val_md()->data_type, "VAL"); + def_data_type(kernel_ctx, pd()->dst_md()->data_type, "DST"); + def_data_type(kernel_ctx, pd()->attn_mask_md()->data_type, "MSK"); + def_data_type(kernel_ctx, pd()->desc()->scale_dt, "SCALE"); + CHECK(create_kernel(engine, &kernel_, "ref_sdpa", kernel_ctx)); + if (!kernel_) return status::runtime_error; + return status::success; + } + + status_t execute(const exec_ctx_t &ctx) const override { + return execute_ref(ctx); + } + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + status_t execute_ref(const exec_ctx_t &ctx) const; + compute::kernel_t kernel_; +}; + +} // namespace ocl +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/intel/primitive_conf.hpp b/src/gpu/intel/primitive_conf.hpp index 90402d4719e..a85a8a81250 100644 --- a/src/gpu/intel/primitive_conf.hpp +++ b/src/gpu/intel/primitive_conf.hpp @@ -1095,6 +1095,10 @@ inline void def_block_offsets(const block_layout_t &layout, inline void def_data_type( compute::kernel_ctx_t &kernel_ctx, data_type_t dt, const char *str) { switch (dt) { + case data_type::undef: + kernel_ctx.add_option( + utils::format("-D%s_DATA_T=void -D%s_DT_UNDEF", str, str)); + break; case data_type::bf16: kernel_ctx.add_option( utils::format("-D%s_DATA_T=ushort -D%s_DT_BF16", str, str)); @@ -1139,7 +1143,6 @@ inline void def_data_type( kernel_ctx.add_option( utils::format("-D%s_DATA_T=int -D%s_DT_S32", str, str)); break; - case data_type::undef: break; default: gpu_error_not_expected() << "Unexpected data type " << dnnl_dt2str(dt); From 4ee0caa6891f8fea527fc638f619abdca67fdb4a Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Wed, 10 Apr 2024 23:07:54 -0700 Subject: [PATCH 082/187] gpu: jit: ngen: decoder component --- src/gpu/intel/jit/ngen/ngen_decoder.hpp | 94 +++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 src/gpu/intel/jit/ngen/ngen_decoder.hpp diff --git a/src/gpu/intel/jit/ngen/ngen_decoder.hpp b/src/gpu/intel/jit/ngen/ngen_decoder.hpp new file mode 100644 index 00000000000..b0a08c0acb9 --- /dev/null +++ b/src/gpu/intel/jit/ngen/ngen_decoder.hpp @@ -0,0 +1,94 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef NGEN_DECODER_HPP +#define NGEN_DECODER_HPP + +#include "ngen_core.hpp" +#include "ngen_auto_swsb.hpp" + +namespace NGEN_NAMESPACE { + +#include "ngen_gen8.hpp" +#include "ngen_gen12.hpp" + +using DependencyRegion = autoswsb::DependencyRegion; + +#ifdef NGEN_SAFE +class unsupported_compaction : public std::runtime_error { +public: + unsupported_compaction() : std::runtime_error("Compacted instructions are not supported") {} +}; +class unimplemented : public std::runtime_error { +public: + unimplemented() : std::runtime_error("Operation is not implemented") {} +}; +#endif + +class Decoder +{ +public: + Decoder(HW hw_, const std::vector &program) : Decoder(hw_, program.data(), program.size()) {} + Decoder(HW hw_, const uint8_t *program, size_t bytes) + : hw(hw_), current(program), end(program + bytes) {} + + void advance() { checkCompaction(); current += 0x10; } + bool done() const { return current >= end; } + + Opcode opcode() const { return static_cast(*current & 0x7F); } + inline bool getOperandRegion(autoswsb::DependencyRegion ®ion, int opNum) const; + +protected: + HW hw; + const uint8_t *current, *end; + + void checkCompaction() const { +#ifdef NGEN_SAFE + if (get().common.cmptCtrl) /* same bit pre-Gen12 */ + throw unsupported_compaction(); +#endif + } + + template + Instruction get() const { + Instruction i; + std::memcpy(&i, current, sizeof(i)); + return i; + } +}; + +bool Decoder::getOperandRegion(autoswsb::DependencyRegion ®ion, int opNum) const +{ + checkCompaction(); + region.hw = hw; +#if XE3P + if (hw >= HW::Xe3p) + return get().getOperandRegion(region, opNum); +#endif + if (hw >= HW::XeHPC) + return get().getOperandRegion(region, opNum); + if (hw >= HW::Gen12LP) + return get().getOperandRegion(region, opNum); +#ifdef NGEN_SAFE + throw unimplemented(); +#else + return false; +#endif +} + +} /* namespace NGEN_NAMESPACE */ + +#endif /* header guard */ From 8963d058a63889cd688d99f7f39cc134e4d76587 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Mon, 8 Apr 2024 08:16:06 -0700 Subject: [PATCH 083/187] gpu: microkernels: service library --- src/gpu/gpu_sdpa_list.cpp | 0 src/gpu/intel/CMakeLists.txt | 1 + src/gpu/intel/microkernels/CMakeLists.txt | 25 + src/gpu/intel/microkernels/elf.hpp | 85 ++ src/gpu/intel/microkernels/entrance_agent.cpp | 112 +++ src/gpu/intel/microkernels/entrance_agent.hpp | 48 ++ src/gpu/intel/microkernels/fuser.cpp | 209 +++++ src/gpu/intel/microkernels/fuser.hpp | 48 ++ .../intel/microkernels/internal_utilities.hpp | 37 + src/gpu/intel/microkernels/package.hpp | 146 ++++ src/gpu/intel/microkernels/protocol.cpp | 127 +++ src/gpu/intel/microkernels/protocol.hpp | 118 +++ src/gpu/intel/microkernels/shim.cpp | 723 ++++++++++++++++++ src/gpu/intel/microkernels/shim.hpp | 51 ++ src/gpu/intel/ocl/ocl_gpu_engine.cpp | 38 +- 15 files changed, 1766 insertions(+), 2 deletions(-) mode change 100755 => 100644 src/gpu/gpu_sdpa_list.cpp create mode 100644 src/gpu/intel/microkernels/CMakeLists.txt create mode 100644 src/gpu/intel/microkernels/elf.hpp create mode 100644 src/gpu/intel/microkernels/entrance_agent.cpp create mode 100644 src/gpu/intel/microkernels/entrance_agent.hpp create mode 100644 src/gpu/intel/microkernels/fuser.cpp create mode 100644 src/gpu/intel/microkernels/fuser.hpp create mode 100644 src/gpu/intel/microkernels/internal_utilities.hpp create mode 100644 src/gpu/intel/microkernels/package.hpp create mode 100644 src/gpu/intel/microkernels/protocol.cpp create mode 100644 src/gpu/intel/microkernels/protocol.hpp create mode 100644 src/gpu/intel/microkernels/shim.cpp create mode 100644 src/gpu/intel/microkernels/shim.hpp diff --git a/src/gpu/gpu_sdpa_list.cpp b/src/gpu/gpu_sdpa_list.cpp old mode 100755 new mode 100644 diff --git a/src/gpu/intel/CMakeLists.txt b/src/gpu/intel/CMakeLists.txt index 1b8c3b5d594..994bfb9af39 100644 --- a/src/gpu/intel/CMakeLists.txt +++ b/src/gpu/intel/CMakeLists.txt @@ -26,6 +26,7 @@ add_definitions_with_host_compiler(-DNGEN_NO_OP_NAMES) add_definitions_with_host_compiler(-DNGEN_WINDOWS_COMPAT) add_subdirectory(compute) +add_subdirectory(microkernels) add_subdirectory(jit) add_subdirectory(ocl) diff --git a/src/gpu/intel/microkernels/CMakeLists.txt b/src/gpu/intel/microkernels/CMakeLists.txt new file mode 100644 index 00000000000..a450fe006cd --- /dev/null +++ b/src/gpu/intel/microkernels/CMakeLists.txt @@ -0,0 +1,25 @@ +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +file(GLOB SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ) + +set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_microkernels) +add_library(${OBJ_LIB} OBJECT ${SOURCES}) +set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS + $) diff --git a/src/gpu/intel/microkernels/elf.hpp b/src/gpu/intel/microkernels/elf.hpp new file mode 100644 index 00000000000..5714744fafc --- /dev/null +++ b/src/gpu/intel/microkernels/elf.hpp @@ -0,0 +1,85 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_MICROKERNELS_ELF_HPP +#define GPU_MICROKERNELS_ELF_HPP + +#include + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace micro { + +enum { + ELFMagic = 0x464C457F, // '\x7FELF' + ELFClass64 = 2, + ELFLittleEndian = 1, + ELFVersion1 = 1, + ELFRelocatable = 1, +}; +enum { MachineIntelGT = 205, ZebinExec = 0xFF12 }; + +struct FileHeader { + uint32_t magic; + uint8_t elfClass; + uint8_t endian; + uint8_t version; + uint8_t osABI; + uint64_t pad; + uint16_t type; + uint16_t machine; + uint32_t version2; + uint64_t entrypoint; + uint64_t programHeaderOff; + uint64_t sectionTableOff; + uint32_t flags; + uint16_t size; + uint16_t programHeaderSize; + uint16_t programTableEntries; + uint16_t sectionHeaderSize; + uint16_t sectionCount; + uint16_t strTableIndex; +}; + +struct SectionHeader { + uint32_t name; + enum Type : uint32_t { + Null, + Program, + SymbolTable = 2, + StringTable = 3, + Note = 7, + ZeInfo = 0xFF000011 + } type; + uint64_t flags; + uint64_t addr; + uint64_t offset; + uint64_t size; + uint32_t link; + uint32_t info; + uint64_t alignx10; + uint64_t entrySize; +}; + +} /* namespace micro */ +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/intel/microkernels/entrance_agent.cpp b/src/gpu/intel/microkernels/entrance_agent.cpp new file mode 100644 index 00000000000..0af9ed7b8d8 --- /dev/null +++ b/src/gpu/intel/microkernels/entrance_agent.cpp @@ -0,0 +1,112 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "entrance_agent.hpp" + +#include + +#include "gpu/intel/jit/ngen/ngen_config.hpp" +#include "gpu/intel/jit/ngen/ngen_decoder.hpp" +#include "gpu/intel/jit/ngen/npack/neo_packager.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace micro { + +EntranceAgent::Status EntranceAgent::scan(Package &package) { + using namespace ngen; + + auto status = Status::Success; + + auto product = npack::decodeHWIPVersion(package.gmdidCompat); + auto hw = getCore(product.family); + + if (hw == HW::Unknown || hw < HW::Gen12LP) return Status::UnsupportedHW; + + Decoder decoder(hw, package.binary); + DependencyRegion dstRegion; + + /* Track clobbered registers at full register granularity for simplicity. */ + std::array clobbered = {false}; + + for (; !decoder.done(); decoder.advance()) { + // Check for systolic usage. + auto op = decoder.opcode(); + package.systolic |= (op == Opcode::dpas || op == Opcode::dpasw); + + // Get destination region and add to clobbers. + if (decoder.getOperandRegion(dstRegion, -1)) { + if (dstRegion.unspecified) { + // Indirect destination -- cannot reliably detect clobbers. + status = Status::UncertainClobbers; + } else + for (int j = 0; j < dstRegion.size; j++) + clobbered[dstRegion.base + j] = true; + } + } + + // Group clobber array into consecutive ranges. + package.clobbers.clear(); + + int regBytes = GRF::bytes(hw); + int base = 0, len = 0; + for (int j = 0; j < int(clobbered.size()); j++) { + if (clobbered[j]) { + if (len > 0) + len++; + else + base = j, len = 1; + } else if (len > 0) { + package.clobbers.emplace_back(base * regBytes, len * regBytes); + len = 0; + } + } + + // Capture GRF usage from clobbers and arguments. + uint32_t last = 0; + if (!package.clobbers.empty()) { + auto &final = package.clobbers.back(); + last = final.boffset + final.blen; + } + for (const auto &argument : package.arguments) + for (auto &range : argument.location) + last = std::max(last, range.boffset + range.blen); + + package.grfMin = (last + regBytes - 1) / regBytes; + + // Generate LUID from hash of kernel. Later, the cataloguer can update it in case of collisions. + uint32_t luid = 0; + uint32_t multiplier = 1357; + + auto *u32ptr = (const uint32_t *)package.binary.data(); + for (size_t i = 0; i < (package.binary.size() >> 2); i++) { + luid ^= u32ptr[i] * multiplier; + multiplier += 2; + luid = (luid << 3) | (luid >> 29); + } + + package.luid = luid; + + return status; +} + +} /* namespace micro */ +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/microkernels/entrance_agent.hpp b/src/gpu/intel/microkernels/entrance_agent.hpp new file mode 100644 index 00000000000..0bb9dc654f9 --- /dev/null +++ b/src/gpu/intel/microkernels/entrance_agent.hpp @@ -0,0 +1,48 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_MICROKERNELS_ENTRANCE_AGENT_HPP +#define GPU_MICROKERNELS_ENTRANCE_AGENT_HPP + +#include "package.hpp" + +// The entrance agent is a stateless class that analyzes an incoming package from the microkernel provider, +// deducing information from the raw microkernel binary. + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace micro { + +class EntranceAgent { +public: + enum class Status { + Success, + UncertainClobbers, + UnsupportedHW, + }; + + static Status scan(Package &package); +}; + +} /* namespace micro */ +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/intel/microkernels/fuser.cpp b/src/gpu/intel/microkernels/fuser.cpp new file mode 100644 index 00000000000..0e7dfbb0381 --- /dev/null +++ b/src/gpu/intel/microkernels/fuser.cpp @@ -0,0 +1,209 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include "elf.hpp" +#include "fuser.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace micro { + +static void fixupJumpTargets(uint8_t *start, size_t len, ptrdiff_t adjust); + +void fuseMicrokernel(std::vector &binary, + const std::vector µkernel, int id) { + auto base = binary.data(); + auto bytes = binary.size(); + + auto fheaderPtr = reinterpret_cast(base); + + bool ok = bytes >= sizeof(fheaderPtr) && fheaderPtr->magic == ELFMagic + && fheaderPtr->elfClass == ELFClass64 + && fheaderPtr->endian == ELFLittleEndian + && fheaderPtr->sectionHeaderSize == sizeof(SectionHeader) + && (fheaderPtr->version == 0 || fheaderPtr->version == ELFVersion1) + && (fheaderPtr->type == ZebinExec + || fheaderPtr->type == ELFRelocatable) + && bytes >= sizeof(fheaderPtr) + + sizeof(SectionHeader) * fheaderPtr->sectionCount; + + if (!ok) + throw std::runtime_error( + "IGC did not generate a valid zebin program binary"); + + bool foundZeInfo = false; + SectionHeader *text = nullptr; + const char *snames = nullptr; + int textSectionID = -1; + + auto *sheaderPtr = reinterpret_cast( + base + fheaderPtr->sectionTableOff); + + if (fheaderPtr->strTableIndex >= 0) + snames = reinterpret_cast( + base + sheaderPtr[fheaderPtr->strTableIndex].offset); + + for (int s = 0; s < fheaderPtr->sectionCount; s++, sheaderPtr++) { + switch (sheaderPtr->type) { + case SectionHeader::Type::ZeInfo: foundZeInfo = true; break; + case SectionHeader::Type::Program: { + if (snames) { + std::string sname(snames + sheaderPtr->name); + if (sname == ".text.Intel_Symbol_Table_Void_Program") + continue; + if (sname.substr(0, 6) != ".text.") continue; + } + if (text) + throw std::runtime_error("Multiple kernels in program"); + text = sheaderPtr; + textSectionID = s; + break; + } + default: break; + } + } + + if (!foundZeInfo || !text || text->offset + text->size > bytes) + throw std::runtime_error( + "IGC did not generate a valid zebin program binary"); + + auto *insn = reinterpret_cast(base + text->offset); + int icount = text->size >> 4; + + const uint8_t *spliceStart = nullptr; + const uint8_t *spliceEnd = nullptr; + + for (int inum = 0; inum < icount; inum++, insn += 4) { + if (insn[0] & (1u << 29)) + throw std::runtime_error( + "Found a compacted instruction. Please run with the " + "environment variable IGC_disableCompaction=1"); + if (insn[3] == (sigilStart ^ id)) + spliceStart = reinterpret_cast(insn); + else if (insn[3] == (sigilEnd ^ id)) { + spliceEnd = reinterpret_cast(insn); + break; + } + } + + if (!spliceStart || !spliceEnd) return; + + auto removeBytes = spliceEnd - spliceStart + 16; + + size_t before = spliceStart - base; + auto after = bytes - before - removeBytes; + ptrdiff_t sizeAdjust = microkernel.size() - removeBytes; + + auto kbefore = before - text->offset; + auto kafter = text->size - kbefore - removeBytes; + + std::vector newBinary(bytes + sizeAdjust); + auto newBase = newBinary.data(); + + memmove(newBase, base, before); + memmove(newBase + before, microkernel.data(), microkernel.size()); + memmove(newBase + before + microkernel.size(), spliceStart + removeBytes, + after); + + fixupJumpTargets(newBase + text->offset, kbefore, +sizeAdjust); + fixupJumpTargets( + newBase + before + microkernel.size(), kafter, -sizeAdjust); + + fheaderPtr = reinterpret_cast(newBase); + + if (fheaderPtr->sectionTableOff > before) + fheaderPtr->sectionTableOff += sizeAdjust; + + sheaderPtr = reinterpret_cast( + newBase + fheaderPtr->sectionTableOff); + sheaderPtr[textSectionID].size += sizeAdjust; + for (int s = 0; s < fheaderPtr->sectionCount; s++, sheaderPtr++) + if (sheaderPtr->offset > before) sheaderPtr->offset += sizeAdjust; + +#ifdef SPLICE_DEBUG + std::ofstream dump0("original.bin"); + dump0.write((const char *)binary.data(), binary.size()); + + std::ofstream dump("patched.bin"); + dump.write((const char *)newBinary.data(), newBinary.size()); +#endif + + std::swap(binary, newBinary); + + // Tail-recurse to handle any further instances of this microkernel + fuseMicrokernel(binary, microkernel, id); +} + +void fuseMicrokernels(std::vector &binary, const char *source) { + std::vector microkernel; + const auto sigilLen = strlen(sigilBinary); + + auto toNybble = [](char c) { + return ((c >= 'A') ? (c - 'A' + 10) : (c - '0')) & 0xF; + }; + + for (const char *s = std::strstr(source, sigilBinary); s; + s = std::strstr(s, sigilBinary)) { + s += sigilLen; + char *after; + int id = strtol(s, &after, 10); + microkernel.clear(); + for (s = after + 1; *s != '\n'; s += 2) { + if (!s[0] || !s[1]) break; + microkernel.push_back((toNybble(s[0]) << 4) | toNybble(s[1])); + } + fuseMicrokernel(binary, microkernel, id); + } +} + +static void fixupJumpTargets(uint8_t *start, size_t len, ptrdiff_t adjust) { + auto insn = reinterpret_cast(start); + auto icount = len >> 4; + + for (size_t inum = 0; inum < icount; inum++, insn += 4) { + uint8_t op = insn[0] & 0xFF; + if ((op & 0xF0) != 0x20) continue; /* skip non-jumps */ + if (op == 0x2B || op == 0x2D) continue; /* skip ret/calla */ + bool hasUIP = (op == 0x22 || op == 0x23 || op == 0x24 || op == 0x28 + || op == 0x2A || op == 0x2E); + + auto jumpFixup = [=](int32_t &ip) { + auto target = ptrdiff_t(inum << 4) + ip; + if (target < 0 || target >= ptrdiff_t(len)) ip += adjust; + }; + + if (hasUIP) jumpFixup(insn[2]); + jumpFixup(insn[3]); + } +} + +bool hasMicrokernels(const char *source) { + return std::strstr(source, sigilBinary); +} + +} /* namespace micro */ +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/microkernels/fuser.hpp b/src/gpu/intel/microkernels/fuser.hpp new file mode 100644 index 00000000000..06c56986f50 --- /dev/null +++ b/src/gpu/intel/microkernels/fuser.hpp @@ -0,0 +1,48 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_MICROKERNELS_FUSER_HPP +#define GPU_MICROKERNELS_FUSER_HPP + +#include +#include + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace micro { + +// Markers for patch sections. +static constexpr uint32_t sigilStart = 0xCAFEFADE; +static constexpr uint32_t sigilEnd = 0xFADECAFE; +static constexpr const char *sigilBinary = "@_u_@"; + +// Fuse the microkernel machine code into the program binary of a compiled host kernel. +void fuseMicrokernel(std::vector &binary, + const std::vector µkernel, int id = 0); + +// Fusing microkernels that were embedded directly in source code. +void fuseMicrokernels(std::vector &binary, const char *source); +bool hasMicrokernels(const char *source); + +} /* namespace micro */ +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/intel/microkernels/internal_utilities.hpp b/src/gpu/intel/microkernels/internal_utilities.hpp new file mode 100644 index 00000000000..bb0d98d52b3 --- /dev/null +++ b/src/gpu/intel/microkernels/internal_utilities.hpp @@ -0,0 +1,37 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_MICROKERNELS_INTERNAL_UTILITIES_HPP +#define GPU_MICROKERNELS_INTERNAL_UTILITIES_HPP + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace micro { + +template +static inline T divideUp(T num, U denom) { + return (num + denom - 1) / denom; +} + +} /* namespace micro */ +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/intel/microkernels/package.hpp b/src/gpu/intel/microkernels/package.hpp new file mode 100644 index 00000000000..f96c171526b --- /dev/null +++ b/src/gpu/intel/microkernels/package.hpp @@ -0,0 +1,146 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_MICROKERNELS_PACKAGE_HPP +#define GPU_MICROKERNELS_PACKAGE_HPP + +#include +#include +#include +#include +#include + +#include "protocol.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace micro { + +struct Argument; +struct RegisterRange; +struct Setting; + +// Microkernel package. +// +// Fields marked [*] are automatically filled in by the entrance agent. +struct Package { + /* Identifiers */ + Protocol protocol; // Protocol implemented by microkernel + uint64_t luid; // Unique package ID for use in catalog [*] + std::vector + providerID; // Optional free-form identifier for use by microkernel provider + + /* Code */ + std::vector binary; // Raw binary blob + + /* Register usage */ + std::vector + arguments; // Input and output arguments for microkernel + std::vector + clobbers; // Registers clobbered by microkernel (includes arguments) [*] + + /* Requirements */ + uint32_t gmdidCompat; // Compatible GMDID + int grfMin = 0; // Minimum GRF size [*] + int barrierCount = 0; // Number of barriers used by microkernel + bool systolic = false; // Does microkernel use systolic array? [*] + + /* Configuration */ + std::vector + settings; // Description of this microkernel's configuration (WG size, tile size, etc.) for host kernel to interpret + + inline int getSetting(const char *name) const; +}; + +// Contiguous span of register space. +struct RegisterRange { + uint32_t boffset; // Byte offset into GRF + uint32_t blen; // Length of range in bytes + + RegisterRange() {} + RegisterRange(uint32_t boffset_, uint32_t blen_) + : boffset(boffset_), blen(blen_) {} +}; + +// Encapsulation of tensor size information. +struct TensorConfig { + static constexpr int maxDims = 4; + std::array dims + = {1, 1, 1, 1}; // Tensor tile size (elements per dimension) + std::array block = {1, 1, 1, + 1}; // Block sizes within tile (equal to dims if only one block) + + int elements() const { + int result = 1; + for (auto d : dims) + result *= d; + return result; + } + + int blockElements() const { + int result = 1; + for (auto d : block) + result *= d; + return result; + } + + bool blocked() const { + for (int i = 0; i < maxDims; i++) + if (block[i] < dims[i]) return true; + return false; + } + + int blocks() const { + int result = 1; + for (int i = 0; i < maxDims; i++) + result *= dims[i] / block[i]; + return result; + } +}; + +// Information on a single argument (input/output). +struct Argument { + std::string name; // Argument name + std::vector location; // Register location(s) + StructuredType::Type actualType + = StructuredType::any; // Type, if not specified by protocol + TensorConfig sizes; // Tensor size, for tensor arguments +}; + +// Information on a single configuration setting. +struct Setting { + std::string name; // Setting name + int value; // Setting numeric value +}; + +int Package::getSetting(const char *name) const { + for (auto &setting : settings) + if (setting.name == name) return setting.value; + throw std::runtime_error( + std::string( + "Microkernel package does not provide requested setting: ") + + name); +} + +} /* namespace micro */ +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/intel/microkernels/protocol.cpp b/src/gpu/intel/microkernels/protocol.cpp new file mode 100644 index 00000000000..aff7f93f9c4 --- /dev/null +++ b/src/gpu/intel/microkernels/protocol.cpp @@ -0,0 +1,127 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "protocol.hpp" + +#include +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace micro { + +[[noreturn]] void unknownProtocol() { + throw std::runtime_error("Unknown protocol"); +} + +GEMMProtocol::GEMMProtocol(const Options &options) { + family = Family::GEMM; + ioptions = 0; + if (options.localA) ioptions |= (1 << 0); + if (options.localB) ioptions |= (1 << 1); + if (options.addToC) ioptions |= (1 << 2); + if (options.slmPtr) ioptions |= (1 << 3); +} + +GEMMProtocol::Options GEMMProtocol::options() const { + Options options {}; + options.localA = (ioptions & (1 << 0)); + options.localB = (ioptions & (1 << 1)); + options.addToC = (ioptions & (1 << 2)); + options.slmPtr = (ioptions & (1 << 3)); + return options; +} + +#define PDISPATCH(routine, cand) \ + if (family == Family::cand) \ + return reinterpret_cast(this)->routine() + +const char *Protocol::kernelBaseName() const { + PDISPATCH(kernelBaseName, GEMM); + unknownProtocol(); +} + +std::vector Protocol::arguments() const { + PDISPATCH(arguments, GEMM); + unknownProtocol(); +} + +std::vector Protocol::settings() const { + PDISPATCH(settings, GEMM); + unknownProtocol(); +} + +#undef PDISPATCH + +const char *GEMMProtocol::kernelBaseName() const { + return "ugemm"; +} + +std::vector GEMMProtocol::arguments() const { + auto In = ProtocolArgument::In; + auto Out = ProtocolArgument::Out; + + auto LocalPointer = StructuredType::LocalPointer; + auto GlobalPointer = StructuredType::GlobalPointer; + auto s32 = StructuredType::s32; + + static ProtocolArgument args[] = { + {"a", In, GlobalPointer}, + {"lda", In, s32}, + {"b", In, GlobalPointer}, + {"ldb", In, s32}, + {"c", Out, 2}, + {"m", In, s32}, + {"n", In, s32}, + {"k", In, s32}, + {"i0", In, s32}, + {"j0", In, s32}, + {"h0", In, s32}, + {"local_id_m", In, s32}, + {"local_id_n", In, s32}, + }; + std::vector argsV + = {args, args + sizeof(args) / sizeof(args[0])}; + + if (options().localA) argsV[0].stype.format = LocalPointer; + if (options().localB) argsV[2].stype.format = LocalPointer; + if (options().addToC) argsV[4].direction = ProtocolArgument::InOut; + if (options().slmPtr) argsV.push_back({"slm", In, LocalPointer}); + + return argsV; +} + +std::vector GEMMProtocol::settings() const { + static ProtocolSetting settings[] = { + {"sg_tile_m"}, + {"sg_tile_n"}, + {"wg_tile_m"}, + {"wg_tile_n"}, + {"sg_per_wg_m"}, + {"sg_per_wg_n"}, + {"sg_per_wg_k"}, + {"slm_size"}, + }; + static std::vector settingsV + = {settings, settings + sizeof(settings) / sizeof(settings[0])}; + return settingsV; +} + +} /* namespace micro */ +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/microkernels/protocol.hpp b/src/gpu/intel/microkernels/protocol.hpp new file mode 100644 index 00000000000..9a0b2b9cfcb --- /dev/null +++ b/src/gpu/intel/microkernels/protocol.hpp @@ -0,0 +1,118 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_MICROKERNELS_PROTOCOL_HPP +#define GPU_MICROKERNELS_PROTOCOL_HPP + +#include +#include + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace micro { + +struct ProtocolArgument; +struct ProtocolSetting; + +// A protocol describes a class of microkernels that provide the same functionality +// and share a high-level interface. +// A Protocol object should not be created directly; instead protocols are created via +// a specific subclass (e.g. GEMMProtocol). +class Protocol { +public: + const char *kernelBaseName() const; + std::vector arguments() const; + std::vector settings() const; + +protected: + enum Family : uint32_t { Invalid = 0, GEMM = 0x39bfca02 }; + + uint32_t family = Family::Invalid; + uint32_t ioptions = 0; +}; + +class GEMMProtocol : public Protocol { +public: + struct Options { + bool localA = false; + bool localB = false; + bool addToC = false; + bool slmPtr = false; + }; + + GEMMProtocol() : GEMMProtocol(Options {}) {} + GEMMProtocol(const Options &options); + + Options options() const; + +protected: + friend class Protocol; + const char *kernelBaseName() const; + std::vector arguments() const; + std::vector settings() const; +}; + +// Describes the type of a microkernel argument (scalar/pointer/tensor). +struct StructuredType { + enum Type { // Element data type + u64, + s64, + u32, + s32, + u16, + s16, + u8, + s8, // integral + f64, + f32, + f16, + bf16, // floating-point + any, // unspecified + } type + = Type::any; + enum Format { Scalar, GlobalPointer, LocalPointer, Tensor } format = Scalar; + int ndims = 1; + + StructuredType() {} + StructuredType(Type type_) : type(type_) {} + StructuredType(Format format_) : format(format_) {} + StructuredType(int ndims_) : format(Tensor), ndims(ndims_) {} +}; + +// Description of a single argument from a protocol's prototype. +struct ProtocolArgument { + const char *name; + enum { In = 0b01, Out = 0b10, InOut = In | Out } direction; + StructuredType stype; + + bool in() const { return direction & In; } + bool out() const { return direction & Out; } +}; + +// Description of a single protocol setting. +struct ProtocolSetting { + const char *name; +}; + +} /* namespace micro */ +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/intel/microkernels/shim.cpp b/src/gpu/intel/microkernels/shim.cpp new file mode 100644 index 00000000000..0ea0360fad3 --- /dev/null +++ b/src/gpu/intel/microkernels/shim.cpp @@ -0,0 +1,723 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "shim.hpp" +#include "fuser.hpp" +#include "internal_utilities.hpp" + +#include +#include +#include +#include +#include +#include +#include + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace micro { + +int grfWidth(uint32_t gmdid) { + union { + uint32_t raw; + struct { + uint32_t revision : 6; + uint32_t reserved : 8; + uint32_t release : 8; + uint32_t architecture : 10; + }; + } decode; + + decode.raw = gmdid; + if (decode.architecture == 12 && decode.release >= 60 + && decode.release < 70) /* XeHPC */ + return 64; + if (decode.architecture >= 20) /* Xe2 */ + return 64; + + return 32; +} + +bool isSIMT(HostLanguage language) { + switch (language) { + case HostLanguage::OpenCL_C: + case HostLanguage::SYCL: return true; + default: return false; + } +} + +const char *globalAddrSpaceDecorator(HostLanguage language) { + if (language == HostLanguage::OpenCL_C) return "global "; + return ""; +} + +const char *localAddrSpaceDecorator(HostLanguage language) { + if (language == HostLanguage::OpenCL_C) return "local "; + return ""; +} + +int typeSize(StructuredType::Type type) { + switch (type) { + case StructuredType::s64: + case StructuredType::u64: + case StructuredType::f64: return 8; + case StructuredType::s32: + case StructuredType::u32: + case StructuredType::f32: return 4; + case StructuredType::s16: + case StructuredType::u16: + case StructuredType::f16: + case StructuredType::bf16: return 2; + case StructuredType::s8: + case StructuredType::u8: return 1; + default: throw std::runtime_error("Unknown type"); + } +} + +int typeSize(StructuredType stype) { + switch (stype.format) { + case StructuredType::GlobalPointer: return 8; + case StructuredType::LocalPointer: return 4; + case StructuredType::Scalar: return typeSize(stype.type); + default: throw std::runtime_error("Unexpected format"); + } +} + +const char *typeName( + StructuredType::Type type, HostLanguage language = HostLanguage::None) { + if (language == HostLanguage::vISA) switch (type) { + case StructuredType::s64: return "q"; + case StructuredType::s32: return "d"; + case StructuredType::s16: return "w"; + case StructuredType::s8: return "b"; + case StructuredType::u64: return "uq"; + case StructuredType::u32: return "ud"; + case StructuredType::u16: return "uw"; + case StructuredType::u8: return "ub"; + case StructuredType::f64: return "df"; + case StructuredType::f32: return "f"; + case StructuredType::f16: return "hf"; + case StructuredType::bf16: return "bf"; + default: throw std::runtime_error("Unknown type"); + } + else + switch (type) { + case StructuredType::s64: return "long"; + case StructuredType::s32: return "int"; + case StructuredType::s16: return "short"; + case StructuredType::s8: return "char"; + case StructuredType::u64: return "ulong"; + case StructuredType::u32: return "uint"; + case StructuredType::u16: return "ushort"; + case StructuredType::u8: return "uchar"; + case StructuredType::f64: return "double"; + case StructuredType::f32: return "float"; + case StructuredType::f16: return "half"; + case StructuredType::bf16: + return (language == HostLanguage::None) ? "bfloat16" : "ushort"; + default: return "char"; + } +} + +std::string typeName(StructuredType stype, HostLanguage language, + const TensorConfig *sizes = nullptr) { + switch (stype.format) { + case StructuredType::Scalar: return typeName(stype.type, language); + case StructuredType::GlobalPointer: + return std::string(globalAddrSpaceDecorator(language)) + + typeName(stype.type, language) + '*'; + case StructuredType::LocalPointer: + return std::string(localAddrSpaceDecorator(language)) + + typeName(stype.type, language) + '*'; + case StructuredType::Tensor: { + auto name = std::string(typeName(stype.type)) + "_tile_"; + assert(sizes && "Sizes not provided"); + + for (int i = 0; i < stype.ndims; i++) { + if (i > 0) name += 'x'; + name += std::to_string(sizes->dims[i]); + } + + if (sizes->blocked()) { + name += "_blocked_"; + for (int i = 0; i < stype.ndims; i++) { + if (i > 0) name += 'x'; + name += std::to_string(sizes->block[i]); + } + } + + return name; + } + default: throw std::runtime_error("Unknown format"); + } +} + +template +std::vector matchProtocol( + const std::vector &plist, const std::vector &list) { + int n = int(plist.size()); + + std::vector result(n); + + for (int i = 0; i < n; i++) { + for (auto &item : list) { + if (item.name == plist[i].name) { + if (result[i]) + throw std::runtime_error( + "Microkernel has a duplicate argument/setting: " + + item.name); + result[i] = &item; + } + } + if (!result[i]) + throw std::runtime_error( + std::string("Microkernel missing a required " + "argument/setting for its protocol: ") + + plist[i].name); + } + + return result; +} + +std::string generateShim(const Package &package, HostLanguage language, + const ShimOptions &options) { + std::stringstream shim; + + bool cpp = (language == HostLanguage::SYCL); + + /* Match up protocol args with microkernel args */ + auto pargs = package.protocol.arguments(); + auto args = matchProtocol(pargs, package.arguments); + auto nargs = int(pargs.size()); + + /* Match up protocol settings with microkernel settings */ + auto psettings = package.protocol.settings(); + auto settings = matchProtocol(psettings, package.settings); + + /* Collect actual argument types */ + std::vector stypes(pargs.size()); + for (size_t i = 0; i < pargs.size(); i++) { + stypes[i] = pargs[i].stype; + if (stypes[i].type == StructuredType::any) + stypes[i].type = args[i]->actualType; + else if (args[i]->actualType != StructuredType::any + && stypes[i].type != args[i]->actualType) + throw std::runtime_error( + "Microkernel argument type does not match its protocol"); + } + + /* Get decorated kernel name */ + std::string kname = package.protocol.kernelBaseName(); + if (!options.decorator.empty()) { + kname += '_'; + kname += options.decorator; + } + + /* Helper to construct "nice" argument type names for tensor types (e.g. gemm_c_type) */ + auto argTypeName = [&](int i) { + if (stypes[i].format == StructuredType::Tensor) + return kname + '_' + pargs[i].name + "_type"; + else + return typeName(stypes[i], language, &args[i]->sizes); + }; + + if (options.subgroupSize == 0 && isSIMT(language)) + throw std::runtime_error("Subgroup size must be specified."); + + /* OpenCL C: Round up tensor structures and generate type definitions for them */ + if (language == HostLanguage::OpenCL_C) { + /* to do: de-duplicate identical tensor structures */ + for (int i = 0; i < nargs; i++) { + auto &sizes = args[i]->sizes; + + if (stypes[i].format != StructuredType::Tensor) continue; + + auto sname = typeName(stypes[i], language, &sizes); + auto ename = typeName(stypes[i].type, language); + int vlen = divideUp(sizes.blockElements(), options.subgroupSize); + + shim << "#ifndef MICRO_DECL_" << sname + << "\n" + "#define MICRO_DECL_" + << sname + << "\n" + "typedef struct {\n" + " " + << ename; + if (vlen > 1) shim << vlen; + shim << " x[" << sizes.blocks() + << "];\n" + "} " + << sname << ";\n"; + + if (options.useTileOps) { + int ndims = stypes[i].ndims; + shim << "DECLARE_" << ndims << "D_TILE_OPS("; + shim << sname << ',' << ename << ',' << options.subgroupSize; + for (int d = 0; d < ndims; d++) + shim << ',' << sizes.block[d]; + for (int d = 0; d < ndims; d++) + shim << ',' << sizes.dims[d] / sizes.block[d]; + shim << ")\n"; + } + + shim << "#endif\n"; + } + } + + std::string returnType; + int returnArg = -1; + + if (language != HostLanguage::None) { + /* Create a definition for each setting */ + auto sintro = cpp ? "static constexpr int " : "#define "; + auto ssep = cpp ? " = " : " "; + + for (auto &setting : settings) + shim << sintro << kname << '_' << setting->name << ssep + << setting->value << '\n'; + + /* Create definitions for some additional package flags */ + shim << sintro << kname << "_barrier_count " << ssep + << package.barrierCount << '\n'; + shim << sintro << kname << "_systolic " << ssep << int(package.systolic) + << '\n'; + + /* Generate typedefs for the tensor argument types, and #defines for blocking sizes */ + for (int i = 0; i < nargs; i++) { + if (stypes[i].format != StructuredType::Tensor) continue; + + auto &sizes = args[i]->sizes; + auto tname = typeName(stypes[i], language, &sizes); + auto aname = argTypeName(i); + shim << "typedef " << tname << ' ' << aname << ";\n"; + for (int d = 0; d < stypes[i].ndims; d++) { + shim << sintro << aname << "_block" << d << ssep + << sizes.block[d] << '\n'; + shim << sintro << aname << "_nblock" << d << ssep + << (sizes.dims[d] / sizes.block[d]) << '\n'; + } + } + + /* Locate return type. Return types are used in the case of a single output. */ + for (int i = 0; i < nargs; i++) { + if (pargs[i].direction == ProtocolArgument::Out) { + if (!returnType.empty()) { + returnArg = -1; + returnType = ""; + break; + } + returnArg = i; + returnType = argTypeName(i); + } + } + + if (returnType.empty()) returnType = "void"; + + /* Synthesize wrapper function declaration */ + bool firstArg = true; + shim << returnType << ' ' << kname << '('; + for (int i = 0; i < nargs; i++) { + if (i == returnArg) continue; + if (!firstArg) shim << ", "; + if (!pargs[i].out() + && (pargs[i].stype.format == StructuredType::GlobalPointer + || pargs[i].stype.format + == StructuredType::LocalPointer)) { + shim << "const "; + } + shim << argTypeName(i) << ' '; + if (pargs[i].out()) shim << (cpp ? '&' : '*'); + shim << pargs[i].name; + firstArg = false; + } + shim << ") {\n"; + + if (returnArg >= 0) + shim << " " << returnType << ' ' << pargs[returnArg].name + << ";\n"; + } + + /* Gather underlying vISA shim args, one for each vISA variable. */ + /* There will be one vISA shim arg for each scalar/pointer argument or tensor block. */ + struct v_shim_argument_t { + bool in, out, uniform, copy; + RegisterRange location; + std::string name; + StructuredType::Type type; + }; + + std::vector vargs, vargsIn; + + for (int i = 0; i < nargs; i++) { + v_shim_argument_t varg; + + varg.in = pargs[i].in(); + varg.out = pargs[i].out(); + + varg.type = stypes[i].type; + if (stypes[i].format == StructuredType::GlobalPointer) + varg.type = StructuredType::u64; + else if (stypes[i].format == StructuredType::LocalPointer) + varg.type = StructuredType::u32; + + auto &vargList = varg.out ? vargs : vargsIn; + + bool byPtr = varg.out && !cpp && (i != returnArg); + + if (stypes[i].format == StructuredType::Tensor) { + int rangeIdx = 0, rangeOffset = 0; + int blockBytes + = args[i]->sizes.blockElements() * typeSize(stypes[i].type); + + varg.uniform = false; + varg.copy = options.copyTensorArgs; + varg.location.blen = blockBytes; + + /* Create vISA variable for each block */ + for (int iblock = 0; iblock < args[i]->sizes.blocks(); iblock++) { + auto &range = args[i]->location[rangeIdx]; + uint32_t noffset = rangeOffset + blockBytes; + if (range.blen < noffset) + throw std::runtime_error( + "Tensor block not contiguous in registers"); + + varg.location.boffset = range.boffset + rangeOffset; + varg.name = pargs[i].name; + varg.name += byPtr ? "->x[" : ".x["; + varg.name += std::to_string(iblock); + varg.name += ']'; + + vargList.push_back(varg); + + if (range.blen > noffset) + rangeOffset = noffset; + else { + rangeOffset = 0; + rangeIdx++; + } + + bool bdone = (iblock + 1 >= args[i]->sizes.blocks()); + bool rdone = (rangeIdx >= int(args[i]->location.size())); + if (rdone && !bdone) + throw std::runtime_error( + "Not enough registers allocated for declared " + "tensor size"); + else if (bdone && !rdone) + throw std::runtime_error( + "Too many registers allocated for declared tensor " + "size"); + } + } else { + if (args[i]->location.size() != 1) + throw std::runtime_error( + "Microkernel scalar argument is not contiguous in " + "registers"); + if (int(args[i]->location[0].blen) != typeSize(pargs[i].stype)) + throw std::runtime_error( + "Microkernel argument does not have expected size"); + varg.location = args[i]->location[0]; + varg.uniform = true; + varg.copy = options.copyScalarArgs; + varg.name = pargs[i].name; + if (byPtr) varg.name = '*' + varg.name; + vargList.push_back(varg); + } + } + + /* Concatenate output and input arguments */ + vargs.insert(vargs.end(), vargsIn.begin(), vargsIn.end()); + + /* Start vISA shim */ + shim << " __asm__ volatile(\"{\\n\"\n"; + + /* Tie arguments to physical registers */ + int gwidth = grfWidth(package.gmdidCompat); + for (int i = 0; i < int(vargs.size()); i++) { + auto &range = vargs[i].location; + auto goffset = range.boffset % gwidth; + + /* Check that arg can be covered by a vISA variable */ + if (goffset != 0 && int(goffset + range.blen) > gwidth) + throw std::runtime_error( + "Microkernel tensor argument misaligned in registers"); + + if (vargs[i].copy) { + shim << " \".decl COPY" << i << " v_type=G type=" + << typeName(vargs[i].type, HostLanguage::vISA) << " num_elts=" + << (vargs[i].location.blen / typeSize(vargs[i].type)) + << "\\n\"\n"; + } + + shim << " \".implicit_PSEUDO_INPUT " + << (vargs[i].copy ? "COPY" : "%") << i + << " offset=" << range.boffset << " size=" << range.blen + << "\\n\"\n"; + } + + /* Check whether any inputs/outputs need copying */ + bool anyCopyIn = false, anyCopyOut = false; + for (auto &varg : vargs) { + if (varg.in) anyCopyIn |= varg.copy; + if (varg.out) anyCopyOut |= varg.copy; + } + + /* Protect input copies from preceding code */ + if (anyCopyIn) shim << " \"fence_sw\\n\"\n"; + + /* Copy inputs as needed */ + auto copyArg = [&](int i, const char *from, const char *to) { + int remaining = vargs[i].location.blen; + int tsize = typeSize(vargs[i].type); + int offset = 0; + while (offset < (int)vargs[i].location.blen) { + int chunk = std::min(remaining, gwidth * 2); + int esize = std::min(chunk / tsize, 32); + chunk = esize * tsize; + int r = offset / gwidth; + int c = (offset % gwidth) / tsize; + shim << " \"mov (M1_NM, " << esize << ") " << to << i + << '(' << r << ',' << c << ")<1> " << from << i << '(' << r + << ',' << c << ")<1;1,0>\\n\"\n"; + offset += chunk; + } + }; + + for (int i = 0; i < int(vargs.size()); i++) + if (vargs[i].copy && vargs[i].in) copyArg(i, "%", "COPY"); + + /* Wrangle clobber regions. */ + struct clobber_t { + RegisterRange location; + std::string name; + bool arg; + bool preclobbered = false; + }; + + /* Temporarily, we need to subtract argument ranges from clobber ranges due to vISA restrictions. + Sort arguments by location in preparation for that. */ + std::vector vargclobber_ts; + for (int i = 0; i < int(vargs.size()); i++) { + clobber_t clobber; + clobber.location = vargs[i].location; + clobber.name = (vargs[i].copy ? "COPY" : "%") + std::to_string(i); + clobber.arg = false; // Reuse 'arg' field as flag + clobber.preclobbered = vargs[i].copy && vargs[i].in; + vargclobber_ts.push_back(clobber); + } + + std::sort(vargclobber_ts.begin(), vargclobber_ts.end(), + [](const clobber_t &vc1, const clobber_t &vc2) { + return (vc1.location.boffset < vc2.location.boffset); + }); + + /* Expand clobber ranges to legal vISA variables */ + std::vector clobbers; + + if (package.clobbers.empty()) + throw std::runtime_error( + "Microkernel does not define any clobber regions"); + + auto vargIter = vargclobber_ts.begin(); + for (auto &range : package.clobbers) { + uint32_t offset = range.boffset; + int remaining = range.blen; + + while (remaining > 0) { + /* Subtract argument ranges */ + while (vargIter != vargclobber_ts.end() + && vargIter->location.boffset + vargIter->location.blen + <= offset) + ++vargIter; + + int nextOffset = 0, nextRemaining = 0; + if (vargIter != vargclobber_ts.end() + && vargIter->location.boffset < offset + remaining) { + nextOffset + = vargIter->location.boffset + vargIter->location.blen; + nextRemaining = offset + remaining - nextOffset; + remaining = std::min( + remaining, vargIter->location.boffset - offset); + vargIter->arg = true; + } + + while (remaining > 0) { + /* DWord align for convenience */ + remaining += offset & 3; + offset &= ~3; + remaining = (remaining + 3) & ~3; + + /* Carve off an aligned power-of-2 or GRF-aligned chunk */ + int chunk = remaining; + if (offset % gwidth) { + chunk = std::min(remaining, gwidth >> 1); + while (offset & (std::min(chunk, gwidth) - 1)) + chunk >>= 1; + } + + clobber_t clobber; + clobber.location = RegisterRange(offset, chunk); + clobber.name = "CLOBBER" + std::to_string(clobbers.size()); + clobber.arg = false; + clobbers.push_back(std::move(clobber)); + + remaining -= chunk; + offset += chunk; + } + + if (nextRemaining > 0) + offset = nextOffset, remaining = nextRemaining; + } + } + + /* Add clobbered arguments to list */ + for (auto &vargclobber_t : vargclobber_ts) + if (vargclobber_t.arg) clobbers.push_back(std::move(vargclobber_t)); + + /* Declare clobbers and tie them to physical registers */ + for (int i = 0; i < int(clobbers.size()); i++) { + if (clobbers[i].arg) continue; + auto &cname = clobbers[i].name; + auto &range = clobbers[i].location; + + shim << " \".decl " << cname + << " v_type=G type=ud num_elts=" << (range.blen >> 2) << "\\n\"\n"; + shim << " \".implicit_PSEUDO_INPUT " << cname + << " offset=" << range.boffset << " size=" << range.blen + << "\\n\"\n"; + } + + /* Mark beginning of patch region */ + shim << " \"fence_sw\\n\"\n" + " \"add (M1,1) CLOBBER0(0,0)<1> CLOBBER0(0,0)<0;1,0> 0x" + << std::hex << (sigilStart ^ options.microkernelID) << std::dec + << ":ud\\n\"\n" + " \"fence_sw\\n\"\n"; + + /* Overwrite clobbers to ensure vISA considers their ranges live */ + for (int i = 0; i < int(clobbers.size()); i++) { + if (clobbers[i].preclobbered) continue; + auto &cname = clobbers[i].name; + auto &range = clobbers[i].location; + uint32_t offset = 0; + while (offset < range.blen) { + int chunk = std::min(range.blen - offset, gwidth * 2); + if (chunk & (chunk - 1)) { + int chunk2; + for (chunk2 = 2; chunk2 <= chunk; chunk2 <<= 1) + ; + chunk = chunk2 >> 1; + } + + int r = offset / gwidth; + int c = offset % gwidth; + shim << " \"mov (M1," << (chunk >> 2) << ") " << cname + << '(' << r << ',' << (c >> 2) << ")<1> 0xAAAAAAAA:ud\\n\"\n"; + offset += chunk; + } + } + + /* Add dummy instructions to enable kernel features as needed */ + if (package.systolic) { + // Find 8 consecutive clobber registers for dummy DPAS. + uint32_t dlen = gwidth * 8; + int idst = -1; + for (int i = 0; i < int(clobbers.size()); i++) { + if (clobbers[i].location.blen >= dlen) { + idst = i; + break; + } + } + shim << " \".decl DUMMY_DPAS_SRC v_type=G type=ud num_elts=" + << (dlen >> 2); + if (idst >= 0) shim << " alias=<" << clobbers[idst].name << ",0>"; + shim << "\\n\"\n" + " \".decl DUMMY_DPAS_DST v_type=G type=f num_elts=" + << (dlen >> 2) + << " alias=\\n\"\n" + " \"dpas.bf.bf.8.1 (M1," + << (gwidth >> 2) + << ") DUMMY_DPAS_DST.0 V0.0 DUMMY_DPAS_SRC.0 " + "DUMMY_DPAS_SRC(0,0)\\n\"\n"; + } + + if (package.barrierCount == 1) { + shim << " \"barrier\\n\"\n"; + } else if (package.barrierCount > 1) { + // Named barriers -- TBD. + // .kernel_attr NBarrierCnt=<...> + throw std::runtime_error("Named barriers not yet implemented"); + } + + /* Mark end of patch region */ + shim << " \"fence_sw\\n\"\n" + " \"add (M1,1) CLOBBER0(0,0)<1> CLOBBER0(0,0)<0;1,0> 0x" + << std::hex << (sigilEnd ^ options.microkernelID) << std::dec + << ":ud\\n\"\n" + " \"fence_sw\\n\"\n"; + + /* Copy output arguments as needed */ + for (int i = 0; i < int(vargs.size()); i++) + if (vargs[i].copy && vargs[i].out) copyArg(i, "COPY", "%"); + + /* Protect output copies from preceding code */ + if (anyCopyOut) shim << " \"fence_sw\\n\"\n"; + + /* End of inline vISA string */ + shim << " \"}\\n\"\n" + " "; + + /* Enumerate inline vISA arguments */ + for (bool doOutputs : {true, false}) { + bool first = true; + shim << " : "; + for (auto &varg : vargs) { + if (doOutputs != varg.out) continue; + if (!first) shim << ", "; + shim << '\"'; + if (varg.out) shim << (varg.in ? '+' : '='); + shim << (varg.uniform ? "rw.u" : "rw"); + shim << "\"(" << varg.name << ')'; + first = false; + } + } + + shim << ");\n"; + + /* Insert binary code in comment for fuser */ + const char hexChars[] = "0123456789abcdef"; + shim << " // " << sigilBinary << options.microkernelID << ' '; + for (auto b : package.binary) + shim << hexChars[(b >> 4) & 0xF] << hexChars[b & 0xF]; + shim << '\n'; + + /* End function declaration */ + if (language != HostLanguage::None) { + if (returnArg >= 0) + shim << " return " << pargs[returnArg].name << ";\n"; + shim << "}\n"; + } + + return shim.str(); +} + +} /* namespace micro */ +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/microkernels/shim.hpp b/src/gpu/intel/microkernels/shim.hpp new file mode 100644 index 00000000000..1ac91d723c3 --- /dev/null +++ b/src/gpu/intel/microkernels/shim.hpp @@ -0,0 +1,51 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_MICROKERNELS_SHIM_HPP +#define GPU_MICROKERNELS_SHIM_HPP + +#include +#include + +#include "package.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace micro { + +enum class HostLanguage { None, OpenCL_C, SYCL, vISA }; + +struct ShimOptions { + std::string decorator; + int subgroupSize = 0; + bool copyScalarArgs = true; + bool copyTensorArgs = false; + bool useTileOps = false; + uint32_t microkernelID = 0; +}; + +std::string generateShim(const Package &package, HostLanguage language, + const ShimOptions &options = ShimOptions()); + +} /* namespace micro */ +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/intel/ocl/ocl_gpu_engine.cpp b/src/gpu/intel/ocl/ocl_gpu_engine.cpp index f1cc5a0cb7e..d2385acf6c5 100644 --- a/src/gpu/intel/ocl/ocl_gpu_engine.cpp +++ b/src/gpu/intel/ocl/ocl_gpu_engine.cpp @@ -23,6 +23,7 @@ #include "common/type_helpers.hpp" #include "common/utils.hpp" #include "gpu/intel/compute/kernel_list.hpp" +#include "gpu/intel/microkernels/fuser.hpp" #include "gpu/intel/ocl/kernel_utils.hpp" #include "gpu/intel/ocl/ocl_gpu_device_info.hpp" #include "gpu/intel/ocl/ocl_gpu_engine.hpp" @@ -197,6 +198,35 @@ inline status_t preprocess_headers( return status::success; } +inline status_t fuse_microkernels(cl_context context, cl_device_id device, + xpu::ocl::wrapper_t &program, const char *code) { + if (micro::hasMicrokernels(code)) { + cl_int status = CL_SUCCESS; + size_t binary_size = 0; + OCL_CHECK(clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, + sizeof(binary_size), &binary_size, nullptr)); + + std::vector binary(binary_size); + auto binary_data = binary.data(); + OCL_CHECK(clGetProgramInfo(program, CL_PROGRAM_BINARIES, + sizeof(binary_data), &binary_data, nullptr)); + + try { + micro::fuseMicrokernels(binary, code); + } catch (...) { return status::runtime_error; } + + auto nbinary_size = binary.size(); + auto nbinary_data = const_cast(binary.data()); + + program = xpu::ocl::make_wrapper(clCreateProgramWithBinary(context, 1, + &device, &nbinary_size, &nbinary_data, nullptr, &status)); + OCL_CHECK(status); + OCL_CHECK(clBuildProgram(program, 1, &device, "", nullptr, nullptr)); + } + + return status::success; +} + } // namespace status_t ocl_gpu_engine_t::build_program_from_source( @@ -223,13 +253,17 @@ status_t ocl_gpu_engine_t::build_program_from_source( debugdump_processed_source( pp_code_str, options, dev_info->get_cl_ext_options()); - program = xpu::ocl::make_wrapper(clCreateProgramWithSource( - context(), 1, &pp_code_str_ptr, nullptr, &err)); + auto ctx = context(); + program = xpu::ocl::make_wrapper( + clCreateProgramWithSource(ctx, 1, &pp_code_str_ptr, nullptr, &err)); OCL_CHECK(err); auto dev = device(); err = clBuildProgram(program, 1, &dev, options.c_str(), nullptr, nullptr); OCL_CHECK(maybe_print_debug_info(err, program, dev)); + + CHECK(fuse_microkernels(ctx, dev, program, pp_code_str_ptr)); + return status::success; } From 98c686a781278d3ee86747b241d74dc4f71e7a8b Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Mon, 8 Apr 2024 15:13:04 -0700 Subject: [PATCH 084/187] gpu: runtime-generated kernel headers --- src/gpu/intel/compute/kernel_ctx.hpp | 15 +++++++++++++++ src/gpu/intel/ocl/ocl_gpu_engine.cpp | 14 +++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/gpu/intel/compute/kernel_ctx.hpp b/src/gpu/intel/compute/kernel_ctx.hpp index 97941478d96..c4a1c1e3b8d 100644 --- a/src/gpu/intel/compute/kernel_ctx.hpp +++ b/src/gpu/intel/compute/kernel_ctx.hpp @@ -23,6 +23,7 @@ #include #include #include +#include #include "common/bit_cast.hpp" #include "gpu/intel/gpu_primitive_attr.hpp" @@ -121,6 +122,19 @@ class kernel_ctx_t { return ""; } + void add_custom_header( + const std::string &header_name, std::string &&source) { + custom_headers_[header_name] = source; + } + + const char *get_custom_header(const std::string &header_name) const { + auto iter = custom_headers_.find(header_name); + if (iter != custom_headers_.end()) return iter->second.c_str(); + return nullptr; + } + + bool has_custom_headers() const { return !custom_headers_.empty(); } + private: void set_default_options(const primitive_attr_t *attr) { // By default fp32 division and sqrt are not IEEE-compliant @@ -150,6 +164,7 @@ class kernel_ctx_t { std::map int_var_map_; std::map float_var_map_; std::set option_set_; + std::unordered_map custom_headers_; }; template <> diff --git a/src/gpu/intel/ocl/ocl_gpu_engine.cpp b/src/gpu/intel/ocl/ocl_gpu_engine.cpp index d2385acf6c5..2083fbbd7e3 100644 --- a/src/gpu/intel/ocl/ocl_gpu_engine.cpp +++ b/src/gpu/intel/ocl/ocl_gpu_engine.cpp @@ -174,8 +174,8 @@ cl_int maybe_print_debug_info( return err_; }; -inline status_t preprocess_headers( - std::stringstream &pp_code, const char *code) { +inline status_t preprocess_headers(std::stringstream &pp_code, const char *code, + const compute::kernel_ctx_t &kernel_ctx) { std::stringstream code_stream(code); for (std::string line; std::getline(code_stream, line);) { @@ -190,7 +190,10 @@ inline status_t preprocess_headers( = second_quote_pos - first_quote_pos - 1; const auto header_name = line.substr(first_quote_pos + 1, kernel_name_len); - CHECK(preprocess_headers(pp_code, get_kernel_header(header_name))); + const char *header_source + = kernel_ctx.get_custom_header(header_name); + if (!header_source) header_source = get_kernel_header(header_name); + CHECK(preprocess_headers(pp_code, header_source, kernel_ctx)); } else { pp_code << line << std::endl; } @@ -246,7 +249,7 @@ status_t ocl_gpu_engine_t::build_program_from_source( // `clCompileProgram` `clBuildProgram` doesn't take headers. Because of // that, a manual preprocessing of `include` header directives in the // OpenCL kernels is required. - CHECK(preprocess_headers(pp_code, code_string)); + CHECK(preprocess_headers(pp_code, code_string, kernel_ctx)); std::string pp_code_str = pp_code.str(); const char *pp_code_str_ptr = pp_code_str.c_str(); @@ -262,7 +265,8 @@ status_t ocl_gpu_engine_t::build_program_from_source( err = clBuildProgram(program, 1, &dev, options.c_str(), nullptr, nullptr); OCL_CHECK(maybe_print_debug_info(err, program, dev)); - CHECK(fuse_microkernels(ctx, dev, program, pp_code_str_ptr)); + if (kernel_ctx.has_custom_headers()) + CHECK(fuse_microkernels(ctx, dev, program, pp_code_str_ptr)); return status::success; } From c869f7226eb1a94fe36b2d349976eb30138a5857 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Mon, 8 Apr 2024 15:42:48 -0700 Subject: [PATCH 085/187] gpu: jit: ngen: configurable argument base registers --- src/gpu/intel/jit/ngen/ngen_interface.hpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/gpu/intel/jit/ngen/ngen_interface.hpp b/src/gpu/intel/jit/ngen/ngen_interface.hpp index a7f74e141cd..89cb7b390d6 100644 --- a/src/gpu/intel/jit/ngen/ngen_interface.hpp +++ b/src/gpu/intel/jit/ngen/ngen_interface.hpp @@ -121,6 +121,7 @@ class InterfaceHandler void requireWorkgroup(size_t x, size_t y = 1, size_t z = 1) { wg[0] = x; wg[1] = y; wg[2] = z; } + void setArgumentBase(RegData base) { baseOverride = base; } void setInlineGRFCount(int grfs) { requestedInlineGRFs = grfs; } void setSkipPerThreadOffset(int32_t offset) { offsetSkipPerThread = offset; } void setSkipCrossThreadOffset(int32_t offset) { offsetSkipCrossThread = offset; } @@ -169,6 +170,7 @@ class InterfaceHandler bool allow64BitBuffers = false; ThreadArbitrationMode arbitrationMode = ThreadArbitrationMode::Default; int barrierCount = 0; + RegData baseOverride; bool needDPAS = false; bool needGlobalAtomics = false; int32_t needGRF = 128; @@ -407,7 +409,7 @@ void InterfaceHandler::finalize() // r3 (no local IDs) // r5 (SIMD8/16, local IDs) // r8 (SIMD32, local IDs) - // [- assign local ptr arguments left-to-right? not checked] + // - assign local ptr arguments left-to-right // - assign global pointer arguments left-to-right // - assign scalar arguments left-to-right // - assign surface indices left-to-right for global pointers @@ -417,11 +419,19 @@ void InterfaceHandler::finalize() static const std::string localSizeArgs[3] = {"__local_size0", "__local_size1", "__local_size2"}; static const std::string scratchSizeArg = "__scratch_size"; - GRF base = getCrossthreadBase(); - int offset = 32; + GRF base; + int offset; int nextSurface = 0; const int grfSize = GRF::bytes(hw); + if (baseOverride.isValid()) { + base = GRF(baseOverride.getBase()); + offset = baseOverride.getByteOffset(); + } else { + base = getCrossthreadBase(); + offset = 32; + } + auto assignArgsOfType = [&](ExternalArgumentType which) { for (auto &assignment : assignments) { auto exttype = assignment.exttype; From bc5e20bab0c805bd7d27f23a9bfe2ad9c1ef0026 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Mon, 8 Apr 2024 15:24:07 -0700 Subject: [PATCH 086/187] gpu: jit: gemm: microkernel provider --- .../jit/gemm/gen_gemm_kernel_generator.cpp | 512 +++++++++++- .../jit/gemm/gen_gemm_kernel_generator.hpp | 37 +- src/gpu/intel/jit/gemm/kernel_catalog.hpp | 1 + src/gpu/intel/jit/gemm/kernel_selector.cpp | 43 ++ src/gpu/intel/jit/gemm/kernel_selector.hpp | 36 + .../intel/jit/gemm/microkernel_provider.cpp | 335 ++++++++ .../intel/jit/gemm/microkernel_provider.hpp | 55 ++ src/gpu/intel/jit/gemm/strategy_parser.cpp | 2 + src/gpu/intel/jit/gemm/ukernel_lmr.db | 21 + src/gpu/intel/jit/gemm/ukernel_mlr.db | 30 + src/gpu/intel/jit/gemm/ukernel_mmr.db | 726 ++++++++++++++++++ 11 files changed, 1768 insertions(+), 30 deletions(-) create mode 100644 src/gpu/intel/jit/gemm/microkernel_provider.cpp create mode 100644 src/gpu/intel/jit/gemm/microkernel_provider.hpp create mode 100644 src/gpu/intel/jit/gemm/ukernel_lmr.db create mode 100644 src/gpu/intel/jit/gemm/ukernel_mlr.db create mode 100644 src/gpu/intel/jit/gemm/ukernel_mmr.db diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp index 976623a618c..dcfcdc93575 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp @@ -837,7 +837,7 @@ void gemm_kernel_generator_t::mulConstant(const InstructionModifier &mod, mul
(mod, dst, src0, int16_t(src1)); } -InstructionModifier unsaturated(InstructionModifier mod) { +static inline InstructionModifier unsaturated(InstructionModifier mod) { if (mod.isSaturate()) return mod ^ InstructionModifier::createSaturate(); else @@ -1367,7 +1367,7 @@ void gemm_kernel_generator_t::getFusedID(int scale, getHint(HintType::LongTerm, strategy)); if (state.lid0.isValid()) { if (is_zero_or_pow2(scale) && scale > 1 - && (state.fusedID.getOffset() & 3) == 0) + && (state.fusedID.getByteOffset() & 3) == 0) bfi2(1, state.fusedID, scale, state.lid0, 0); else { and_(1, state.fusedID, state.lid0, 1); @@ -1627,6 +1627,13 @@ void gemm_kernel_generator_t::releaseSavedMNLocalIDs(GEMMState &state) { state.lidN = invalid; } +template +void gemm_kernel_generator_t::makeSLMBaseRelative( + Subregister addr, const GEMMState &state) { + if (state.inputs.slmBase.isValid()) + add(1, addr, addr, state.inputs.slmBase); +} + // Clear read suppresion data on ALU pipes. template void gemm_kernel_generator_t::doReadSuppressionWA( @@ -4197,6 +4204,9 @@ bool gemm_kernel_generator_t::getRegLayout(Type T, layout.clear(); + // If no associated address space, create an empty layout. + if (astrategy.base.getModel() == ModelInvalid) return true; + // Tiling handling. if (astrategy.tileR > 0) maxRBlock = (maxRBlock == 0) ? astrategy.tileR @@ -10425,7 +10435,7 @@ bool gemm_kernel_generator_t::gemmFinalizeSums(const GEMMProblem &problem, : mulConstant(1, adjBase, state.lidM, state.nb_slm * Tc); add(1, adjBase, adjBase, ABs_base[isB]); } - + makeSLMBaseRelative(adjBase, state); allocAddrRegs(ABs_addrs[isB], ABs_layoutSLM[isB], ABs_SLM[isB], ABs_strategySLMAtomic[isB], state); setupAddr(Tc, ABs_addrs[isB], adjBase, ABs_layoutSLM[isB], @@ -11708,6 +11718,7 @@ void gemm_kernel_generator_t::gemmKReduce(const GEMMProblem &problem, emad(1, addr0, state.lidM, state.lidN, strategy.wg[LoopM], strategy, state); emad(1, addr0, addr0, state.lidK, strategy.wg[LoopM] * strategy.wg[LoopN], strategy, state); + makeSLMBaseRelative(addr0, state); mulConstant(1, addr0, addr0, sliceRegs * GRF::bytes(hw)); int unrollKSLMStride = strategy.wg[LoopM] * strategy.wg[LoopN] * sliceRegs @@ -12601,7 +12612,9 @@ void gemm_kernel_generator_t::gemmCheck32( }; if (checkA) { - add(1, temp2, state.effA.ud(), state.offsetA.ud()); + state.offsetA.isValid() + ? add(1, temp2, state.effA.ud(), state.offsetA.ud()) + : mov(1, temp2, state.effA.ud()); switch (problem.A .layout) { // Conservatively estimate upper bound for size of A. case MatrixLayout::N: mulHigh(temp1, lda, k); break; @@ -12624,7 +12637,9 @@ void gemm_kernel_generator_t::gemmCheck32( } if (checkB) { - add(1, temp2, state.effB.ud(), state.offsetB.ud()); + state.offsetB.isValid() + ? add(1, temp2, state.effB.ud(), state.offsetB.ud()) + : mov(1, temp2, state.effB.ud()); switch (problem.B.layout) { case MatrixLayout::T: mulHigh(temp1, ldb, k); break; case MatrixLayout::N: mulHigh(temp1, ldb, n); break; @@ -13886,6 +13901,9 @@ bool gemm_kernel_generator_t::kLoopSetup(const GEMMProblem &problem, getGranularities(problem.A, ignore, ka_loadRem); getGranularities(problem.B, kb_loadRem, ignore); + ka_loadRem = std::min(ka_loadRem, strategy.ka_load); + kb_loadRem = std::min(kb_loadRem, strategy.kb_load); + // With 2D block loads, extend k unroll to at least a full block (array). bool a2D = isBlock2D(strategy.A.accessType); bool b2D = isBlock2D(strategy.B.accessType); @@ -14686,7 +14704,7 @@ void gemm_kernel_generator_t::kLoop(KLoop type, const GEMMProblem &problem, if (calcBSums) zeroMatrix(state.Bs_regs, strategy); // Zero out C, if not loading ahead of time. - if (!cLoadAhead) { + if (!cLoadAhead && !(strategy.registerOutput() && problem.beta1())) { for (int i = 0; i < state.C_accCount; i += 2) mov(2 * elementsPerGRF(hw), AccumulatorRegister(i), uint16_t(0)); @@ -16514,6 +16532,7 @@ bool gemm_kernel_generator_t::gemmAccumulateCSetup( add(1, Ai_params.offC, Ai_params.offC, A_params.offC); } else eadd(1, state.effAi, state.effAi, temp2, strategy, state); + makeSLMBaseRelative(state.effA, state); add(1, state.effAo, state.effA, temp); if (problem.backward()) add(1, state.effA, state.effA, @@ -16705,6 +16724,7 @@ bool gemm_kernel_generator_t::gemmAccumulateCSetup( add(1, state.effB, state.effB, strategy.slmABufSize(problem)); } + makeSLMBaseRelative(state.effB, state); add(1, state.effBo, state.effB, temp); if (problem.backward()) add(1, state.effB, state.effB, @@ -18386,7 +18406,8 @@ bool gemm_kernel_generator_t::gemmBodyInternal( // Late exit. Label labelLateExit; - if (state.doLateExit && !strategy.fusePostOps) + if (state.doLateExit && !strategy.fusePostOps + && !(strategy.registerOutput() && outputCRange.empty())) gemmOOBExit(labelLateExit, strategy, state); // Handle fused post-ops for atomic update kernels. @@ -18395,8 +18416,19 @@ bool gemm_kernel_generator_t::gemmBodyInternal( return false; } - // C update. - if (!gemmUpdateCDispatch(problem, strategy, state)) return false; + if (strategy.registerOutput()) { + // Marshal C into output registers. The main path defines the output registers. + if (outputCRange.empty()) { + outputCRange = state.C_regs[0]; + outputCLayout = state.C_layout; + } else { + // FIXME: check that layouts are compatible, and rearrange if not. + overlappedCopy(state.C_regs[0], outputCRange, state); + } + } else { + // Regular C update into memory. + if (!gemmUpdateCDispatch(problem, strategy, state)) return false; + } // Cleanup. if (remaskC_M) @@ -18945,6 +18977,7 @@ void gemm_kernel_generator_t::gemmInitInterface(GEMMProblem &problem, state.inputs.diagB = interface.getArgumentIfExists("diag_B"); state.inputs.diagC = interface.getArgumentIfExists("diag_C"); state.inputs.flags = interface.getArgumentIfExists("flags"); + state.inputs.slmBase = interface.getArgumentIfExists("slm_base"); if (strategy.linearOrder()) { state.inputs.groupCountM = interface.getArgument("group_count_m"); @@ -19029,6 +19062,14 @@ void gemm_kernel_generator_t::gemmInitInterface(GEMMProblem &problem, state.inputs.groupIDN = invalid; } + // Move SLM pointers to offset arguments. + if (strategy.A.base.getModel() == ModelSLM) + std::swap(state.inputs.A, state.inputs.offsetA); + if (strategy.B.base.getModel() == ModelSLM) + std::swap(state.inputs.B, state.inputs.offsetB); + if (strategy.C.base.getModel() == ModelSLM) + std::swap(state.inputs.C[0], state.inputs.offsetC[0]); + // Downgrade offsets to 32 bits for non-A64 accesses. if (strategy.A.base.getModel() != ModelA64) state.inputs.offsetA = state.inputs.offsetA.d(); @@ -19151,6 +19192,7 @@ void gemm_kernel_generator_t::gemmInitInterface(GEMMProblem &problem, } if (state.inputs.flags.isValid()) state.ra.claim(state.inputs.flags); + if (state.inputs.slmBase.isValid()) state.ra.claim(state.inputs.slmBase); if (problem.batch == BatchMode::Strided) { for (int i = 0; i < problem.batchDims; i++) { @@ -19218,13 +19260,13 @@ void gemm_kernel_generator_t::gemmInitInterface(GEMMProblem &problem, // Return amount of SLM needed by a GEMM kernel. template -size_t gemm_kernel_generator_t::gemmSLMSize( - const GEMMProblem &problem, const GEMMStrategy &strategy) { +size_t gemm_kernel_generator_t::gemmSLMSize(const GEMMProblem &problem, + const GEMMStrategy &strategy, bool computeMax) { size_t slmSize = 0; // Space needed by SLM copies. slmSize = strategy.slmABufSize(problem) + strategy.slmBBufSize(problem); - if (strategy.kParallelLocal) slmSize /= strategy.wg[LoopK]; + if (strategy.kParallelLocal && !computeMax) slmSize /= strategy.wg[LoopK]; // Space needed for row/column sum reduction/sharing. if ((problem.needsASums() && strategy.slmA) @@ -19323,7 +19365,8 @@ void gemm_kernel_generator_t::gemmInitState(GEMMProblem &problem, state.Tacc = problem.Tc; state.copyC = (problem.Tc != problem.Tc_ext) || (!strategy.altCRemainder && (Tc.size() < 4)) - || strategy.forceCopyC; + || strategy.forceCopyC + || (strategy.C.base.getModel() == ModelInvalid); state.broadcast = strategy.doubleWA; @@ -19834,6 +19877,7 @@ void gemm_kernel_generator_t::gemmRestoreOffsets(const GEMMProblem &problem, zeroOrRestore(strategy.CO, state.saveOffsetCO, state.inputs.offsetCO); } +// Prepare final A/B/C pointers for a GEMM-like inner loop. template void gemm_kernel_generator_t::gemmSetupABC(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state) { @@ -19845,7 +19889,6 @@ void gemm_kernel_generator_t::gemmSetupABC(const GEMMProblem &problem, state.effCO = state.offsetCO; } - // Add offsets to A, B, C base pointers for stateless accesses. if (strategy.C.base.isStateless()) { for (int q = 0; q < state.C_count; q++) { auto Csrc = state.inputs.C[q]; @@ -20562,6 +20605,23 @@ void gemm_kernel_generator_t::gemmScaleInputs(const GEMMProblem &problem, state.ra.safeRelease(inputs.offsetBq); } +// Calculate workgroup m/n remainders. +template +void gemm_kernel_generator_t::gemmCalcWGRemainders( + const GEMMProblem &problem, const GEMMStrategy &strategy, + GEMMState &state) { + if (wgRemCheck(problem, strategy)) { + state.remaindersWG[LoopM] = state.ra.alloc_sub( + getHint(HintType::TempComp1, strategy)); + state.remaindersWG[LoopN] = state.ra.alloc_sub( + getHint(HintType::TempComp0, strategy)); + add(1 | sat, state.remaindersWG[LoopM], -state.wgI0, state.inputs.m); + add(1 | sat, state.remaindersWG[LoopN], -state.wgJ0, state.inputs.n); + } + if (strategy.coopA != CoopSplit::FullK) state.ra.safeRelease(state.wgI0); + if (strategy.coopB != CoopSplit::FullK) state.ra.safeRelease(state.wgJ0); +} + // Cache multiples of lda/ldb for later address calculations. template void gemm_kernel_generator_t::gemmCacheLDABMultiples( @@ -21229,17 +21289,8 @@ void gemm_kernel_generator_t::gemm( state.ra.safeRelease(state.inputs.localIDN); if (!strategy.needsMNLocalIDs()) state.lidM = state.lidN = invalid; - // Compute workgroup remainders if needed. - if (wgCheck) { - state.remaindersWG[LoopM] = state.ra.alloc_sub( - getHint(HintType::TempComp1, strategy)); - state.remaindersWG[LoopN] = state.ra.alloc_sub( - getHint(HintType::TempComp0, strategy)); - add(1 | sat, state.remaindersWG[LoopM], -state.wgI0, state.inputs.m); - add(1 | sat, state.remaindersWG[LoopN], -state.wgJ0, state.inputs.n); - } - if (strategy.coopA != CoopSplit::FullK) state.ra.safeRelease(state.wgI0); - if (strategy.coopB != CoopSplit::FullK) state.ra.safeRelease(state.wgJ0); + // Calculate workgroup remainders if needed. + gemmCalcWGRemainders(problem, strategy, state); // Compute base addresses for A, B, C. auto &i0p = (strategy.coopA == CoopSplit::FullK) ? state.wgI0 : state.i0; @@ -21773,6 +21824,145 @@ void gemm_kernel_generator_t::gemmSubkernel( gemmFreeIncrements(problem, strategy, state); } +template +void gemm_kernel_generator_t::gemmMicrokernel(GEMMProblem problem, + GEMMStrategy strategy, const ngen::InterfaceHandler &interface_) { + GEMMState state(hw); + + interface = interface_; + + gemmAutoTypeConversions(problem, strategy); + gemmInitState(problem, strategy, state); + for (int q = 0; q < 2; q++) + state.ra.safeRelease(state.emulate.temp[q]); + + outputCRange = GRFMultirange(); + outputCLayout.clear(); + + strategy.forceWGUpdate = WGFixed; + + state.isNested = true; + state.ra.claim(r0 - r6); /* Leave some space for host kernel arguments */ + + state.fullK = state.inputs.k; + + bool registerC = strategy.registerOutput(); + + // Locate and claim additional inputs. + auto getAndClaim = [&](const char *name) { + auto sub = interface.getArgument(name); + state.ra.claim(sub); + return sub; + }; + + state.i0 = getAndClaim("i0"); + state.j0 = getAndClaim("j0"); + state.h0 = getAndClaim("h0"); + + state.lidM = getAndClaim("local_id_m").uw(); + state.lidN = getAndClaim("local_id_n").uw(); + + state.allocEmulate64Temp(strategy.emulate); + + setDefaultNoMask(); + setDefaultAutoSWSB(); + + // Save and modify dispatch mask as needed. + Subregister dmaskSave; + int minSIMD = GRF::bytes(hw) >> 2; + if (minSIMD < state.internalSIMD()) { + dmaskSave = state.ra.alloc_sub(); + mov(1, dmaskSave, sr0[2]); + mov(1 | SWSB(1), sr0[2], + uint32_t(uint64_t(1) << state.internalSIMD()) - 1); + } + + // Synchronize and save flag registers from host kernel. + syncall(); + Subregister flagSave[4]; + for (int i = 0; i < FlagRegister::count(hw); i++) { + flagSave[i] = state.ra.alloc_sub(); + mov(1, flagSave[i], FlagRegister(i)); + } + + // Beginning of microkernel: + // - check32 + // - fused ID calculation + // - ld scaling + // - i0/j0/h0 calculations (inside WG) + // - A/B/C offsets + bool wgCheck = wgRemCheck(problem, strategy); + bool gemmtBarriers = problem.gemmt() && strategy.needsBarrier(); + + auto &k = state.inputs.k; + auto &k0 = state.inputs.k0; + + state.lid0 = (strategy.fusedLoop == LoopN) ? state.lidN : state.lidM; + getFusedID(strategy.unroll[strategy.fusedLoop], problem, strategy, state); + + emulConstant(1, state.inputs.lda, state.inputs.lda, problem.Ta_ext.size(), + strategy, state); + emulConstant(1, state.inputs.ldb, state.inputs.ldb, problem.Tb_ext.size(), + strategy, state); + if (!registerC) + emulConstant(1, state.inputs.ldc[0], state.inputs.ldc[0], + problem.Tc_ext.size(), strategy, state); + + if (wgCheck || gemmtBarriers) { + state.wgI0 = copySubregister(state.i0, state); + state.wgJ0 = copySubregister(state.j0, state); + } + + if (strategy.kParallelLocal) { + /* Select k0 automatically -- also need to compute lidK */ + int wgK = strategy.wg[LoopK]; + if (!is_zero_or_pow2(wgK)) stub(); + k0 = state.ra.alloc_sub(); + add(1, k0, k, wgK - 1); + shr(1, k0, k0, log2(wgK)); + alignUp(k0, k0, strategy.kAlign(problem), strategy, state); + } + + emad(1, state.i0, state.i0, state.lidM, strategy.unroll[LoopM], strategy, + state); + emad(1, state.j0, state.j0, state.lidN, strategy.unroll[LoopN], strategy, + state); + if (strategy.kParallelLocal) { + emad(1, state.h0, state.h0, k0, state.lidK, strategy, state); + add(1 | sat, k.ud(), k, -state.h0); + min_(1, k, k, k0); + if (strategy.barrierFreq > 0 || strategy.slmBuffers > 0) + state.ra.safeRelease(k0); + else + state.threadK0 = k0; + } + + gemmCalcWGRemainders(problem, strategy, state); + gemmCheck32(problem, strategy, state); + + auto &i0p = (strategy.coopA == CoopSplit::FullK) ? state.wgI0 : state.i0; + auto &j0p = (strategy.coopB == CoopSplit::FullK) ? state.wgJ0 : state.j0; + + gemmOffsetABC(false, state.i0, state.j0, state.h0, i0p, j0p, problem, + strategy, state, true, true, !registerC); + + if (!(strategy.prefetchA && strategy.A_prefetch.address2D)) + state.ra.safeRelease(state.wgI0); + if (!(strategy.prefetchB && strategy.B_prefetch.address2D)) + state.ra.safeRelease(state.wgJ0); + + if (strategy.prefetchA && state.effAp.isInvalid()) state.effAp = state.effA; + if (strategy.prefetchB && state.effBp.isInvalid()) state.effBp = state.effB; + + gemmSubkernel(problem, strategy, state); + + // Restore flag registers and dispatch mask and return to host kernel. + for (int i = 0; i < FlagRegister::count(hw); i++) + mov(1, FlagRegister(i), flagSave[i]); + if (dmaskSave.isValid()) mov(1, sr0[2], dmaskSave); + syncall(); +} + template void gemm_kernel_generator_t::gemmSuperkernelInitState( GEMMSuperkernelProblem &problem, GEMMSuperkernelStrategy &strategy, @@ -22090,6 +22280,184 @@ int GEMMStrategy::kInterleaveChunk(const GEMMProblem &problem) const { return chunk; } +static inline micro::StructuredType::Type microType(Type T) { + using ST = micro::StructuredType::Type; +#define CASE(x) \ + case Type::x: return ST::x; + switch (T) { + CASE(f32) + CASE(f16) + CASE(bf16) + CASE(s32) + CASE(s16) + CASE(s8) + CASE(u32) + CASE(u16) + CASE(u8) + default: throw std::runtime_error("Unsupported type"); + } +#undef CASE +} + +template +micro::Package gemm_kernel_generator_t::gemmMicrokernelPackage( + const GEMMProblem &problem_, const GEMMStrategy &strategy, + const ngen::InterfaceHandler &interface_, micro::GEMMProtocol protocol, + uint32_t gmdid, bool transposeC) { + using namespace micro; + Package package; + + auto problem = problem_; + gemmAutoTypeConversions(problem, strategy); + gemmMicrokernel(problem, strategy, interface_); + + package.protocol = protocol; + package.gmdidCompat = gmdid; + package.binary = this->getCode(); + + for (auto parg : package.protocol.arguments()) { + Argument arg; + arg.name = parg.name; + + if (arg.name == "c") { + int tileM = strategy.unroll[LoopM]; + int tileN = strategy.unroll[LoopN]; + int blockM = outputCLayout[0].nr; + int blockN = outputCLayout[0].nc; + + for (auto &block : outputCLayout) { + if (blockM != block.nr) stub(); + if (blockN != block.nc) stub(); + } + if (!isLayoutColMajor(outputCLayout)) + stub(); /* Swap dims and block ordering */ + + int blockGRF = 8 / problem.Tc; + for (auto &r : outputCRange.ranges) + blockGRF = gcd(blockGRF, r.getLen()); + int maxBlock = blockGRF * GRF::bytes(hw); + + if (blockM > maxBlock) { + if (blockM % maxBlock) stub(); + blockM = maxBlock; + blockN = 1; + } else if (blockM * blockN > maxBlock) { + int split = (blockM * blockN / maxBlock); + if (blockM * blockN % maxBlock || blockN % split) stub(); + blockN /= split; + } + + arg.location.resize((tileM * tileN) / (blockM * blockN)); + int idx = 0; + for (int bj = 0; bj < tileN; bj += blockN) { + for (int bi = 0; bi < tileM; bi += blockM) { + const RegisterBlock *block; + int ne; + auto topLeft = findBlockReg(problem.Tc, outputCLayout, bi, + bj, outputCRange, ne, block); + arg.location[idx].boffset + = topLeft.getBase() * GRF::bytes(hw) + + topLeft.getByteOffset(); + arg.location[idx].blen = blockM * blockN * problem.Tc; + idx++; + } + } + + arg.sizes.dims[0] = tileM; + arg.sizes.dims[1] = tileN; + arg.sizes.block[0] = blockM; + arg.sizes.block[1] = blockN; + } else { + const char *aname = parg.name; + if (arg.name == "a") aname = "A"; + if (arg.name == "b") aname = "B"; + if (arg.name == "slm") aname = "slm_base"; + auto reg = interface.getArgument(aname); + arg.location.resize(1); + arg.location[0].boffset + = reg.getBase() * GRF::bytes(hw) + reg.getByteOffset(); + arg.location[0].blen = reg.getBytes(); + } + + if (arg.name == "a") arg.actualType = microType(problem.Ta_ext); + if (arg.name == "b") arg.actualType = microType(problem.Tb_ext); + if (arg.name == "c") arg.actualType = microType(problem.Tc); + + if (transposeC) { + if (arg.name == "a") + arg.name = "b"; + else if (arg.name == "b") + arg.name = "a"; + else if (arg.name == "lda") + arg.name = "ldb"; + else if (arg.name == "ldb") + arg.name = "lda"; + else if (arg.name == "m") + arg.name = "n"; + else if (arg.name == "n") + arg.name = "m"; + else if (arg.name == "i0") + arg.name = "j0"; + else if (arg.name == "j0") + arg.name = "i0"; + else if (arg.name == "local_id_m") + arg.name = "local_id_n"; + else if (arg.name == "local_id_n") + arg.name = "local_id_m"; + } + + package.arguments.push_back(std::move(arg)); + } + + auto effLoopM = !transposeC ? LoopM : LoopN; + auto effLoopN = !transposeC ? LoopN : LoopM; + package.settings.push_back({"sg_tile_m", strategy.unroll[effLoopM]}); + package.settings.push_back({"sg_tile_n", strategy.unroll[effLoopN]}); + package.settings.push_back({"wg_tile_m", strategy.wgTile(effLoopM)}); + package.settings.push_back({"wg_tile_n", strategy.wgTile(effLoopN)}); + package.settings.push_back({"sg_per_wg_m", strategy.wg[effLoopM]}); + package.settings.push_back({"sg_per_wg_n", strategy.wg[effLoopN]}); + package.settings.push_back({"sg_per_wg_k", strategy.wg[LoopK]}); + package.settings.push_back( + {"slm_size", int(gemmSLMSize(problem, strategy, true))}); + + package.barrierCount = interface.getBarrierCount(); + + EntranceAgent::scan(package); + + return package; +} + +// Transpose a GEMM problem. +void GEMMProblem::transpose() { + std::swap(A, B); + std::swap(AO, BO); + std::swap(A_scale, B_scale); + std::swap(Ta, Tb); + std::swap(Ta_ext, Tb_ext); + std::swap(Tao, Tbo); + std::swap(Ta_scale, Tb_scale); + std::swap(aOffset, bOffset); + std::swap(aoPtrDims, boPtrDims); + std::swap(aScale2D, bScale2D); + std::swap(sumA, sumB); + std::swap(binaryRow, binaryCol); + binaryTrans.flip(); + for (auto &bsrc : binary) + bsrc.transpose(); + A.transpose(); + B.transpose(); + C.transpose(); + AO.transpose(); + BO.transpose(); + CO.transpose(); +} + +void MatrixAddressing::transpose() { + layout = transposeLayout(layout); + std::swap(tileR, tileC); +} + // Check if a non-named barrier is needed in addition to named barriers. bool GEMMStrategy::needsUnnamedBarrier(const GEMMProblem &problem) const { if (needsKLoopBarrier() && (!namedBarriers[LoopM] || !namedBarriers[LoopN])) @@ -22591,6 +22959,8 @@ void GEMMSuperkernelStrategy::preflight(HW hw, const GEMMProblem &problem) { void MatrixAddressingStrategy::preflight(HW hw) { newDP |= isBlock2D(accessType); + padded |= (base.getModel() == ModelSLM); + if (prefetch && newDP && cachingR == CacheSettingsLSC::Default) cachingR = CacheSettingsLSC::L1C_L3C; @@ -27515,6 +27885,96 @@ bool gemm_kernel_generator_t::copyRegisters(Type Ts, Type Td, return true; // Success } +// Copy one GRFMultirange to another, allowing overlap between the two. +template +void gemm_kernel_generator_t::overlappedCopy(const GRFMultirange &src, + const GRFMultirange &dst, CommonState &state) { + constexpr int regs = GRF::maxRegs(); + std::array map; + + std::vector temps; + temps.reserve(src.getLen()); + + std::vector alloced; + + for (auto &m : map) + m = -1; + + for (int i = 0; i < src.getLen(); i++) + if (src[i].getBase() != dst[i].getBase()) + map[src[i].getBase()] = dst[i].getBase(); + + int n = 1, ne = elementsPerGRF(hw); + bool done = false; + bool useFloat = false; + + while (!done) { + bool progress = false; + done = true; + + // Copy registers where dst doesn't overlap src, then clear associated entries. + for (int i = 0; i < regs; i += n) { + n = 1; + if (map[i] >= 0) done = false; + + if (map[i] >= 0 && map[map[i]] < 0) { + temps.push_back(i); + if (i + 1 < regs && map[i + 1] == map[i] + 1) { + /* copy 2 consecutive registers at once */ + temps.push_back(map[i + 1]); + map[i + 1] = -1; + n++; + } + + auto dt = useFloat ? DataType::f : DataType::ud; + useFloat = !useFloat; + + mov(n * ne, GRF(map[i]).retype(dt), GRF(i).retype(dt)); + map[i] = -1; + progress = true; + } + } + + if (!progress && !done) { + // Get a few temporaries to break cycles, copy and continue. + // Grab temporaries from already-moved src registers if available. + int unstuck = 0; + constexpr int maxUnstuck = 8; + std::array from, to; + + for (int i = 0; i < regs; i++) + if (map[i] >= 0) { + GRF temp; + if (temps.empty()) { + temp = state.ra.tryAlloc(); + if (temp.isInvalid()) { + if (unstuck == 0) + throw out_of_registers_exception(); + break; + } + alloced.push_back(temp); + } else { + temp = GRF(temps.back()); + temps.pop_back(); + } + + mov(ne, temp, GRF(i)); + from[unstuck] = temp.getBase(); + to[unstuck] = map[i]; + map[i] = -1; + if (++unstuck >= maxUnstuck) + break; /* that's enough for now */ + } + + for (int j = 0; j < unstuck; j++) + map[from[j]] = to[j]; + } + } + + for (auto &r : alloced) + state.ra.release(r); +} + // Get driver information from this strategy. template CommonDriverInfo gemm_kernel_generator_t::driverInfo( @@ -27627,7 +28087,7 @@ void gemm_kernel_generator_t::prologue( template void gemm_kernel_generator_t::prologue( const GEMMStrategy &strategy, GEMMState &state) { - prologue(strategy, state.simd32KMasks ? 32 : 16); + prologue(strategy, state.internalSIMD()); } // Generate the kernel epilogue. diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp index 7141e156321..0372132c4c3 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp @@ -42,6 +42,9 @@ #include "gpu/intel/jit/emulation.hpp" +#include "gpu/intel/microkernels/entrance_agent.hpp" +#include "gpu/intel/microkernels/package.hpp" + #include #include #include @@ -444,8 +447,9 @@ class MultishiftSubregister { struct MatrixAddressing { MatrixLayout layout; // Layout type (N/T/Pr/Pc) - uint8_t packSize; // # of elements in a packed row/column for packed layouts. - uint8_t crosspack; // Crosspack for packed layouts. + uint8_t packSize + = 0; // # of elements in a packed row/column for packed layouts. + uint8_t crosspack = 1; // Crosspack for packed layouts. uint8_t alignment; // Alignment for all addresses, offsets, and leading dimensions. uint8_t tileR = 0, tileC = 0; // Tiling (0 if none) for packed layouts. uint8_t panelLength @@ -457,6 +461,8 @@ struct MatrixAddressing { (isPacked(layout) ? (packSize * crosspack) : 1) * T); } + void transpose(); + private: static int sanitizeAlign(int align) { return std::min(128, largest_pow2_divisor(align)); @@ -985,6 +991,8 @@ struct GEMMProblem : public CommonProblem { bool quantized2DA() const { return (aoPtrDims == 2) || aScale2D; } bool quantized2DB() const { return (boPtrDims == 2) || bScale2D; } + void transpose(); + /* Kernel cache helpers. */ void serialize(serialized_data_t &s) const { s.append(Ta, Tb, Tc, Ts); @@ -1316,6 +1324,10 @@ struct GEMMStrategy : public GEMMStrategyPOD { bool checkAdd32Rem() const { return checkAdd32 && emulate.emulate64; } + bool registerOutput() const { + return C.base.getModel() == ngen::ModelInvalid; + } + int aqGroupKGranularity() const { return groupKReduce(slmA ? unrollKSLM : ka_load); } @@ -1390,6 +1402,7 @@ struct GEMMState : public CommonState { incr_beta; // ud, used for non-strided variable batch. ngen::Subregister alpha_array, beta_array; // q, used for non-strided variable batch. + ngen::Subregister slmBase; // ud std::vector binarySrcs; // q std::vector binaryOffsets; // q/d std::vector binaryLDs; // d @@ -1570,6 +1583,8 @@ struct GEMMState : public CommonState { } sysgemm; GEMMState(ngen::HW hw) : CommonState(hw) {} + + int internalSIMD() const { return simd32KMasks ? 32 : 16; } }; // GEMM superkernel problem. @@ -1723,6 +1738,13 @@ class gemm_kernel_generator_t : public jit_generator { const ngen::InterfaceHandler &interface_); void copy(CopyProblem problem, CopyStrategy strategy, const ngen::InterfaceHandler &interface_); + void gemmMicrokernel(GEMMProblem problem, GEMMStrategy strategy, + const ngen::InterfaceHandler &interface_); + micro::Package gemmMicrokernelPackage(const GEMMProblem &problem, + const GEMMStrategy &strategy, + const ngen::InterfaceHandler &interface_, + micro::GEMMProtocol protocol, uint32_t gmdid, + bool transposeC = false); static CommonDriverInfo driverInfo( GEMMProblem problem, const GEMMStrategy &strategy); @@ -1736,6 +1758,8 @@ class gemm_kernel_generator_t : public jit_generator { &interface = ngen::OpenCLCodeGenerator::interface_; std::exception_ptr lastException; + GRFMultirange outputCRange; + std::vector outputCLayout; std::ostream &getOutStream() const { return std::cerr; } @@ -2080,6 +2104,7 @@ class gemm_kernel_generator_t : public jit_generator { void saveMNLocalIDs(const GEMMStrategy &strategy, GEMMState &state); void saveKLocalIDSize(const GEMMStrategy &strategy, GEMMState &state); void releaseSavedMNLocalIDs(GEMMState &state); + void makeSLMBaseRelative(ngen::Subregister addr, const GEMMState &state); void doReadSuppressionWA( const CommonStrategy &strategy, CommonState &state); @@ -2818,14 +2843,16 @@ class gemm_kernel_generator_t : public jit_generator { bool prefetch = false); void gemmScaleInputs(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state); + void gemmCalcWGRemainders(const GEMMProblem &problem, + const GEMMStrategy &strategy, GEMMState &state); void gemmReverseLoops(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state); void gemmDowngradeAccess(const GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state); void gemmSubkernel( GEMMProblem &problem, GEMMStrategy &strategy, GEMMState state); - static size_t gemmSLMSize( - const GEMMProblem &problem, const GEMMStrategy &strategy); + static size_t gemmSLMSize(const GEMMProblem &problem, + const GEMMStrategy &strategy, bool computeMax = false); static size_t gemmPerKSLMSize( const GEMMProblem &problem, const GEMMStrategy &strategy); void gemmInitInterface(GEMMProblem &problem, GEMMStrategy &strategy, @@ -2927,6 +2954,8 @@ class gemm_kernel_generator_t : public jit_generator { const SubregisterPair &alpha_imag, bool conjugate, const CommonStrategy &strategy, CommonState &state, bool preserveSrc = false); + void overlappedCopy(const GRFMultirange &src, const GRFMultirange &dst, + CommonState &state); bool copyBody( CopyProblem &problem, CopyStrategy &strategy, CopyState &state); diff --git a/src/gpu/intel/jit/gemm/kernel_catalog.hpp b/src/gpu/intel/jit/gemm/kernel_catalog.hpp index d9bcc0dafeb..e103926bab7 100644 --- a/src/gpu/intel/jit/gemm/kernel_catalog.hpp +++ b/src/gpu/intel/jit/gemm/kernel_catalog.hpp @@ -56,6 +56,7 @@ struct Restrictions { }; enum RestrictionTags : char { + ReqDisabled = '@', ReqAlignFallback = '#', ReqBlock2DA = 'A', ReqNoBlock2DA = 'a', diff --git a/src/gpu/intel/jit/gemm/kernel_selector.cpp b/src/gpu/intel/jit/gemm/kernel_selector.cpp index e08fbabcdba..5cd3c7b631e 100644 --- a/src/gpu/intel/jit/gemm/kernel_selector.cpp +++ b/src/gpu/intel/jit/gemm/kernel_selector.cpp @@ -83,6 +83,34 @@ inline bool tagMatch(const char *tref, const char *tpattern) { return true; } +inline bool strategyMatch( + const CommonDriverInfo &info, const StrategyRequirement &req) { + int actual = 0; + switch (req.param) { + case StrategyRequirement::UnrollM: actual = info.unroll[LoopM]; break; + case StrategyRequirement::UnrollN: actual = info.unroll[LoopN]; break; + case StrategyRequirement::WGTileM: actual = info.wgTile(LoopM); break; + case StrategyRequirement::WGTileN: actual = info.wgTile(LoopN); break; + case StrategyRequirement::WGTileMN: + actual = info.wgTile(LoopM) * info.wgTile(LoopN); + break; + case StrategyRequirement::WGM: actual = info.wg[LoopM]; break; + case StrategyRequirement::WGN: actual = info.wg[LoopN]; break; + case StrategyRequirement::WGK: actual = info.wg[LoopK]; break; + case StrategyRequirement::WG: + actual = info.wg[LoopM] * info.wg[LoopN] * info.wg[LoopK]; + break; + default: return false; + } + + switch (req.relation) { + case StrategyRequirement::Equals: return (actual == req.value); + case StrategyRequirement::AtLeast: return (actual >= req.value); + case StrategyRequirement::AtMost: return (actual <= req.value); + default: return false; + } +} + bool matches(const kcatalog::Entry &e, const MatchParams &pattern) { bool ok = true; @@ -120,6 +148,9 @@ bool matches(const kcatalog::Entry &e, const MatchParams &pattern) { } } + for (int i = 0; i < pattern.nExtraReqs; i++) + ok = ok && strategyMatch(e.driverInfo, pattern.extraReqs[i]); + // Should already be matched. ok = ok && (e.selector.hw == pattern.selector.hw); ok = ok @@ -320,6 +351,18 @@ MatchParamsBase::MatchParamsBase(ngen::HW hw, const GEMMProblem &problem) { sizes.batch = sizes.m = sizes.n = sizes.k = 0; } +void StrategyRequirement::transpose() { + switch (param) { + case UnrollM: param = UnrollN; break; + case UnrollN: param = UnrollM; break; + case WGTileM: param = WGTileN; break; + case WGTileN: param = WGTileM; break; + case WGM: param = WGN; break; + case WGN: param = WGM; break; + default: break; + } +} + } // namespace jit } // namespace intel } // namespace gpu diff --git a/src/gpu/intel/jit/gemm/kernel_selector.hpp b/src/gpu/intel/jit/gemm/kernel_selector.hpp index 42df510e191..c6c121e9573 100644 --- a/src/gpu/intel/jit/gemm/kernel_selector.hpp +++ b/src/gpu/intel/jit/gemm/kernel_selector.hpp @@ -31,6 +31,40 @@ namespace intel { namespace jit { // Basic kernel selection API. +struct StrategyRequirement { + enum Parameter { + UnrollM, + UnrollN, + WGTileM, + WGTileN, + WGTileMN, + WGM, + WGN, + WGK, + WG + } param; + enum Relation { Equals, AtLeast, AtMost } relation; + int value; + + StrategyRequirement(Parameter param_, Relation relation_, int value_) + : param(param_), relation(relation_), value(value_) {} + + template + friend StrategyRequirement operator==(Parameter param_, T value_) { + return StrategyRequirement(param_, Equals, int(value_)); + } + template + friend StrategyRequirement operator<=(Parameter param_, T value_) { + return StrategyRequirement(param_, AtMost, int(value_)); + } + template + friend StrategyRequirement operator>=(Parameter param_, T value_) { + return StrategyRequirement(param_, AtLeast, int(value_)); + } + + void transpose(); +}; + struct MatchParamsBase { kcatalog::Selector selector; SizeParams sizes; @@ -40,6 +74,8 @@ struct MatchParamsBase { int alignment[3] = {0, 0, 0}; kcatalog::string tags, lateTags; int unroll[2] = {0, 0}; + int nExtraReqs = 0; + const StrategyRequirement *extraReqs = nullptr; MatchParamsBase() {} MatchParamsBase(ngen::HW hw, const GEMMProblem &problem); diff --git a/src/gpu/intel/jit/gemm/microkernel_provider.cpp b/src/gpu/intel/jit/gemm/microkernel_provider.cpp new file mode 100644 index 00000000000..dcd3a5f068c --- /dev/null +++ b/src/gpu/intel/jit/gemm/microkernel_provider.cpp @@ -0,0 +1,335 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "microkernel_provider.hpp" +#include "gen_gemm_kernel_generator.hpp" +#include "gpu/intel/jit/ngen/npack/neo_packager.hpp" +#include "kernel_selector.hpp" +#include "strategy_parser.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace jit { + +#define _CATALOG_ CatalogMMR +#include "ukernel_mmr.db" +; +#undef _CATALOG_ + +#define _CATALOG_ CatalogLMR +#include "ukernel_lmr.db" +; +#undef _CATALOG_ + +#define _CATALOG_ CatalogMLR +#include "ukernel_mlr.db" +; +#undef _CATALOG_ + +using namespace ngen; +using namespace micro; + +static inline bool getStrategyByHeuristics(HW hw, GEMMStrategy &strategy, + bool localA, bool localB, GEMMProblem &problem, HWInformation hwInfo, + SizeParams sizes, const std::vector &reqs); + +Package selectGEMMMicrokernel(GEMMProtocol protocol, HWInformation hwInfo, + SizeParams sizes, const GEMMProblem &problem_, + const std::vector &reqs_) { + kcatalog::Catalog catalog; + + bool localA = protocol.options().localA; + bool localB = protocol.options().localB; + bool beta1 = protocol.options().addToC; + bool transC = !isColMajor(problem_.C.layout); + + auto problem = problem_; + auto reqs = reqs_; + + problem.alpha = 1; + problem.beta = beta1 ? 1 : 0; + + problem.C.setAlignment(4); + + if (transC) { + problem.transpose(); + std::swap(localA, localB); + std::swap(sizes.m, sizes.n); + for (auto &req : reqs) + req.transpose(); + } + + /* Get hardware information */ + auto product = npack::decodeHWIPVersion(hwInfo.gmdid); + auto hw = getCore(product.family); + auto stepping = hwInfo.gmdid & 0xFF; + + /* Create catalog matcher */ + MatchParams matchParams(hw, problem); + + matchParams.sizes = sizes; + matchParams.stepping = stepping; + matchParams.nExtraReqs = int(reqs.size()); + matchParams.extraReqs = reqs.data(); + + auto tags = const_cast(matchParams.tags); + while (*tags) + tags++; + *tags++ = kcatalog::ReqBlock2DA; + *tags++ = kcatalog::ReqBlock2DB; + if (hwInfo.systolicAvailable) *tags++ = kcatalog::ReqSystolic; + + /* Provide information for kernel selection */ + EvaluateParams evalParams; + evalParams.sizes = matchParams.sizes; + evalParams.alpha = 1; + evalParams.beta = 0; + evalParams.euCount = hwInfo.euCount; + + /* Locate appropriate kernel catalog */ + if (localA && localB) throw std::runtime_error("Unsupported protocol"); + + if (localA) + catalog = CatalogLMR; + else if (localB) + catalog = CatalogMLR; + else + catalog = CatalogMMR; + + /* Call kernel selector */ + EvaluateAuxOutput auxParams; + auto entry = select(catalog, 1, &matchParams, evalParams, auxParams); + + GEMMStrategy strategy(hw, stepping); + + if (entry) { + problem.A.setAlignment( + std::max(problem.Ta.size(), entry->driverInfo.alignment[0])); + problem.B.setAlignment( + std::max(problem.Tb.size(), entry->driverInfo.alignment[1])); + + /* Prepare strategy parameters */ + strategy.unroll[LoopM] = entry->driverInfo.unroll[LoopM]; + strategy.unroll[LoopN] = entry->driverInfo.unroll[LoopN]; + parseStrategy(entry->strategy, hw, problem, strategy); + adjustStrategy(hw, problem, strategy); + modifyStrategy(strategy, auxParams); + + /* Xe2-XeHPC compatibility logic */ + if (hw == ngen::HW::Xe2) { + // Use XeHPC register banking on Xe2, in order + // to successfully reuse XeHPC strategies. + strategy.raHW = ngen::HW::XeHPC; + + // Bump up alignments to 16 bytes for block 2D if available. + bool block2DA = false, block2DB = false; + for (auto c = entry->restrictions.tags; *c; c++) { + block2DA |= (*c == kcatalog::ReqBlock2DA); + block2DB |= (*c == kcatalog::ReqBlock2DB); + } + if (block2DA && strategy.legalAAlignment(problem, 16)) + problem.A.setAlignment(std::max(problem.A.alignment, 16)); + if (block2DB && strategy.legalBAlignment(problem, 16)) + problem.B.setAlignment(std::max(problem.B.alignment, 16)); + } + } else if (!getStrategyByHeuristics(hw, strategy, localA, localB, problem, + hwInfo, sizes, reqs)) + throw std::runtime_error("No matching kernel"); + + strategy.systolicAvailable &= hwInfo.systolicAvailable; + + /* Disable strategies not related to microkernels */ + strategy.kParallel = strategy.kParallelVariable = strategy.persistent + = false; + strategy.cWalkOrder = WalkOrder::HW2D; + + /* Adjust strategy for performance */ + if (strategy.barrierFreq > 0 && sizes.k < 4 * strategy.barrierFreq) + strategy.barrierFreq = 0; + + /* Keep size down by only using checkAdd32 when really needed */ + strategy.checkAdd32 &= (hw != HW::XeHPC); + + /* C output in registers */ + strategy.C.base = AddressBase {}; + + strategy.preflight(hw, problem); + + /* Set up arguments for microkernel */ + InterfaceHandler interface(hw); + + interface.setArgumentBase(ngen::GRF(8)); + interface.newArgument("A", + localA ? ExternalArgumentType::LocalPtr + : ExternalArgumentType::GlobalPtr); + interface.newArgument("lda", DataType::d); + interface.newArgument("B", + localB ? ExternalArgumentType::LocalPtr + : ExternalArgumentType::GlobalPtr); + interface.newArgument("ldb", DataType::d); + interface.newArgument("m", DataType::d); + interface.newArgument("n", DataType::d); + interface.newArgument("k", DataType::d); + interface.newArgument("i0", DataType::d); + interface.newArgument("j0", DataType::d); + interface.newArgument("h0", DataType::d); + interface.newArgument("local_id_m", DataType::d); + interface.newArgument("local_id_n", DataType::d); + if (protocol.options().slmPtr) + interface.newArgument("slm_base", ExternalArgumentType::LocalPtr); + + /* Update problem from strategy */ + if (isPacked(problem.A.layout)) problem.A.packSize = strategy.unroll[LoopM]; + if (isPacked(problem.B.layout)) problem.B.packSize = strategy.unroll[LoopN]; + + /* Generate microkernel */ +#define ARCH_DISPATCH(arch) \ + case HW::arch: { \ + gemm_kernel_generator_t generator; \ + generator.setStepping(stepping); \ + return generator.gemmMicrokernelPackage( \ + problem, strategy, interface, protocol, hwInfo.gmdid, transC); \ + } + + switch (hw) { + ARCH_DISPATCH(Gen9) + ARCH_DISPATCH(Gen11) + ARCH_DISPATCH(XeLP) + ARCH_DISPATCH(XeHP) + ARCH_DISPATCH(XeHPG) + ARCH_DISPATCH(XeHPC) + ARCH_DISPATCH(Xe2) + default: throw std::runtime_error("Unsupported architecture"); + } +#undef ARCH_DISPATCH +} + +static inline bool getStrategyByHeuristics(HW hw, GEMMStrategy &strategy, + bool localA, bool localB, GEMMProblem &problem, HWInformation hwInfo, + SizeParams sizes, const std::vector &reqs) { + if (hw < HW::XeHPG) return false; + if (problem.C.layout == MatrixLayout::T) return false; + if (!hwInfo.systolicAvailable) return false; + if (problem.Ta.size() != 2 || problem.Tb.size() != 2) return false; + + bool block2DA = (hw >= HW::XeHPC) && (problem.A.alignment % 16) == 0; + bool block2DB = (hw >= HW::XeHPC) && (problem.B.alignment % 16) == 0; + + problem.A.alignment = std::min(problem.A.alignment, 16); + problem.B.alignment = std::min(problem.B.alignment, 16); + + auto &s = strategy; + + s.ka_load = s.kb_load = 16; + + if (problem.A.layout == MatrixLayout::Pc) { + s.A.accessType = AccessType::Block; + s.A_copies = 2; + s.A.padded = true; + } else if (!block2DA) { + s.A.accessType = AccessType::Block; + s.ka_load = (problem.A.layout == MatrixLayout::T) ? 32 : 16; + s.slmA = true; + } else if (problem.A.layout == MatrixLayout::T) { + s.A.accessType = AccessType::Block2DTranspose; + s.ka_load = 32; + } else if (problem.A.layout == MatrixLayout::N) { + s.A.accessType = AccessType::Block2DVNNI; + s.A_copies = 2; + } + + if (problem.B.layout == MatrixLayout::Pr) { + s.B.accessType = AccessType::Block; + s.B.padded = true; + s.B_copies = 2; + } else if (!block2DB) { + s.B.accessType = AccessType::Block; + s.doubleMasking = true; + s.kb_load = (problem.B.layout == MatrixLayout::N) ? 32 : 16; + s.slmB = true; + } else if (problem.B.layout == MatrixLayout::T) + s.B.accessType = AccessType::Block2DTranspose; + else if (problem.B.layout == MatrixLayout::N) { + s.B.accessType = AccessType::Block2D; + s.kb_load = 32; + } + + s.C.accessType = AccessType::Block; + + s.A.base = localA ? AddressBase::createSLM() : AddressBase::createA64(true); + s.B.base = localB ? AddressBase::createSLM() : AddressBase::createA64(true); + s.A.newDP = true; + s.B.newDP = true; + s.A.cachingR = s.B.cachingR = CacheSettingsLSC::L1C_L3C; + + s.A_prefetch = s.A; + s.B_prefetch = s.B; + s.A_prefetch.prefetch = s.B_prefetch.prefetch = true; + + if (!localA && block2DA) { + if (!isPacked(problem.A.layout)) + s.A_prefetch.accessType = AccessType::Block2D; + s.prefetchA = s.prefetchAMasked = 2 * s.ka_load; + s.ka_pfStride = s.ka_prefetch = s.ka_load; + } + + if (!localB && block2DB) { + if (!isPacked(problem.B.layout)) + s.B_prefetch.accessType = AccessType::Block2D; + s.prefetchB = s.prefetchBMasked = 2 * s.kb_load; + s.kb_pfStride = s.kb_prefetch = s.kb_load; + } + + s.unroll[LoopK] = 1; + s.wg[LoopK] = 1; + s.unroll[LoopM] = s.unroll[LoopN] = 0; + s.wg[LoopM] = s.wg[LoopN] = 0; + + for (auto &req : reqs) + switch (req.param) { + case StrategyRequirement::UnrollM: + s.unroll[LoopM] = req.value; + break; + case StrategyRequirement::UnrollN: + s.unroll[LoopN] = req.value; + break; + case StrategyRequirement::WGM: s.wg[LoopM] = req.value; break; + case StrategyRequirement::WGN: s.wg[LoopN] = req.value; break; + case StrategyRequirement::WGK: s.wg[LoopK] = req.value; break; + default: break; + } + + if (s.wgTile(LoopM) * s.wgTile(LoopN) == 0) return false; + + s.systolic = true; + s.registerScheme = GEMMStrategy::VAvoid; + if (s.wgTile(LoopM) * s.wgTile(LoopN) > 512) s.GRFs = 256; + if (localA && !localB) s.loadBFirst = true; + + if (s.slmA || s.slmB) s.slmBuffers = 1; + + adjustStrategy(hw, problem, strategy); + + return true; +} + +} // namespace jit +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/jit/gemm/microkernel_provider.hpp b/src/gpu/intel/jit/gemm/microkernel_provider.hpp new file mode 100644 index 00000000000..93168559d78 --- /dev/null +++ b/src/gpu/intel/jit/gemm/microkernel_provider.hpp @@ -0,0 +1,55 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef MICROKERNEL_PROVIDER_HPP +#define MICROKERNEL_PROVIDER_HPP + +#include "gpu/intel/microkernels/package.hpp" +#include "kernel_evaluator.hpp" +#include "kernel_selector.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace jit { + +/* Hardware information for microkernel provider */ +struct HWInformation { + uint32_t gmdid; + int euCount; + bool systolicAvailable; +}; + +micro::Package selectGEMMMicrokernel(micro::GEMMProtocol protocol, + HWInformation hwInfo, SizeParams sizes, const GEMMProblem &problem, + const std::vector &reqs + = std::vector()); + +/* Helpers */ +static inline int alignmentForLD(int ld) { + for (int x = 1; x <= 64; x <<= 1) + if (ld & x) return x; + return 128; +}; + +} // namespace jit +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif /* header guard */ diff --git a/src/gpu/intel/jit/gemm/strategy_parser.cpp b/src/gpu/intel/jit/gemm/strategy_parser.cpp index 194689f3508..5922cb95e2d 100644 --- a/src/gpu/intel/jit/gemm/strategy_parser.cpp +++ b/src/gpu/intel/jit/gemm/strategy_parser.cpp @@ -60,7 +60,9 @@ AddressBase getAddressBase(char c) { switch (c) { case 'a': return AddressBase::createA64(true); case 'c': return AddressBase::createCC(0); + case 'l': return AddressBase::createSLM(); case 'm': return AddressBase::createSC(0); + case 'r': return AddressBase {}; case 's': return AddressBase::createBTS(0); default: throw std::runtime_error("Unknown address space."); } diff --git a/src/gpu/intel/jit/gemm/ukernel_lmr.db b/src/gpu/intel/jit/gemm/ukernel_lmr.db new file mode 100644 index 00000000000..03bafb96895 --- /dev/null +++ b/src/gpu/intel/jit/gemm/ukernel_lmr.db @@ -0,0 +1,21 @@ +/******************************************************************************* +* INTEL CONFIDENTIAL +* Copyright 2024 Intel Corporation. +* +* This software and the related documents are Intel copyrighted materials, and +* your use of them is governed by the express license under which they were +* provided to you (License). Unless the License provides otherwise, you may not +* use, modify, copy, publish, distribute, disclose or transmit this software or +* the related documents without Intel's prior written permission. +* +* This software and the related documents are provided as is, with no express +* or implied warranties, other than those that are expressly stated in the +* License. +*******************************************************************************/ + +/*@kcatalog@*/ +kcatalog::FlatCatalog<2> _CATALOG_ +{1, 4, 2, { +{{'F', "ugemm", {"H", "H", "S"}, {"A2#16,64", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "xzIB"}, "lB16x2 am32+m32@64 rB sys wg 2x4 vav grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {128, 8, 4}, {false, true, false}}, {'W', 1, {1024}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "xzIB"}, "lS16x2 am32+m32@64 rB sys wg 2x4 vav grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 32}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {false, true, false}}, {'W', 1, {1024}}} +}} diff --git a/src/gpu/intel/jit/gemm/ukernel_mlr.db b/src/gpu/intel/jit/gemm/ukernel_mlr.db new file mode 100644 index 00000000000..bc6ecf4c9e8 --- /dev/null +++ b/src/gpu/intel/jit/gemm/ukernel_mlr.db @@ -0,0 +1,30 @@ +/******************************************************************************* +* INTEL CONFIDENTIAL +* Copyright 2024 Intel Corporation. +* +* This software and the related documents are Intel copyrighted materials, and +* your use of them is governed by the express license under which they were +* provided to you (License). Unless the License provides otherwise, you may not +* use, modify, copy, publish, distribute, disclose or transmit this software or +* the related documents without Intel's prior written permission. +* +* This software and the related documents are provided as is, with no express +* or implied warranties, other than those that are expressly stated in the +* License. +*******************************************************************************/ + +/*@kcatalog@*/ +kcatalog::FlatCatalog<11> _CATALOG_ +{1, 18, 11, { +{{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "yI"}, "aB16x2 lB32 rB sys dw wg 4x8 vav", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 32}, {4, 8, 1}, 1, (WGType) 0, 257, 0, 0, {4, 4, 4}, {true, false, true}}, {'W', 1, {64}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 1, 1}, "yzIA"}, "av16x2+m16@32 lB16x2 rB sys wg 2x4 vav", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 128, 4}, {true, false, false}}, {'W', 1, {512}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "yIA"}, "av16x2+m16@32 lB32x2 rB sys wg 4x8 vav", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, false, true}}, {'W', 1, {128}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "yIA"}, "av16x2+m16@32 lB32x2 rB sys wg 2x4 vav", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 32}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, false, true}}, {'W', 1, {256}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "yIA"}, "av16x2+m16@32 lB32 rB sys wg 2x2 vav", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, false, true}}, {'W', 1, {512}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "yIA"}, "av16x2+m16@32 lB32 rB sys wg 2x4 vav", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, false, true}}, {'W', 1, {512}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "yIA"}, "av16x2+m16@32 lB32 rB sys wg 2x8 vav", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, false, true}}, {'W', 1, {512}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "yIA"}, "av16x2+m32@32 lB32 rB sys wg 2x16 vav", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 32}, {2, 16, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, false, true}}, {'W', 1, {256}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "#AIy"}, "aB16x2+m32@32 lB32 rB sys wg 2x16 vav l4 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 32}, {2, 16, 1}, 1, (WGType) 1, 257, 0, 0, {2, 4, 4}, {true, false, true}}, {'W', 1, {256}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "yzIA"}, "av16x2+m16@32 lS16x2 rB sys wg 2x4 vav", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, false, false}}, {'W', 1, {512}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "yzIA"}, "at16x2+m16@32 lB32 rB sys wg 2x4 vav grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 32}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, false, false}}, {'W', 1, {1024}}} +}} diff --git a/src/gpu/intel/jit/gemm/ukernel_mmr.db b/src/gpu/intel/jit/gemm/ukernel_mmr.db new file mode 100644 index 00000000000..461b5c3a0fd --- /dev/null +++ b/src/gpu/intel/jit/gemm/ukernel_mmr.db @@ -0,0 +1,726 @@ +/******************************************************************************* +* INTEL CONFIDENTIAL +* Copyright 2022-2024 Intel Corporation. +* +* This software and the related documents are Intel copyrighted materials, and +* your use of them is governed by the express license under which they were +* provided to you (License). Unless the License provides otherwise, you may not +* use, modify, copy, publish, distribute, disclose or transmit this software or +* the related documents without Intel's prior written permission. +* +* This software and the related documents are provided as is, with no express +* or implied warranties, other than those that are expressly stated in the +* License. +*******************************************************************************/ + +/*@kcatalog@*/ +kcatalog::FlatCatalog<707> _CATALOG_ +{1, 10447, 707, { +{{'C', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 as8 rb l4 ca1 wg 2x8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2 ab8 rb l4 cab1 wg 4x4 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8x2 ab16/8 rb l4 cb1 wg 8x2 vnc nmk", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 16}, {8, 2, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {256}}}, +{{'C', "ugemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 ab4 rs l4 cb1 wg 8x2 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 8}, {8, 2, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"C", "C", "C"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "ab4x2 ab4 rb k8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 768}, {4096, 4096, 768}, {8, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 128, 8}, {true, true, false}}, {'W', 1, {128}}}, +{{'C', "ugemm", {"C", "C", "C"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "ab4 ab4x2 rb k8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 768}, {4096, 4096, 768}, {16, 8, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {128, 64, 8}, {true, true, false}}, {'W', 1, {1e+06}}}, +{{'C', "ugemm", {"C", "C", "C"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "ab2x2 ab8/4 rb k16 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 512}, {4096, 4096, 512}, {16, 8, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, false}}, {'W', 1, {128}}}, +{{'C', "ugemm", {"C", "C", "C"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "ab2x2 ab4x2 rb k8 int ns64", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 768}, {4096, 4096, 768}, {16, 8, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, false}}, {'W', 1, {128}}}, +{{'C', "ugemm", {"C", "C", "C"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "ab8/4 as2x2 rs k16 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {8, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, false}}, {'W', 1, {128}}}, +{{'C', "ugemm", {"C", "C", "C"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "ab8/4 ab4 rs k8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 512}, {4096, 4096, 512}, {8, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, false}}, {'W', 1, {128}}}, +{{'C', "ugemm", {"H", "H", "H"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 ab4x2 rb k8 vnc", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 64, 2}, {true, true, false}}, {'W', 1, {1024}}}, +{{'C', "ugemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "p"}, "ab4x2 ab32/8 rb k64 l4 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 64}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1024, 32, -1}, {1, 1, 1}, ""}, "ab4x2 as8x2 rb l4 int nmk", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 4, 8}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, false}}, {'W', 1, {128}}}, +{{'C', "ugemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {1, 1, 1}, ""}, "ab2x2 as16 rb l4 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 8, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, false}}, {'W', 1, {256}}}, +{{'C', "ugemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qxy"}, "ab2/1 as8x2 rb l4 ca1 wg 2x8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {64, 16, 16}, {2, 8, 1}, 1, (WGType) 1, 1, 4096, 0, {2, 2, 2}, {true, true, false}}, {'W', 1, {1024}}}, +{{'C', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 as8 rb l4 ca1 wg 2x8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"H", "H", "H"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2x2 ab2x2 rb k4 l4 vnc", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 32, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, false}}, {'W', 1, {1024}}}, +{{'C', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2 ab8 rb l4 cab1 wg 4x4 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as4 as8 rb k8 l4 vnc", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {32, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, false}}, {'W', 1, {1024}}}, +{{'C', "ugemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 32, -1}, {1, 1, 1}, "p"}, "as8x2 ab32 rb l4 vnc", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 16, 32}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, false}}, {'W', 1, {256}}}, +{{'C', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8x2 ab16/8 rb l4 cb1 wg 8x2 vnc nmk", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 16}, {8, 2, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {256}}}, +{{'C', "ugemm", {"H", "H", "H"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 ab2x2 rb k16 l4 vnc", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 32, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, false}}, {'W', 1, {1024}}}, +{{'C', "ugemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 ab4 rs l4 cb1 wg 8x2 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 8}, {8, 2, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"O", "O", "I"}, {"A4", "B4", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab16x2 ab8x2 rb int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 16}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 128, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "ab4 ab8 rb l4 int k32 cab1 wg 4x4 ek", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {1, 1, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {2048, 8, -1}, {1, 1, 1}, "xyz"}, "ab8x2 as16x2 rb l4 ca1 wg 2x8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {1, 1, 4}, {true, true, false}}, {'W', 1, {128}}}, +{{'C', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {5280, 32, -1}, {1, 1, 1}, ""}, "ab4 as32 rb l4 cab1 wg 4x4 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 8, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 5120, 0, {1, 1, 4}, {true, true, false}}, {'W', 1, {256}}}, +{{'C', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 32, -1}, {1, 1, 1}, "@nxyz"}, "ab4x2 as16 rb l4 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {64, 8, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {1, 1, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "ab8 ab4 rb l4 int k16 cab1 wg 4x4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 3072, 0, {1, 1, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "ab8x2 ab8x2 rb l4 vnc k32 cab1 wg 4x4 ek", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {1, 1, 4}, {true, true, false}}, {'W', 1, {256}}}, +{{'C', "ugemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "ab8 ab4 rb l4 int k32 cab1 wg 4x4 fn nmk ek", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {1, 1, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 ab2x2 rb int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {16, 32, 4}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 128, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4x2 rb k8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {128, 64, 4}, {true, true, false}}, {'W', 1, {1e+06}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 as4x2 rb int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 4, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {64}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2 ab32 rb ca1 wg 2x8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 8, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {128}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "@pq"}, "ab4x2 ab16/8 rb k32 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {32, 12, 32}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {384}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 32, -1}, {1, 1, 1}, ""}, "ab2x2 ab8 rs cb1 wg 8x2 int nmk", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {32, 16, 8}, {8, 2, 1}, 1, (WGType) 1, 1, 1024, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4096, -1, -1}, {1, 1, 1}, ""}, "ab8 ab8 rb int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {32, 8, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {256}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {256, -1, -1}, {1, 1, 1}, ""}, "ab16 ab32/16x2 rb ca1 wg 2x8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {32}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {256, -1, -1}, {1, 1, 1}, "v"}, "ab16 ab32/16x2 rb ca1 wg 2x8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {32}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "p"}, "ab32 ab32 rb ca1 wg 2x8 vnc", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 8, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {64}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab16 rb cab1 wg 4x4 vnc", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {8, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {64}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "ab4 ab16 rb cab1 wg 4x4 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {8, 4, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 3072, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {32}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2x2 ab4x2 rb k8 int ns64", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {512}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab16 rb cab1 wg 4x4 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {8, 4, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 3072, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {32}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4x2 rb vnc nmk", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {16, 16, 4}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {256}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab8 ab8 rb k16 cab1 wg 4x4 vnc", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {16, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 8192, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {256}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 31, -1}, {1, 1, 1}, ""}, "ab8x2 as8x2 rs cab1 wg 4x4 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {16, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {128}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab16 ab32 rb ca1 wg 2x8 int ek", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {32}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "ab16 ab32 rb ca1 wg 2x8 int ek", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {32}}}, +{{'C', "ugemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qz"}, "ab8 ab4x2 rs k32 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {12, 32, 32}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {1e+06}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB32 rB wg 8x8 cab4x2 ks32 xaf dw vav di sn grf256 sys l4 dm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {579626, 1.30873e+06, 0, 0, 0, 0, 6.16503, 6.88343, 4.54054, 13.1904, 0.0528026, 0.0528026, 0, 1, 1.21396, 1.2014, 5.22092e-15}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB32 rB wg 2x16 cab4 ks64 af dw vav di sn grf256 sys dm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.13494e+06, 565723, 0, 0, 0, 0, 6.58651, 4.63582, 2.77482, 9.77739, 0.0825648, 0.0364345, 0.0773913, 0.987187, 1.21508, 1.20179, 4.02242e-15}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB16 rB wg 4x8 cab4 ks64 af dw vav di sn grf256 sys l4 dm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 4, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.17572e+06, 607273, 0, 0, 0, 0, 6.45464, 5.31845, 3.15766, 10.158, 0.132027, 0.0429529, 0.158416, 0.988589, 1.21634, 1.20182, 5.20966e-16}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB64 rB wg 2x16 cab4x2 ks64 af dw vav di sn grf256 sys dm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 2, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.12082e+06, 548504, 0, 0, 0, 0, 8.01234, 4.39347, 4.40929, 17.0427, 0.211072, 0.304088, 0.114402, 0.99056, 1.22213, 1.20282, -6.02264e-15}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 rB wg 2x8 cab4 ks16 af dw vav di sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 2, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 10240, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.4135e+06, 253984, 0, 0, 0, 0, 10.236, 15.0824, 3.09469, 9.93698, 0.171264, 0.0287612, 0.217312, 1, 1.20862, 1.20257, -4.41257e-15}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "aB16 aB16 rB wg 8x4 cab4x2 ks16 xaf st dw vav sn dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.08792e+06, 567270, 0, 0, 0, 0, 5.72096, 5.41042, 6.54953, 18.1672, 0.017198, 0.00716878, 0.0131277, 0.81547, 1.46216, 1.16767, 2.15216e-12}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipxy"}, "aB16 aB16 rB wg 4x8 cab3x2 ks32 xaf dw vav sn dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 24, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.12984e+06, 584334, 0, 0, 0, 0, 5.52191, 5.62337, 6.60649, 17.5355, 0.0194272, 0.00902898, 0.0139238, 0.977022, 1.42548, 1.17566, 1.22702e-12}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "aB16 aB16 rB wg 4x8 cab4x2 ks32 xaf dw vav dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.08172e+06, 516945, 0, 0, 0, 0, 5.56094, 5.32487, 6.43486, 16.8147, 0.0232929, 0.0282432, 0.0105684, 0.710233, 1.39477, 1.17707, 1.37161e-12}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "aB16 aB16 rB wg 8x8 cab4 ks16 af dw vav di sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {568975, 1.35348e+06, 0, 0, 0, 0, 5.67121, 5.37584, 6.51139, 18.2712, 0.0196859, 0.0196859, 0, 1, 1.35089, 1.18161, 7.86876e-13}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 rB wg 4x8 cab4 ks16 af dw vav di sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.04074e+06, 498949, 0, 0, 0, 0, 5.38936, 5.23734, 5.98628, 16.2827, 0.0274674, 0.010762, 0.0237025, 0.945984, 1.24132, 1.19515, 1.88597e-13}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {64, 64, 64}, {-1, -1, -1}, {64, 64, 64}, {1, 1, 1}, "V"}, "aB8 aB8 rB wg 4x8 kc8 cab4 ks8 nse sn l4 dm", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {128}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "iz"}, "aB4x2 aB4x2 rB wg 4x8 kc4 cab4 ks8 nse di sn grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {1024}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 rB wg 8x8 cab4 ks16 af dw vav di grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {560656, 1.11604e+06, 0, 0, 0, 0, 8.60856, 11.0509, 6.14909, 15.6609, 0.0531671, 0.0531671, 0, 1, 1.21193, 1.20139, 7.41618e-15}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 rB wg 8x8 cab4 ks16 af dw vav di grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {548489, 1.09288e+06, 0, 0, 0, 0, 8.2211, 16.0374, 5.20066, 13.9736, 0.103914, 0.103914, 0, 0.941962, 1.20843, 1.20104, 5.39326e-15}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 rB wg 8x8 cab4 ks16 af dw vav di grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {563029, 1.05131e+06, 0, 0, 0, 0, 14.5376, 15.5776, 3.59843, 13.665, 0.178851, 0.178851, 0, 1, 1.20856, 1.20159, 4.80437e-16}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 rB wg 8x4 cab4x2 ks16 af dw vav di sys l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 8, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.26456e+06, 286183, 0, 0, 0, 0, 5.81633, 8.57816, 4.66116, 13.8076, 0.137423, 0.0613044, 0.0898491, 0.892134, 1.20738, 1.20318, -9.09531e-15}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 rB wg 4x8 cab4x2 ks16 af dw vav di sys l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 4, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.26179e+06, 282811, 0, 0, 0, 0, 5.93672, 11.9594, 3.04212, 9.63196, 0.237524, 0.1208, 0.174179, 0.963212, 1.20937, 1.20459, -2.01852e-14}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 rB wg 8x8 cab4 ks16 af dw vav di grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 40960, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {573184, 1.12993e+06, 0, 0, 0, 0, 10.2158, 17.4327, 5.79394, 15.404, 0.0613861, 0.0613861, 0, 1, 1.20766, 1.20072, 1.08665e-14}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB2x2 aB2x2 rB wg 4x8 kc2 cab4 ks8 nse di l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.22155e+06, 318136, 0, 0, 0, 0, 11.9818, 10.4458, 6.43557, 17.6607, 0.129335, 0.122263, 0.016479, 0.835522, 1.17418, 1.03038, 5.72485e-12}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "aB16 aB16 rB wg 4x8 cab4x2 ks16 xaf dw vav grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {982395, 479263, 0, 0, 0, 0, 6.10212, 6.1884, 6.59917, 17.8536, 0.0214419, 0.00652312, 0.0168029, 0.617745, 1.33872, 1.18408, 8.9311e-13}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "aB16 aB16 rB wg 4x8 cab3 ks32 xaf dw vav grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {993932, 437006, 0, 0, 0, 0, 5.67036, 6.48396, 6.52602, 16.8605, 0.0283748, -0.00314213, 0.0387446, 0.666054, 1.28983, 1.18846, 4.68723e-13}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "aB16 aB16 rB wg 8x8 cab4 ks16 af dw vav di grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {581508, 1.13635e+06, 0, 0, 0, 0, 6.25209, 6.61747, 6.57388, 18.2853, 0.0236389, 0.0236389, 0, 0.993208, 1.26513, 1.19125, 2.88352e-13}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "aB16 aB16 rB wg 4x8 cab3 ks32 xaf dw vav grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {981474, 437790, 0, 0, 0, 0, 6.24369, 6.37532, 6.49232, 16.9191, 0.0283921, 0.0130048, 0.0211151, 0.927229, 1.298, 1.18778, 5.89935e-13}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB16 rB wg 8x4 cab4 ks32 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.09148e+06, 534838, 0, 0, 0, 0, 5.22674, 5.22482, 6.50064, 16.4865, 0.0218546, 0.00768442, 0.0199595, 0.840879, 1.41374, 1.18184, 1.21865e-12}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB32 rB wg 8x4 cab4 ks64 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.0738e+06, 487395, 0, 0, 0, 0, 4.60875, 4.65483, 5.61489, 14.464, 0.0516162, 0.0102235, 0.0545566, 0.938533, 1.22067, 1.20222, 3.04423e-15}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 rB wg 8x8 cab4 ks64 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 8, 64}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {566031, 1.06674e+06, 0, 0, 0, 0, 6.12801, 5.51734, 3.1656, 13.6348, 0.0846202, 0.0846202, 0, 1, 1.21348, 1.20076, 2.80227e-14}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB64 rB wg 8x4 cab4 ks64 af dw vav di sm sn sys l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.10031e+06, 465178, 0, 0, 0, 0, 4.66255, 5.56717, 3.95779, 12.18, 0.0860145, 0.0695545, 0.0336857, 1, 1.21111, 1.20136, 5.42319e-15}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 rB wg 2x8 cab4 ks64 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.33233e+06, 224556, 0, 0, 0, 0, 4.71443, 4.70675, 3.65673, 10.5957, 0.105005, 0.0590905, 0.0614575, 1, 1.21201, 1.20236, -1.02685e-15}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB16 rB wg 8x4 cab4 ks32 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.08668e+06, 501544, 0, 0, 0, 0, 5.24563, 5.63352, 6.20576, 15.4182, 0.0341144, 0.0170101, 0.0263965, 1, 1.29682, 1.18968, 3.46159e-13}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 rB wg 4x4 cab3 ks64 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.28342e+06, 207114, 0, 0, 0, 0, 5.20558, 6.84739, 3.90187, 10.8408, 0.114743, 0.00818414, 0.107703, 0.865881, 1.21171, 1.20195, 3.61977e-15}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 rB wg 4x4 cab3 ks64 af vav di sm grf256 sys dm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {0, 0, 0}, {0, 0, 0}, {4, 4, 1}, "@Inpqxy"}, "aB16 aB16 rB wg 8x4 cab4 ks16 xaf dw vav sm dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 655360, 16777216}, {524288, 655360, 16777216}, {32, 40, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 53248, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.11178e+06, 622060, 0, 0, 0, 0, 5.62948, 5.39308, 6.34783, 18.0651, 0.0188949, 0.00722523, 0.0160939, 0.792179, 1.4111, 1.16673, 1.76659e-12}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 4, 1}, "xyI"}, "aS16x2 aB16 rB wg 16x2 cb4x2 ks16 xaf dw vav sn dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 16}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {16, 4, 4}, {true, true, false}}, {'E', 17, {1.07596e+06, 547281, 0, 0, 0, 0, 5.3341, 5.22645, 6.47481, 17.5871, 0.0187721, 0.00841977, 0.0135396, 0.854887, 1.37769, 1.17708, 1.45209e-12}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipxy"}, "aB32 aB16 rB wg 8x4 cab3 ks32 xaf dw vav sm dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.05966e+06, 526716, 0, 0, 0, 0, 5.25431, 5.59994, 6.71912, 17.4322, 0.0194231, 0.00993451, 0.0135184, 0.980104, 1.41981, 1.17834, 1.31956e-12}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "aB16 aB16 rB wg 8x8 cab4 ks16 xaf dw vav di sm sn grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {596002, 1.33567e+06, 0, 0, 0, 0, 5.53572, 5.47044, 6.55621, 18.2618, 0.0211424, 0.0211424, 0, 1, 1.31489, 1.18381, 6.85524e-13}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "iz"}, "aB4 aB4 rB wg 4x8 kc4 cab4 ks8 nse di sm sn l4 grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {1024}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 rB wg 8x4 cab4 ks16 af dw vav di sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.31167e+06, 785275, 0, 0, 0, 0, 7.11381, 8.75643, 6.11098, 15.9972, 0.0503546, 0.0303966, 0.0484271, 0.842682, 1.20649, 1.2023, -1.8357e-15}}}, +{{'E', "ugemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "iz"}, "aB4x2 aB4x2 rB wg 8x4 kc4 cab4 ks8 nse di sm l4 grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {1024}}}, +{{'E', "ugemm", {"C", "C", "C"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "uxyz"}, "ab2x2 ab4x2 rb ca1x2 wg 2x8 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 8, 8}, {2, 8, 1}, 1, (WGType) 1, 0, 2048, 0, {128, 64, 8}, {true, true, false}}, {'W', 1, {128}}}, +{{'E', "ugemm", {"C", "C", "C"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "uxyz"}, "ab4x2 ab2x2 rb cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 16, 4}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 128, 8}, {true, true, false}}, {'W', 1, {1e+06}}}, +{{'E', "ugemm", {"C", "C", "C"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "nuxyz"}, "ab2 ab8 rb cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {8, 8, 8}, {true, true, false}}, {'W', 1, {128}}}, +{{'E', "ugemm", {"C", "C", "C"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Nu"}, "ab2 ab8/4 rb cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {8, 8, 8}, {true, true, false}}, {'W', 1, {128}}}, +{{'E', "ugemm", {"C", "C", "C"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "@Nu"}, "ab4/2 ab8/4 rb cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 6144, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {128}}}, +{{'E', "ugemm", {"C", "C", "C"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "@nuxyz"}, "ab4 ab8/4 rb cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {128}}}, +{{'E', "ugemm", {"C", "C", "C"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "uxyz"}, "ab4 ab8 rb cab1 wg 2x8 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 8, 8}, {2, 8, 1}, 1, (WGType) 1, 1, 5120, 0, {8, 8, 8}, {true, true, false}}, {'W', 1, {64}}}, +{{'E', "ugemm", {"C", "C", "C"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Nu"}, "ab8/4 ab2 rs cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {8, 8, 8}, {true, true, false}}, {'W', 1, {128}}}, +{{'E', "ugemm", {"C", "C", "C"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "nuxy"}, "ab8 ab2 rs cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {8, 8, 8}, {true, true, false}}, {'W', 1, {128}}}, +{{'E', "ugemm", {"D", "D", "D"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 rB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 8, 4}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 64, 8}, {true, true, false}}, {'W', 1, {64}}}, +{{'E', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aS4x2 rB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {8, 8, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, false}}, {'W', 1, {64}}}, +{{'E', "ugemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 rB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 8, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, false}}, {'W', 1, {64}}}, +{{'E', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS4x2 aS4x2 rB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {8, 8, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, false}}, {'W', 1, {64}}}, +{{'E', "ugemm", {"D", "D", "D"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS4x2 aB4x2 rB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {8, 8, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, false}}, {'W', 1, {64}}}, +{{'E', "ugemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "@ipq"}, "aB8/4 aB8 rB wg 8x4 kc8 cab4 ks8 nse di sn l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 2}, {true, true, false}}, {'W', 1, {1024}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB32 rB wg 8x8 cab4x2 ks32 xaf dw vav di sn grf256 sys l4 dm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {577632, 1.31042e+06, 0, 0, 0, 0, 6.15959, 6.87283, 4.53581, 13.1549, 0.0527369, 0.0527369, 0, 1, 1.2126, 1.20117, 9.25713e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB32 rB wg 2x16 cab4 ks64 af dw vav di sn grf256 sys dm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.14032e+06, 564642, 0, 0, 0, 0, 6.56444, 4.63397, 2.77135, 9.77858, 0.0825907, 0.035527, 0.0779663, 0.969555, 1.21564, 1.20197, 9.03249e-16}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB16 rB wg 4x8 cab4 ks64 af dw vav di sn grf256 sys l4 dm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 4, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.16856e+06, 607673, 0, 0, 0, 0, 6.45483, 5.30858, 3.13856, 10.1588, 0.13211, -0.0608739, 0.253501, 0.955862, 1.21927, 1.20201, -3.92067e-16}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB64 rB wg 2x16 cab4x2 ks64 af dw vav di sn grf256 sys dm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 2, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.11663e+06, 549008, 0, 0, 0, 0, 7.99056, 4.41464, 4.40063, 17.071, 0.210664, 0.0956988, 0.192101, 1, 1.22479, 1.20258, -6.10193e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 rB wg 2x8 cab4 ks16 af dw vav di sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 2, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 10240, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.41774e+06, 253424, 0, 0, 0, 0, 10.239, 15.1219, 3.1142, 9.95356, 0.171272, 0.0208162, 0.21871, 0.99885, 1.21298, 1.20221, -2.86036e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "aB16 aB16 rB wg 8x4 cab4x2 ks16 xaf st dw vav sn dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.08421e+06, 567300, 0, 0, 0, 0, 5.72582, 5.41621, 6.5602, 18.1823, 0.0171802, 0.00649357, 0.0137923, 0.748807, 1.55527, 1.15042, 2.94264e-12}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipxy"}, "aB16 aB16 rB wg 4x8 cab3x2 ks32 xaf dw vav sn dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 24, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.13571e+06, 583999, 0, 0, 0, 0, 5.51731, 5.62733, 6.63464, 17.5494, 0.0194115, 0.00899104, 0.0142425, 0.950599, 1.50472, 1.17739, 2.4916e-12}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "aB16 aB16 rB wg 4x8 cab4x2 ks32 xaf dw vav dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.0878e+06, 516397, 0, 0, 0, 0, 5.57335, 5.3253, 6.42346, 16.8151, 0.0232576, 0.0155179, 0.01274, 0.807047, 1.46838, 1.16337, 1.98005e-12}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "aB16 aB16 rB wg 8x8 cab4 ks16 af dw vav di sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {572492, 1.35365e+06, 0, 0, 0, 0, 5.65906, 5.37186, 6.53579, 18.2267, 0.0196679, 0.0196679, 0, 1, 1.44388, 1.17216, 1.57451e-12}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 rB wg 4x8 cab4 ks16 af dw vav di sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.03854e+06, 499303, 0, 0, 0, 0, 5.39147, 5.24663, 6.00784, 16.2676, 0.0273688, 0.01059, 0.0237607, 0.949573, 1.33386, 1.18647, 7.31943e-13}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {64, 64, 64}, {-1, -1, -1}, {64, 64, 64}, {1, 1, 1}, "V"}, "aB8 aB8 rB wg 4x8 kc8 cab4 ks8 nse sn l4 dm", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {128}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "iz"}, "aB4x2 aB4x2 rB wg 4x8 kc4 cab4 ks8 nse di sn grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {1024}}}, +{{'E', "ugemm", {"H", "H", "H"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "@i"}, "aB8 aB8 rB wg 8x4 kc8 cab4 ks8 nse di l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 2}, {true, true, false}}, {'W', 1, {1024}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 rB wg 8x8 cab4 ks16 af dw vav di grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {568679, 1.11539e+06, 0, 0, 0, 0, 8.57882, 11.0683, 6.15089, 15.6211, 0.0531227, 0.0531227, 0, 1, 1.20724, 1.20171, 2.80388e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 rB wg 8x8 cab4 ks16 af dw vav di grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {549980, 1.09293e+06, 0, 0, 0, 0, 8.24445, 16.052, 5.21936, 14.0172, 0.104056, 0.104056, 0, 0.927309, 1.21031, 1.20229, -3.26446e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 rB wg 8x8 cab4 ks16 af dw vav di grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {563998, 1.05034e+06, 0, 0, 0, 0, 14.5229, 15.5849, 3.5838, 13.7015, 0.178533, 0.178533, 0, 1, 1.20753, 1.20157, 1.35023e-17}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 rB wg 8x4 cab4x2 ks16 af dw vav di sys l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 8, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.25687e+06, 286505, 0, 0, 0, 0, 5.80146, 8.50854, 4.6718, 13.7597, 0.13751, 0.0574509, 0.0903835, 0.891908, 1.21163, 1.20152, 2.71443e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 rB wg 4x8 cab4x2 ks16 af dw vav di sys l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 4, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.26051e+06, 282866, 0, 0, 0, 0, 5.93413, 11.8843, 3.04053, 9.63044, 0.237344, 0.0914411, 0.191295, 0.942713, 1.21014, 1.20249, -5.81423e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 rB wg 8x8 cab4 ks16 af dw vav di grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 40960, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {571150, 1.13043e+06, 0, 0, 0, 0, 10.2194, 17.4328, 5.7629, 15.4125, 0.0613411, 0.0613411, 0, 1, 1.20744, 1.20181, 2.57881e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB2x2 aB2x2 rB wg 4x8 kc2 cab4 ks8 nse di l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.22683e+06, 317772, 0, 0, 0, 0, 11.9975, 10.4327, 6.41073, 17.69, 0.130587, 0.124094, 0.0159742, 0.849448, 1.19435, 1.02497, 8.14606e-12}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "aB16 aB16 rB wg 4x8 cab4x2 ks16 xaf dw vav grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {979393, 479807, 0, 0, 0, 0, 6.09588, 6.17619, 6.59461, 17.8681, 0.0214328, 0.00474331, 0.0185975, 0.676442, 1.42917, 1.17445, 1.63534e-12}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "aB16 aB16 rB wg 4x8 cab3 ks32 xaf dw vav grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {986809, 437370, 0, 0, 0, 0, 5.68695, 6.48375, 6.53936, 16.9925, 0.0282225, 0.0226525, 0.0239336, 0.736827, 1.37762, 1.17817, 1.10401e-12}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "aB16 aB16 rB wg 8x8 cab4 ks16 af dw vav di grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {585301, 1.13594e+06, 0, 0, 0, 0, 6.24472, 6.60826, 6.57509, 18.329, 0.0236345, 0.0236345, 0, 0.985563, 1.32741, 1.18438, 6.65402e-13}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "aB16 aB16 rB wg 4x8 cab3 ks32 xaf dw vav grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {982154, 437724, 0, 0, 0, 0, 6.2565, 6.37164, 6.49676, 16.9924, 0.0283206, 0.0139661, 0.0208344, 0.966273, 1.36049, 1.17681, 1.17759e-12}}}, +{{'E', "ugemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "@ipq"}, "aB8/4 aS8 rB wg 4x8 kc8 ca4 ks8 nse di sm dm l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 8192, 0, {2, 2, 2}, {true, true, false}}, {'W', 1, {1024}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB16 rB wg 8x4 cab4 ks32 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.0836e+06, 535346, 0, 0, 0, 0, 5.23021, 5.23405, 6.51087, 16.4718, 0.0217916, 0.0114948, 0.0166387, 0.885878, 1.48291, 1.16519, 1.92343e-12}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB32 rB wg 8x4 cab4 ks64 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.06855e+06, 487726, 0, 0, 0, 0, 4.61972, 4.65428, 5.62471, 14.4914, 0.0515614, 0.0277546, 0.0342646, 0.995874, 1.21142, 1.20166, 4.60295e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 rB wg 8x8 cab4 ks64 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 8, 64}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {577238, 1.06536e+06, 0, 0, 0, 0, 6.1297, 5.50253, 3.15556, 13.6879, 0.0845357, 0.0845357, 0, 1, 1.21247, 1.20135, 6.92574e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB64 rB wg 8x4 cab4 ks64 af dw vav di sm sn sys l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.10783e+06, 465033, 0, 0, 0, 0, 4.65887, 5.56569, 3.95764, 12.1666, 0.0862301, 0.0694934, 0.0295468, 1, 1.21388, 1.20186, 3.63753e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 rB wg 2x8 cab4 ks64 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.33132e+06, 224343, 0, 0, 0, 0, 4.74023, 4.70375, 3.6736, 10.5877, 0.105009, 0.0387103, 0.0712581, 0.996133, 1.21448, 1.20153, 2.71117e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB16 rB wg 8x4 cab4 ks32 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.08886e+06, 501207, 0, 0, 0, 0, 5.25936, 5.63221, 6.18774, 15.4045, 0.0340166, 0.0129824, 0.0294272, 1, 1.34352, 1.18285, 7.68453e-13}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 rB wg 4x4 cab3 ks64 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.27453e+06, 207401, 0, 0, 0, 0, 5.20901, 6.85109, 3.83997, 10.8185, 0.114739, 0.0329578, 0.0902767, 0.944924, 1.21429, 1.202, -4.89835e-16}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 rB wg 4x4 cab3 ks64 af vav di sm grf256 sys dm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.36326e+06, 242779, 0, 0, 0, 0, 5.88843, 23.0453, 1.43253, 1.41056, 0.411444, 0.0655792, 0.343105, 0.89732, 1.20931, 0, 0}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {0, 0, 0}, {0, 0, 0}, {4, 4, 1}, "@Inpqxy"}, "aB16 aB16 rB wg 8x4 cab4 ks16 xaf dw vav sm dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 655360, 16777216}, {524288, 655360, 16777216}, {32, 40, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 53248, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.117e+06, 621749, 0, 0, 0, 0, 5.63531, 5.38748, 6.34314, 18.0359, 0.0187448, 0.0158313, 0.0117899, 0.815598, 1.50444, 1.15596, 2.96809e-12}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 4, 1}, "xyI"}, "aS16x2 aB16 rB wg 16x2 cb4x2 ks16 xaf dw vav sn dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 16}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {16, 4, 4}, {true, true, false}}, {'E', 17, {1.08445e+06, 546785, 0, 0, 0, 0, 5.33037, 5.24024, 6.46841, 17.5866, 0.0186739, 0.00499822, 0.0174057, 0.790315, 1.46202, 1.17542, 3.02139e-12}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipxy"}, "aB32 aB16 rB wg 8x4 cab3 ks32 xaf dw vav sm dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.05615e+06, 527155, 0, 0, 0, 0, 5.24242, 5.61507, 6.74453, 17.432, 0.0191776, 0.0115211, 0.0120279, 1, 1.50506, 1.17144, 2.14593e-12}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "aB16 aB16 rB wg 8x8 cab4 ks16 xaf dw vav di sm sn grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {599248, 1.33458e+06, 0, 0, 0, 0, 5.52852, 5.45748, 6.54024, 18.2766, 0.0211426, 0.0211426, 0, 1, 1.40808, 1.175, 1.4855e-12}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "iz"}, "aB4 aB4 rB wg 4x8 kc4 cab4 ks8 nse di sm sn l4 grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {1024}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aS16 aB16 rB wg 16x2 cb4 ks32 af dw vav sn dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.05478e+06, 575218, 0, 0, 0, 0, 5.74371, 5.08922, 6.19581, 17.2554, 0.0334411, 0.0196105, 0.0113473, 0.677275, 1.20569, 1.20089, 8.36593e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aS16x2 aB16 rB wg 16x2 cb4 ks32 af dw vav dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 8192, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.04765e+06, 516617, 0, 0, 0, 0, 5.80456, 6.92892, 6.12461, 16.5118, 0.0555797, 0.0380168, 0.0166268, 0.662109, 1.20624, 1.20065, 1.4513e-14}}}, +{{'E', "ugemm", {"H", "H", "H"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "@ipq"}, "aB8 aB8/4 rB wg 4x8 kc8 cab4 ks8 nse di sm l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 2}, {true, true, false}}, {'W', 1, {1024}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 rB wg 8x4 cab4 ks16 af dw vav di sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.30894e+06, 785550, 0, 0, 0, 0, 7.11935, 8.75656, 6.13129, 16.041, 0.0504259, 0.0420693, 0.0674256, 0.73758, 1.20696, 1.20187, 1.15297e-15}}}, +{{'E', "ugemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "iz"}, "aB4x2 aB4x2 rB wg 8x4 kc4 cab4 ks8 nse di sm l4 grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {1024}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"A4", "B4", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB16x2 rB wg 8x4 nse di sb32 grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 0, 256, 0, 0, {128, 128, 4}, {true, true, false}}, {'W', 1, {1024}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 rB wg 8x8 cab4 ks32 af dw vav di sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {552020, 1.12529e+06, 0, 0, 0, 0, 5.16852, 2.80811, 6.1605, 15.5799, 0.0215997, 0.0215997, 0, 1, 1.2111, 1.20564, -3.21736e-14}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB32 aB64 rB wg 4x16 cab4 ks64 af dw vav di sn grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 4, 64}, {4, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {545420, 1.0938e+06, 0, 0, 0, 0, 3.27178, 5.27297, 4.83132, 13.78, 0.0374045, 0.0374045, 0, 1, 1.21335, 1.20085, 1.04884e-14}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB128 aB32 rB wg 4x16 cab4 ks128 af dw vav di sn grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 128}, {4, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {558963, 1.08579e+06, 0, 0, 0, 0, 4.41143, 3.44507, 2.93976, 12.2012, 0.0459434, 0.0459434, 0, 1, 1.2227, 1.20171, 7.0997e-15}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB128 aB32 rB wg 2x16 cab4 ks128 af dw vav di sn grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.06759e+06, 469635, 0, 0, 0, 0, 4.35669, 3.29013, 2.80884, 10.6334, 0.0840327, -0.117777, 0.237041, 0.791787, 1.22717, 1.20241, 4.66387e-16}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB64 aB32 rB wg 2x16 cab4 ks128 af dw vav di sn grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.06794e+06, 471134, 0, 0, 0, 0, 4.47378, 3.21765, 2.79237, 10.5871, 0.0837144, -0.00244281, 0.11683, 0.989653, 1.22114, 1.20202, 2.17019e-15}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 rB wg 2x8 cab4 ks32 af dw vav di sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 257, 10240, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.30659e+06, 218350, 0, 0, 0, 0, 3.0037, 5.69885, 3.5694, 10.4219, 0.0863226, -0.00310343, 0.129888, 0.997265, 1.21109, 1.20141, 4.79304e-15}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB8 rB wg 8x4 cab4 ks16 nse di sn grf256 dm l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.1687e+06, 680034, 0, 0, 0, 0, 3.04718, 3.55084, 6.45873, 17.7424, 0.0351353, 0.0322876, 0.00620525, 0.874922, 1.23355, 1.02944, 3.75965e-12}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {64, 64, 1}, "@Inopqxy"}, "aB32 aB32x2 rB wg 2x16 ca3 ks64 xaf dw vav dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {64, 64, 4}, {false, false, true}}, {'E', 17, {1.00154e+06, 513574, 0, 0, 0, 0, 2.73866, 2.65797, 6.51583, 18.2287, 0.00996636, 0.00691341, 0.00686787, 0.986643, 1.39382, 1.18598, 1.13183e-12}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "@Inopqxy"}, "aB32 aB32 rB wg 2x16 ca4 ks128 xaf dw vav dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {991698, 514197, 0, 0, 0, 0, 2.9311, 2.62991, 6.73613, 18.2732, 0.00996362, -0.00212772, 0.0186261, 0.740865, 1.40306, 1.17934, 9.21266e-13}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Inopxy"}, "aB32 aB32 rB wg 4x8 cab3 ks64 xaf dw vav dm grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 61440, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.07695e+06, 540770, 0, 0, 0, 0, 3.25146, 2.5002, 6.59555, 17.6112, 0.0108004, 0.00702661, 0.00787872, 0.935703, 1.34469, 1.18472, 4.82174e-13}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Inopxy"}, "aB32 aB64x2 rB wg 4x8 ca4 ks128/64 xaf st dw vav dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.00948e+06, 476755, 0, 0, 0, 0, 3.40086, 2.5695, 6.47179, 16.9, 0.0122444, 0.00343553, 0.0130855, 0.923057, 1.31761, 1.18319, 6.35596e-13}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32x2 aB32x2 rB wg 8x8 cab4 ks32 af dw vav di grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {563627, 1.13673e+06, 0, 0, 0, 0, 4.3956, 9.25108, 5.16051, 13.8631, 0.0539423, 0.0539423, 0, 1, 1.21113, 1.20077, 9.36221e-15}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 rB wg 8x4 cab4 ks32 af dw vav di sys l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.34117e+06, 329996, 0, 0, 0, 0, 4.49293, 4.23853, 6.05585, 15.6037, 0.057776, 0.0380034, 0.0257597, 1, 1.20975, 1.20161, 4.0607e-15}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 rB wg 8x8 cab4 ks32 af dw vav di grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 4, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 12288, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {549881, 1.11668e+06, 0, 0, 0, 0, 7.90656, 17.6481, 3.58236, 14.012, 0.170192, 0.170192, 0, 1, 1.21497, 1.20301, -7.65714e-15}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 rB wg 4x8 cab4 ks32 af dw vav di grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 4, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.09363e+06, 526547, 0, 0, 0, 0, 8.12314, 11.9414, 3.64859, 10.586, 0.206556, 0.126187, 0.252045, 1, 1.21209, 1.20195, 3.97516e-15}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB16 rB wg 4x4 cab4 ks16 nse di grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.325e+06, 271215, 0, 0, 0, 0, 3.1067, 4.51079, 6.43656, 17.4888, 0.0350908, 0.0307137, 0.00851217, 0.777143, 1.26441, 1.04115, 3.50353e-12}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "@Inxy"}, "aB32 aB32 rB wg 8x4 cab4 ks32 xaf st dw vav grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.01391e+06, 509797, 0, 0, 0, 0, 3.18295, 3.81981, 6.57362, 18.2308, 0.0132914, 0.00669773, 0.00903364, 0.924525, 1.25888, 1.19572, 2.0617e-13}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "aB32 aB32 rB wg 4x8 cab4 ks64 xaf dw vav grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.03222e+06, 489032, 0, 0, 0, 0, 3.6293, 4.30606, 6.4398, 16.8446, 0.0180904, 0.00975268, 0.0127029, 0.887231, 1.21983, 1.20127, -3.55483e-15}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB32 rB wg 8x4 cab4 ks64 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.08485e+06, 534090, 0, 0, 0, 0, 2.26038, 2.28822, 6.54241, 16.5807, 0.0108941, -0.00616224, 0.023865, 0.743702, 1.35093, 1.1815, 6.50109e-13}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB64 rB wg 8x8 cab4 ks64 af dw vav di sm sn grf256 sys l4 dm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {546628, 1.16417e+06, 0, 0, 0, 0, 2.69496, 2.82552, 5.90182, 15.4398, 0.0165818, 0.0165818, 0, 1, 1.23827, 1.20187, -6.19745e-15}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB128 aB64 rB wg 8x4 cab4 ks128 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 16, 128}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.18851e+06, 476513, 0, 0, 0, 0, 2.34744, 2.36664, 5.65933, 14.5041, 0.0251992, 0.0179545, 0.0233066, 0.991699, 1.2182, 1.2025, -6.17732e-15}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB64 rB wg 8x8 cab4 ks128 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 8, 128}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {560263, 1.07097e+06, 0, 0, 0, 0, 3.06127, 2.86759, 3.07389, 13.5153, 0.0419574, 0.0419574, 0, 1, 1.22194, 1.20059, 1.54207e-14}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB128 rB wg 8x4 cab4 ks128 af dw vav di sm sn sys l4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {8, 8, 128}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.09557e+06, 462553, 0, 0, 0, 0, 2.17329, 2.8532, 3.94421, 12.162, 0.0422784, 0.00389671, 0.0410451, 0.844249, 1.21956, 1.20074, 1.36525e-14}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB64 rB wg 2x8 cab4 ks128 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 128}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.32689e+06, 222623, 0, 0, 0, 0, 2.31002, 2.30138, 3.66802, 10.5821, 0.0515592, -0.0216913, 0.0738923, 0.87394, 1.219, 1.20127, 6.74603e-15}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB32 rB wg 8x4 cab4 ks64 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.07668e+06, 500480, 0, 0, 0, 0, 2.2985, 2.85765, 6.19196, 15.4234, 0.0169503, 0.00108194, 0.0249639, 0.993166, 1.24039, 1.19939, 4.69831e-14}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB64 rB wg 4x4 cab3 ks128 af dw vav di sm sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 128}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.26622e+06, 205878, 0, 0, 0, 0, 2.41748, 3.44852, 3.85081, 10.8513, 0.0565124, 0.0196373, 0.0513865, 0.975821, 1.22132, 1.20149, 6.09696e-15}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "opI"}, "aB64 aB32 rB wg 4x4 cab3 ks128 af vav di sm grf256 sys dm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 1, 128}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.35303e+06, 235808, 0, 0, 0, 0, 3.04897, 11.8664, 1.44909, 1.44909, 0.203641, 0.052135, 0.168812, 0.965466, 1.22155, 0, 0}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8x2 aB8x2 rB wg 8x8 cab4 ks16 nse di sm sn grf256 dm l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {582554, 1.48982e+06, 0, 0, 0, 0, 3.76758, 3.77334, 6.62494, 17.6039, 0.0363451, 0.0363451, 0, 1, 1.20147, 1.02529, 3.01473e-12}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {64, 4, 1}, "xyI"}, "aS32x2 aB32 rB wg 16x2 cb4x2 ks32 xaf dw vav sn dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {64, 4, 4}, {true, true, false}}, {'E', 17, {1.06789e+06, 533234, 0, 0, 0, 0, 2.49913, 2.42261, 6.47452, 17.5868, 0.00865422, 0.00164569, 0.0106404, 0.852414, 1.37422, 1.18475, 8.41776e-13}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Iopxy"}, "aB64 aB32 rB wg 8x4 cab3 ks64 xaf dw vav sm dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.0553e+06, 523707, 0, 0, 0, 0, 2.27443, 2.47386, 6.75043, 17.4507, 0.00973723, 0.0173445, 0.00450943, 0.936605, 1.39613, 1.18223, 6.42735e-13}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "ab32 ab32 rb l4 cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 4096, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.01946e+06, 85192.9, 0, 0, 0, 0, 3.69873, 4.09617, 6.42674, 17.041, 0.0424222, 0.0270009, 0.0195008, 0.698122, 1.40723, 1.13886, 7.68905e-13}}}, +{{'E', "ugemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8 aB16 rB wg 4x8 cab4 ks16 nse di sn grf256 dm l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.31114e+06, 828815, 0, 0, 0, 0, 4.12868, 4.53677, 6.35113, 17.6714, 0.04015, 0.0278237, 0.024414, 0.810338, 1.20421, 1.02447, 3.99205e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 rB wg 4x8 kc4 nse di sb64 grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 32, 4}, {4, 8, 1}, 1, (WGType) 0, 256, 0, 0, {128, 128, 4}, {true, true, false}}, {'W', 1, {1024}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 rB wg 8x4 kc4 cab4x2 ks8 nse di sn dm", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.3546e+06, 315336, 0, 0, 0, 0, 19.529, 12.4603, 5.98766, 15.7596, 0.125914, 0.121288, 0.0101254, 0.873111, 1.32691, 1.13232, 2.47813e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 rB wg 8x4 kc8 cab4x2 ks8 nse di sn", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 8, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.31235e+06, 288775, 0, 0, 0, 0, 18.7973, 21.3706, 5.54761, 14.4861, 0.147428, 0.107901, 0.033153, 1, 1.33996, 1.1511, 2.16816e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 rB wg 4x8 kc4 cab4x2 ks8 nse di sn", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.33992e+06, 296845, 0, 0, 0, 0, 20.3993, 13.23, 5.72152, 14.4818, 0.143213, 0.116468, 0.028474, 0.9246, 1.33587, 1.1524, 2.2794e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2 aB8x2 rB wg 4x4 kc8 cab4x2 ks8 nse di sn", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.42251e+06, 151979, 0, 0, 0, 0, 18.9169, 13.1915, 5.54022, 14.3571, 0.153986, 0.0983515, 0.0423043, 0.945168, 1.35424, 1.15241, 2.28656e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 rB wg 2x8 kc4 cab4x2 ks8 nse di sn", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 4, 8}, {2, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.3331e+06, 140159, 0, 0, 0, 0, 19.7793, 12.1262, 4.33379, 12.2776, 0.235893, 0.114077, 0.0969446, 0.891475, 1.27279, 1.13859, 7.22824e-13}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB8x2 rB wg 4x1 kc4 nse di sb64", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 1, 8}, {4, 1, 1}, 1, (WGType) 0, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.16919e+06, 27880.4, 0, 0, 0, 0, 11.5008, 22.3297, 0.230151, 0.23026, 1.23566, 0.719548, 0.513589, 0.5, 1.11024, 0, 0}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {2048, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "aB4x2 aB4x2 rB wg 8x4 kc4 cb4 ks8 nse di sn grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.08605e+06, 604168, 0, 0, 0, 0, 29.905, 14.9021, 6.30949, 17.3852, 0.127384, 0.123773, 0.010991, 0.905338, 1.1857, 1.02549, 5.95169e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {2048, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4/2x2 aB4x2 rB wg 8x4 kc4 nse di sb64 grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 32, 4}, {8, 4, 1}, 1, (WGType) 0, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {989969, 519612, 0, 0, 0, 0, 16.7886, 17.9252, 6.45578, 17.4626, 0.12653, 0.124499, 0.0102408, 0.914174, 1.18982, 1.01788, 6.88478e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 rB wg 8x4 kc4 cab4x2 ks8 vav di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.14372e+06, 268176, 0, 0, 0, 0, 19.0558, 21.4548, 6.11653, 15.9061, 0.125914, 0.1165, 0.0129836, 0.766057, 1.30565, 1.10714, 3.63242e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 rB wg 8x4 kc8 cab4x2 ks8 nse di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 8, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.14681e+06, 252797, 0, 0, 0, 0, 18.7556, 22.0632, 5.55765, 14.7217, 0.129576, 0.0903504, 0.0430783, 0.608364, 1.37895, 1.18499, 2.72061e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 rB wg 4x8 kc4 cab4x2 ks8 nse di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.1555e+06, 250247, 0, 0, 0, 0, 20.2189, 17.5095, 5.50136, 14.7401, 0.130247, 0.121949, 0.0272943, 0.822585, 1.38372, 1.17114, 2.4229e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 rB wg 8x4 kc4 cab4x2 ks8 nse di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {8, 8, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.14264e+06, 242222, 0, 0, 0, 0, 11.8247, 19.2893, 4.69171, 13.5115, 0.206087, 0.141717, 0.0799846, 1, 1.23922, 1.11159, 2.37456e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 rB wg 4x4 kc4 cab4x2 ks8 nse di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {8, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 8192, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.16938e+06, 118796, 0, 0, 0, 0, 12.0084, 17.0436, 4.73424, 13.6962, 0.223144, 0.129865, 0.0961405, 0.885772, 1.23839, 1.03623, 1.12784e-11}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aS4x2 rP wg 4x8 kc4 ca4 ks8 nse di sm grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.08113e+06, 644110, 0, 0, 0, 0, 17.8779, 12.8384, 6.35841, 21.2735, 0.12828, 0.125356, 0.0107346, 0.963002, 1.21137, 1.04637, 4.08712e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 rB wg 8x4 kc8 cab4x2 ks8 nse di sm sn dm", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.3433e+06, 325408, 0, 0, 0, 0, 13.5129, 12.9096, 5.98721, 15.7372, 0.12661, 0.0960571, 0.0304219, 0.583089, 1.34377, 1.11974, 3.39207e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 rB wg 4x8 kc8 cab4x2 ks8 nse di sm sn dm", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.33091e+06, 294791, 0, 0, 0, 0, 12.113, 12.1053, 5.76463, 14.438, 0.174666, 0.153877, 0.0284394, 1, 1.21846, 1.09172, 2.58579e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 rB wg 2x8 kc4 cab4x2 ks8 nse di sm sn dm", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 4, 8}, {2, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.37172e+06, 143581, 0, 0, 0, 0, 12.0273, 11.939, 4.30531, 12.11, 0.300404, 0.190268, 0.0847494, 0.961414, 1.14371, 1.0774, 1.20649e-12}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8 aB8x2 rB wg 4x1 kc8 nse di kd", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 1, 8}, {4, 1, 1}, 1, (WGType) 0, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.18688e+06, 30123, 0, 0, 0, 0, 11.8183, 1.08352, -3.02547, -3.02137, 3.45176, 1.04334, 0.610591, 0, 1.00207, 0, 0}}}, +{{'E', "ugemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "aS4x2 aB8/4x2 rB wg 8x4 kc4 cb4 ks8 nse di grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {981989, 555240, 0, 0, 0, 0, 15.2859, 19.3328, 6.46524, 20.9306, 0.131163, 0.116872, 0.0205996, 0.805001, 1.19768, 1.00218, 8.63883e-12}}}, +{{'E', "ugemm", {"Z", "Z", "Z"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB2x2 aB2x2 rB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 384}, {4096, 4096, 384}, {8, 8, 2}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {128, 128, 16}, {true, true, false}}, {'W', 1, {64}}}, +{{'E', "ugemm", {"Z", "Z", "Z"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB2x2 aS2x2 rB xb", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {8, 8, 2}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'W', 1, {64}}}, +{{'E', "ugemm", {"Z", "Z", "Z"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB2x2 aB2x2 rB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 384}, {4096, 4096, 384}, {8, 8, 2}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'W', 1, {64}}}, +{{'E', "ugemm", {"Z", "Z", "Z"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aS2x2 aS2x2 rB xb", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 128}, {4096, 4096, 128}, {8, 8, 2}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'W', 1, {64}}}, +{{'E', "ugemm", {"Z", "Z", "Z"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aS2x2 aB2x2 rS xa", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {8, 8, 2}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'W', 1, {64}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"A2#16,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16+B16@48 aB16+B16@48 rB vav di sys grf256 af wg 4x8 sb256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, false}}, {'W', 1, {2048}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"A2", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2+B16@80 aB16x2+B16@80 rB vav sb256 wg 4x8 di sys", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {64, 128, 4}, {true, true, false}}, {'W', 1, {256}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am16 rB wg 8x4 cb4x2 ks32 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {997384, 523666, 0, 0, 0, 0, 0.725909, 1.44913, 0.969352, 1.7371, 0.00684326, 0.00684326, 0, 0.948745, 1.28965, 1.02213, 2.98663e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am32+m64@64 rB wg 4x8 af vav di sb256 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {884175, 432893, 0, 0, 0, 0, 0.691705, 0.930636, 0.67952, 1.28623, 0.00834359, 0.00834359, 0, 0.829417, 1.2862, 0.998202, 1.83773e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@32 am32+m16@32 rB wg 4x8 xaf vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {886608, 390320, 0, 0, 0, 0, 0.857697, 1.69524, 0.882366, 1.49272, 0.0152973, 0.0152973, 0, 0.87518, 1.13598, 0.985664, 7.22943e-13}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m64@64 am32x2+m64@32 rB wg 4x8 xaf vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {892820, 387589, 0, 0, 0, 0, 1.37414, 1.37091, 1.57321, 2.49804, 0.0230095, 0.0230095, 0, 0.798198, 1.0699, 0.943911, 8.62443e-13}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@64 am32+m64@64 rB wg 4x8 af vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {889098, 373289, 0, 0, 0, 0, 1.28342, 2.80624, 2.15509, 3.83409, 0.037508, 0.037508, 0, 0.905233, 1.05006, -0.205455, 1.1953e-11}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB64 rB wg 4x8 cab3x2 ks64 af vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.50672e+06, 832915, 0, 0, 0, 0, 0.987646, 1.02217, 0.93956, 1.85826, 0.00865513, 0.00865513, 0, 0.919371, 1.38478, 0.970416, 3.20276e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 rB wg 4x8 cab4 ks64 af vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 rB wg 2x16 cab3x2 ks64 af vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.0376e+06, 806829, 0, 0, 0, 0, 1.28219, 1.0295, 1.49381, 3.00966, 0.0179815, 0.0179815, 0, 0.929009, 1.36496, 0.83458, 3.84739e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {11000, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {64, 8, 1}, "ABI"}, "av16+B16@48 am16x2 rB wg 8x2 af vav di sb256 sys ska rr wx4", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 257, 0, 0, {64, 8, 4}, {true, true, false}}, {'E', 17, {722541, 469395, 0, 0, 0, 0, 0.464722, 19.0441, 0.942118, 2.23237, 0.0530335, 0.0530335, 0, 0.0814462, 1.17896, -0.305624, 2.12632e-11}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@72 am32+m32@64 rB wg 4x8 xaf vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {901925, 700635, 0, 0, 9.16685e+06, 1.23699e+07, 0.721771, 0.719501, 0.918422, 1.55461, 0.00404125, 0.00404125, 0, 0.990031, 1.64922, 1.16161, 1.79221e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m32@48 am32+S32@64 rB wg 4x8 xaf st vav di sb32 sn grf256 sys rr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {887798, 736266, 0, 0, 8.25754e+06, 1.06742e+07, 0.730652, 0.777015, 0.882231, 1.50445, 0.00406892, 0.00406892, 0, 0.972567, 1.60737, 1.13182, 3.10708e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av64+m16@64 am32+m32@72 rB wg 8x4 xaf vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {904313, 557140, 0, 0, 6.41761e+06, 7.86432e+06, 0.630762, 0.876288, 0.890116, 1.496, 0.00449959, 0.00449959, 0, 0.908549, 1.99433, 1.14102, 2.53933e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIpq"}, "av64+m16@64 am16+m16@48 rB wg 8x2x2 xaf vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 64, 64}, {8, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.08493e+06, 292142, 0, 0, 7.6546e+06, 9.12589e+06, 0.650233, 1.15949, 0.898431, 1.61747, 0.00551188, 0.000223805, 0.00523812, 0.479798, 1.50717, 1.16902, 1.70322e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av64+m32@72 am64 rB wg 4x4 cb3 ks64 xaf st vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.39811e+06, -511740, -162098, 776498, 3.66838e+06, 3.61759e+06, 0.615352, 0.834481, 0.947487, 1.59755, 0.00699936, 0.00120041, 0.00602363, 0.481222, 1.38965, 1.12497, 1.73138e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@16 am16x2 rB wg 4x4 cb3x2 ks64 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.20728e+06, 220586, 0, 0, 3.33332e+06, 3.75194e+06, 0.652532, 1.23869, 0.91158, 1.56406, 0.0117214, 0.00109919, 0.0112405, 0.60464, 1.3107, 0.968174, 2.71041e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 rB wg 2x4 ca3x2 ks16 af vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 6144, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.32904e+06, -190080, -59958.5, 292338, 3.36691e+06, 2.61734e+06, 0.669227, 0.829673, 0.945076, 1.6481, 0.0148001, 0.00579724, 0.00959387, 0.821019, 1.23241, -0.89576, 6.77612e-11}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@16 am64+m16@32 rB wg 2x4 af vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 64}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.14845e+06, -147110, -26044.3, 242040, 2.51494e+06, 1.99066e+06, 0.589278, 0.947466, 0.896827, 1.473, 0.0220317, 0.0143856, 0.00994993, 0.785111, 1.09932, 0.536897, 2.83611e-11}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@32 am32+m32@32 rB wg 2x4 xaf vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 32}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.14345e+06, -137745, -33413, 227017, 2.82624e+06, 1.87433e+06, 0.82029, 0.877149, 1.66039, 2.54748, 0.0298959, 0.00933916, 0.0216032, 0.865203, 1.23765, 0.941145, 3.68181e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m32@16 am32 rB wg 2x8 af vav di nmk sb64 sys", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.04885e+06, 107041, 0, 0, 2.47316e+06, 3.29318e+06, 0.654999, 3.81135, 0.843479, 1.3591, 0.038736, 0.023268, 0.0232375, 0.751197, 1.17223, 0.496182, 1.38208e-11}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#I"}, "aB16 aB16 rB wg 4x8 cab4x2 ks16 xaf vav di sn dm grf256 sys rr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.04424e+06, 973248, 0, 0, 7.84302e+06, 1.21242e+07, 0.953302, 1.18133, 1.01052, 1.70291, 0.00479485, 0.00479485, 0, 0.800073, 1.59939, 1.10436, 6.87712e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB32 rB wg 4x8 cab4 ks32 xaf st vav di sn dm grf256 sys rr l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.05252e+06, 860723, 0, 0, 4.41549e+06, 6.58637e+06, 0.871936, 1.44298, 0.913188, 1.80988, 0.00617766, 0.00617766, 0, 0.994839, 1.55547, 1.04563, 3.62354e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 rB wg 4x4 cab3 ks32 xaf st vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.81493e+06, -570976, -358468, 965905, 3.85024e+06, 4.4073e+06, 0.888133, 0.947947, 0.979375, 1.62192, 0.00765437, 0.00116269, 0.00661508, 0.746058, 1.42628, 1.00722, 4.56276e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB32 rB wg 4x4 cab4x2 ks32 af vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.73798e+06, -437907, -318331, 822139, 2.89178e+06, 3.01466e+06, 0.809057, 1.16853, 0.915011, 1.60895, 0.0117213, 0.00153425, 0.0104969, 0.446714, 1.33828, 0.977686, 2.82538e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 rB wg 2x8 cab4x2 ks32 xaf st vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 32}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.6828e+06, -385192, -295917, 726406, 2.7435e+06, 2.62144e+06, 0.822119, 0.992845, 0.985335, 1.93178, 0.0162175, 0.0015617, 0.0148216, 0.580448, 1.2832, 0.963286, 2.7298e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB32 rB wg 1x4 cab3 ks32 xaf vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 32}, {1, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.64155e+06, -103333, -66679.8, 190802, 3.23584e+06, 2.12992e+06, 0.833578, 0.885824, 0.842945, 1.56023, 0.0285113, 0.00669822, 0.0219822, 0.893832, 1.23518, -0.453662, 6.29633e-11}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "#AI"}, "aB16+m32@32 aB32 rB wg 2x8 af vav di nmk ca3 sys dm l4", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 2, 4}, {true, true, false}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "AIp"}, "aB16+m32@32 aB64 rB wg 1x4 af vav di sb64 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.19025e+06, -94418.5, -16051.4, 154897, 2.84262e+06, 1.62529e+06, 1.82485, 0.600489, 0.594593, 1.14879, 0.0290928, 0.0261561, 0.0155745, 1, 1.18676, 0.246189, 2.30089e-11}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "aB32 aB32 rB wg 8x4 cab3 ks32 af vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 86016, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.07095e+06, 921756, 0, 0, 5.66559e+06, 9.26515e+06, 0.874939, 1.19488, 1.04455, 1.6478, 0.00468774, 0.00468774, 0, 0.993164, 1.66254, 1.1552, 3.26236e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 rB wg 8x2 af vav di sb256 sys ska rr wx4", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {879752, 350276, 0, 0, 0, 0, 0.455042, 18.3558, 0.734906, 2.00265, 0.0538244, 0.0538244, 0, 0.0478715, 1.11789, 0.996902, 3.19089e-13}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@48 at16x2+m64@48 rB wg 4x8 af vav di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {884302, 427442, 0, 0, 0, 0, 1.10721, 1.21304, 0.68084, 1.29522, 0.00895717, 0.00895717, 0, 0.917109, 1.35302, 0.98768, 3.54927e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@64 at32+m32@64 rB wg 4x8 xaf vav di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {882279, 402227, 0, 0, 0, 0, 1.096, 2.25336, 0.87212, 1.48589, 0.0146719, 0.0146719, 0, 0.967546, 1.27381, 0.958332, 1.758e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB16+m32@48 at32+m32@48 rB wg 8x4 cab4 ks32 xaf st vav di sn grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.07446e+06, 778165, 0, 0, 0, 0, 1.04453, 2.06434, 0.993405, 1.67185, 0.00809777, 0.00809777, 0, 0.966922, 1.35225, 0.983739, 2.64748e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB32 at32+m32@48 rB wg 4x8 cab4 ks64 xaf st vav di sn grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 131072, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.08109e+06, 676708, 0, 0, 0, 0, 1.46721, 1.49766, 0.911352, 1.83399, 0.0123018, 0.0123018, 0, 0.930092, 1.22199, 0.994137, 1.11339e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16x2 at16x2 rB wg 4x8 cab4x2 ks64 xaf st vav di grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.08079e+06, 618844, 0, 0, 0, 0, 1.25013, 3.49017, 1.047, 2.28189, 0.0251551, 0.0251551, 0, 0.955754, 1.00322, 1.00053, 1.67181e-14}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#Ip"}, "aB16x2 aS16x2 rB wg 4x16 cab4 ks64 af vav di sys", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 2, 64}, {4, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.0392e+06, 894929, 0, 0, 0, 0, 1.62399, 5.20002, 2.09792, 4.09594, 0.063358, 0.063358, 0, 0.952898, 1.11995, 0.921533, 1.32456e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@32 av16x2 rB wg 4x8 cb4x2 ks32 xaf vav di grf256 sys rr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.03592e+06, 704672, 0, 0, 7.32365e+06, 1.08544e+07, 0.894904, 1.09998, 0.983005, 1.70679, 0.00421397, 0.00421397, 0, 1, 1.62696, 1.15394, 2.31737e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@48 av32 rB wg 8x4 cb4 ks32 xaf st vav di grf256 sys rr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.00518e+06, 647557, 0, 0, 5.6361e+06, 8.82278e+06, 0.824036, 1.52239, 1.05594, 1.7661, 0.00544276, 0.00544276, 0, 0.821492, 1.58157, 1.1307, 1.25438e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 rB wg 8x4 cb3 ks64 af vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.01546e+06, 551460, 0, 0, 5.50994e+06, 5.39853e+06, 0.786066, 1.60403, 1.03564, 1.72987, 0.00627267, 0.00627267, 0, 0.945912, 1.4299, 1.12176, 2.119e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m16@32 av16 rB wg 4x4 cb3 ks32 xaf vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.34327e+06, -452800, -134815, 707026, 3.90758e+06, 4.02227e+06, 0.83768, 0.940982, 0.959085, 1.58238, 0.00734203, 0.000556818, 0.00668236, 0.749254, 1.38444, 1.02403, 3.33022e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 at64+m64@48 rB wg 4x4 af vav di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {970705, -384977, 57542, 601914, 2.51904e+06, 2.77955e+06, 0.70559, 1.27931, 0.712808, 1.42004, 0.0108625, 0.000797412, 0.0106601, 0.747036, 1.45242, 0.89344, 7.5701e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@16 at16+m32@32 rB wg 4x2 af vav di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.108e+06, -159098, -1506.45, 263377, 3.35462e+06, 2.46088e+06, 0.968705, 0.943372, 0.747695, 1.27763, 0.0163497, 0.00954244, 0.00740148, 0.567584, 1.32154, 0.910281, 9.36606e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@32 at32x2+m64@16 rB wg 4x2 xaf vav di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.13295e+06, -131068, -24845.2, 227188, 2.85082e+06, 2.12828e+06, 0.759763, 1.40657, 0.884702, 1.49164, 0.0269937, 0.0148542, 0.0126213, 0.40094, 1.20735, 0.590554, 2.76112e-11}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m16@32 at32x2+m16@16 rB wg 4x2 af vav di nmk sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 8, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {774823, -159604, 298303, 251381, 2.48054e+06, 3.06135e+06, 0.49199, 4.0514, 0.750455, 1.55245, 0.0179351, 0.0189282, 0.0113946, 1, 1.20887, 0.775438, 7.38219e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABIpq"}, "aB16+m16@48 at32+m16@48 rB wg 4x8 cab3 ks32 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.04733e+06, 850139, 0, 0, 7.14342e+06, 1.02482e+07, 1.35439, 1.1268, 0.968002, 1.56332, 0.0049908, 0.0049908, 0, 1, 1.5967, 1.09547, 2.65254e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at16+m16@32 rB wg 8x2 cab3 ks16 xaf st vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 64, 16}, {8, 2, 1}, 1, (WGType) 1, 257, 36864, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.62752e+06, -1.34654e+06, -263082, 1.70692e+06, 5.27729e+06, 7.17619e+06, 1.02404, 1.43681, 1.0099, 1.67277, 0.00669992, 0.000934616, 0.00604576, 0.813201, 1.39102, 1.03493, 5.39508e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB32+m32@64 at32+m16@64 rB wg 4x4 cab3 ks32 xaf st vav di sn grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.66893e+06, -477403, -298914, 842863, 4.01408e+06, 4.51379e+06, 1.32266, 0.980356, 0.957919, 1.60267, 0.00874929, 0.000443986, 0.00826981, 0.601212, 1.33967, 0.974327, 4.12771e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB16x2+m16@32 at16x2+m16@32 rB wg 2x4 cab4 ks16 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 16, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.53714e+06, -226698, -110555, 381984, 4.01408e+06, 3.08838e+06, 1.02014, 1.2588, 0.99644, 1.73352, 0.0120789, 0.00866919, 0.00421095, 0.619534, 1.33753, 0.955156, 5.93572e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB16x2+m16@32 at16x2+m16@32 rB wg 2x4 cab3x2 ks32 xaf vav di sn grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {2, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.59894e+06, -165886, -123352, 328698, 3.3538e+06, 2.68698e+06, 1.29378, 0.935395, 0.914424, 1.63719, 0.0161479, 0.0115282, 0.00624899, 0.60152, 1.23882, 0.978417, 2.1418e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 rB wg 2x2 cab4x2 ks16 xaf vav di grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 16}, {2, 2, 1}, 1, (WGType) 1, 257, 12288, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.46165e+06, -119762, -43325.5, 189248, 3.71917e+06, 2.31834e+06, 0.816729, 1.39844, 0.648548, 1.38561, 0.0254865, 0.0233101, 0.00872739, 0.939824, 1.2192, -0.0765997, 2.67484e-11}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 rB wg 1x2 cab3x2 ks16 af vav di grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 16}, {1, 2, 1}, 1, (WGType) 1, 257, 6144, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#AI"}, "aB32+m16@48 aS16 rB wg 16x1 cb4x2 ks16 xaf vav di nmk grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {16, 1, 1}, 1, (WGType) 1, 257, 2048, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.4553e+06, -444405, -228349, 795645, 2.85082e+06, 2.41664e+06, 0.798343, 8.5823, 0.957266, 1.78402, 0.0570677, 0.00285529, 0.0528963, 0.711108, 1.00269, 0.593861, 3.05276e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 rB wg 1x16 xaf st vav di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.03388e+06, 205982, 0, 0, 2.69353e+06, 2.9311e+06, 5.87694, 0.62071, 0.59549, 1.16678, 0.0302158, 0.00216455, 0.029364, 0.590076, 1.35965, 0.82526, 6.19021e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m32@48 at64+m16@48 rB wg 2x2 xaf cs di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.12446e+06, -88323.8, -10316.2, 139507, 2.89997e+06, 1.7449e+06, 0.820255, 0.848685, 0.849377, 1.53444, 0.0324984, 0.0285972, 0.0106938, 0.747769, 1.17424, 0.248229, 3.53369e-11}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@64 am16 rB wg 16x2 cb3x2 ks32 af vav di sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.00078e+06, 522321, 0, 0, 0, 0, 0.682391, 1.20192, 1.01335, 1.54654, 0.0071952, 0.0071952, 0, 0.953151, 1.34794, 1.05729, 2.20536e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m64@64 am32+m16@64 rB wg 8x4 af vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {885956, 430908, 0, 0, 0, 0, 0.914642, 0.833252, 0.684654, 1.25859, 0.00868088, 0.00868088, 0, 0.977653, 1.23981, 0.98687, 1.9126e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am32+m64@48 rB wg 8x4 af vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {894564, 394448, 0, 0, 0, 0, 0.740088, 1.32909, 0.850093, 1.46728, 0.0134182, 0.0134182, 0, 0.841095, 1.18393, 0.950667, 1.90558e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am16+m64@48 rB wg 4x8 xaf st vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {884081, 380764, 0, 0, 0, 0, 1.36831, 1.27261, 1.55496, 2.49777, 0.0278439, 0.0278439, 0, 0.839345, 1.01284, 0.740188, 3.17551e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m32@48 aB32 rB wg 8x4 cb4x2 ks32 xaf vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.00274e+06, 774511, 0, 0, 0, 0, 0.955103, 0.935315, 0.976805, 1.72673, 0.00884678, 0.00884678, 0, 1, 1.29366, 0.922321, 3.85241e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m16@64 aB32x2 rB wg 8x4 cb4x2 ks64 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {965920, 744570, 0, 0, 0, 0, 0.791602, 1.36895, 0.948542, 2.16931, 0.0146708, 0.0146708, 0, 0.923434, 1.19189, 0.953727, 1.73617e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m32@64 aB64 rB wg 4x8 cb4x2 ks64 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {994587, 706072, 0, 0, 0, 0, 1.16172, 1.12925, 1.50732, 3.11789, 0.0225454, 0.0225454, 0, 0.932581, 1.112, -1.70688, 2.19465e-11}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 rB wg 4x1 af vav nmk sb256 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIqp"}, "at16+m64@48 am32+m32@56 rB wg 4x8 xaf st vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {891672, 716622, 0, 0, 9.15866e+06, 1.23699e+07, 1.01186, 0.784937, 0.920647, 1.54228, 0.00412477, 0.00412477, 0, 1, 1.49446, 1.10147, 4.62125e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m64@48 am32+m32@64 rB wg 8x4 xaf vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {878563, 640029, 0, 0, 7.94624e+06, 1.00516e+07, 0.792598, 0.747562, 0.882257, 1.4892, 0.00427013, 0.00427013, 0, 0.943254, 1.61821, 1.15931, 2.48177e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at16x2+m32@32 am32+m64@48 rB wg 8x4 xaf vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {874741, 575243, 0, 0, 5.8327e+06, 8.40499e+06, 0.789453, 0.905132, 0.884603, 1.48669, 0.00505136, 0.00505136, 0, 0.980586, 1.4651, 1.12795, 2.59575e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16+m32@32 am32+m32@48 rB wg 8x2 af vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 64, 32}, {8, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.1314e+06, -878271, -20025.1, 1.16609e+06, 4.7276e+06, 6.41434e+06, 0.491471, 0.826693, 0.899462, 1.61955, 0.00543978, 0.000623448, 0.00517972, 0.502855, 1.47936, 1.13884, 2.33468e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am16x2+m64@32 rB wg 4x4 xaf st vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.03784e+06, -450885, 26588.1, 694717, 3.98295e+06, 3.92397e+06, 0.81751, 0.78514, 0.845116, 1.57539, 0.00816675, 0.000557843, 0.00774396, 0.425496, 1.24902, 0.987854, 2.88769e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am32+m32@48 rB wg 4x2 xaf vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.12707e+06, -214752, -7383.8, 333698, 3.94035e+06, 3.08838e+06, 0.484581, 0.643672, 0.911706, 1.72451, 0.00993042, 0.0077074, 0.00295794, 0.390742, 1.38294, 0.925243, 9.30327e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am32+m32@48 rB wg 2x2 xaf vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 32}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.12909e+06, -145288, -3837.8, 206075, 4.36634e+06, 2.87539e+06, 0.472236, 0.412666, 0.84861, 1.70891, 0.0122108, 0.0128924, 0.00189719, 0.642098, 1.34608, 0.926981, 8.76031e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 rB wg 4x2 xaf st vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@64 am32+m64@64 rB wg 2x2 xaf vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 32}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.15274e+06, -78649.5, -7526.38, 127813, 3.10477e+06, 2.06438e+06, 0.435838, 0.461228, 0.861245, 1.55509, 0.0239119, 0.0234452, 0.00542143, 0.84336, 1.27869, 0.990632, 3.60161e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 8, 1}, "ABI"}, "at16+m64@16 am64+m64@64 rB wg 8x4 af vav di nmk sb64 sm sys", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 4, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.03147e+06, 232150, 0, 0, 2.25853e+06, 2.39288e+06, 0.636799, 3.56799, 1.54923, 2.28375, 0.0371495, 0.00331608, 0.0414442, 0.906021, 1.00388, 1.00126, 2.00587e-14}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at16+m32@48 aB16 rB wg 8x4 cb4x2 ks32 af vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.05957e+06, 973070, 0, 0, 7.52845e+06, 1.03301e+07, 0.801208, 0.831268, 0.985204, 1.59017, 0.00445847, 0.00445847, 0, 0.989412, 1.66875, 1.16553, 2.2562e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AIp"}, "at16+m32@48 aB16 rB wg 16x2 cb4x2 ks64 xaf st vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 96, 64}, {16, 2, 1}, 1, (WGType) 1, 257, 98304, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.07221e+06, 859869, 0, 0, 5.8327e+06, 7.83974e+06, 0.766638, 1.10644, 1.03215, 1.48351, 0.00498767, 0.00498767, 0, 0.940961, 1.6724, 1.16768, 2.0947e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@64 aB16x2 rB wg 16x2 cb3 ks32 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {997268, 779168, 0, 0, 4.36634e+06, 5.75078e+06, 0.740136, 1.4932, 1.0067, 1.5422, 0.00624117, 0.00624117, 0, 0.962734, 1.45555, 1.07753, 2.97255e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AIp"}, "at16x2+m32@32 aB16x2 rB wg 8x2 cb4 ks32 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 32}, {8, 2, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.79219e+06, -531031, -361044, 940628, 4.07142e+06, 4.00589e+06, 0.824906, 0.816235, 1.02815, 1.50776, 0.00733025, 0.00114441, 0.00653515, 0.764373, 1.47826, 1.10061, 2.80374e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AIp"}, "at16+m32@16 aB32 rB wg 8x1 cb4x2 ks32 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 32}, {8, 1, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.73971e+06, -258868, -158599, 451327, 3.80928e+06, 3.05316e+06, 0.482671, 0.706304, 1.03317, 1.54125, 0.00920456, 0.00608899, 0.00408001, 0.912767, 1.36748, 1.00539, 5.47259e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@32 aB16x2 rB wg 4x2 cb4x2 ks32 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.88588e+06, -200788, -193135, 423330, 3.44883e+06, 2.47398e+06, 0.572193, 0.527027, 0.933872, 1.59818, 0.0129759, 0.0095749, 0.00498263, 0.687385, 1.38427, 0.929972, 5.58992e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16x2+m32@48 aB32 rB wg 4x2 cb4x2 ks64 af vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {4, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.63784e+06, -161637, -133730, 320507, 2.49037e+06, 1.72196e+06, 0.553135, 0.77437, 0.93673, 1.73418, 0.0199906, 0.0144326, 0.00833266, 0.7979, 1.32142, 0.974789, 3.83166e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@48 aB32x2 rB wg 2x2 cb4x2 ks32 xaf vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {2, 2, 1}, 1, (WGType) 1, 257, 8192, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#ABI"}, "at16x2+m64@16 aB32+m16@32 rB wg 4x2 af vav di nmk sb64 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 8, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.28894e+06, -131687, -76296.7, 251879, 2.71155e+06, 2.04882e+06, 0.531564, 1.43928, 1.43076, 2.58493, 0.0282574, 0.0178971, 0.0205543, 1, 1.23116, -0.0703731, 1.35882e-11}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "AI"}, "at16+m32@48 aB64 rB wg 1x4 af vav di sb64 sm dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 64}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.19242e+06, -90063.1, -14616.2, 144232, 2.71974e+06, 1.70476e+06, 1.08584, 0.412189, 0.608623, 1.21556, 0.0200475, 0.0220263, 0.0153697, 1, 1.02408, 0.835925, 1.49508e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABqI"}, "am32+S1,32@64 av32+B32@64 rS vav di sys grf256 af wg 8x4 sb256 sm", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {705599, 1.02394e+06, 0, 0, 0, 0, 0.719208, 0.662079, 1.08787, 2.05052, 0.00435156, 0.00435156, 0, 0.998842, 1.73144, 1.10326, 2.79532e-12}}}, +{{'F', "ugemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16 aS16 rB sys grf256 cab2 wg 4x4 l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.2896e+06, 327092, 0, 0, 0, 0, 1.64753, 1.69722, 1.01172, 1.48812, 0.0145767, 0.000763122, 0.0157325, 0.871871, 1.01157, 1.00431, 1.22757e-13}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB8+B8@8 aB8+B8@8 rB nse di wg 4x8 sb256 kc8", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 8}, {true, true, false}}, {'W', 1, {256}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB8+B8@16 aS8+S1,8@16 rB nse di wg 4x8 kc8 sn sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {929253, 1.19309e+06, 0, 0, 0, 0, 2.3554, 3.13252, 1.035, 2.46244, 0.250009, 0.250009, 0, 0.999916, 1.03944, 1.00017, 6.37469e-13}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tu"}, "aB4+B16@12 aS4x2+S8@8 rB wg 4x8 kc4 nse di sb256 sn xa xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 8, 4}, {4, 8, 1}, 1, (WGType) 1, 1, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {894066, 580494, 0, 0, 0, 0, 3.14311, 5.3277, 1.15565, 2.28476, 0.248243, 0.248243, 0, 1, 1.10493, 0.235368, 3.16203e-11}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB8+B16@12 aS4x2+S8@8 rB wg 4x4 kc4 nse di sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.10109e+06, 304248, 0, 0, 0, 0, 3.17806, 5.38312, 1.3854, 2.54779, 0.250663, -0.000217884, 0.252008, 0.451926, 1.19142, 0.993468, 1.24079e-11}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB4x2+B4@8 aS4x2+S8@8 rB wg 4x4 kc4 nse di sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 4, 4}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.09279e+06, 244142, 0, 0, 0, 0, 3.18086, 8.29599, 2.95538, 4.75936, 0.253869, -0.000232083, 0.254291, 0.713957, 1.31884, 0.837955, 2.12666e-11}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB4x2+B8@12 aS8x2+S8@8 rB wg 1x8 kc4 nse di sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {64, 2, 8}, {1, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.24179e+06, 145120, 0, 0, 0, 0, 2.13335, 5.46734, 1.48722, 2.6079, 0.251106, 0.186294, 0.093908, 0.681672, 1.31523, 0.995679, 2.21367e-11}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB4x2+S4@12 aB4+S16@16 rB wg 8x1 kc4 nse di nmk sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {8, 1, 4}, {8, 1, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.08999e+06, 124066, 0, 0, 0, 0, 1.58825, 46.3568, 7.22137, 20.802, 0.935054, 0.122417, 0.898668, 0.995842, 1.2952, 0, 0}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ft"}, "aB4x2+B4@16 aS8x2+S1,16@16 rB wg 2x4 kc4 nse di sb256 sn xa xc grf256 3m", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 16, 8}, {2, 4, 1}, 1, (WGType) 1, 1, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.25004e+07, 320111, 0, 0, 0, 0, 3.45876, 4.66482, 1.5961, 3.76016, 0.191506, -0.000954248, 0.19046, 0.604334, 1.14555, -0.0681631, 1.66972e-10}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB8/4x2+B8@28 aB8x2+B8@24 rB nse di wg 4x8 sb256 kc8 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {973762, 1.17991e+06, 0, 0, 0, 0, 2.58408, 2.9415, 1.07665, 2.43053, 0.249895, 0.249895, 0, 1, 1.03001, 1.00202, 4.70138e-13}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB8+B16@12 aB8x2+S16@4 rB wg 4x8 kc8 nse di sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {32, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {885788, 754865, 0, 0, 0, 0, 3.05325, 5.35618, 0.91159, 1.96328, 0.25069, 0.25069, 0, 1, 1.07747, 0.350687, 2.47611e-11}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB4x2+B8@8 aB4+S8@12 rB wg 4x8 kc4 nse di sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 8, 4}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {908617, 569425, 0, 0, 0, 0, 4.77009, 4.96443, 1.49196, 2.61677, 0.251117, 0.251117, 0, 1, 1.14354, 0.314998, 3.6427e-11}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB4x2+B16@12 aB4x2+S4@12 rB wg 4x8 kc4 nse di sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 4, 4}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {883654, 523936, 0, 0, 0, 0, 5.53142, 11.1213, 2.82141, 4.63939, 0.256629, 0.256629, 0, 0.98682, 1.24557, 0.849611, 1.42198e-11}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB8+B8@16 aB4+S16@16 rB wg 2x8 kc4 nse di sb256", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {32, 2, 8}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.3708e+06, 167653, 0, 0, 0, 0, 2.86309, 9.95196, 1.56844, 2.32918, 0.254056, 0.195284, 0.0898234, 0.777144, 1.37467, 0.990347, 3.84971e-11}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ft"}, "aB4x2+B8@12 aB8x2+B8@12 rB wg 4x4 kc4 nse di sb256 xa xc grf256 3m", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {32, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.36961e+07, 574403, 0, 0, 0, 0, 0, 0, 1.53045, 3.68865, 0.191948, 0.00175386, 0.188172, 0.873487, 1.11353, 0.267016, 1.05692e-10}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aS8+S1,8@32 aS16+S1,8@32 rB nse di wg 4x8 kc8 sm sn sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.01796e+06, 1.15558e+06, 0, 0, 0, 0, 3.09639, 2.6085, 1.09003, 2.43754, 0.249854, 0.249854, 0, 1, 1.05678, 0.642066, 1.26073e-11}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aS4+S1,4@12 aS8+S1,4@12 rB wg 8x2 kc4 nse di sb256 sm sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {16, 16, 8}, {8, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.14882e+06, 374482, 0, 0, 0, 0, 2.00654, 5.84482, 0.885802, 2.41542, 0.250434, -0.000239763, 0.251234, 0.621532, 1.15673, -0.190682, 6.76316e-11}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aS8+S1,4@12 aS4x2+S16@8 rB wg 4x4 kc4 nse di sb256 sm sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {16, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.12151e+06, 284242, 0, 0, 0, 0, 3.59691, 4.35772, 1.561, 2.6786, 0.251058, -0.00166923, 0.253801, 0.494685, 1.217, -1.13625, 1.27709e-10}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aS4x2+S1,4@8 aS16+S8@12 rB wg 2x4 kc4 nse di sb256 sm sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {32, 4, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.22315e+06, 149008, 0, 0, 0, 0, 2.54737, 5.80197, 1.48089, 2.58246, 0.251002, 0.192367, 0.0893035, 0.796591, 1.32193, 0.961266, 2.9227e-11}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aS8+S8@12 aB16+S8@12 rB wg 8x1 kc8 nse di nmk sb256 sm grf256", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {8, 1, 16}, {8, 1, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.10684e+06, 113769, 0, 0, 0, 0, 1.76693, 57.3126, 7.23661, 20.821, 1.6365, 0.067267, 1.15215, 0.302842, 1.24431, 0, 0}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ft"}, "aS4x2+S1,4@12 aS8x2+S1,16@12 rB wg 2x4 kc4 nse di sb256 sm sn grf256 3m", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {32, 16, 8}, {2, 4, 1}, 1, (WGType) 1, 1, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.2826e+07, 264862, 0, 0, 0, 0, 4.46192, 4.29688, 1.78043, 4.14814, 0.191247, -0.00134627, 0.190106, 0.606487, 1.18139, -0.301318, 2.24512e-10}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aS8+S1,8@16 aB8+B8@16 rS nse di wg 8x4 kc8 sm sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {923035, 1.19193e+06, 0, 0, 0, 0, 4.29102, 2.38069, 1.04423, 2.73374, 0.250135, 0.250135, 0, 1, 1.04024, 0.694575, 8.8587e-12}}}, +{{'F', "ugemm", {"C", "C", "C"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ft"}, "aS4x2+S1,8@12 aB8x2+B16@12 rB wg 4x2 kc4 nse di sb256 sm grf256 3m", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 16, 8}, {4, 2, 1}, 1, (WGType) 1, 1, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.31848e+07, 315591, 0, 0, 0, 0, 0, 0, 1.59112, 4.17141, 0.19168, -0.00026114, 0.189622, 0.62094, 1.19794, 0.0237784, 1.76108e-10}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+B8@16 rB nse di grf256 wg 4x8 kc8 sb256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 8}, {true, true, false}}, {'W', 1, {1024}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@24 aS8x2+S8@24 rB wg 4x8 kc8 nse di sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {915642, 488057, 0, 0, 0, 0, 2.62682, 4.67056, 1.01353, 1.76192, 0.0687398, 0.0687398, 0, 0.998364, 1.80644, 1.08579, 3.08664e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB32+B16@32 aS16+S8@32 rB wg 4x8 kc16 nse di sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {887309, 574758, 0, 0, 0, 0, 4.77569, 4.82861, 0.536993, 1.65054, 0.0889844, 0.0889844, 0, 1, 1.65309, 1.06232, 1.19911e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "am16+B8@16 am/S8x2+S16@8 rB wg 4x8 kc8 nse di sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 4, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {874030, 476421, 0, 0, 0, 0, 4.8312, 6.40891, 1.05734, 2.94303, 0.18704, 0.18704, 0, 0.861733, 1.30529, 0.905412, 1.30557e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "am16+S16@32 am/S16x2+S32@16 rB wg 4x8 kc16 nse di sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {8, 4, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {878381, 502381, 0, 0, 0, 0, 3.51216, 4.32155, 2.1606, 5.68106, 0.153809, 0.153809, 0, 0.992787, 1.53401, 1.00819, 5.99333e-12}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "am8+B8@32 am16+S32@32 rB wg 2x8 kc8 nse di sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 1, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.07599e+06, 227353, 0, 0, 0, 0, 2.32632, 10.6671, 1.95612, 9.18304, 0.317695, 0.0293572, 0.283039, 0.966881, 1.48893, 0, 0}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "am8x2+S32@56 am32x2+S8@32 rB wg 2x8 kc8 nse di sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {8, 1, 32}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.07715e+06, 250361, 0, 0, 0, 0, 2.59315, 7.24109, 5.29152, 18.9828, 0.384253, 0.0758976, 0.344607, 0.971505, 1.50646, 0, 0}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aS16+S1,8@24 rB wg 2x4 kc8 nse sb32 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 16, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.18501e+06, -243475, -23032.8, 379112, 3.1703e+06, 0, 1.97161, 2.22356, 1.44309, 2.66453, 0.0699686, 0.0453516, 0.0318017, 0.918119, 1.74896, 1.09407, 3.48966e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@20 aS8+S8@16 rB wg 2x4 kc8 nse sb32 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 8, 8}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.24656e+06, -164669, -48002.1, 283386, 2.77709e+06, 0, 1.98606, 2.80806, 1.04589, 2.3437, 0.0784483, 0.0455955, 0.0428373, 0.974247, 1.873, 1.30826, 1.26458e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "B"}, "aB16+B32@8 am/S8+S32@8 rB wg 2x4 kc8 nse sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 8, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.2169e+06, -138561, -51393, 265750, 2.82624e+06, 0, 3.0771, 2.47131, 0.605585, 1.74042, 0.122303, 0.0704165, 0.0648163, 0.869056, 1.56192, 1.18982, 4.27875e-12}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 63, -1}, {1, 1, 1}, "B"}, "aB8x2+B16@8 am/S32+S8@16 rB wg 8x2 kc8 nse nmk sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 8, 32}, {8, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.05699e+06, -428258, 173.365, 680582, 2.47398e+06, 0, 2.18296, 8.55503, 0.998614, 1.95224, 0.116745, 0.0199768, 0.0971294, 0.407718, 1.56937, 1.07565, 2.15851e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, "AB"}, "am8x2+S8@16 am/S16x2+S32@8 rB wg 8x1 kc8 nse nmk sb256", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {8, 4, 16}, {8, 1, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.1402e+06, -71205.9, -21903.7, 142433, 2.55017e+06, 0, 1.66009, 9.05773, 1.11937, 2.68092, 0.156119, 0.215721, 0.084158, 0.95334, 1.51236, 1.10163, 1.46309e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, "A"}, "am8x2+S8@24 aB16+S16@32 rB wg 8x1 kc8 nse nmk sb256", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {8, 1, 16}, {8, 1, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.16128e+06, -65247.7, -22289, 130631, 2.53133e+06, 0, 1.64855, 28.7631, 2.52926, 8.66628, 0.473162, 0.41267, 0.237065, 0.973421, 1.33231, 0, 0}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {63, -1, -1}, {1, 1, 1}, "B"}, "aB8x2+B16@16 am/S8+S1,16@16 rB wg 1x16 kc8 nse sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 16, 8}, {1, 16, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {976429, -406266, 45745.6, 629685, 2.86228e+06, 0, 7.99903, 1.64031, 1.12683, 2.08743, 0.110815, 0.0136219, 0.0976198, 0.772308, 1.44929, 1.07416, 1.34478e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {1, 1, 1}, ""}, "aB8x2+S16@24 aS16x2+S1,8@16 rS wg 1x8 kc8 nse sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {4, 16, 16}, {1, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.21325e+06, -117438, -61556.4, 217477, 2.2528e+06, 0, 12.0563, 1.81403, 1.31532, 3.89594, 0.568058, 0.26898, 0.145357, 0, 1.19505, 0.748462, 6.62819e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, "AB"}, "am32+S32@16 at8+S1,8@16 rS wg 1x4 kc8 nse sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {1, 16, 32}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.16219e+06, -57394.2, -25547.5, 110434, 2.43302e+06, 0, 33.3002, 1.74263, 6.19487, 15.3289, 1.83114, 0.655931, 0.435916, 0, 1.20699, 0, 0}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB32/16+B32@24 aS8+S8@16 rB wg 1x4 kc8 nse sb32 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 4, 32}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.18047e+06, -71272.1, -20070.7, 138650, 2.71974e+06, 0, 2.17632, 2.94444, 0.603756, 1.71966, 0.167455, 0.134419, 0.0534319, 0.781359, 1.48835, 1.28544, 1.46591e-12}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aS8/4x2+S1,8@12 rB wg 8x4 kc8 nse sb32 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {879168, 722368, 0, 0, 4.75054e+06, 0, 2.06787, 4.41992, 1.60573, 2.51717, 0.0641037, 0.0641037, 0, 1, 1.60856, 1.11787, 2.05124e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aS16+S1,8@32 rB wg 4x8 kc8 nse sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {912372, 563212, 0, 0, 3.32595e+06, 0, 2.38855, 3.24194, 1.35317, 2.28502, 0.0657287, 0.0657287, 0, 1, 1.66986, 1.09282, 2.26599e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16/8+B16@24 aS16+S1,8@24 rB wg 4x4 kc16 nse sb32 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.10777e+06, -506066, -12890.7, 799323, 3.24076e+06, 0, 2.01254, 3.23584, 1.4086, 2.31622, 0.0656034, 0.00300752, 0.0661711, 0.737749, 1.7214, 1.10676, 2.39035e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "@"}, "aB8/4+B16@16 aB8/4x2+B16@12 rB wg 8x4 kc8 nse sb32 grf256", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 0}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 409, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {861097, 722621, 0, 0, 4.43187e+06, 0, 2.3239, 3.58269, 1.60841, 2.53024, 0.064521, 0.064521, 0, 0.999741, 1.56629, 1.08295, 1.63858e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16/8+B8@24 aB8x2+B8@8 rB wg 4x8 kc8 nse sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {871542, 592164, 0, 0, 3.45702e+06, 0, 2.36848, 3.02961, 1.32902, 2.25867, 0.0662883, 0.0662883, 0, 0.994455, 1.63546, 1.12381, 1.69159e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "B"}, "aB8+B8@24 am8x2+B16@16 rB wg 8x4 kc8 nse sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {880192, 520025, 0, 0, 2.84099e+06, 0, 2.31648, 4.16414, 0.909713, 1.66985, 0.0692239, 0.0692239, 0, 1, 1.71165, 1.07463, 2.09285e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+B16@24 aB32+B8@24 rB wg 4x4 kc16 nse sb64 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.11809e+06, -466011, -16563.9, 764493, 2.61325e+06, 0, 2.84324, 3.20443, 0.85062, 1.78271, 0.0761868, 0.00753119, 0.0759547, 0.644822, 1.67986, 1.12972, 2.44869e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@8 aB32+S32@8 rB wg 2x2 kc8 nse sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 8, 32}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.22853e+06, -82782.2, -23670.7, 148817, 3.12689e+06, 0, 2.3891, 3.33249, 0.600665, 1.68573, 0.163761, 0.082605, 0.0800501, 0.814058, 1.43822, 1.26394, 2.17885e-12}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, "A"}, "am8x2+S8@16 aB8x2+S8@16 rB wg 8x1 kc8 nse nmk sb256", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 4, 8}, {8, 1, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.11999e+06, -72570.5, -14923.8, 138124, 2.42483e+06, 0, 1.78133, 14.7744, 1.09287, 2.73154, 0.489395, 0.247154, 0.0835583, 0, 1.20981, 0.834056, 5.52179e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, ""}, "aB8x2+S16@16 aS8x2+S16@16 rB wg 8x1 kc8 nse nmk sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 1, 8}, {8, 1, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.18155e+06, -101841, -56144.7, 192951, 2.19546e+06, 0, 1.84095, 49.5117, 4.87015, 18.3577, 2.19278, 0.799335, 0.551365, 0.333333, 1.22289, 0, 0}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, "B"}, "aS8x2+S16@16 am16+S8@24 rS wg 1x8 kc8 nse sb256", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {1, 8, 16}, {1, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.10592e+06, -66294.5, -13583.7, 130433, 2.4576e+06, 0, 41.6616, 1.75295, 2.80747, 13.7042, 3.09892, 0.798956, 0.29321, 0, 1.09357, 0, 0}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {63, -1, -1}, {1, 1, 1}, ""}, "aB8+S16@8 aB16/8+B32@8 rB wg 2x8 kc8 nse sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 32, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.03036e+06, -481494, 30842.2, 727737, 2.94093e+06, 0, 10.2738, 2.35775, 1.05763, 1.93135, 0.117162, 0.0193778, 0.131382, 0.981071, 1.41432, 1.12958, 6.8823e-12}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aB32+B32@24 rB wg 4x2 kc8 nse di grf256 sb32", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 16, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.2646e+06, -196909, -51005.6, 337453, 2.80986e+06, 0, 2.27659, 3.26787, 0.897445, 2.02238, 0.102658, 0.061305, 0.034087, 0.0702678, 1.56451, 1.36193, 2.83441e-12}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB32/16+B32@16 aB8x2+B8@8 rB wg 2x2 kc8 nse di grf256 sb32", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 16, 32}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.23282e+06, -129218, -13345.6, 202286, 3.46522e+06, 0, 2.44022, 2.28397, 0.985247, 2.22005, 0.137039, 0.0856075, 0.0371964, 0.0656057, 1.44605, 1.31939, 1.07507e-12}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "B"}, "aB16x2+B8@16 am8x2+B32@24 rB wg 16x1 kc8 nse di grf256 sb32", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 16, 16}, {16, 1, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.08915e+06, -431335, -5997.05, 700679, 2.9737e+06, 0, 2.21771, 7.76121, 0.980592, 1.76723, 0.139327, 0.00941652, 0.0971052, 0.262078, 1.46315, 1.21861, 3.7446e-12}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,8@40 aS16+S1,8@40 rB wg 4x8 kc8 nse di sb32 sm sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {877644, 599578, 0, 0, 0, 0, 3.15342, 2.59702, 1.3226, 2.25765, 0.0670018, 0.0670018, 0, 1, 1.68766, 1.11233, 2.23688e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "B"}, "aS8+S1,8@40 am/S8x2+S32@24 rB wg 4x8 kc8 nse di sb256 sm sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {883553, 504171, 0, 0, 0, 0, 3.77786, 2.16465, 0.612725, 1.79563, 0.0848513, 0.0848513, 0, 0.998512, 1.75631, 1.12347, 1.1711e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "at32+S32@64 am/S16+S32@64 rB wg 8x4 kc16 nse di sb256 sm sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {8, 8, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {865413, 570645, 0, 0, 0, 0, 2.86633, 4.96099, 1.2902, 3.23687, 0.123001, 0.123001, 0, 0.965442, 1.63164, 1.05986, 1.02296e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "at16+S32@64 am/S16x2+S32@48 rB wg 4x8 kc16 nse di sb256 sm grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {8, 2, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {863028, 510360, 0, 0, 0, 0, 3.9041, 7.2614, 3.96106, 10.675, 0.250578, 0.250578, 0, 1, 1.51731, 1.03361, 3.84937e-12}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "at16x2+S32@48 am16x2+S32@48 rB wg 2x16 kc16 nse di sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {8, 1, 16}, {2, 16, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {881189, 505307, 0, 0, 0, 0, 7.06485, 7.28102, 5.42581, 19.0715, 0.482831, 0.482831, 0, 1, 1.34232, 0, 0}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "B"}, "aS8x2+S1,8@24 am8+S1,8@32 rB wg 8x4 kc8 nse di sm sn grf256 sb32", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {890085, 527131, 0, 0, 0, 0, 4.69062, 2.67511, 0.899379, 1.66937, 0.0769989, 0.0769989, 0, 1, 1.69862, 1.05048, 2.36119e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,8@24 aS8+S1,8@24 rB nse wg 4x8 kc8 sm sn sb32 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {859973, 874478, 0, 0, 5.06266e+06, 0, 3.29404, 2.37429, 1.4817, 2.42039, 0.0651324, 0.0651324, 0, 1, 1.62478, 1.08017, 2.64218e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "B"}, "aS8+S1,8@12 at8+S1,8@12 rB wg 4x4 kc8 nse sb32 sm sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {32, 32, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.25951e+06, -1.32422e+06, -91876, 1.6803e+06, 4.04111e+06, 0, 3.66458, 2.28958, 1.60107, 2.58557, 0.0653312, 0.00193227, 0.0644636, 0.609814, 1.66452, 1.13819, 2.10824e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "B"}, "aS16x2+S1,8@16 at8+S1,8@32 rB wg 4x2 kc8 nse sb32 sm sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {16, 32, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.2613e+06, -288637, -37005, 436502, 3.16211e+06, 0, 2.96943, 2.18687, 1.4616, 2.61866, 0.0832218, 0.0600046, 0.0297515, 0.685298, 1.63098, 1.03251, 4.21843e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, "AB"}, "at8x2+S1,8@16 am/S8+S16@24 rB wg 8x1 kc8 nse nmk sb256 sm grf256", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {16, 4, 8}, {8, 1, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.20635e+06, -115723, -66951.5, 223965, 2.56983e+06, 0, 1.66705, 14.0732, 0.141298, 2.83505, 0.169419, 0.184522, 0.10455, 1, 1.54672, 1.272, 2.05682e-13}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, "AB"}, "at8x2+S1,8@16 am8+S16@24 rB wg 8x1 kc8 nse nmk sb256 sm grf256", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {16, 1, 8}, {8, 1, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.22796e+06, -97461.3, -70391.8, 202283, 2.28884e+06, 0, 1.64601, 50.7867, 1.43536, 8.53536, 0.609792, 0.200389, 0.445621, 0.98454, 1.34486, 0, 0}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {63, -1, -1}, {1, 1, 1}, "AB"}, "at16+S1,16@8 am/S8+S1,8@8 rB wg 1x16 kc8 nse sb256 sm sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.06061e+06, -422040, -138.053, 673111, 2.44531e+06, 0, 21.8595, 1.74146, 1.02, 1.99062, 0.125827, 0.0373741, 0.118184, 0.973939, 1.46707, 1.1825, 1.18024e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, "B"}, "aB8+S8@8 at8+S1,8@8 rS wg 1x4 kc8 nse sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {1, 16, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.13093e+06, -53054.3, -19486.3, 101494, 2.50675e+06, 0, 32.1889, 1.67287, 6.89438, 17.2631, 0.697187, 0.183575, 0.653988, 1, 1.26574, 0, 0}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,8@32 aS16+S1,8@32 rB wg 8x2 kc8 nse di sm sn grf256 sb32", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {16, 32, 16}, {8, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.12406e+06, -567411, -26695.9, 856136, 3.45784e+06, 0, 1.98785, 3.48264, 1.41613, 2.24603, 0.0703441, 0.00331692, 0.0659792, 0.597622, 1.7232, 1.14925, 3.0157e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8x2+S8@16 aS8+S1,8@24 rB wg 8x1 kc8 nse di sm sn grf256 sb32", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {8, 32, 8}, {8, 1, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.21584e+06, -238804, -30942.1, 366268, 2.7689e+06, 0, 1.96496, 3.24266, 0.99871, 1.67489, 0.0963645, 0.0221903, 0.0792587, 0.999858, 1.69956, 1.20732, 1.33481e-11}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "B"}, "aS8+S1,8@40 am32+S32@32 rB wg 2x4 kc8 nse di sm sn grf256 sb32", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {16, 8, 32}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.30275e+06, -140576, -80892.2, 270400, 2.52314e+06, 0, 2.41886, 1.96183, 0.657308, 1.79007, 0.118095, 0.0805585, 0.0575456, 0.781989, 1.6348, 1.22619, 7.4659e-12}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8x2+S16@24 aS16+S16@32 rB wg 4x2 kc8 nse di sm sn grf256 sb32", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {8, 8, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.2692e+06, -138155, -69946.2, 247166, 2.48218e+06, 0, 2.22535, 2.9076, -0.226323, 2.36118, 0.179437, 0.0955498, 0.0965902, 0.986833, 1.48781, 1.14074, 6.5259e-12}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS16+S8@32 aS8+S1,8@32 rB wg 16x1 kc8 nse di nmk sm sn grf256 sb32", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {8, 16, 16}, {16, 1, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {1.08982e+06, -365235, -57570.3, 563017, 2.44122e+06, 0, 1.97404, 6.45479, 0.729323, 2.0721, 0.109576, 0.0188555, 0.121551, 0.995776, 1.6362, 1.09808, 8.84348e-12}}}, +{{'F', "ugemm", {"D", "D", "D"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,8@16 aB8+B8@16 rS nse di wg 8x4 kc8 sm sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, false}}, {'E', 17, {863507, 843091, 0, 0, 0, 0, 2.99578, 2.42156, 4.42824, 4.74906, 0.0650645, 0.0650645, 0, 0.993213, 1.59693, 1.16972, 1.1178e-11}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"A2#16,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16+B16@48 aB16+B16@48 rB vav di sys grf256 af wg 4x8 sb256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, false}}, {'W', 1, {2048}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"A2", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2+B16@80 aB16x2+B16@80 rB vav sb256 wg 4x8 di sys", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {64, 128, 4}, {true, true, false}}, {'W', 1, {256}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am16 rB wg 8x4 cb4x2 ks32 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.00706e+06, 522382, 0, 0, 0, 0, 0.725659, 1.44632, 0.970408, 1.74134, 0.0067111, 0.0067111, 0, 0.90349, 1.42986, 1.13348, 2.91269e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am32+m64@64 rB wg 4x8 af vav di sb256 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {872440, 433979, 0, 0, 0, 0, 0.692755, 0.929392, 0.682568, 1.28977, 0.00829318, 0.00829318, 0, 0.933146, 1.44966, 1.06633, 2.17433e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@32 am32+m16@32 rB wg 4x8 xaf vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {883175, 391500, 0, 0, 0, 0, 0.858496, 1.69057, 0.878366, 1.49214, 0.0153564, 0.0153564, 0, 0.862765, 1.22512, 0.990538, 1.20433e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m64@64 am32x2+m64@32 rB wg 4x8 xaf vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {888453, 388680, 0, 0, 0, 0, 1.36981, 1.36706, 1.57389, 2.50026, 0.0230892, 0.0230892, 0, 0.779666, 1.13216, 0.970824, 1.11098e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@64 am32+m64@64 rB wg 4x8 af vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {891563, 373077, 0, 0, 0, 0, 1.28395, 2.80375, 2.15367, 3.8278, 0.0375084, 0.0375084, 0, 0.91257, 1.09932, -0.409383, 1.56754e-11}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB64 rB wg 4x8 cab3x2 ks64 af vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.71378e+06, 809360, 0, 0, 0, 0, 0.986084, 1.01074, 0.832813, 1.74844, 0.00859306, 0.00859306, 0, 0.738196, 1.53185, 1.01022, 3.30219e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 rB wg 4x8 cab4 ks64 af vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 rB wg 2x16 cab3x2 ks64 af vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {11000, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {64, 8, 1}, "ABI"}, "av16+B16@48 am16x2 rB wg 8x2 af vav di sb256 sys ska rr wx4", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 257, 0, 0, {64, 8, 4}, {true, true, false}}, {'E', 17, {715095, 469673, 0, 0, 0, 0, 0.454681, 19.058, 0.94253, 2.23273, 0.0531575, 0.0531575, 0, 0.0832933, 1.2383, -1.03685, 3.66291e-11}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@72 am32+m32@64 rB wg 4x8 xaf vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {903365, 697556, 0, 0, 8.2903e+06, 1.21651e+07, 0.724506, 0.722081, 0.92287, 1.55416, 0.00402055, 0.00402055, 0, 0.997691, 1.6726, 1.18622, 5.18793e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m32@48 am32+S32@64 rB wg 4x8 xaf st vav di sb32 sn grf256 sys rr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {869760, 741196, 0, 0, 8.192e+06, 1.05431e+07, 0.731287, 0.777012, 0.881104, 1.51408, 0.00403024, 0.00403024, 0, 0.998184, 1.76752, 1.28618, 2.08947e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av64+m16@64 am32+m32@72 rB wg 8x4 xaf vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {876646, 564122, 0, 0, 6.5151e+06, 7.83974e+06, 0.629669, 0.87362, 0.885543, 1.48097, 0.00440774, 0.00440774, 0, 1, 1.66234, 1.24996, 2.85794e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIpq"}, "av64+m16@64 am16+m16@48 rB wg 8x2x2 xaf vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 64, 64}, {8, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.09587e+06, 290213, 0, 0, 7.65133e+06, 9.1177e+06, 0.649959, 1.16004, 0.89878, 1.61737, 0.0055749, -0.00026212, 0.00571368, 0.41686, 1.92252, 1.24755, 2.35066e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av64+m32@72 am64 rB wg 4x4 cb3 ks64 xaf st vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.39136e+06, -511133, -159173, 776441, 3.49225e+06, 3.79699e+06, 0.616898, 0.829845, 0.950983, 1.59577, 0.00679651, 0.000143485, 0.00679319, 0.412801, 1.47949, 1.21242, 2.01548e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@16 am16x2 rB wg 4x4 cb3x2 ks64 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.202e+06, 220946, 0, 0, 3.22519e+06, 3.72408e+06, 0.649739, 1.23543, 0.910187, 1.56677, 0.0118941, 0.00175498, 0.01085, 0.587028, 1.3097, 0.977204, 4.11281e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 rB wg 2x4 ca3x2 ks16 af vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 6144, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.27954e+06, -187821, -42333.9, 291644, 3.34234e+06, 2.63782e+06, 0.670967, 0.826166, 0.942564, 1.64083, 0.0148244, 0.00555253, 0.00975056, 0.806514, 1.26716, 0.788997, 1.48059e-11}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@16 am64+m16@32 rB wg 2x4 af vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 64}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.11866e+06, -145727, -15120.1, 241492, 2.46743e+06, 1.84648e+06, 0.59095, 0.943411, 0.892557, 1.46629, 0.0222611, 0.0141188, 0.0101768, 0.737216, 1.20804, 0.716031, 2.22642e-11}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@32 am32+m32@32 rB wg 2x4 xaf vav di sb64 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 32}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.1218e+06, -136578, -30749.5, 226811, 2.85082e+06, 1.87597e+06, 0.817567, 0.877924, 1.64935, 2.53899, 0.0298413, 0.00967317, 0.0214053, 0.870582, 1.28882, 0.938989, 4.84082e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m32@16 am32 rB wg 2x8 af vav di nmk sb64 sys", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.04087e+06, 107650, 0, 0, 2.52723e+06, 3.24731e+06, 0.657844, 3.81434, 0.838331, 1.35005, 0.038394, 0.0231761, 0.0232745, 0.770273, 1.2243, 0.973306, 4.56074e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB16 rB wg 4x8 cab4x2 ks16 xaf vav di sn dm grf256 sys rr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.04604e+06, 973086, 0, 0, 7.53336e+06, 1.18866e+07, 0.951056, 1.1825, 1.0066, 1.69808, 0.00473475, 0.00473475, 0, 0.835362, 2.23999, 1.37476, 2.87047e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB32 rB wg 4x8 cab4 ks32 xaf st vav di sn dm grf256 sys rr l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.04992e+06, 862089, 0, 0, 4.41549e+06, 6.61914e+06, 0.869043, 1.44302, 0.910417, 1.82393, 0.00618576, 0.00618576, 0, 0.990298, 1.63647, 1.07524, 5.899e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 rB wg 4x4 cab3 ks32 xaf st vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.79824e+06, -569026, -348921, 964550, 3.97312e+06, 4.26803e+06, 0.900273, 0.952519, 0.97609, 1.62415, 0.0075791, 0.00139129, 0.00649277, 0.622224, 1.5355, 1.13493, 2.82312e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB32 rB wg 4x4 cab4x2 ks32 af vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.74298e+06, -438399, -331091, 823020, 2.83443e+06, 3.02285e+06, 0.827087, 1.16983, 0.912475, 1.60769, 0.0117213, 0.00142707, 0.0105272, 0.504681, 1.41325, 0.945808, 5.78409e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 rB wg 2x8 cab4x2 ks32 xaf st vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 32}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.69805e+06, -386506, -310168, 727551, 2.85082e+06, 2.46579e+06, 0.823076, 0.993902, 0.988626, 1.9249, 0.0161014, 0.00259962, 0.0143172, 0.708518, 1.38784, 0.977495, 3.17493e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB32 rB wg 1x4 cab3 ks32 xaf vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 32}, {1, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.63678e+06, -103227, -63581.8, 190716, 3.1785e+06, 2.12173e+06, 0.894659, 0.884893, 0.842745, 1.56296, 0.0283829, 0.00669699, 0.0220137, 0.907502, 1.26828, 0.579024, 2.5883e-11}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "AI"}, "aB16+m32@32 aB32 rB wg 2x8 af vav di nmk ca3 sys dm l4", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 2, 4}, {true, true, false}}, {'E', 17, {1.31212e+06, 162444, 0, 0, 2.20365e+06, 0, 0.717898, 4.15784, 0.784983, 1.4051, 0.0339977, 0.013915, 0.0254973, 0.96039, 1.47669, 1.00421, 5.83703e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "AIp"}, "aB16+m32@32 aB64 rB wg 1x4 af vav di sb64 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.17788e+06, -94434.5, -16004.3, 154932, 2.83443e+06, 1.63758e+06, 1.82204, 0.60134, 0.592597, 1.15461, 0.0290724, 0.0265558, 0.015314, 1, 1.23184, 0.449695, 1.93026e-11}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "aB32 aB32 rB wg 8x4 cab3 ks32 af vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 86016, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.07153e+06, 922406, 0, 0, 5.48536e+06, 9.18323e+06, 0.85293, 1.19559, 1.04485, 1.64281, 0.00471518, 0.00471518, 0, 0.961495, 1.72318, 1.24713, 3.69593e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 rB wg 8x2 af vav di sb256 sys ska rr wx4", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@48 at16x2+m64@48 rB wg 4x8 af vav di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {883225, 427290, 0, 0, 0, 0, 1.11407, 1.21203, 0.677771, 1.29342, 0.00889462, 0.00889462, 0, 0.963151, 1.53208, 1.10128, 2.13999e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@64 at32+m32@64 rB wg 4x8 xaf vav di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {894456, 401519, 0, 0, 0, 0, 1.09663, 2.26086, 0.873233, 1.4831, 0.0147437, 0.0147437, 0, 0.972414, 1.38406, 0.982763, 2.2604e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB16+m32@48 at32+m32@48 rB wg 8x4 cab4 ks32 xaf st vav di sn grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.08227e+06, 776908, 0, 0, 0, 0, 1.04569, 2.05877, 0.996213, 1.67003, 0.00827411, 0.00827411, 0, 0.961616, 1.45528, 1.11502, 2.00546e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB32 at32+m32@48 rB wg 4x8 cab4 ks64 xaf st vav di sn grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 131072, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.07062e+06, 676936, 0, 0, 0, 0, 1.46521, 1.49357, 0.913257, 1.83797, 0.0123376, 0.0123376, 0, 0.978477, 1.29593, 1.02611, 1.67976e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16x2 at16x2 rB wg 4x8 cab4x2 ks64 xaf st vav di grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.09253e+06, 617958, 0, 0, 0, 0, 1.24883, 3.48717, 1.05505, 2.27656, 0.0251952, 0.0251952, 0, 0.980002, 1.05051, 0.285902, 7.08638e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#Ip"}, "aB16x2 aS16x2 rB wg 4x16 cab4 ks64 af vav di sys", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 2, 64}, {4, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.04437e+06, 894474, 0, 0, 0, 0, 1.61942, 5.20006, 2.09495, 4.09093, 0.063329, 0.063329, 0, 0.953584, 1.16518, 0.929804, 1.67951e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@32 av16x2 rB wg 4x8 cb4x2 ks32 xaf vav di grf256 sys rr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.03762e+06, 706048, 0, 0, 6.93043e+06, 1.10019e+07, 0.892859, 1.0972, 0.98165, 1.70677, 0.00434444, 0.00434444, 0, 0.705466, 1.6217, 1.19447, 5.03184e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@48 av32 rB wg 8x4 cb4 ks32 xaf st vav di grf256 sys rr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.00046e+06, 649003, 0, 0, 5.58776e+06, 8.89651e+06, 0.806799, 1.52159, 1.05017, 1.76588, 0.00548707, 0.00548707, 0, 0.843701, 1.54307, 1.23912, 1.78553e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 rB wg 8x4 cb3 ks64 af vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.01629e+06, 551456, 0, 0, 5.42147e+06, 5.37395e+06, 0.789185, 1.6085, 1.03879, 1.72752, 0.00628363, 0.00628363, 0, 0.924224, 1.51259, 1.24909, 2.21382e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m16@32 av16 rB wg 4x4 cb3 ks32 xaf vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.34547e+06, -453771, -133582, 707431, 3.85843e+06, 4.096e+06, 0.838397, 0.942262, 0.957325, 1.57766, 0.00740113, 0.0010301, 0.00648511, 0.743417, 1.52474, 1.0944, 4.90834e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 at64+m64@48 rB wg 4x4 af vav di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {979562, -385642, 51711.6, 602420, 2.47235e+06, 2.84426e+06, 0.69937, 1.28105, 0.710825, 1.4275, 0.0108655, 0.00130873, 0.010468, 0.846726, 1.54637, 0.967193, 7.32783e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@16 at16+m32@32 rB wg 4x2 af vav di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.117e+06, -159631, -1790.36, 263388, 3.24403e+06, 2.48627e+06, 0.970315, 0.939915, 0.746986, 1.27856, 0.016383, 0.00986423, 0.00728157, 0.639606, 1.39693, 0.908242, 1.20384e-11}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@32 at32x2+m64@16 rB wg 4x2 xaf vav di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.14232e+06, -131424, -30012.5, 227440, 2.74432e+06, 2.18972e+06, 0.756622, 1.40904, 0.883778, 1.49413, 0.0268538, 0.0148448, 0.0126023, 0.415284, 1.26391, 0.590636, 3.31179e-11}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m16@32 at32x2+m16@16 rB wg 4x2 af vav di nmk sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 8, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {726421, -157896, 326311, 250406, 2.55754e+06, 3.09658e+06, 0.491778, 4.05366, 0.747811, 1.5515, 0.0177586, 0.0187752, 0.0114616, 1, 1.28009, 0.929609, 5.62845e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABIpq"}, "aB16+m16@48 at32+m16@48 rB wg 4x8 cab3 ks32 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.06016e+06, 848171, 0, 0, 7.38099e+06, 1.0453e+07, 1.36089, 1.12404, 0.968578, 1.56365, 0.00464109, 0.00464109, 0, 1, 1.59464, 1.11033, 6.56771e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at16+m16@32 rB wg 8x2 cab3 ks16 xaf st vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 64, 16}, {8, 2, 1}, 1, (WGType) 1, 257, 36864, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.60943e+06, -1.34483e+06, -253630, 1.70607e+06, 5.342e+06, 7.41376e+06, 1.0168, 1.44036, 1.01244, 1.68138, 0.00675092, 0.0008524, 0.00611925, 0.798697, 1.5218, 1.14975, 5.05128e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB32+m32@64 at32+m16@64 rB wg 4x4 cab3 ks32 xaf st vav di sn grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.63235e+06, -473591, -275698, 840536, 4.04275e+06, 4.34176e+06, 1.32077, 0.982863, 0.96265, 1.60044, 0.00888121, 0.000531032, 0.00822524, 0.585331, 1.48926, 0.995812, 5.75876e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB16x2+m16@32 at16x2+m16@32 rB wg 2x4 cab4 ks16 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 16, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.51888e+06, -225905, -111399, 382282, 3.85024e+06, 3.13754e+06, 1.02462, 1.25826, 0.997766, 1.72944, 0.0116731, 0.00921103, 0.00406354, 0.785111, 1.49836, 0.998742, 6.32281e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB16x2+m16@32 at16x2+m16@32 rB wg 2x4 cab3x2 ks32 xaf vav di sn grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {2, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {2.50658e+06, -199989, -350122, 337287, 3.39968e+06, 2.49856e+06, 0.784682, 0.339369, 0.497498, 1.59625, 0.016336, 0.0120673, 0.0057422, 0.609408, 1.33312, 0.989535, 3.60691e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 rB wg 2x2 cab4x2 ks16 xaf vav di grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 16}, {2, 2, 1}, 1, (WGType) 1, 257, 12288, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.47272e+06, -119971, -42490.2, 189240, 3.69459e+06, 2.31014e+06, 0.811882, 1.39447, 0.646674, 1.39008, 0.025457, 0.0234095, 0.00867396, 0.960377, 1.27279, 0.825982, 8.35402e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 at16 rB wg 1x2 cab3x2 ks16 af vav di grf256 sys sn l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 16}, {1, 2, 1}, 1, (WGType) 1, 257, 6144, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#AI"}, "aB32+m16@48 aS16 rB wg 16x1 cb4x2 ks16 xaf vav di nmk grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {16, 1, 1}, 1, (WGType) 1, 257, 2048, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.42997e+06, -443650, -203140, 794602, 2.88358e+06, 2.22003e+06, 0.794365, 8.5854, 0.944109, 1.77614, 0.0570978, 0.00252013, 0.0532169, 0.706182, 1.0381, 0.381417, 6.1353e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 rB wg 1x16 xaf st vav di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m32@48 at64+m16@48 rB wg 2x2 xaf cs di sb64 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.13361e+06, -87922.6, -10020.2, 138891, 2.8672e+06, 1.83501e+06, 0.818937, 0.8499, 0.851117, 1.53666, 0.0324244, 0.0285585, 0.0107132, 0.756667, 1.09867, 0.84864, 1.283e-11}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@64 am16 rB wg 16x2 cb3x2 ks32 af vav di sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m64@64 am32+m16@64 rB wg 8x4 af vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {888807, 430310, 0, 0, 0, 0, 0.918491, 0.832983, 0.687151, 1.2534, 0.00870491, 0.00870491, 0, 0.902125, 1.38093, 1.03399, 2.98332e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am32+m64@48 rB wg 8x4 af vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {894052, 395164, 0, 0, 0, 0, 0.74104, 1.32668, 0.846804, 1.46546, 0.0135248, 0.0135248, 0, 0.856275, 1.25895, 0.981331, 2.02221e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am16+m64@48 rB wg 4x8 xaf st vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {887285, 380837, 0, 0, 0, 0, 1.36748, 1.28667, 1.55406, 2.48833, 0.0278373, 0.0278373, 0, 0.840837, 1.07072, 0.944443, 1.41313e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m32@48 aB32 rB wg 8x4 cb4x2 ks32 xaf vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.00127e+06, 774753, 0, 0, 0, 0, 0.947461, 0.934863, 0.97605, 1.72158, 0.00894179, 0.00894179, 0, 0.970326, 1.50724, 1.0217, 2.9666e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m16@64 aB32x2 rB wg 8x4 cb4x2 ks64 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {972331, 751881, 0, 0, 0, 0, 0.799573, 1.37171, 0.93861, 2.1425, 0.0147472, 0.0147472, 0, 0.928068, 1.27036, 0.985348, 1.75902e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m32@64 aB64 rB wg 4x8 cb4x2 ks64 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.00586e+06, 704834, 0, 0, 0, 0, 1.16313, 1.1353, 1.51168, 3.11912, 0.0225951, 0.0225951, 0, 0.923429, 1.15703, 0.85336, 2.75385e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 rB wg 4x1 af vav nmk sb256 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIqp"}, "at16+m64@48 am32+m32@56 rB wg 4x8 xaf st vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {885722, 718272, 0, 0, 8.48691e+06, 1.26566e+07, 1.01026, 0.782981, 0.918959, 1.54496, 0.00414471, 0.00414471, 0, 1, 2.17926, 1.31409, 2.23082e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m64@48 am32+m32@64 rB wg 8x4 xaf vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, 1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at16x2+m32@32 am32+m64@48 rB wg 8x4 xaf vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {879324, 573732, 0, 0, 5.95558e+06, 8.37222e+06, 0.790222, 0.904604, 0.883707, 1.48127, 0.0050314, 0.0050314, 0, 0.985529, 1.4878, 1.23641, 2.41747e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16+m32@32 am32+m32@48 rB wg 8x2 af vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 64, 32}, {8, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.03255e+06, -856835, 35930.9, 1.15355e+06, 4.81608e+06, 6.38157e+06, 0.493684, 0.827212, 0.896741, 1.6147, 0.0053978, 6.82453e-05, 0.00570828, 0.483746, 1.57976, 1.26821, 2.39349e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am16x2+m64@32 rB wg 4x4 xaf st vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.03742e+06, -452509, 25423.9, 695882, 3.92397e+06, 3.94035e+06, 0.820319, 0.785135, 0.84902, 1.56923, 0.00809246, 0.00116078, 0.00745441, 0.526504, 1.60176, 1.07294, 4.15601e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am32+m32@48 rB wg 4x2 xaf vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.1758e+06, -218428, -13663.7, 334244, 4.15334e+06, 3.1703e+06, 0.488566, 0.645788, 0.915783, 1.73226, 0.00993899, 0.00846533, 0.00271336, 0.453691, 1.48336, 1.05987, 7.98819e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am32+m32@48 rB wg 2x2 xaf vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 32}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.15992e+06, -146486, -8001.88, 206226, 4.48102e+06, 2.75251e+06, 0.472911, 0.412364, 0.853569, 1.71413, 0.0122185, 0.0134426, 0.0018025, 0.763429, 1.44099, 1.00673, 1.03497e-11}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 rB wg 4x2 xaf st vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@64 am32+m64@64 rB wg 2x2 xaf vav di sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 32}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.13957e+06, -78439.8, -8785.04, 127858, 3.09821e+06, 2.03325e+06, 0.434987, 0.462575, 0.856765, 1.54622, 0.0239184, 0.0233867, 0.00546205, 0.851204, 1.33767, 0.923893, 9.98065e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 8, 1}, "ABI"}, "at16+m64@16 am64+m64@64 rB wg 8x4 af vav di nmk sb64 sm sys", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 4, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.03017e+06, 232218, 0, 0, 2.06356e+06, 2.32817e+06, 0.635122, 3.56777, 1.53227, 2.28154, 0.0372029, 0.00337686, 0.0415229, 0.905521, 1.06254, 0.957223, 9.43647e-13}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at16+m32@48 aB16 rB wg 8x4 cb4x2 ks32 af vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.06632e+06, 970822, 0, 0, 6.83377e+06, 1.0453e+07, 0.802893, 0.831622, 0.987736, 1.60125, 0.0044336, 0.0044336, 0, 0.99801, 1.7879, 1.21352, 4.70293e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AIp"}, "at16+m32@48 aB16 rB wg 16x2 cb4x2 ks64 xaf st vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 96, 64}, {16, 2, 1}, 1, (WGType) 1, 257, 98304, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.06174e+06, 862966, 0, 0, 5.06266e+06, 7.90528e+06, 0.76759, 1.1074, 1.02291, 1.47362, 0.00497143, 0.00497143, 0, 0.968925, 1.63841, 1.28013, 3.04063e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@64 aB16x2 rB wg 16x2 cb3 ks32 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {996043, 778753, 0, 0, 4.31555e+06, 5.99654e+06, 0.739151, 1.50326, 1.00767, 1.54244, 0.00616043, 0.00616043, 0, 0.997048, 1.53835, 1.17302, 3.4064e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AIp"}, "at16x2+m32@32 aB16x2 rB wg 8x2 cb4 ks32 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 32}, {8, 2, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.77071e+06, -527987, -353300, 939437, 4.14515e+06, 4.096e+06, 0.810631, 0.820929, 1.02923, 1.5067, 0.0071533, 0.000990959, 0.00658984, 0.834339, 1.70826, 1.24022, 4.35712e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AIp"}, "at16+m32@16 aB32 rB wg 8x1 cb4x2 ks32 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 32}, {8, 1, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.78479e+06, -261349, -173232, 452033, 4.02227e+06, 3.01466e+06, 0.481975, 0.723918, 1.03793, 1.54262, 0.00927887, 0.00655604, 0.00397641, 0.95291, 1.53912, 1.14817, 5.51161e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@32 aB16x2 rB wg 4x2 cb4x2 ks32 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.88533e+06, -200778, -198593, 423600, 3.39149e+06, 2.28557e+06, 0.572852, 0.529126, 0.934694, 1.58997, 0.012793, 0.00968393, 0.00496565, 0.873155, 1.53254, 0.953462, 7.13806e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16x2+m32@48 aB32 rB wg 4x2 cb4x2 ks64 af vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {4, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@48 aB32x2 rB wg 2x2 cb4x2 ks32 xaf vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {2, 2, 1}, 1, (WGType) 1, 257, 8192, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.66288e+06, -98375, -71966.2, 195025, 2.84262e+06, 2.06438e+06, 0.483255, 0.500833, 0.809319, 1.72387, 0.023295, 0.0176286, 0.00890789, 0.991568, 1.447, 0.919946, 9.68796e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#ABI"}, "at16x2+m64@16 aB32+m16@32 rB wg 4x2 af vav di nmk sb64 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 8, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.26276e+06, -130899, -63587.1, 251457, 2.73613e+06, 1.99803e+06, 0.535584, 1.4389, 1.4172, 2.56436, 0.0281714, 0.0174514, 0.0209074, 1, 1.28388, 0.655999, 5.90369e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "AI"}, "at16+m32@48 aB64 rB wg 1x4 af vav di sb64 sm dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 64}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.15217e+06, -88485.5, -9852.59, 144015, 2.69517e+06, 1.69656e+06, 1.08886, 0.414631, 0.587045, 1.19593, 0.0202405, 0.0220064, 0.0155006, 1, 1.0809, 0.644827, 3.81346e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at16+m32@24 am32x2+m32@24 rB wg 16x2 af rr vav sb64 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {873551, 497995, 0, 0, 0, 0, 0.456439, 2.57339, 0.681793, 1.162, 0.0083654, 0.0083654, 0, 0.887217, 1.50373, 1.04074, 7.72477e-12}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16 aS16 rB sys grf256 cab2 wg 4x4 l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {2, 2, 4}, {true, true, false}}, {'E', 17, {1.17895e+06, 343529, 0, 0, 0, 0, 1.63411, 1.77325, 1.00531, 1.48275, 0.0145617, 0.000936039, 0.0155971, 0.877282, 1.01034, 1.0048, 9.67486e-14}}}, +{{'F', "ugemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABqI"}, "am32+S1,32@64 av32+B32@64 rS vav di sys grf256 af wg 8x4 sb256 sm", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {765291, 1.00234e+06, 0, 0, 0, 0, 0.723688, 0.663141, 1.08538, 2.05438, 0.00434664, 0.00434664, 0, 1, 1.8693, 1.21785, 3.96104e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"A4#16,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32+B32@96 aB32+B32@96 rB vav di sys grf256 af wg 4x8 sb512", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, false}}, {'W', 1, {2048}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@64 am64+m64@64 rB wg 8x4 ca4x2 ks64 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.06557e+06, 554194, 0, 0, 0, 0, 0.351596, 0.90997, 0.947429, 1.57679, 0.00331983, 0.00331983, 0, 0.990955, 1.56412, 1.02425, 2.51352e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m64@64 am128+m128@64 rB wg 4x8 ca3 ks64 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {989932, 503264, 0, 0, 0, 0, 0.521085, 0.615326, 0.759821, 1.5027, 0.00469883, 0.00469883, 0, 0.99602, 1.31748, 0.979255, 1.00069e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m64@64 am64x2+m64@64 rB wg 4x8 ca4 ks64 af vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {989597, 475337, 0, 0, 0, 0, 0.454855, 0.942129, 0.617706, 1.68393, 0.00734477, 0.00734477, 0, 0.963871, 1.2358, 0.932592, 1.33399e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@128 am64x2+m64@128 rB wg 2x16 ca4x2 ks64 af vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 16384, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {961533, 455478, 0, 0, 0, 0, 0.642337, 0.7679, 0.660867, 2.13711, 0.0105946, 0.0105946, 0, 1, 1.30406, 0.902759, 1.47773e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64 am64x2+m64@128 rB wg 2x16 ca3x2 ks128 af vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {991020, 415137, 0, 0, 0, 0, 0.494061, 1.1783, 1.09356, 3.42418, 0.0192037, 0.0192037, 0, 0.99432, 1.16294, -0.903393, 1.39731e-11}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB64+m64@64 aB32+m32@64 rB wg 4x8 cab4 ks64 xaf st vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.04335e+06, 903066, 0, 0, 0, 0, 0.573291, 0.742252, 0.728779, 1.625, 0.0049232, 0.0049232, 0, 1, 1.41255, 0.970177, 1.84329e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB32x2+m64@128 aB32x2+m64@128 rB wg 4x8 cab3 ks64 xaf st vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 36864, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.02689e+06, 960490, 0, 0, 0, 0, 0.510059, 1.04909, 0.673702, 1.67059, 0.00734958, 0.00734958, 0, 0.997915, 1.31262, 0.915348, 2.01189e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIop"}, "av32+m128@96 am64+m64@96 rB wg 4x8 xaf st vav di sb128 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 40, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {886295, 679810, 0, 0, 6.14973e+06, 1.05103e+07, 0.387882, 0.34723, 0.844766, 1.28789, 0.00202312, 0.00202312, 0, 0.99971, 1.60161, 1.05949, 2.70909e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIop"}, "av32+m64@128 am64x2+m64@128 rB wg 4x8 xaf vav di sb128 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {878684, 628439, 0, 0, 5.63446e+06, 8.38042e+06, 0.326785, 0.347891, 0.805503, 1.24882, 0.00204557, 0.00204557, 0, 1, 1.50403, 1.06152, 3.02972e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@64 am64+m128@64 rB wg 4x8 af vav di sb128 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {893468, 542180, 0, 0, 4.62356e+06, 6.62733e+06, 0.391937, 0.505587, 0.774621, 1.23145, 0.00235858, 0.00235858, 0, 0.996539, 1.5023, 1.05328, 2.52696e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIo"}, "av64+m64@128 am32x2 rB wg 4x4 cb4x2 ks32 xaf st vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.19279e+06, 305937, 0, 0, 5.25107e+06, 8.03635e+06, 0.333064, 0.619053, 0.911588, 1.46748, 0.00298243, 0.000462826, 0.0027584, 0.631652, 1.34002, 1.00593, 2.21481e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m64@32 am64 rB wg 2x8 cb4x2 ks64 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 16, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.4092e+06, -554772, -169517, 793747, 2.90734e+06, 3.16867e+06, 0.390299, 0.577147, 0.857166, 1.54399, 0.00361771, 0.000811875, 0.0033029, 0.98501, 1.49402, 0.997618, 1.88797e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64+m64@64 am64+m32@64 rB wg 4x2 xaf vav di sb128 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.15591e+06, -238679, -20728.1, 351139, 2.5641e+06, 2.48218e+06, 0.273012, 0.460771, 0.775111, 1.33768, 0.00533606, 0.00407793, 0.00182134, 0.639264, 1.2796, 0.665315, 1.2356e-11}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m32@128 am128+m128@128 rB wg 2x4 xaf vav di sb128 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 128}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.16312e+06, -178711, -20881.1, 280021, 2.00049e+06, 1.8178e+06, 0.329846, 0.416682, 0.587935, 1.13731, 0.00689723, 0.005772, 0.00231223, 0.806174, 1.33228, 0.879709, 5.51613e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m32@96 am128+m64@96 rB wg 2x4 ca4x2 ks64 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 128}, {2, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.41922e+06, -182504, -78300.4, 285831, 1.86778e+06, 1.47046e+06, 0.294783, 0.695972, 0.506752, 1.20455, 0.0108447, 0.00678984, 0.00498837, 0.844314, 1.33655, 0.794731, 4.36829e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64 am64x2+m64@32 rB wg 1x4 ca4x2 ks64 xaf st vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 64}, {1, 4, 1}, 1, (WGType) 1, 257, 8192, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.31936e+06, -96890.5, -30769.9, 145228, 2.34291e+06, 1.81862e+06, 0.412414, 0.30833, 0.409741, 1.09695, 0.0174833, 0.0161637, 0.00408927, 0.766829, 1.28501, 0.936336, 4.72138e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "IAB"}, "av32+B32@48 am32x2 rB wg 8x2 af vav di sb256 sys ska rr wx4", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 4, 32}, {8, 2, 1}, 4, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {883594, 337014, 0, 0, 0, 0, 0.22229, 8.60539, 0.729527, 2.00832, 0.0271033, 0.0271033, 0, 0.0457007, 1.09409, 0.724113, 2.54861e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {8, 8, 1}, "IAB"}, "av64 am128+m128@96 rB wg 1x4 ca3x2 ks64 af vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 128}, {1, 4, 1}, 1, (WGType) 1, 257, 3072, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.23048e+06, -125592, -17585.5, 179011, 2.12173e+06, 1.35168e+06, 0.920505, 0.255099, 0.653157, 1.21015, 0.0169894, 0.0198407, 0.00264754, 0.9339, 1.23674, 0.982957, 2.88698e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#Iop"}, "aB32 aB32 rB wg 4x8 cab4 ks64 xaf vav di sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 131072, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.05829e+06, 1.03353e+06, 0, 0, 5.30842e+06, 9.18323e+06, 0.47066, 0.531818, 0.917115, 1.45352, 0.00220015, 0.00220015, 0, 0.975008, 1.63094, 0.992603, 5.47213e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AIop"}, "aB32+m32@96 aB32 rB wg 4x8 cab3 ks64 xaf st vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 86016, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.06575e+06, 1.10881e+06, 0, 0, 4.41139e+06, 6.57818e+06, 0.42243, 0.679112, 0.918586, 1.50737, 0.00243376, 0.00243376, 0, 1, 1.95319, 1.10024, 2.294e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB64 aB32 rB wg 4x8 cab3 ks64 xaf vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 73728, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.06135e+06, 822813, 0, 0, 3.62578e+06, 4.59817e+06, 0.415771, 0.818848, 0.865328, 1.58139, 0.00337564, 0.00337564, 0, 1, 1.58683, 1.05039, 3.00347e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB64 aB32 rB wg 2x8 cab4 ks64 af vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 16, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.89554e+06, -498803, -411042, 930415, 2.67387e+06, 3.20799e+06, 0.567677, 0.501271, 0.840676, 1.5063, 0.00427203, 0.000481846, 0.00404699, 0.832467, 1.44137, 0.955203, 3.92029e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB32 aB32+m128@64 rB wg 2x4 cab4 ks64 af vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {2, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {2.01352e+06, -185065, -235490, 423906, 2.17907e+06, 2.28065e+06, 0.487223, 0.546183, 0.355732, 0.989325, 0.00871265, 0.00303217, 0.00595475, 0.892248, 1.28589, 0.854172, 3.98616e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB32+m64@32 aB64+m64@32 rB wg 2x4 cab4 ks64 af vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 64}, {2, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.88418e+06, -153479, -204520, 376006, 2.17088e+06, 1.88498e+06, 0.367103, 0.802232, -0.0233093, 0.803253, 0.0120542, 0.00674447, 0.00621616, 0.958995, 1.34316, 0.655552, 9.35284e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB32x2 aB32x2+m64@32 rB wg 1x4 cab4 ks64 xaf vav di sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 64}, {1, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.789e+06, -104206, -88473.3, 216093, 3.5799e+06, 1.90874e+06, 0.494892, 0.45351, -0.0429483, 0.861739, 0.0175893, 0.00752501, 0.010522, 0.988366, 1.40054, 0.929823, 4.81054e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "#AI"}, "aB32+m64@64 aB64 rB wg 2x8 af vav di nmk ca3 sys dm l4", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 2, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 1, 4}, {true, true, false}}, {'E', 17, {1.33537e+06, 174420, 0, 0, 1.79651e+06, 2.35274e+06, 0.35032, 2.21824, 0.289392, 0.954431, 0.0187001, 0.00782903, 0.0142837, 0.961014, 1.45471, 0.970912, 4.15697e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "#AIop"}, "aB32+m64@64 aB128 rB wg 1x4 af vav di sb128 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 128}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.21993e+06, -94688.2, -20621.2, 162272, 2.16269e+06, 679936, 1.63701, 0.42146, 0.328977, 0.880663, 0.0240187, 0.0160351, 0.0106171, 0.824292, 1.25588, 0.789446, 8.45484e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m64@128 av64+m32@128 rB wg 4x8 cab4x2 ks64 af vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.04641e+06, 551646, 0, 0, 0, 0, 0.775015, 0.878808, 0.742373, 1.49937, 0.00558315, 0.00558315, 0, 1, 1.29356, 0.970044, 1.0419e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m32@128 av32+m32@128 rB wg 4x8 cab3x2 ks64 xaf vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 36864, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.02633e+06, 512948, 0, 0, 0, 0, 0.616223, 1.4606, 0.843356, 1.74802, 0.0095619, 0.0095619, 0, 1, 1.19005, 0.980175, 5.84833e-13}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64+m32@128 av64+m32@128 rB wg 4x8 cab3x2 ks128 af vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.02641e+06, 481324, 0, 0, 0, 0, 0.93847, 1.32746, 0.702383, 2.19035, 0.0160592, 0.0160592, 0, 1, 1.04875, 0.330175, 3.7195e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aS32 rB wg 4x8 cab4 ks64 af vav di grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.07428e+06, 870501, 0, 0, 0, 0, 0.501416, 1.12211, 0.858073, 1.52799, 0.00684843, 0.00684843, 0, 0.996251, 1.07336, 0.092233, 6.21635e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB64 aS64 rB wg 4x8 cab4 ks128 af vav di grf256 sys l4 dm", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 8, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.11515e+06, 925681, 0, 0, 0, 0, 0.830567, 3.60606, 0.636814, 1.59255, 0.0203905, 0.0203905, 0, 0.862215, 1.00371, 1.00069, -4.07711e-16}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32x2 aS32x2 rB wg 2x16 cab4 ks128 af vav di grf256 sys l4 dm", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 4, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.05676e+06, 797092, 0, 0, 0, 0, 1.15513, 3.47579, 0.681838, 2.11327, 0.0385831, 0.0385831, 0, 0.774259, 1.0034, 1.00053, 5.87826e-17}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32x2 aS32x2 rB wg 2x16 cab4 ks128 af vav di grf256 sys l4 dm", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.00981e+06, 799187, 0, 0, 0, 0, 1.93343, 4.24673, 2.12932, 5.7688, 0.146616, 0.146616, 0, 0.630819, 1.00163, 1.00055, -1.04571e-15}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIo"}, "av32x2+m32@64 av32+m32@64 rB wg 4x8 cb3 ks32 xaf st vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {968994, 716920, 0, 0, 5.78437e+06, 9.22419e+06, 0.534082, 0.742409, 0.894677, 1.49963, 0.00222583, 0.00222583, 0, 0.887893, 1.57089, 1.08446, 1.91309e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIo"}, "av32x2+m128@64 av64+m32@64 rB wg 4x4 cb3 ks64 xaf vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.48013e+06, -879245, -212784, 1.19787e+06, 3.27516e+06, 5.10362e+06, 0.353883, 0.703019, 0.911615, 1.47833, 0.00312427, 0.000484667, 0.00281171, 0.593822, 1.43189, 1.02998, 1.76713e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64+m32@64 av32+m32@64 rB wg 4x4 cab3x2 ks64 xaf st vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.44895e+06, -475742, -192864, 753196, 2.79429e+06, 3.21044e+06, 0.543815, 0.583688, 0.893765, 1.54599, 0.00436005, 0.000416302, 0.00416311, 0.659609, 1.31185, 0.99626, 1.25799e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64+m32@64 av64+m32@64 rB wg 4x2 cb4x2 ks64 xaf st vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {4, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.43195e+06, -220355, -88718.8, 353320, 2.49856e+06, 2.51167e+06, 0.377332, 0.75479, 0.906308, 1.29913, 0.0058945, 0.00410324, 0.00237673, 0.692439, 1.23717, 0.463634, 1.60992e-11}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32 at32+m64@32 rB wg 2x2 ca3x2 ks32 xaf vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {2, 2, 1}, 1, (WGType) 1, 257, 6144, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.33083e+06, -155638, -32206.7, 215344, 2.58048e+06, 2.38387e+06, 0.560445, 0.453657, 0.73036, 1.3036, 0.00873208, 0.0075434, 0.00252893, 0.878367, 1.27904, 0.492696, 1.21306e-11}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@32 av32+m128@32 rB wg 2x2 cb4x2 ks64 af vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {2, 2, 1}, 1, (WGType) 1, 257, 8192, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.34069e+06, -116391, -31493.6, 173677, 2.22003e+06, 1.85139e+06, 0.42459, 0.964629, 0.713108, 1.06445, 0.0165022, 0.0154653, 0.00316435, 0.667006, 1.16044, 0.602135, 7.93972e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m64@64 at64+m32@64 rB wg 1x2 ca4x2 ks64 af vav di grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 64}, {1, 2, 1}, 1, (WGType) 1, 257, 8192, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.31786e+06, -82626.9, -17117, 112973, 3.34234e+06, 1.65478e+06, 0.813229, 0.748021, 0.644888, 1.29433, 0.0397427, 0.0319944, 0.00448936, 0, 1.14457, 0.953023, 3.67408e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BIop"}, "aB32 at32+m64@32 rB wg 4x8 cab3x2 ks64 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.06084e+06, 849124, 0, 0, 5.06593e+06, 8.26573e+06, 0.813458, 0.757306, 0.923897, 1.44252, 0.00320502, 0.00320502, 0, 0.998673, 1.38807, 1.02748, 1.29467e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB32+m32@96 at32+m32@96 rB wg 4x4 cab4x2 ks32 xaf vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.66314e+06, -1.27279e+06, -300802, 1.65863e+06, 3.75931e+06, 5.86547e+06, 0.491219, 0.87616, 0.899685, 1.462, 0.00397407, 0.000371744, 0.00376005, 0.648719, 1.3183, 0.990919, 1.99015e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB32 at32+m32@96 rB wg 2x8 cab3 ks64 af vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 16, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.59165e+06, -496791, -255103, 821170, 2.73285e+06, 3.18177e+06, 0.790684, 0.690631, 0.850764, 1.52446, 0.00584761, 0.000218975, 0.00563592, 0.617191, 1.54591, 0.999485, 7.93976e-13}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB32+m32@64 at32+m32@64 rB wg 2x4 cab4 ks32 xaf st vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 16, 32}, {2, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.53804e+06, -244254, -110255, 404894, 2.415e+06, 3.12934e+06, 0.469656, 0.821322, 0.84661, 1.33465, 0.00730374, 0.00540283, 0.00285574, 0.877009, 1.23985, 0.959553, 2.29643e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB32+m32@32 at32+m64@32 rB wg 1x4 cab4 ks32 xaf st vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 16, 32}, {1, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.45404e+06, -177745, -43504.6, 259730, 2.40026e+06, 2.42483e+06, 0.652924, 0.534638, 0.729263, 1.27241, 0.00899012, 0.00717777, 0.00313284, 0.933371, 1.20556, 0.9698, 2.67914e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB32 at32+m32@96 rB wg 1x4 cab4x2 ks32 af vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 8, 32}, {1, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.43554e+06, -119170, -46892.7, 189568, 2.77709e+06, 2.1504e+06, 0.402068, 0.981366, 0.646632, 1.0724, 0.0199081, 0.0164365, 0.0044588, 0.668568, 1.20591, 0.616402, 1.76161e-11}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "AI"}, "aB64+m32@96 aS32 rB wg 16x1 cb4x2 ks32 xaf vav di nmk grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {16, 1, 1}, 1, (WGType) 1, 257, 2048, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.58636e+06, -482491, -299697, 877643, 1.90054e+06, 1.73179e+06, 0.601985, 6.41237, 0.0513263, 1.10568, 0.0409021, 0.00376531, 0.0384593, 0.895641, 1.00114, 1.00052, -4.11446e-16}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB32x2+m32@64 at32x2+m32@64 rB wg 1x16 xaf st vav di sb128 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {1, 16, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.03539e+06, 223719, 0, 0, 1.75473e+06, 2.61161e+06, 5.72432, 0.487304, 0.347712, 0.902737, 0.0217428, 0.0032454, 0.0206116, 0.78847, 1.24817, 0.762289, 3.8041e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m64@128 am32 rB wg 16x2 cb3x2 ks64 af vav di sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 64}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {985303, 529193, 0, 0, 0, 0, 0.334488, 0.597794, 0.849156, 1.3743, 0.00359035, 0.00359035, 0, 0.903609, 1.40249, 1.04911, 2.49355e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m128@128 am64+m32@128 rB wg 8x4 af vav di sb128 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {887443, 420819, 0, 0, 0, 0, 0.443197, 0.408057, 0.55357, 1.07857, 0.00424551, 0.00424551, 0, 1, 1.4159, 1.00616, 1.44951e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m64@96 am64+m128@96 rB wg 8x4 af vav di sb128 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {883828, 389004, 0, 0, 0, 0, 0.372711, 0.671929, 0.348909, 0.985628, 0.00656641, 0.00656641, 0, 0.979495, 1.22863, 0.98572, 8.6565e-13}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m64@96 am32+m128@96 rB wg 4x8 xaf st vav di sb128 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {879427, 374769, 0, 0, 0, 0, 0.681995, 0.637561, 0.593202, 1.53194, 0.0120684, 0.0120684, 0, 0.902553, 1.05814, 0.58533, 4.64073e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m64@96 aB64 rB wg 8x4 cb4x2 ks64 xaf vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {979561, 775985, 0, 0, 0, 0, 0.472998, 0.448938, 0.785458, 1.5451, 0.00427182, 0.00427182, 0, 1, 1.48807, 0.994577, 1.6929e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m32@128 aB64x2 rB wg 8x4 cb4x2 ks128 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 128}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.00856e+06, 729582, 0, 0, 0, 0, 0.391803, 0.680643, 0.557725, 1.58519, 0.00733339, 0.00733339, 0, 0.974679, 1.32826, 0.97557, 9.11096e-13}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m64@128 aB128 rB wg 4x8 cb4x2 ks128 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.68462e+06, 614668, 0, 0, 0, 0, 0.528613, 0.557291, -0.992969, 0.00078125, 0.0113836, 0.0113836, 0, 0.96247, 1.20355, 0.897505, 1.16415e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aS32+S64@96 aB64+S32@96 rB wg 4x1 af vav nmk sb256 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 1, 64}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.18068e+06, 53960.8, 0, 0, 0, 0, 0.213645, 3.70034, 2.54839, 9.45781, 0.067707, 0.0150101, 0.0808417, 1, 1.00383, 0, 0}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIop"}, "at32+m128@96 am64+m64@112 rB wg 4x8 xaf st vav di sb128 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 40, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {884649, 705009, 0, 0, 6.08092e+06, 1.03629e+07, 0.496687, 0.364134, 0.840897, 1.28383, 0.00200956, 0.00200956, 0, 1, 2.31371, 1.15477, 1.64496e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIop"}, "at32x2+m128@96 am64+m64@128 rB wg 8x4 xaf vav di sb128 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {895996, 619115, 0, 0, 5.44113e+06, 8.45414e+06, 0.385983, 0.342777, 0.788529, 1.22228, 0.00199536, 0.00199536, 0, 1, 1.60168, 1.14344, 3.24663e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32x2+m64@64 am64+m128@96 rB wg 8x4 xaf vav di sb128 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {873940, 565190, 0, 0, 4.58752e+06, 7.00416e+06, 0.375538, 0.421029, 0.747052, 1.19407, 0.00251587, 0.00251587, 0, 1, 1.69069, 1.13204, 1.62736e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIop"}, "at32+m64@64 am64+m64@96 rB wg 8x2 af vav di sb128 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 64, 64}, {8, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.11662e+06, -910592, -33046.6, 1.19754e+06, 3.36527e+06, 5.0217e+06, 0.258583, 0.403911, 0.818673, 1.29478, 0.00269142, 0.000381417, 0.00261948, 0.597966, 1.74376, 1.14072, 1.43395e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@96 am32x2+m128@64 rB wg 4x4 xaf st vav di sb128 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.05983e+06, -474100, -2272.62, 713748, 2.81641e+06, 3.33414e+06, 0.403748, 0.373013, 0.772197, 1.18809, 0.00404296, 0.000494942, 0.00376492, 0.513109, 1.48851, 1.00899, 2.18946e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@96 am64+m64@96 rB wg 4x2 xaf vav di sb128 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.14066e+06, -229433, -14533.6, 346037, 2.54935e+06, 2.45105e+06, 0.222742, 0.287911, 0.779924, 1.34129, 0.00482324, 0.00413971, 0.00147387, 0.486442, 1.45855, 0.962427, 5.617e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@96 am64+m64@96 rB wg 2x2 xaf vav di sb128 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.14864e+06, -151615, -6901.62, 210708, 2.44695e+06, 2.41664e+06, 0.224008, 0.202288, 0.73347, 1.38945, 0.00597322, 0.00649637, 0.00100094, 0.864243, 1.42077, 0.800446, 1.1311e-11}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32x2+m64@96 am64+m32@128 rB wg 4x2 xaf st vav di sb128 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.12606e+06, -137008, -24369.3, 228077, 2.14139e+06, 1.80224e+06, 0.232306, 0.352856, 0.366483, 1.01866, 0.0096663, 0.00801051, 0.00318484, 0.842293, 1.37613, 0.921541, 4.97764e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@128 am64+m128@128 rB wg 2x2 xaf vav di sb128 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.09423e+06, -83565.2, -8479.72, 132562, 2.41664e+06, 1.75309e+06, 0.217848, 0.232009, 0.349329, 1.02701, 0.0120474, 0.012138, 0.00283781, 0.878271, 1.29462, 0.7804, 9.59757e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 8, 1}, "IAB"}, "at32+m128@32 am128+m128@128 rB wg 8x4 af vav di nmk sb128 sm sys", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 128}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.01309e+06, 225543, 0, 0, 1.84812e+06, 1.93577e+06, 0.318973, 1.6272, 0.589134, 1.33561, 0.014082, 0.00175378, 0.0216268, 1, 1.06459, 0.958519, 4.93673e-13}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at32+m64@96 aB32 rB wg 8x4 cb4x2 ks64 af vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.05324e+06, 979401, 0, 0, 5.54107e+06, 7.84794e+06, 0.402313, 0.410019, 0.918697, 1.43736, 0.00226438, 0.00226438, 0, 1, 1.62189, 1.08285, 3.19321e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AIop"}, "at32+m64@96 aB32 rB wg 16x2 cb4x2 ks128 xaf st vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 96, 128}, {16, 2, 1}, 1, (WGType) 1, 257, 98304, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.04068e+06, 873177, 0, 0, 3.37183e+06, 6.29965e+06, 0.384445, 0.528477, 0.860175, 1.32814, 0.00247393, 0.00247393, 0, 0.990759, 1.57684, 1.14116, 2.03781e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "#AIopq"}, "at64+m64@128 aB32x2 rB wg 16x2 cb3 ks64 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 64}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 4, 4}, {true, true, false}}, {'E', 17, {973856, 781280, 0, 0, 3.47341e+06, 4.52198e+06, 0.376801, 0.745941, 0.834762, 1.36837, 0.00306578, 0.00306578, 0, 1, 1.548, 1.17568, 1.37045e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "#AI"}, "at32x2+m64@64 aB32x2 rB wg 8x2 cb4 ks64 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 64}, {8, 2, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 4, 4}, {true, true, false}}, {'E', 17, {1.76235e+06, -528635, -346576, 928208, 2.73039e+06, 3.28253e+06, 0.411331, 0.407059, 0.82943, 1.26791, 0.00365219, 0.000634094, 0.00340305, 0.949468, 1.57605, 1.11643, 2.18464e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "#AI"}, "at32+m64@32 aB64 rB wg 8x1 cb4x2 ks64 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 64, 64}, {8, 1, 1}, 1, (WGType) 1, 257, 16384, 0, {1, 4, 4}, {true, true, false}}, {'E', 17, {1.72793e+06, -260474, -161085, 447531, 2.31834e+06, 2.89178e+06, 0.238818, 0.354785, 0.799686, 1.20956, 0.00460907, 0.00312961, 0.00215107, 0.989106, 1.46994, 1.02912, 3.02074e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "#AI"}, "at64+m64@64 aB32x2 rB wg 4x2 cb4x2 ks64 xaf st vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 32, 64}, {4, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {1, 4, 4}, {true, true, false}}, {'E', 17, {1.84304e+06, -196719, -186130, 412502, 2.21921e+06, 2.048e+06, 0.286789, 0.263596, 0.436547, 1.25452, 0.00648641, 0.00488563, 0.00263508, 0.940889, 1.37843, 0.889564, 5.23147e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at32x2+m64@96 aB64 rB wg 4x2 cb4x2 ks128 af vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 128}, {4, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.63724e+06, -117354, -138376, 322423, 1.85139e+06, 1.3271e+06, 0.217482, 0.358374, -0.173415, 0.826585, 0.00954542, 0.00723565, 0.00380699, 0.869435, 1.30653, 0.807528, 6.89227e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "#AI"}, "at64+m64@96 aB64x2 rB wg 2x2 cb4x2 ks64 xaf vav di sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 64}, {2, 2, 1}, 1, (WGType) 1, 257, 8192, 0, {1, 4, 4}, {true, true, false}}, {'E', 17, {1.65053e+06, -99807.1, -68122.5, 193337, 2.53952e+06, 1.70394e+06, 0.244079, 0.25177, 0.0702585, 0.887446, 0.0117307, 0.00844281, 0.00495289, 0.997313, 1.45603, 0.945166, 4.40941e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#ABI"}, "at32x2+m128@32 aB64+m32@64 rB wg 4x2 af vav di nmk sb128 sm sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 64}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.2871e+06, -132026, -84694.7, 251812, 2.17989e+06, 1.66871e+06, 0.26071, 0.702238, 0.035417, 1.12604, 0.0101197, 0.0087237, 0.010893, 1, 1.29244, 0.838716, 2.17924e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "#AI"}, "at32+m64@96 aB128 rB wg 1x4 af vav di sb128 sm dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 128}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.19588e+06, -90151.7, -17276.9, 143923, 2.18726e+06, 1.17965e+06, 0.50932, 0.206862, 0.351165, 0.961797, 0.00952689, 0.0094563, 0.00799782, 1, 1.08799, 0.29902, 3.8884e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABopqI"}, "am64+m64@128 av64+m64@128 rS vav di sys grf256 af wg 8x4 sb512 sm", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {696651, 1.10436e+06, 0, 0, 0, 0, 0.942188, 0.965594, 1.06189, 2.04065, 0.00368306, 0.00368306, 0, 0.911159, 1.34929, 0.941877, 1.64736e-12}}}, +{{'F', "ugemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "oI"}, "aS32 aS32 rB sys grf256 cab2 wg 4x4 ek l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {1, 1, 4}, {true, true, false}}, {'E', 17, {1.16401e+06, 348644, 0, 0, 0, 0, 0.807306, 0.892675, 0.990554, 1.4802, 0.00939438, 0.000733543, 0.0109328, 0.899502, 1.01113, 1.00523, 2.16142e-14}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@8 aB8+B8@8 rB nse di wg 4x8 sb256 kc8", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, false}}, {'W', 1, {1024}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "aB16+m8@32 aS32+m16@40 rB wg 4x4 kc16 nse di sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.08792e+06, 260070, 0, 0, 0, 0, 1.27159, 2.25336, 0.633711, 1.35704, 0.0632943, 0.00105479, 0.0694168, 0.543903, 1.15915, 0.195161, 2.93818e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "B"}, "aB8x2+B8@24 am/S16+S32@32 rB wg 4x8 kc8 nse di sb256 sn grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {933004, 470490, 0, 0, 0, 0, 2.49562, 3.97982, 0.810184, 1.38841, 0.0630776, 0.0630776, 0, 1, 1.22055, -0.309162, 2.6504e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {0, 0, 0}, {0, 0, 0}, {1, 1, 1}, "AB"}, "am16+m16@64 am/S32+m32@64 rB wg 4x8 kc16 nse di sb256 sn grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 4, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {895520, 534803, 0, 0, 0, 0, 1.20137, 3.44786, 1.88791, 3.67536, 0.0747548, 0.0747548, 0, 1, 1.37174, 0.989936, 2.27372e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {256, 256, 256}, {8, 8, 1}, "AB"}, "am16+m32@64 am/S32x2+m16@32 rB wg 2x16 kc16 nse di sb256 sn grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 2, 32}, {2, 16, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {938708, 543839, 0, 0, 0, 0, 2.39026, 3.22437, 2.56416, 5.86786, 0.101672, 0.101672, 0, 0.999145, 1.33995, 1.00257, 1.83338e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 rB wg 1x4 kc8 nse sb256 sn", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {1, 1, 1}, "AB"}, "am16x2+m32@16 aS32+m16@32 rS wg 1x4 kc16 nse sb256 sn grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {4, 16, 32}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.15385e+06, 54677.4, 0, 0, 0, 0, 4.66453, 0.94861, 2.51588, 5.10353, 0.11684, 0.263033, 0.0520785, 1, 1.35189, 1.13162, 7.09638e-15}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8/4+B8@32 aS8+S1,16@32 rB nse wg 4x8 kc8 sn sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {64, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {877281, 856555, 0, 0, 6.69286e+06, 0, 1.87833, 2.50913, 0.836287, 1.45578, 0.0625082, 0.0625082, 0, 1, 1.00952, 0.859121, 3.28697e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 aS8x2+S32@8 rB wg 2x8 kc8 nse sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {64, 16, 8}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.12483e+06, -574330, -20527.6, 853244, 4.03046e+06, 0, 2.33007, 1.51079, 0.80632, 1.51643, 0.0625462, -1.54388e-05, 0.0629715, 0.425906, 1.06028, 0.229788, 1.06051e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@8 aS8x2+S16@8 rB wg 2x4 kc8 nse sb64 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {64, 16, 8}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.22775e+06, -292493, -31491.9, 424162, 3.35872e+06, 0, 1.38608, 1.51352, 0.865001, 1.65146, 0.0625896, 0.0512516, 0.0196923, 0.635005, 1.1356, -0.568771, 2.52473e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "B"}, "aB8x2+B8@16 am/S16+S8@24 rB wg 2x2 kc8 nse nmk sb256 sn grf256 l8 k32", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {64, 8, 32}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.20305e+06, -132301, -19617.3, 193485, 2.5387e+06, 0, 0.832904, 3.59755, 0.684047, 1.44642, 0.0626258, 0.0593555, 0.0162245, 1, 1.17502, 0.548457, 1.08697e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, "A"}, "am8x2+S8@24 aB16+S8@32 rB wg 8x1 kc8 nse nmk sb256 l8", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 1, 16}, {8, 1, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.2083e+06, -79687.9, -19989.3, 150722, 1.64659e+06, 0, 0.822546, 29.7452, 2.30501, 5.26227, 0.521206, 0.152686, 0.118741, 0.333333, 1.13241, 0, 0}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {63, -1, -1}, {1, 1, 1}, "AB"}, "am16x2+m16@16 aS16+m16@16 rB wg 1x16 kc16 nse sb256 sn grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.11666e+06, -375280, -55079.8, 590017, 2.65421e+06, 0, 6.85937, 1.10986, 0.561901, 1.18544, 0.0718347, 0.0118681, 0.0644105, 0.645395, 1.19384, 0.859778, 5.37885e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {1, 1, 1}, "A"}, "am16x2+S32@16 aS32+S16@32 rS wg 1x4 kc16 nse sb256 sn grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {4, 16, 32}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.21263e+06, -101219, -23787.5, 154376, 2.3765e+06, 0, 4.80141, 0.952132, 2.62236, 5.17717, 0.130559, 0.263753, 0.0519091, 1, 1.25383, 1.14072, -8.09176e-14}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+S32@24 aS8+S16@32 rS wg 1x8 kc8 nse sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {1, 16, 8}, {1, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.16307e+06, -117224, -49945.9, 211601, 2.21184e+06, 0, 27.752, 0.924823, 7.95811, 19.7907, 0.498836, 0.333805, 0.307337, 1, 1.19347, 0, 0}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "aB16+m16@8 aS16+m32@8 rB wg 2x4 kc16 nse di sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 16, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.22732e+06, -192672, -38121.5, 322384, 3.32595e+06, 0, 2.49763, 1.54907, 0.628927, 1.39358, 0.0627876, 0.0504407, 0.0208002, 0.589308, 1.19034, 0.60621, 4.79866e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "aB16+m16@32 aS16+m16@32 rB wg 2x4 kc16 nse di sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 8, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.24447e+06, -148653, -56383, 266371, 2.85901e+06, 0, 1.66468, 2.34448, 0.802059, 1.33917, 0.0632256, 0.0464224, 0.0241823, 0.804536, 1.32602, 0.945203, 1.61386e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, "AB"}, "am8+B8@8 at8x2 rB wg 4x1 kc8 nse di nmk sn sb32 l8", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 4, 8}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {1024, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "aB16+m8@24 aS16+m16@24 rB wg 1x2 kc16 nse di sb32 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 16, 16}, {1, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.19256e+06, -99702.7, -5250.1, 135470, 4.21888e+06, 0, 0.924905, 1.15646, 0.601147, 1.43162, 0.0628943, 0.0635026, 0.0173549, 1, 1.36155, 1.02238, 5.68694e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1023, -1, -1}, {1, 1, 1}, "AB"}, "aB8+m8@16 aS16+m8@24 rB wg 1x4 kc8 nse di sb256 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 8, 16}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.21899e+06, -88481.9, -25272.1, 146166, 3.03923e+06, 0, 1.52455, 1.45603, 0.792316, 1.37923, 0.0633789, 0.0507551, 0.0245627, 1, 1.38276, 0.904775, 1.71039e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "aB8+m8@16 aB16+m16@16 rB wg 4x4 kc8 nse di sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.10018e+06, 251905, 0, 0, 0, 0, 1.56408, 2.85947, 0.648851, 1.37611, 0.0629702, 0.000146865, 0.0632313, 0.517444, 1.16754, -0.0884205, 2.14696e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "AB"}, "am16+m16@64 am16+m32@64 rB wg 4x4 kc16 nse di sb256 grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 4, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.10359e+06, 253001, 0, 0, 0, 0, 1.23463, 4.97057, 1.98081, 3.72207, 0.0797284, 0.00775855, 0.0752955, 0.909553, 1.43847, 0.946385, 6.92721e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "am16+m32@32 am16+m16@32 rB wg 2x8 kc16 nse di sb256 grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 2, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.09369e+06, 252258, 0, 0, 0, 0, 1.95015, 5.48836, 2.57025, 5.95009, 0.14056, 0.0114998, 0.12796, 0.974789, 1.2964, 0.960056, 4.31831e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "am8x2+B8@8 am16x2+S16@8 rB wg 4x8 kc8 nse di sb256 grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {869157, 563214, 0, 0, 0, 0, 2.39412, 4.22276, 0.853271, 1.30754, 0.0725087, 0.0725087, 0, 1, 1.25715, 0.942426, 4.35208e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "am8+m8@32 am16x2+m16@16 rB wg 2x8 kc8 nse di sb256 grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 4, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.09546e+06, 254169, 0, 0, 0, 0, 1.63097, 3.84728, 1.50643, 2.37923, 0.0742032, 0.0154716, 0.0594372, 0.827878, 1.42273, 0.913754, 1.12977e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "@ABpq"}, "aB8/4x2+m8@28 aB8/4x2+m8@28 rP nse wg 4x8 sb256 kc8 grf256", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 409, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {866662, 832973, 0, 0, 6.62733e+06, 0, 2.31399, 2.18462, 0.879335, 1.58093, 0.0624816, 0.0624816, 0, 1, 1.01086, 1.00539, 3.62849e-14}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "aB8x2+m16@8 aB8+m8@16 rB wg 4x8 kc8 nse sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {64, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {882165, 581951, 0, 0, 4.12713e+06, 0, 2.40327, 3.47719, 0.766058, 1.43151, 0.062663, 0.062663, 0, 1, 1.01425, 1.00462, 1.16371e-13}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "B"}, "aB16/8x2+B8@24 am8x2+B8@24 rB wg 4x4 kc8 nse sb256 grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {-9.04747e+06, 584317, 1.02639e+07, -265146, 4.2025e+06, 0, 1.64736, 1.39786, 0.778015, 1.45273, 0.0625854, -3.41849e-05, 0.0630026, 0.438401, 1.03752, 0.690802, 4.57945e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "aB8x2+m8@8 am8x2+m16@8 rB wg 2x4 kc8 nse sb32 grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {64, 16, 8}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.30059e+06, -298431, -46736.1, 440498, 3.35053e+06, 0, 1.50565, 1.62089, 0.886099, 1.64882, 0.0625893, 0.0488024, 0.0214838, 0.619159, 1.12794, 0.304184, 1.13643e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "aB8x2+m8@16 aB8x2+m16@16 rB wg 2x2 kc8 nse sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 32, 8}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.21807e+06, -209850, -17703.2, 281411, 4.47283e+06, 0, 1.03886, 1.08671, 0.844744, 1.64697, 0.0626487, 0.0640654, 0.0133332, 0.981443, 1.14832, 0.194753, 1.9432e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "am8x2+m8@16 am8+m8@24 rB wg 1x4 kc8 nse sb16 l8", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.18328e+06, -79905.7, -7917.22, 125829, 3.2768e+06, 0, 1.22836, 1.29337, 0.531332, 1.37249, 0.0719463, 0.0581012, 0.0201665, 0.709035, 1.34298, 0.803171, 5.21261e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "AB"}, "aB8x2+m8@16 aB16+m8@24 rB wg 2x2 kc8 nse nmk sb32 grf256", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {64, 8, 16}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.16702e+06, -121012, -8196.35, 178120, 2.6026e+06, 0, 1.04823, 5.51776, 0.693341, 1.46543, 0.0628152, 0.058718, 0.0176573, 0.998417, 1.16326, 0.364617, 1.16605e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, "B"}, "aB8x2+S8@24 at8x2+S32@24 rB wg 4x1 kc8 nse nmk sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 1, 8}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.1701e+06, -71179.7, -3197.37, 120149, 2.58458e+06, 0, 0.911003, 23.7279, 3.34366, 10.235, 1.37481, 0.288391, 0.23596, 0.333333, 1.09898, 0, 0}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "AB"}, "am8x2+m32@8 aB8x2+m8@8 rB wg 1x4 kc8 nse sb256 l8", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 16, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {1, 1, 1}, "AB"}, "am16x2+m16@16 aB8x2+m8@24 rS wg 1x8 kc8 nse sb256 grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {4, 16, 16}, {1, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.26743e+06, -130651, -75631.7, 238106, 2.415e+06, 0, 13.4604, 0.983422, 2.77777, 5.15729, 0.127127, 0.144409, 0.0797807, 1, 1.29538, 1.06434, 1.41696e-13}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, "AB"}, "at8+m16@8 aB16x2+m16@8 rS wg 1x8 kc8 nse sb256", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {1, 16, 16}, {1, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.10613e+06, -79156.7, -1736.09, 153649, 2.00704e+06, 0, 47.8566, 0.896699, 10.8519, 21.2763, 1.082, 0.293987, 0.241183, 0.333333, 1.10378, 0, 0}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "aB8x2+m8@8 am8x2+m16@16 rB wg 2x2 kc8 nse sb64 grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 16, 8}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, "AB"}, "aB8+m8@8 aB32+m32@8 rB wg 8x1 kc8 nse di nmk sb32 grf256", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 4, 32}, {8, 1, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.18493e+06, -151099, 5931.1, 262767, 2.36749e+06, 0, 0.843802, 28.5958, 1.39394, 2.57371, 0.107758, 0.0474906, 0.0797103, 0.994029, 1.25885, 0.690983, 1.3586e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "at8x2+S1,8@56 am/S8x2+S16@56 rB wg 4x8 kc8 nse di sb32 sm sn grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {900022, 588464, 0, 0, 0, 0, 2.11582, 2.1833, 0.547862, 1.23354, 0.0626136, 0.0626136, 0, 1, 1.1599, -0.725344, 3.06459e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "at8x2+m32@24 am/S16+m8@32 rB wg 8x4 kc8 nse di sb256 sm sn grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 16, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {893971, 565638, 0, 0, 0, 0, 2.10566, 2.60265, 0.8239, 1.43017, 0.072004, 0.072004, 0, 0.497868, 1.18996, 0.90853, 4.46328e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "at16+m16@64 am/S16x2+m32@48 rB wg 4x4 kc16 nse di sb256 sm sn grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.30686e+06, 256588, 0, 0, 0, 0, 1.11607, 1.55713, 0.941507, 2.42007, 0.0726522, 0.00111933, 0.0659252, 0.759172, 1.32447, 0.944187, 7.04547e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "at8x2+S8@56 am/S16x2+S16@48 rB wg 4x8 kc8 nse di sb32 sm sn grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 4, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {892566, 531408, 0, 0, 0, 0, 1.946, 3.76473, 1.95506, 3.63902, 0.0803044, 0.0803044, 0, 1, 1.40954, 0.984594, 4.35399e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "at16x2+S16@48 at32+S16@64 rB wg 2x16 kc16 nse di sb256 sm sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 2, 32}, {2, 16, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {880883, 514420, 0, 0, 0, 0, 4.6228, 3.84817, 2.57586, 5.80879, 0.169176, 0.169176, 0, 1, 1.15837, 0.943093, 3.07402e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,16@32 aS16+S1,16@32 rB nse wg 4x8 kc8 sm sn sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {935084, 896033, 0, 0, 6.17677e+06, 0, 2.71042, 2.12983, 0.828082, 1.44356, 0.0625855, 0.0625855, 0, 1, 1.01199, 1.00613, 6.43507e-14}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8x2+S1,8@16 aS16x2+S1,16@40 rB wg 4x4 kc8 nse sb32 sm sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.53273e+06, -601303, -215849, 889641, 4.35814e+06, 0, 2.13947, 1.867, 0.799054, 1.46981, 0.062662, 9.4614e-05, 0.0633035, 0.43207, 1.15029, -0.297015, 2.33734e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Bp"}, "aS8x2+S1,16@32 am/S8x2+S1,8@24 rB wg 4x2 kc8 nse sb32 sm sn grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 32, 8}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.20785e+06, -377725, 50285.1, 535170, 6.62733e+06, 0, 1.42372, 1.15814, 0.840145, 1.59798, 0.062656, 0.0523573, 0.0192386, 0.636211, 1.17551, -0.657316, 3.00397e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "at8x2+S1,8@24 am/S8x2+S16@24 rB wg 2x4 kc8 nse sb64 sm sn grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 16, 8}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.3151e+06, -199624, -53740.1, 339095, 3.22765e+06, 0, 1.33798, 1.23385, 0.606827, 1.36406, 0.0626292, 0.0504945, 0.0206585, 0.514635, 1.23079, 0.191329, 6.88155e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "A"}, "at8x2+S1,16@24 aS16x2+S32@16 rB wg 2x2 kc8 nse sb256 sm sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 16, 16}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.2289e+06, -127728, -18531.3, 192240, 3.35053e+06, 0, 0.932716, 1.33521, 0.665104, 1.39923, 0.0628179, 0.0675437, 0.0114361, 0.999809, 1.27564, 0.821381, 3.76564e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "aS32+m16@24 aS8x2+m8@16 rB wg 2x2 kc8 nse sb256 sm sn", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 16, 32}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.19363e+06, -102074, -12101.3, 144610, 3.60448e+06, 0, 1.02426, 0.988243, 0.546949, 1.14949, 0.116426, 0.0720828, 0.0143077, 0, 1.19811, 0.938108, 2.11398e-11}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "AB"}, "at8x2+m16@24 at8x2+m32@8 rB wg 16x1 kc8 nse nmk sb256 sm sn", {16, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 16, 8}, {16, 1, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.18993e+06, -230103, -26635.9, 388995, 2.2528e+06, 0, 0.900793, 5.78162, 0.552809, 1.28255, 0.0627307, 0.0602325, 0.0232779, 1, 1.21284, 0.921396, 2.8065e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, "AB"}, "at16x2+m16@32 at16+m32@32 rB wg 16x1 kc16 nse nmk sb256 sm grf256", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 4, 16}, {16, 1, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.1734e+06, -264907, -109870, 485064, 2.21266e+06, 0, 0.856653, 15.807, 1.98085, 3.89882, 0.125049, 0.0139237, 0.143865, 1, 1.34573, 0.978713, 4.7619e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, "AB"}, "at8+m16@24 aB8+m32@24 rB wg 4x1 kc8 nse nmk sb256 sm grf256", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 1, 8}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.17696e+06, -69519.2, -25169.2, 118671, 2.77463e+06, 0, 0.853223, 15.5817, 3.31287, 9.86255, 0.335914, 0.160302, 0.315473, 1, 1.14674, 0, 0}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "AB"}, "at8x2+m32@8 at8x2+m8@8 rB wg 1x4 kc8 nse sb256 sm sn", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 16, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.13247e+06, -102265, -1131.66, 143220, 3.04742e+06, 0, 1.75413, 0.928621, 0.480908, 0.891062, 0.0713067, 0.0781393, 0.013656, 0.976256, 1.18801, 0.74341, 6.89459e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {1, 1, 1}, "AB"}, "at16x2+S32@16 at16+S16@32 rS wg 1x4 kc16 nse sb256 sm sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {4, 16, 16}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.22164e+06, -75649.9, -28431.3, 129898, 2.46579e+06, 0, 4.47812, 0.846859, 2.59041, 5.08414, 0.0900204, 0.188187, 0.0683862, 1, 1.31716, 0.967325, 2.73803e-12}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, "AB"}, "am32+m32@32 at8x2+m16@24 rS wg 1x8 kc8 nse sb256 sn grf256 l8", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {1, 16, 32}, {1, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.26572e+06, -129216, -67284, 239862, 2.15941e+06, 0, 24.5032, 0.852871, 5.82167, 16.1942, 0.788258, 0.295269, 0.225216, 0.354473, 1.07957, 0, 0}}}, +{{'F', "ugemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,16@32 aB8/4+B8@32 rU nse di wg 8x4 kc8 sm sb256 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 64, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {875898, 864492, 0, 0, 0, 0, 2.55527, 2.12966, 0.857629, 1.85569, 0.0625314, 0.0625314, 0, 1, 1.01096, 1.00724, 3.06523e-14}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am8+B8@32 am16 rB wg 4x8 cb4x2 ks32 af vav di sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {991490, 510308, 0, 0, 0, 0, 1.21713, 3.27426, 1.0535, 2.33084, 0.0217767, 0.0217767, 0, 0.980474, 1.39168, 0.919941, 4.31334e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am16+B16@32 am16+S1,16@32 rB wg 8x4 af vav di sb256 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {887182, 626290, 0, 0, 0, 0, 1.11701, 2.20089, 0.753196, 1.51546, 0.00982405, 0.00982405, 0, 0.986514, 1.51371, 1.10395, 2.7046e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIpq"}, "am16+m16@20 am16+m16@20 rB wg 4x8 xaf st rr vav sb32 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {870519, 757436, 0, 0, 7.0656e+06, 9.74029e+06, 1.61519, 1.51882, 0.777223, 1.2159, 0.00794099, 0.00794099, 0, 1, 1.52399, 1.13177, 3.83469e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "am16+B8@32 am32+m16@32 rB wg 4x8 af rr vav sb32 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {64, 24, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {881335, 689481, 0, 0, 4.45891e+06, 7.58579e+06, 1.46799, 1.88849, 0.735162, 1.19884, 0.00844968, 0.00844968, 0, 1, 1.56029, 1.15624, 3.29649e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIv"}, "am8+m8@12 am16+m16@12 rB wg 4x4 xaf st rr vav sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {64, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.05058e+06, -905176, 36068, 1.20994e+06, 4.27049e+06, 5.44768e+06, 0.832982, 1.70307, 0.823027, 1.31887, 0.00894305, 0.0016304, 0.00826202, 0.641064, 1.55717, 1.18753, 2.39955e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "am8+m8@32 am32x2+m16@32 rB wg 4x4 cb3 ks32 xaf rr vav sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.495e+06, -552515, -223241, 832726, 2.84426e+06, 3.87482e+06, 1.28529, 1.52068, 0.884565, 1.51616, 0.0123021, 0.000807932, 0.0129924, 0.76811, 1.53199, 1.09418, 3.34525e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am16x2+m16@16 am16x2+m16@16 rB wg 4x2 xaf st vav sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.151e+06, -214932, -14362.1, 332485, 3.12934e+06, 2.83443e+06, 0.856861, 1.31774, 0.784444, 1.33614, 0.0178404, 0.015689, 0.00568798, 0.760968, 1.36233, 0.935131, 1.65223e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am8x2+m8@16 am16+m16@16 rB wg 4x2 xaf rr vav sb32 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 32, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.12171e+06, -139486, -7826.11, 242937, 2.58048e+06, 2.09469e+06, 0.974006, 1.18793, 0.621158, 1.05288, 0.0236282, 0.0192689, 0.0100126, 0.991025, 1.34534, 0.903336, 1.42014e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am16+m16@24 am16+m16@24 rB wg 4x2 xaf vav sb32 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.11883e+06, -113128, -26047.6, 208183, 2.39206e+06, 1.99066e+06, 0.913668, 1.55697, 0.353875, 1.00409, 0.0354029, 0.0247767, 0.0164062, 0.998102, 1.16477, -0.223884, 1.06064e-10}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am8+m8@24 am16+m16@24 rB wg 2x2 xaf st rr vav sb32 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 16}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.09769e+06, -57030.9, -6352.25, 105284, 2.80166e+06, 1.9882e+06, 0.893798, 1.03645, 0.350574, 1.01725, 0.0463755, 0.0412808, 0.0144535, 0.996262, 1.24046, 0.915099, 1.40089e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB8 aB8+m16@4 rB wg 8x4 cab4 ks16 xaf fx vav dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 64, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 131072, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.06032e+06, 1.25638e+06, 0, 0, 6.01702e+06, 9.45357e+06, 1.82432, 1.57187, 0.889071, 1.36268, 0.00796522, 0.00796522, 0, 1, 1.70167, 1.14548, 4.12059e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB8x2+m16@32 aB8+m16@32 rB wg 16x2 cb4x2 ks16 af vav dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 64, 16}, {16, 2, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {977656, 1.27269e+06, 0, 0, 3.98541e+06, 5.43949e+06, 1.37334, 2.37069, 0.781236, 1.33861, 0.00946734, 0.00946734, 0, 1, 1.54411, 1.01385, 7.50807e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB8+m8@28 aB16+m16@28 rB wg 4x4 cab3 ks16 xaf rr vav dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {2.16525e+06, -543645, -553184, 1.15734e+06, 2.72302e+06, 3.75194e+06, 1.0933, 1.05267, 0.739246, 1.30453, 0.0123325, 0.00114474, 0.01269, 0.796222, 1.65248, 1.10646, 3.16139e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "aB8+m16@4 aB8 rB wg 4x2 cb3 ks16 af vav sm dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 32, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 12288, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.86546e+06, -231426, -190573, 455448, 2.80986e+06, 2.85082e+06, 1.05281, 1.89266, 0.768987, 1.11533, 0.017914, 0.0104947, 0.00956962, 0.99803, 1.51931, 1.03564, 3.16307e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "aB8+m16@20 aB8 rB wg 4x2 cb4 ks16 af vav dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 32, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.95228e+06, -155772, -209098, 393418, 2.48218e+06, 2.2528e+06, 1.4244, 1.53085, 0.312276, 1.12087, 0.0242192, 0.00627943, 0.0193567, 1, 1.40693, 0.960855, 4.73398e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB8+m8@16 aB16+m16@16 rB wg 4x2 af rr vav sb32 dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.5179e+06, -146559, -113822, 345656, 2.31834e+06, 2.07258e+06, 0.765268, 2.39027, -0.0267412, 0.770134, 0.0358339, 0.0156392, 0.0233713, 1, 1.33018, 0.794846, 1.92976e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB16 aB16x2+m16@20 rB wg 2x4 ca4x2 ks16 xaf st rr vav dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 8, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 8192, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.58566e+06, -116017, -144663, 284746, 2.40845e+06, 1.7367e+06, 1.41636, 1.39229, -0.840332, 0.397168, 0.0479738, 0.025812, 0.0299484, 1, 1.37335, 0.964517, 6.85737e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "ABI"}, "am16+B16@32 at16+S16@32 rB wg 4x8 af vav di sb256 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {64, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, true, false}}, {'E', 17, {870199, 642863, 0, 0, 0, 0, 1.58315, 2.95571, 0.763338, 1.48553, 0.0107117, 0.0107117, 0, 0.95453, 1.55818, 1.10822, 3.1594e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "ABI"}, "am8+B8@32 at8+S16@32 rB wg 4x8 af vav di sb256 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, true, false}}, {'E', 17, {888363, 535816, 0, 0, 0, 0, 1.88142, 2.12019, 0.619881, 1.23713, 0.01468, 0.01468, 0, 0.990366, 1.42663, 0.981831, 3.45631e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {64, 1, 1}, "ABIp"}, "am8x2+m16@32 at8x2+m8@32 rB wg 4x8 xaf rr vav sb32 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {64, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {64, 4, 4}, {true, true, false}}, {'E', 17, {876569, 665985, 0, 0, 7.66198e+06, 9.6256e+06, 1.95542, 2.14536, 0.79812, 1.24152, 0.00817485, 0.00817485, 0, 0.994621, 1.61371, 1.13434, 6.6789e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am8+B8@32 am16 rB wg 4x8 cb3x2 ks16 xaf vav grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.0371e+06, 775586, 0, 0, 5.70163e+06, 9.92051e+06, 1.72964, 1.82897, 0.886229, 1.47344, 0.00841793, 0.00841793, 0, 1, 1.49653, 1.11416, 4.04426e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am8+m16@16 am8x2 rB wg 4x4 cb4x2 ks16 af rr vav sm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {64, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.59272e+06, -978725, -272730, 1.30124e+06, 4.09928e+06, 5.79174e+06, 1.05622, 2.23639, 0.894523, 1.42815, 0.0092523, 0.000785124, 0.00895202, 0.63727, 1.5504, 1.17763, 2.80357e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am8+m8@12 am8 rB wg 4x4 cb3x2 ks16 af rr vav grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.41278e+06, -475207, -184854, 746006, 2.8631e+06, 3.96493e+06, 1.41005, 1.64502, 0.846753, 1.524, 0.0124003, 0.00247552, 0.010916, 0.71204, 1.5023, 1.11529, 2.55063e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am8+m8@8 am16x2 rB wg 4x2 cb4x2 ks16 xaf st vav sm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 32, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.44977e+06, -229432, -91996.8, 360967, 2.74432e+06, 2.83443e+06, 0.994644, 1.82033, 0.912465, 1.26725, 0.0177885, 0.0113898, 0.00819279, 0.994278, 1.3075, 1.0341, 2.81883e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am16+m8@28 am16 rB wg 4x2 cb4 ks32 af rr vav grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 32, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.4412e+06, -158604, -93384.8, 269786, 2.49856e+06, 2.16596e+06, 1.33045, 1.56869, 0.614107, 1.13068, 0.0237811, 0.0192942, 0.00773208, 0.891613, 1.28247, 0.989234, 3.90583e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am16+m16@20 am8 rB wg 4x2 cb3x2 ks16 xaf st rr vav grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 16, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 6144, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.32644e+06, -117029, -77022.1, 217651, 2.50675e+06, 2.08077e+06, 1.10942, 1.87029, 0.445393, 1.19202, 0.0356602, 0.0240091, 0.0150978, 0.985953, 1.20643, 0.660809, 3.5073e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am8+m8@16 at8x2+m16@16 rB wg 2x2 xaf st vav sb32 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 16, 8}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, false}}, {'E', 17, {1.10873e+06, -58922.2, -6912.85, 107670, 2.74432e+06, 1.9415e+06, 1.25976, 1.28769, 0.338524, 1.00538, 0.0495596, 0.0306422, 0.0206077, 0.892755, 1.22765, 0.843988, 1.99783e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB8+m16@28 at8+m8@28 rB wg 8x4 cab3 ks16 xaf fx vav sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 64, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 98304, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.07574e+06, 844468, 0, 0, 5.65658e+06, 9.57645e+06, 2.10088, 1.94561, 0.92383, 1.37803, 0.0081964, 0.0081964, 0, 1, 1.53929, 1.14878, 3.24806e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABIp"}, "aB8+m8@16 at8+m8@16 rB wg 8x2 cb3 ks16 xaf vav sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 64, 16}, {8, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.58768e+06, -901185, -260664, 1.27074e+06, 3.35053e+06, 6.40614e+06, 1.37734, 2.09365, 0.9199, 1.36339, 0.00975575, 0.00208833, 0.0089245, 0.67917, 1.64797, 1.17287, 3.36617e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB8 at16+m8@12 rB wg 2x4 ca4x2 ks8 xaf st rr vav sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {64, 16, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.57761e+06, -234672, -123306, 399103, 2.5813e+06, 2.71155e+06, 1.27743, 2.07431, 0.659144, 1.18966, 0.01863, 0.0128349, 0.00792373, 0.900376, 1.36382, 1.06382, 2.33814e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB8+m8@8 at8+m8@8 rB wg 1x4 xaf rr fn nmk vav sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {64, 16, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.18347e+06, -130993, -1022.14, 203731, 2.62144e+06, 2.65421e+06, 1.14288, 1.05988, 0.674346, 1.45385, 0.0233347, 0.0163703, 0.00886945, 0.989082, 1.38519, 0.90064, 2.11921e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB8+B8@12 at8+m8@12 rB wg 2x2 xaf st vav sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 16, 8}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.22438e+06, -85563.8, -17080.9, 152837, 2.89997e+06, 2.13811e+06, 1.13375, 1.65157, 0.529939, 1.09905, 0.0356927, 0.0236702, 0.0148759, 0.98034, 1.32898, 0.869925, 2.23293e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB8x2+m8@16 at8x2+m8@16 rB wg 2x2 xaf st rr vav sb32 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {16, 16, 8}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.18837e+06, -58547.5, -17231.8, 121783, 2.69517e+06, 1.93331e+06, 1.27086, 1.21231, 0.268122, 0.902316, 0.0486346, 0.0404013, 0.0158949, 0.936826, 1.27881, 0.949937, 1.10074e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABI"}, "at8+m16@32 am8 rB wg 16x2 cb3x2 ks32 af vav di sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.02713e+06, 545120, 0, 0, 0, 0, 1.33793, 2.52449, 0.809498, 1.39207, 0.0123313, 0.0123313, 0, 1, 1.35782, 1.10413, 2.51062e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABI"}, "at8+m32@32 am16+m8@32 rB wg 8x4 af vav di sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {884492, 432577, 0, 0, 0, 0, 1.76508, 1.6272, 0.54562, 0.981819, 0.0170086, 0.0170086, 0, 1, 1.26826, 0.98644, 2.49628e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABI"}, "at8+m16@24 am16+m32@24 rB wg 8x4 af vav di sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 16, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {876300, 396941, 0, 0, 0, 0, 1.49136, 2.69242, 0.343495, 0.978292, 0.0258616, 0.0258616, 0, 1, 1.16974, 0.937499, 2.10134e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABI"}, "at8+m16@24 am8+m32@24 rB wg 4x8 xaf st vav di sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {887348, 379545, 0, 0, 0, 0, 2.73455, 2.54363, 0.600631, 1.54448, 0.0445874, 0.0445874, 0, 1, 1.14882, -0.188859, 1.45784e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at8+m16@24 aB16 rB wg 8x4 cb4x2 ks16 xaf vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {982704, 739459, 0, 0, 0, 0, 1.88943, 1.80702, 0.748656, 1.56147, 0.0163717, 0.0163717, 0, 1, 1.40964, 1.004, 2.5724e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at8+m8@32 aB16x2 rB wg 8x4 cb4x2 ks32 xaf st vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {990777, 696161, 0, 0, 0, 0, 1.57625, 2.76566, 0.574062, 1.70951, 0.027591, 0.027591, 0, 1, 1.17736, 0.949533, 2.24305e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at8+m16@32 aB32 rB wg 4x8 cb4x2 ks32 xaf st vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {991067, 667403, 0, 0, 0, 0, 2.32107, 2.27078, 0.600535, 2.15725, 0.0413329, 0.0413329, 0, 1, 1.15505, -0.471853, 2.01057e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABIp"}, "at8+m32@24 am16+m16@28 rB wg 4x8 xaf st vav di sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {64, 40, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {889895, 713074, 0, 0, 6.02931e+06, 1.12312e+07, 2.11653, 1.73065, 0.848984, 1.28, 0.00816471, 0.00816471, 0, 1, 1.55007, 1.13556, 3.63302e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABIp"}, "at8x2+m32@24 am16+m16@32 rB wg 8x4 xaf vav di sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 64, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {898482, 632398, 0, 0, 5.24288e+06, 9.7321e+06, 1.68469, 1.5304, 0.790341, 1.22016, 0.00802145, 0.00802145, 0, 1, 1.75052, 1.14461, 3.6943e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABI"}, "at8x2+m16@16 am16+m32@24 rB wg 8x4 xaf vav di sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 48, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {891936, 567469, 0, 0, 4.63585e+06, 7.4711e+06, 1.83757, 1.87916, 0.755879, 1.18158, 0.0100832, 0.0100832, 0, 1, 1.42535, 1.09302, 3.53176e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABIp"}, "at8+m16@16 am16+m16@24 rB wg 8x2 af vav di sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 64, 16}, {8, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.13949e+06, -897413, -32590.8, 1.18643e+06, 3.76832e+06, 6.70106e+06, 1.00553, 1.66047, 0.814792, 1.30163, 0.0103335, 0.00163025, 0.0103806, 0.833414, 1.44587, 1.08563, 5.68178e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABI"}, "at16+m16@24 am8x2+m32@16 rB wg 4x4 xaf st vav di sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.0632e+06, -467639, 10417.6, 710888, 3.35872e+06, 3.57171e+06, 1.81304, 1.61773, 0.776717, 1.18564, 0.015991, 0.00183222, 0.0181941, 0.836065, 1.39175, 0.968008, 4.37129e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABI"}, "at16+m16@24 am16+m16@24 rB wg 4x2 xaf vav di sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 32, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.16404e+06, -213766, -23189.3, 331208, 3.1703e+06, 2.95731e+06, 1.03006, 1.3373, 0.782341, 1.33751, 0.0200369, 0.0175885, 0.00684884, 0.845273, 1.31354, 0.766008, 2.32049e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABI"}, "at16+m16@24 am16+m16@24 rB wg 2x2 xaf vav di sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {32, 32, 16}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.12531e+06, -125731, -141.335, 186556, 3.072e+06, 2.61325e+06, 0.937134, 0.827271, 0.726022, 1.38801, 0.0242967, 0.0252145, 0.00563579, 0.936659, 1.28299, 0.871341, 1.76728e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABI"}, "at8x2+m16@24 am16+m8@32 rB wg 4x2 xaf st vav di sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 16, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.15244e+06, -115879, -36339.6, 208099, 2.41664e+06, 2.00376e+06, 0.923207, 1.40591, 0.363361, 1.02336, 0.0391334, 0.0349888, 0.0147314, 0.897811, 1.25588, 0.905559, 1.261e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABI"}, "at16+m16@32 am16+m32@32 rB wg 2x2 xaf vav di sb32 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 16, 16}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, false}}, {'E', 17, {1.12145e+06, -60602.7, -9691.85, 110113, 2.78528e+06, 2.02342e+06, 0.876923, 0.929169, 0.3496, 1.0308, 0.046609, 0.0524722, 0.0105972, 1, 1.21602, 0.894099, 1.36788e-11}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m16@32 aB8x2 rB wg 16x2 cb3 ks16 xaf st vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 64, 16}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.00138e+06, 734096, 0, 0, 4.07552e+06, 4.85786e+06, 1.57632, 3.003, 0.818111, 1.43042, 0.0123216, 0.0123216, 0, 1, 1.50818, 1.08647, 2.42674e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at8x2+m16@16 aB8x2 rB wg 8x2 cb4 ks16 xaf st vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 64, 16}, {8, 2, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.73773e+06, -557674, -328650, 945507, 2.60014e+06, 3.66182e+06, 1.6105, 1.63931, 0.840027, 1.38612, 0.0140496, 0.00310004, 0.0135841, 0.899554, 1.50782, 1.06591, 2.89163e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at8+m16@8 aB16 rB wg 8x1 cb4x2 ks16 xaf st vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 64, 16}, {8, 1, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.69161e+06, -246817, -147908, 433263, 2.74432e+06, 3.16211e+06, 0.956798, 1.40211, 0.805897, 1.31444, 0.0181823, 0.0148306, 0.00804353, 1, 1.44649, 0.968708, 7.35847e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m16@16 aB8x2 rB wg 4x2 cb4x2 ks16 xaf st vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 32, 16}, {4, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.8509e+06, -175907, -189196, 392631, 2.74432e+06, 2.18726e+06, 1.12474, 1.04753, 0.43698, 1.42448, 0.0254728, 0.0206988, 0.0107976, 0.953996, 1.38293, 0.912126, 8.89018e-12}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at8x2+m16@24 aB16 rB wg 4x2 cb4x2 ks32 af vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 32}, {4, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.64426e+06, -145120, -138637, 309031, 2.31014e+06, 2.00704e+06, 0.918257, 1.44548, 0.262092, 1.02772, 0.0377766, 0.0386483, 0.015333, 0.989041, 1.28006, -1.30164, 1.0788e-10}}}, +{{'F', "ugemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m16@24 aB16x2 rB wg 2x2 cb4x2 ks16 xaf vav di sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {16, 16, 16}, {2, 2, 1}, 1, (WGType) 1, 257, 8192, 0, {4, 4, 4}, {true, true, false}}, {'E', 17, {1.65746e+06, -77290.3, -69015.6, 169032, 2.75251e+06, 1.85958e+06, 0.958255, 0.997806, 0.100098, 0.957911, 0.0468295, 0.0427724, 0.0158824, 1, 1.31247, 0.973533, 8.29974e-12}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB8+B8@8 aB8+B8@8 rB nse di grf256 wg 4x8 kc8 sb256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 384}, {4096, 4096, 384}, {16, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 16}, {true, true, false}}, {'W', 1, {256}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ft"}, "aB4+B4@16 aS4x2+S1,4@16 rB wg 8x4 kc4 nse di sb256 sn xb grf256 3m", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 16, 4}, {8, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.24645e+07, 1.04207e+06, 0, 0, 0, 0, 5.55127, 10.2661, 2.84493, 5.75622, 0.206162, 0.206162, 0, 0.958009, 1.69161, 1.36059, 1.9372e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tBu"}, "aB4+B4@12 aS8+m8@12 rB wg 4x8 kc4 nse sb32 sm sn xa xb xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {874627, 804733, 0, 0, 5.17734e+06, 1.15835e+07, 4.85352, 5.83594, 3.45513, 5.18442, 0.256338, 0.256338, 0, 1, 1.54396, 1.10663, 3.09518e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tABu"}, "aB4+m8@6 aS4x2+m8@6 rB wg 8x4 kc4 nse sb32 sm sn xa xb xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 16, 4}, {8, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {845107, 656282, 0, 0, 3.94854e+06, 5.0135e+06, 4.07563, 10.3842, 2.58594, 4.59169, 0.258909, 0.258909, 0, 1, 1.60465, 1.09256, 3.54937e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tBu"}, "aB8+B8@4 aS4x2+m4@4 rB wg 4x4 kc4 nse sb32 sn xa xb xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.10457e+06, 305271, 0, 0, 2.83443e+06, 5.41491e+06, 7.68483, 8.83236, 2.82113, 4.61003, 0.259707, 0.0043142, 0.258523, 0.58838, 1.65059, 0.660053, 1.30922e-10}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tABu"}, "aB4+m8@6 aS4x2+m16@6 rB wg 4x4 kc4 nse sb32 sm sn xa xb xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 8, 4}, {4, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.16438e+06, -413992, -50164.8, 703477, 2.94011e+06, 3.18669e+06, 4.67834, 8.6051, 1.73433, 3.55631, 0.299264, 0.0126545, 0.297455, 0.723046, 1.64113, 0.988344, 5.11479e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tBu"}, "aB4x2+B4@8 aS4x2+m8@8 rB wg 2x4 kc4 nse sb32 sn xa xb xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 8, 4}, {2, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.27901e+06, -171452, -53645.4, 309236, 3.16703e+06, 3.21126e+06, 4.19559, 4.02399, 1.79936, 4.26956, 0.288744, 0.170897, 0.157815, 1, 1.72896, 1.2319, 3.20258e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tBu"}, "aB4x2+B4@4 aS8+m8@4 rB wg 2x2 kc4 nse sb32 xa xb xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 8, 8}, {2, 2, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.22119e+06, -106726, -14375.4, 175884, 3.34234e+06, 2.71974e+06, 4.12919, 6.56698, 2.06521, 4.72499, 0.372315, 0.163823, 0.212395, 0.802612, 1.61316, 1.46102, 4.59442e-12}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tABu"}, "aB4+m4@4 aS4+m4@4 rB wg 1x4 kc4 nse sb32 sm xa xb xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 4, 4}, {1, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.2221e+06, -74414.7, -22228.7, 139670, 3.12115e+06, 2.048e+06, 5.66368, 5.54845, 0.646465, 2.98794, 0.516121, 0.146348, 0.381201, 0.979484, 1.45781, 0.884651, 6.49504e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Bu"}, "aB8+B8@6 aS4x2+m16@6 rB wg 4x2 kc4 nse sb32 sm xb grf256 ska", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 8, 8}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.28379e+06, -177869, -63962.6, 371163, 3.09658e+06, 3.04742e+06, 4.0385, 8.91089, 0.867202, 3.69585, 0.275117, 0.183379, 0.127637, 0.954717, 1.84083, 1.42173, 2.20814e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ABu"}, "aB16+m8@12 aS8+m8@12 rB wg 2x4 kc8 nse sb32 xb grf256 sn", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {8, 16, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.33985e+06, -186686, -65711.1, 389568, 3.32595e+06, 3.1703e+06, 6.01278, 3.7883, 0.701654, 3.40967, 0.268548, 0.179888, 0.128493, 0.995636, 1.89846, 1.52268, 9.01454e-12}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ft"}, "aB4+B4@12 aB4+B16@12 rB wg 8x4 kc4 nse di sb256 xa xc grf256 3m", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 384}, {4096, 4096, 384}, {16, 16, 4}, {8, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.20396e+07, 1.14542e+06, 0, 0, 0, 0, 7.88184, 11.5693, 2.66146, 5.69254, 0.208472, 0.208472, 0, 0.933156, 1.64788, 1.33269, 1.3569e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Btu"}, "aB4+B8@8 aB4x2+m8@8 rB wg 4x8 kc4 nse sb32 xa xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 384}, {4096, 4096, 384}, {32, 16, 4}, {4, 8, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {871306, 768118, 0, 0, 6.17677e+06, 1.01663e+07, 5.01929, 4.38257, 3.49194, 5.25718, 0.256413, 0.256413, 0, 1, 1.51977, 1.09553, 3.30503e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tu"}, "aB4x2 aB4x2 rB wg 8x4 kc4 nse sb32 xa xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 384}, {4096, 4096, 384}, {16, 16, 4}, {8, 4, 1}, 1, (WGType) 0, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {875588, 562723, 0, 0, 4.57933e+06, 4.73498e+06, 5.11959, 4.86313, 2.7399, 4.60274, 0.258416, 0.258416, 0, 1, 1.56704, 1.11342, 2.9923e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tu"}, "aB8+B4@8 aB4x2 rB wg 4x2 kc4 nse sb32 xa xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 384}, {4096, 4096, 384}, {16, 16, 8}, {4, 2, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.29804e+06, -270759, -53894, 413523, 3.63151e+06, 4.83328e+06, 4.2098, 8.8199, 2.89335, 5.35417, 0.259844, 0.183329, 0.113036, 0.991824, 1.68151, 1.08541, 6.62013e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB4x2+B8@8 aB4x2+B4@8 rB wg 2x4 kc4 nse sb32 grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 384}, {4096, 4096, 384}, {16, 8, 4}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.35923e+06, -167843, -72624.8, 360710, 3.24403e+06, 2.22822e+06, 4.4058, 4.47416, 0.858064, 3.59423, 0.274889, 0.187013, 0.131714, 0.973094, 1.77267, 1.49634, 1.22049e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tABu"}, "aB4+m4@8 aB8+m4@8 rB wg 2x2 kc4 nse sb32 sn xa xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 384}, {4096, 4096, 384}, {16, 8, 8}, {2, 2, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.19439e+06, -104738, -7431.37, 176450, 3.58482e+06, 3.03923e+06, 3.45913, 4.79702, 1.99969, 4.67954, 0.296171, 0.149372, 0.165852, 0.973329, 1.84827, 1.49094, 1.53984e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ABu"}, "aB8+m8@8 aB4x2+m8@8 rB wg 2x2 kc4 nse sb32 sn grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 384}, {4096, 4096, 384}, {8, 8, 8}, {2, 2, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.2276e+06, -74239.5, -23444.3, 152508, 3.1703e+06, 2.26918e+06, 4.04019, 3.71597, 1.18927, 3.08229, 0.387924, 0.20049, 0.191732, 0.872448, 1.73431, 1.32225, 2.86689e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Bu"}, "aB8+B8@6 aB4x2+m16@6 rB wg 4x2 kc4 nse sb32 sm grf256 ska", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 384}, {4096, 4096, 384}, {16, 8, 8}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.29642e+06, -172481, -65596, 367881, 3.21126e+06, 2.73613e+06, 4.12176, 7.71716, 0.847917, 3.69619, 0.275028, 0.154896, 0.137417, 0.975914, 1.83622, 1.50005, 1.47282e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Au"}, "aB4x2+m16@6 aB8+B8@6 rB wg 2x4 kc4 nse sb32 sn grf256 skb", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 384}, {4096, 4096, 384}, {8, 16, 8}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.31419e+06, -194892, -72042.4, 389619, 3.08838e+06, 2.82624e+06, 7.66819, 4.05916, 0.795262, 3.55538, 0.275955, 0.18167, 0.131205, 0.985494, 1.83331, 1.20052, 4.53046e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ft"}, "aS8+S1,4@8 aS8+S1,4@8 rB wg 8x4 kc8 nse di sb256 sm sn xb xa xc grf256 3m", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 128}, {4096, 4096, 128}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.21242e+07, 1.08543e+06, 0, 0, 0, 0, 8.55762, 12.8174, 2.82619, 5.72986, 0.206385, 0.206385, 0, 0.981135, 1.67927, 1.32972, 1.98927e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tABu"}, "aS4x2+m4@8 aS4x2+m4@8 rB wg 8x4 kc4 nse sb32 sm sn xa xb xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 128}, {4096, 4096, 128}, {16, 32, 4}, {8, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {860139, 790959, 0, 0, 5.66067e+06, 1.14852e+07, 9.78408, 9.24219, 3.12994, 5.11847, 0.254414, 0.254414, 0, 1, 1.57074, 1.09716, 5.88263e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tABu"}, "aS8x2+m8@14 aS8x2+m16@14 rB wg 8x4 kc8 nse sb32 xa xb xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 128}, {4096, 4096, 128}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {902076, 640478, 0, 0, 3.59629e+06, 7.45472e+06, 6.52891, 9.10469, 2.63457, 4.5264, 0.254863, 0.254863, 0, 1, 1.61477, 1.1729, 2.56989e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tABu"}, "at4x2+m4@12 aS4x2+m8@12 rB wg 8x2 kc4 nse sb32 sm xa xb xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 128}, {4096, 4096, 128}, {8, 16, 4}, {8, 2, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.12292e+06, -668264, -32620.8, 971014, 2.82214e+06, 3.84205e+06, 4.04685, 10.0561, 1.61126, 3.71076, 0.329289, 0.0403761, 0.299635, 0.94804, 1.63451, 1.04436, 4.99301e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tABu"}, "at4x2+m8@8 aS4x2+m8@8 rB wg 4x2 kc4 nse sb32 sn xa xb xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 128}, {4096, 4096, 128}, {8, 16, 4}, {4, 2, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.29594e+06, -299522, -56040, 449802, 2.84262e+06, 4.24346e+06, 3.75494, 5.80831, 1.59711, 4.43442, 0.329283, 0.0718173, 0.272959, 0.995263, 1.67414, 1.05906, 3.94134e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tABu"}, "at4x2+m8@10 aS8+m4@10 rB wg 2x2 kc4 nse sb32 sn xa xb xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 128}, {4096, 4096, 128}, {8, 8, 8}, {2, 2, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.21221e+06, -121137, -22610.7, 187433, 3.08019e+06, 2.27738e+06, 3.79704, 4.96892, 0.676927, 3.05081, 0.409279, 0.0687096, 0.342948, 0.990995, 1.73552, 1.30383, 1.71048e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ft"}, "aS8+S1,4@4 aB8+B16@4 rB wg 8x4 kc8 nse di sb256 sm xa xc grf256 3m", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.37427e+07, 1.19537e+06, 0, 0, 0, 0, 11.0283, 12.5713, 2.75167, 5.60455, 0.266312, 0.266312, 0, 0.967908, 1.44226, 1.30174, 1.59611e-12}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tABu"}, "aS4+m8@10 aB4+m4@4 rB wg 8x4 kc4 nse sb32 sm sn xa xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 32, 4}, {8, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {856663, 756956, 0, 0, 5.47389e+06, 1.00106e+07, 5.85557, 14.2774, 3.14894, 4.97295, 0.255634, 0.255634, 0, 1, 1.52285, 1.16241, 2.27096e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "@ABtu"}, "aS4x2+m8@8 aB8x2+m8@8 rB wg 4x8 kc4 nse sb32 sm sn xa xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {899813, 661377, 0, 0, 3.53976e+06, 5.87366e+06, 14.3934, 3.96372, 2.55187, 4.35476, 0.257777, 0.257777, 0, 0.923675, 1.61221, 1.11115, 3.61799e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "@Atu"}, "aS8+m8@8 aB8x2+B8@8 rB wg 4x4 kc8 nse sb32 xa xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.0405e+06, -500235, 28828.6, 819453, 3.47341e+06, 5.09542e+06, 6.73073, 4.52253, 2.75442, 4.57448, 0.257638, 0.00642445, 0.256344, 0.629485, 1.62679, 1.09834, 4.12901e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tAu"}, "aS8+m4@8 aB4x2 rB wg 2x4 kc4 nse sb32 sm xa xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 16, 8}, {2, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.2652e+06, -265227, -45997.9, 407538, 3.49635e+06, 4.77594e+06, 7.18552, 5.06579, 2.79626, 5.25854, 0.262051, 0.188468, 0.112739, 0.993569, 1.6976, 1.03364, 6.15783e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tAu"}, "aS8+m4@16 aB4x2+B8@16 rB wg 2x4 kc4 nse sb32 sm xa xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 8, 8}, {2, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.44272e+06, -173509, -85245.4, 308614, 2.65421e+06, 2.62144e+06, 4.89627, 4.39907, 1.7601, 4.14959, 0.285741, 0.185127, 0.136625, 0.956631, 1.76617, 1.34653, 2.03215e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tABu"}, "aS4+m4@4 aB4+m4@4 rB wg 1x4 kc4 nse sb32 sm xa xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 8, 4}, {1, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.21607e+06, -105747, -9289.82, 176560, 3.4431e+06, 2.23642e+06, 6.95131, 3.18464, 2.01878, 4.70577, 0.3227, 0.141888, 0.209695, 0.979343, 1.75797, 1.359, 2.31506e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "tABu"}, "aS4+m8@8 aB8+m8@8 rB wg 1x4 kc4 nse sb32 sm xa xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {16, 4, 8}, {1, 4, 1}, 1, (WGType) 1, 1, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.25685e+06, -74761.9, -31678.5, 142523, 2.81805e+06, 1.91693e+06, 5.28414, 3.5142, 0.415298, 2.76742, 0.465263, 0.304503, 0.213074, 0.981571, 1.59188, 1.25542, 1.62848e-11}}}, +{{'F', "ugemm", {"Z", "Z", "Z"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Au"}, "aS4x2+m16@6 aB8+B8@6 rB wg 2x4 kc4 nse sb32 sn grf256 skb", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 256}, {4096, 4096, 256}, {8, 16, 8}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 16}, {true, true, false}}, {'E', 17, {1.34133e+06, -190628, -71300.2, 388604, 3.30138e+06, 2.94912e+06, 9.80936, 4.02402, 0.72048, 3.48181, 0.290936, 0.196811, 0.135886, 0.963735, 1.82686, 1.36608, 2.21681e-11}}}, +{{'I', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "ABIp"}, "av32+m16@128 am32+m32@128 rB wg 4x4 sys kc2 grf512 sn", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 64, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32+B16@128 aB32+B32@128 rB wg 4x4 sys kc2 dm sn grf512 l4", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "ABIp"}, "av32+m16@128 at32+m16@128 rB wg 4x4 sys kc2 grf512", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 64, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B16@64 aB8+B16@64 rB wg 4x8 kc8 grf256 l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "ABIp"}, "at32+m32@128 am32+m32@128 rB wg 4x4 sys kc2 grf512 sm sn", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {64, 64, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+B32@128 aB32+B32@128 rB wg 4x4 sys kc2 dm sm sn grf512 l4", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "ABIq"}, "am32+m32@128 av32+m16@128 rS wg 4x4 sys kc2 grf512 sm", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 64, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32+B32@128 aB32+B16@128 rS wg 4x4 sys kc2 dm sm grf512 l4", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"C", "C", "C"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "u"}, "aB8+B8@16 aB8+B8@16 rB kc8", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 16, 8}, {2, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 8}, {true, true, true}}, {'W', 1, {256}}}, +{{'I', "ugemm", {"D", "D", "D"}, {"A#8,8", "B8", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB8+B8@16 aB8+B8@16 rB sys grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {32, 32, 8}, {2, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 8}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "ABIp"}, "av32+m16@128 am32+m32@128 rB wg 4x4 sys kc2 grf512 sn", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 64, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32+B16@128 aB32+B32@128 rB wg 4x4 sys kc2 dm sn grf512 l4", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "ABIp"}, "av32+m16@128 at32+m16@128 rB wg 4x4 sys kc2 grf512", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 64, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B16@64 aB8+B16@64 rB wg 4x8 kc8 grf256 l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "ABIp"}, "at32+m32@128 am32+m32@128 rB wg 4x4 sys kc2 grf512 sm sn", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {64, 64, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+B32@128 aB32+B32@128 rB wg 4x4 sys kc2 dm sm sn grf512 l4", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "ABIq"}, "am32+m32@128 av32+m16@128 rS wg 4x4 sys kc2 grf512 sm", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 64, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32+B32@128 aB32+B16@128 rS wg 4x4 sys kc2 dm sm grf512 l4", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "IAB"}, "av64+m32@256 am64+m64@256 rB wg 4x4 sys kc2 grf512 sn", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 64, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64+B32@256 aB64+B64@256 rB wg 4x4 sys kc2 dm sn grf512 l4", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "ABIop"}, "av64+m32@256 at64+m32@256 rB wg 4x4 sys kc2 grf512", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 64, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B16@64 aB8+B16@64 rB wg 4x8 grf256 l4", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "IAB"}, "at64+m64@256 am64+m64@256 rB wg 4x4 sys kc2 grf512 sm sn", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 64, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS64+B64@256 aB64+B64@256 rB wg 4x4 sys kc2 dm sm sn grf512 l4", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "IAB"}, "am64+m64@256 av64+m32@256 rS wg 4x4 sys kc2 grf512 sm", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {64, 64, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64+B64@256 aB64+B32@256 rS wg 4x4 sys kc2 dm sm grf512 l4", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+B8@16 rB kc8", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {32, 32, 8}, {2, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'I', "ugemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+C8@24 aS8+S16@24 rB wg 4x4 kc8 grf512 sn", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {64, 64, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+C8@24 aB8+C8@24 rB wg 4x4 kc8 grf512", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1536}, {64, 64, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S16@24 aS8+S16@24 rB wg 4x4 kc8 grf512 sm sn", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {64, 64, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S16@24 aB8+C8@24 rB wg 4x4 kc8 grf512 sm", {16, (LoopType) 255, 512, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {64, 64, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {4096}}}, +{{'I', "ugemm", {"Z", "Z", "Z"}, {"A", "B8", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Itu"}, "aB8+B8@16 aB8+B8@16 rB sys xc grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 384}, {4096, 4096, 384}, {16, 16, 8}, {2, 8, 1}, 1, (WGType) 1, 0, 0, 0, {128, 128, 16}, {true, true, true}}, {'W', 1, {256}}} +}} From 66c3461d1fc15c238635f8ccc44643a430e85ab9 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Tue, 9 Apr 2024 10:09:28 -0700 Subject: [PATCH 087/187] gpu: add ip_version to device info --- src/gpu/intel/compute/device_info.hpp | 2 ++ src/gpu/intel/ocl/ocl_gpu_device_info.cpp | 5 +++-- src/gpu/intel/ocl/ocl_gpu_hw_info.cpp | 12 +++++++++++- src/gpu/intel/ocl/ocl_gpu_hw_info.hpp | 2 +- src/sycl/sycl_device_info.cpp | 5 +++-- 5 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/gpu/intel/compute/device_info.hpp b/src/gpu/intel/compute/device_info.hpp index 4c4ec5a2055..30eb42c4daf 100644 --- a/src/gpu/intel/compute/device_info.hpp +++ b/src/gpu/intel/compute/device_info.hpp @@ -241,6 +241,7 @@ struct device_info_t { bool has_native(native_ext_t ext) const { return native_extensions_ & (uint64_t)ext; } gpu_arch_t gpu_arch() const { return gpu_arch_; } int stepping_id() const { return stepping_id_; } + uint32_t ip_version() const { return ip_version_; } int max_eus_per_wg() const { return max_eus_per_wg_; } static int max_eus_per_wg(gpu_arch_t gpu_arch); @@ -315,6 +316,7 @@ struct device_info_t { compute::gpu_arch_t gpu_arch_ = compute::gpu_arch_t::unknown; int stepping_id_ = 0; + uint32_t ip_version_ = 0; bool mayiuse_systolic_ = false; bool mayiuse_ngen_kernels_ = false; bool mayiuse_system_memory_allocators_ = false; diff --git a/src/gpu/intel/ocl/ocl_gpu_device_info.cpp b/src/gpu/intel/ocl/ocl_gpu_device_info.cpp index ff7831843c0..5562c4a09cb 100644 --- a/src/gpu/intel/ocl/ocl_gpu_device_info.cpp +++ b/src/gpu/intel/ocl/ocl_gpu_device_info.cpp @@ -42,8 +42,9 @@ status_t ocl_gpu_device_info_t::init_arch(engine_t *engine) { = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err); OCL_CHECK(err); - init_gpu_hw_info(engine, device, context, gpu_arch_, stepping_id_, - native_extensions_, mayiuse_systolic_, mayiuse_ngen_kernels_); + init_gpu_hw_info(engine, device, context, ip_version_, gpu_arch_, + stepping_id_, native_extensions_, mayiuse_systolic_, + mayiuse_ngen_kernels_); err = clReleaseContext(context); OCL_CHECK(err); diff --git a/src/gpu/intel/ocl/ocl_gpu_hw_info.cpp b/src/gpu/intel/ocl/ocl_gpu_hw_info.cpp index 70af5fe8c8d..bd402c65402 100644 --- a/src/gpu/intel/ocl/ocl_gpu_hw_info.cpp +++ b/src/gpu/intel/ocl/ocl_gpu_hw_info.cpp @@ -21,6 +21,10 @@ #include "gpu/intel/jit/jit_generator.hpp" #include "gpu/intel/jit/utils/ngen_type_bridge.hpp" +#ifndef CL_DEVICE_IP_VERSION_INTEL +#define CL_DEVICE_IP_VERSION_INTEL 0x4250 +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -28,7 +32,7 @@ namespace intel { namespace ocl { void init_gpu_hw_info(engine_t *engine, cl_device_id device, cl_context context, - compute::gpu_arch_t &gpu_arch, int &stepping_id, + uint32_t &ip_version, compute::gpu_arch_t &gpu_arch, int &stepping_id, uint64_t &native_extensions, bool &mayiuse_systolic, bool &mayiuse_ngen_kernels) { using namespace ngen; @@ -53,6 +57,12 @@ void init_gpu_hw_info(engine_t *engine, cl_device_id device, cl_context context, auto status = jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine); if (status != status::success) mayiuse_ngen_kernels = false; + + ip_version = 0; + if (clGetDeviceInfo(device, CL_DEVICE_IP_VERSION_INTEL, sizeof(ip_version), + &ip_version, nullptr) + != CL_SUCCESS) + ip_version = 0; } } // namespace ocl diff --git a/src/gpu/intel/ocl/ocl_gpu_hw_info.hpp b/src/gpu/intel/ocl/ocl_gpu_hw_info.hpp index 8a8f270c72a..d6c3cbda938 100644 --- a/src/gpu/intel/ocl/ocl_gpu_hw_info.hpp +++ b/src/gpu/intel/ocl/ocl_gpu_hw_info.hpp @@ -29,7 +29,7 @@ namespace intel { namespace ocl { void init_gpu_hw_info(engine_t *engine, cl_device_id device, cl_context context, - compute::gpu_arch_t &gpu_arch, int &stepping_id, + uint32_t &ip_version, compute::gpu_arch_t &gpu_arch, int &stepping_id, uint64_t &native_extensions, bool &mayiuse_systolic, bool &mayiuse_ngen_kernels); diff --git a/src/sycl/sycl_device_info.cpp b/src/sycl/sycl_device_info.cpp index 22cb503dc60..bbddb68a482 100644 --- a/src/sycl/sycl_device_info.cpp +++ b/src/sycl/sycl_device_info.cpp @@ -51,8 +51,8 @@ status_t sycl_device_info_t::init_arch(engine_t *engine) { OCL_CHECK(err); gpu::intel::ocl::init_gpu_hw_info(engine, ocl_dev_wrapper, - ocl_ctx_wrapper, gpu_arch_, stepping_id_, native_extensions_, - mayiuse_systolic_, mayiuse_ngen_kernels_); + ocl_ctx_wrapper, ip_version_, gpu_arch_, stepping_id_, + native_extensions_, mayiuse_systolic_, mayiuse_ngen_kernels_); } else if (be == xpu::sycl::backend_t::level0) { // TODO: add support for L0 binary ngen check // XXX: query from ocl_engine for now @@ -66,6 +66,7 @@ status_t sycl_device_info_t::init_arch(engine_t *engine) { gpu::intel::compute::compute_engine_t *>(engine)); auto *dev_info = compute_engine->device_info(); + ip_version_ = dev_info->ip_version(); gpu_arch_ = dev_info->gpu_arch(); stepping_id_ = dev_info->stepping_id(); mayiuse_systolic_ = dev_info->mayiuse_systolic(); From 2535185775d652110e9bac257fd9e15a90f49de2 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Tue, 9 Apr 2024 11:54:33 -0700 Subject: [PATCH 088/187] common: gemm: make get_trans static --- src/common/gemm_types.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/gemm_types.hpp b/src/common/gemm_types.hpp index 295b91451ab..4027d0ff7af 100644 --- a/src/common/gemm_types.hpp +++ b/src/common/gemm_types.hpp @@ -71,7 +71,7 @@ struct gemm_desc_t { inline bool is_batched() const { return c_desc.ndims >= 3; } // Simplified accessors that comply to GEMM API - transpose_t get_trans(const memory_desc_t &md) const { + static transpose_t get_trans(const memory_desc_t &md) { return md.format_desc.blocking.strides[md.ndims - 1] != 1 ? transpose::trans : transpose::notrans; From 0990319bce4f25c4b09567896832978ab5fe6d6e Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Mon, 8 Apr 2024 15:17:04 -0700 Subject: [PATCH 089/187] gpu: ocl: microkernel-based SDPA implementation --- src/gpu/gpu_sdpa_list.cpp | 3 + src/gpu/intel/ocl/micro_sdpa.cl | 173 +++++++++++++++++++++++++++++ src/gpu/intel/ocl/micro_sdpa.cpp | 158 +++++++++++++++++++++++++++ src/gpu/intel/ocl/micro_sdpa.hpp | 179 +++++++++++++++++++++++++++++++ src/gpu/intel/ocl/ref_sdpa.cl | 12 +-- src/gpu/intel/ocl/sdpa_utils.h | 31 ++++++ src/gpu/intel/ocl/tile_ops.h | 75 +++++++++++++ 7 files changed, 620 insertions(+), 11 deletions(-) create mode 100644 src/gpu/intel/ocl/micro_sdpa.cl create mode 100644 src/gpu/intel/ocl/micro_sdpa.cpp create mode 100644 src/gpu/intel/ocl/micro_sdpa.hpp create mode 100644 src/gpu/intel/ocl/sdpa_utils.h create mode 100644 src/gpu/intel/ocl/tile_ops.h diff --git a/src/gpu/gpu_sdpa_list.cpp b/src/gpu/gpu_sdpa_list.cpp index 173a58df8e8..c45e6b430e8 100644 --- a/src/gpu/gpu_sdpa_list.cpp +++ b/src/gpu/gpu_sdpa_list.cpp @@ -17,6 +17,8 @@ #include "common/compiler_workarounds.hpp" #include "gpu/gpu_impl_list.hpp" + +#include "gpu/intel/ocl/micro_sdpa.hpp" #include "gpu/intel/ocl/ref_sdpa.hpp" namespace dnnl { @@ -27,6 +29,7 @@ namespace { // clang-format off constexpr impl_list_item_t impl_list[] = { + INSTANCE(intel::ocl::micro_sdpa_t) INSTANCE(intel::ocl::ref_sdpa_t) nullptr, }; diff --git a/src/gpu/intel/ocl/micro_sdpa.cl b/src/gpu/intel/ocl/micro_sdpa.cl new file mode 100644 index 00000000000..c23cb4d0751 --- /dev/null +++ b/src/gpu/intel/ocl/micro_sdpa.cl @@ -0,0 +1,173 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/intel/ocl/ocl_types.h" +#include "gpu/intel/ocl/sdpa_utils.h" +#include "gpu/intel/ocl/tile_ops.h" + +/* Microkernel headers -- generated at runtime */ +#include "gemm_kq.h" +#include "gemm_vs.h" + +DECLARE_2D_TILE(ugemm_vs_c_type_half, half, SUBGROUP_SIZE, + ugemm_vs_c_type_block0, ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0, + ugemm_vs_c_type_nblock1) + +__attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE))) kernel void +micro_sdpa(const global half *K, const global half *Q, const global half *V, + global half *A, global SCALE_DATA_T *scale_ptr, global void *attn_mask, + int d, int k, int q, local uint *slm) { + uint sg_ij = sub_group_broadcast(get_local_id(1), 0); + uint b0 = get_group_id(1); + uint b1 = get_group_id(2); + + uint ldk = KEY_S3; + uint ldq = QRY_S2; + uint ldv = VAL_S2; + uint lda = DST_S2; + + float scale = convert_float(*scale_ptr); + + /* Locate K/Q/V/A matrices within batch */ + K += KEY_OFF(b1, b0, 0, 0); + Q += QRY_OFF(b1, b0, 0, 0); + V += VAL_OFF(b1, b0, 0, 0); + A += DST_OFF(b1, b0, 0, 0, 0); + + /* Calculate S = (K^T) * Q */ + uint wg_i0 = 0; + uint wg_j0 = get_group_id(0) * ugemm_kq_wg_tile_n; + uint sg_i_kq = sg_ij % ugemm_kq_sg_per_wg_m; + uint sg_j_kq = sg_ij / ugemm_kq_sg_per_wg_m; + + ugemm_kq_c_type S_tile = ugemm_kq( + K, ldk, Q, ldq, k, q, d, wg_i0, wg_j0, 0, sg_i_kq, sg_j_kq); + + if (ugemm_kq_slm_size > 0) barrier(CLK_LOCAL_MEM_FENCE); + +/* Store tile to SLM */ +#define chunk_m (4 * SUBGROUP_SIZE) + uint slm_stride = max(chunk_m, ugemm_kq_wg_tile_m); + uint sg_i0_kq = sg_i_kq * ugemm_kq_sg_tile_m; + uint sg_j0_kq = sg_j_kq * ugemm_kq_sg_tile_n; + tile_store_full(S_tile, (local float *)slm, slm_stride, sg_i0_kq, sg_j0_kq); + barrier(CLK_LOCAL_MEM_FENCE); + +/* Read back full column(s) */ +#define sg_per_wg (ugemm_kq_sg_per_wg_m * ugemm_kq_sg_per_wg_n) +#define cols_per_sg ((ugemm_kq_wg_tile_n + sg_per_wg - 1) / sg_per_wg) +#define chunks_per_col ((ugemm_kq_wg_tile_m + chunk_m - 1) / chunk_m) + + float4 sdata[cols_per_sg][chunks_per_col]; +#pragma unroll + for (int jj = 0; jj < cols_per_sg; jj++) { + int j = jj * sg_per_wg + sg_ij; +#pragma unroll + for (int ii = 0; ii < chunks_per_col; ii++) { + sdata[jj][ii] = as_float4(intel_sub_group_block_read4( + slm + ii * chunk_m + j * slm_stride)); + } + } + + /* Scale and apply softmax to each column. */ + /* 1) Scale + exp */ +#if INVERT_SCALE + scale = native_recip(scale); +#endif + scale *= 1.442695f; // log2(e) +#pragma unroll + for (int jj = 0; jj < cols_per_sg; jj++) { +#pragma unroll + for (int ii = 0; ii < chunks_per_col; ii++) + sdata[jj][ii] = native_exp2(scale * sdata[jj][ii]); + } + + /* 2) Mask out-of-bounds elements */ + int mask0 = k - get_sub_group_local_id(); + int4 mask = {mask0, mask0 - SUBGROUP_SIZE, mask0 - 2 * SUBGROUP_SIZE, + mask0 - 3 * SUBGROUP_SIZE}; + +#pragma unroll + for (int ii = 0; ii < chunks_per_col; ii++) { +#pragma unroll + for (int jj = 0; jj < cols_per_sg; jj++) + sdata[jj][ii] = select(sdata[jj][ii], 0, mask); + mask -= chunk_m; + } + + /* 3) Sum columns */ + float ssums[cols_per_sg]; + +#pragma unroll + for (int jj = 0; jj < cols_per_sg; jj++) + ssums[jj] = 0.0f; + +#pragma unroll + for (int ii = 0; ii < chunks_per_col; ii++) { +#pragma unroll + for (int jj = 0; jj < cols_per_sg; jj++) { + ssums[jj] += sdata[jj][ii].s0; + ssums[jj] += sdata[jj][ii].s1; + ssums[jj] += sdata[jj][ii].s2; + ssums[jj] += sdata[jj][ii].s3; + } + } + +#pragma unroll + for (int jj = 0; jj < cols_per_sg; jj++) + ssums[jj] = sub_group_reduce_add(ssums[jj]); + +/* 4) Normalize */ +#pragma unroll + for (int jj = 0; jj < cols_per_sg; jj++) { + ssums[jj] = native_recip(sub_group_broadcast(ssums[jj], 0)); +#pragma unroll + for (int ii = 0; ii < chunks_per_col; ii++) + sdata[jj][ii] *= ssums[jj]; + } + + /* Convert to half precision and write back full column(s) to SLM. + Stride between columns is same as original f32 data. */ + uint slm_stride_half = slm_stride * 2; + +#pragma unroll + for (int jj = 0; jj < cols_per_sg; jj++) { + int j = jj * sg_per_wg + sg_ij; +#pragma unroll + for (int ii = 0; ii < chunks_per_col; ii++) { + half4 sdata_half = convert_half4(sdata[jj][ii]); + intel_sub_group_block_write_us4( + (local ushort *)slm + ii * chunk_m + j * slm_stride_half, + as_ushort4(sdata_half)); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* Calculate A = V*S */ + uint sg_i_vs = sg_ij % ugemm_vs_sg_per_wg_m; + uint sg_j_vs = sg_ij / ugemm_vs_sg_per_wg_m; + ugemm_vs_c_type A_tile = ugemm_vs(V, ldv, (local half *)slm, + slm_stride_half, d, q, k, 0, 0, 0, sg_i_vs, sg_j_vs); + + /* Convert to half precision and store */ + ugemm_vs_c_type_half A_tile_half; + tile_copy(A_tile, A_tile_half); + + uint sg_i0_vs = sg_i_vs * ugemm_vs_sg_tile_m; + uint sg_j0_vs = sg_j_vs * ugemm_vs_sg_tile_n + wg_j0; + + tile_store(A_tile_half, A, lda, d, q, sg_i0_vs, sg_j0_vs); +} diff --git a/src/gpu/intel/ocl/micro_sdpa.cpp b/src/gpu/intel/ocl/micro_sdpa.cpp new file mode 100644 index 00000000000..5e5ad48c6ab --- /dev/null +++ b/src/gpu/intel/ocl/micro_sdpa.cpp @@ -0,0 +1,158 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/intel/ocl/micro_sdpa.hpp" + +#include "common/c_types_map.hpp" +#include "common/type_helpers.hpp" +#include "gpu/intel/compute/utils.hpp" +#include "gpu/intel/jit/gemm/microkernel_provider.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace ocl { + +status_t micro_sdpa_t::pd_t::init_microkernels(engine_t *engine) { + using namespace jit; + + assert(engine->kind() == engine_kind::gpu); + auto *compute_engine = utils::downcast(engine); + auto *dev_info = compute_engine->device_info(); + auto arch = dev_info->gpu_arch(); + auto *d = desc(); + + /* Get device information */ + HWInformation hw_info; + hw_info.euCount = dev_info->eu_count(); + hw_info.gmdid = dev_info->ip_version(); + hw_info.systolicAvailable = compute_engine->mayiuse( + compute::device_ext_t::intel_subgroup_matrix_multiply_accumulate); + + if (hw_info.gmdid == 0) return status::unimplemented; + + auto max_wg_slm = dev_info->max_slm_size_per_tg(arch); + + /* Set up GEMMProblem structure for first GEMM: K^T * Q */ + GEMMProblem problem; + problem.Ta = problem.Ta_ext = Type::f16; + problem.Tb = problem.Tb_ext = Type::f16; + problem.Tc = problem.Tc_ext = Type::f32; + problem.Ts = problem.Tc; + problem.A.layout = MatrixLayout::T; + problem.B.layout = MatrixLayout::N; + problem.C.layout = MatrixLayout::N; + problem.A.setAlignment(alignmentForLD(d->head_size() * problem.Ta)); + problem.B.setAlignment(alignmentForLD(d->head_size() * problem.Tb)); + problem.C.setAlignment(problem.Tc.size()); + + /* Set up problem size information */ + SizeParams sizes; + sizes.m = d->keys(); + sizes.n = d->queries(); + sizes.k = d->head_size(); + sizes.batch = d->batch_size(); + + /* Set up special kernel requirements */ + std::vector reqs_kq; + reqs_kq.push_back(StrategyRequirement::WGTileMN <= max_wg_slm / 4); + reqs_kq.push_back(StrategyRequirement::WGTileM >= sizes.m); + + /* Ask microkernel provider for microkernel */ + try { + gemm_kq_ = selectGEMMMicrokernel( + micro::GEMMProtocol(), hw_info, sizes, problem, reqs_kq); + } catch (...) { return status::unimplemented; } + + /* Update for second GEMM: V*S */ + problem.A.layout = MatrixLayout::N; + problem.B.setAlignment(64); + sizes.m = d->head_size(); + sizes.n = gemm_kq_.getSetting("wg_tile_n"); + sizes.k = d->keys(); + + /* Set up special kernel requirements */ + int sg_per_wg = gemm_kq_.getSetting("sg_per_wg_m") + * gemm_kq_.getSetting("sg_per_wg_n"); + + std::vector reqs_vs; + reqs_vs.push_back(StrategyRequirement::WGTileM + >= sizes.m); /* could relax with loop over d */ + reqs_vs.push_back(StrategyRequirement::WGTileN >= sizes.n); + reqs_vs.push_back(StrategyRequirement::WG == sg_per_wg); + + micro::GEMMProtocol::Options opts_vs; + opts_vs.localB = true; + + /* Ask microkernel provider for microkernel */ + try { + gemm_vs_ = selectGEMMMicrokernel( + opts_vs, hw_info, sizes, problem, reqs_vs); + } catch (...) { return status::unimplemented; } + + return status::success; +} + +status_t micro_sdpa_t::execute(const exec_ctx_t &ctx) const { + const auto &qry = CTX_IN_STORAGE(DNNL_ARG_QUERIES); + const auto &key = CTX_IN_STORAGE(DNNL_ARG_KEYS); + const auto &val = CTX_IN_STORAGE(DNNL_ARG_VALUES); + auto &dst = CTX_OUT_STORAGE(DNNL_ARG_DST); + const auto &scale = CTX_IN_STORAGE(DNNL_ARG_SCALE); + const auto &attn_mask = CTX_IN_STORAGE(DNNL_ARG_ATTN_MASK); + + const dim_t Q = pd()->desc()->queries(); + const dim_t K = pd()->desc()->keys(); + const dim_t D = pd()->desc()->head_size(); + + auto &gemm_kq = pd()->gemm_kq(); + auto wg_tile_k = gemm_kq.getSetting("wg_tile_m"); + auto wg_tile_q = gemm_kq.getSetting("wg_tile_n"); + auto sg_per_wg = gemm_kq.getSetting("sg_per_wg_m") + * gemm_kq.getSetting("sg_per_wg_n"); + auto slm_stride = std::max(wg_tile_k, 4 * sg_size_); + auto slm = std::max(gemm_kq.getSetting("slm_size"), + slm_stride * wg_tile_q * sizeof(float)); + + compute::kernel_arg_list_t arg_list; + arg_list.set(0, key); + arg_list.set(1, qry); + arg_list.set(2, val); + arg_list.set(3, dst); + arg_list.set(4, scale); + arg_list.set(5, attn_mask); + arg_list.set(6, (int)D); + arg_list.set(7, (int)K); + arg_list.set(8, (int)Q); + arg_list.set(9, slm, nullptr); + + compute::range_t lws = {(size_t)sg_size_, (size_t)sg_per_wg, 1}; + compute::range_t gws = lws; + + gws[0] *= utils::div_up(Q, wg_tile_q); + gws[1] *= pd()->dst_md()->dims[1]; + gws[2] *= pd()->dst_md()->dims[0]; + + auto nd_range = compute::nd_range_t(gws, lws); + return parallel_for(ctx, nd_range, kernel_, arg_list); +} + +} // namespace ocl +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/ocl/micro_sdpa.hpp b/src/gpu/intel/ocl/micro_sdpa.hpp new file mode 100644 index 00000000000..d86ba55bbe7 --- /dev/null +++ b/src/gpu/intel/ocl/micro_sdpa.hpp @@ -0,0 +1,179 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_OCL_MICRO_SDPA_HPP +#define GPU_OCL_MICRO_SDPA_HPP + +#include + +#include "common/c_types_map.hpp" +#include "common/gemm_types.hpp" +#include "common/gemm_utils.hpp" +#include "common/primitive.hpp" +#include "common/sdpa_pd.hpp" +#include "common/type_helpers.hpp" +#include "common/utils.hpp" +#include "gpu/intel/gpu_primitive.hpp" +#include "gpu/intel/gpu_resource.hpp" +#include "gpu/intel/microkernels/shim.hpp" +#include "gpu/intel/ocl/ocl_utils.hpp" +#include "gpu/intel/primitive_conf.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace ocl { + +struct micro_sdpa_t : public gpu_primitive_t { + using gpu_primitive_t::gpu_primitive_t; + struct pd_t : public sdpa_pd_t { + using sdpa_pd_t::sdpa_pd_t; + + DECLARE_COMMON_PD_T("ocl:micro:any", micro_sdpa_t); + + status_t init(engine_t *engine) { + using namespace data_type; + using smask_t = primitive_attr_t::skip_mask_t; + + VDISPATCH_SDPA(attr()->has_default_values(smask_t::scales_runtime), + VERBOSE_UNSUPPORTED_ATTR); + VDISPATCH_SDPA( + utils::everyone_is(4, qry_md()->ndims, key_md()->ndims, + val_md()->ndims, dst_md()->ndims), + VERBOSE_UNSUPPORTED_TAG); + if (with_attn_mask()) { + VDISPATCH_SDPA( + attn_mask_md()->ndims == 4, VERBOSE_UNSUPPORTED_TAG); + } + VDISPATCH_SDPA(utils::everyone_is(data_type::f16, + qry_md()->data_type, key_md()->data_type, + val_md()->data_type, dst_md()->data_type), + VERBOSE_UNSUPPORTED_DT); + VDISPATCH_SDPA(set_default_formats() == status::success, + VERBOSE_UNSUPPORTED_TAG); + VDISPATCH_SDPA(desc()->values() == desc()->head_size(), + "values does not match head size"); + + CHECK(init_microkernels(engine)); + return status::success; + } + + status_t init_microkernels(engine_t *engine); + + status_t set_default_format(memory_desc_t &md, bool transposed) { + using namespace format_tag; + memory_desc_wrapper mdw(md); + auto exp_trans = transposed ? dnnl_trans : dnnl_notrans; + if (mdw.format_any()) + CHECK(memory_desc_init_by_tag(md, transposed ? abdc : abcd)); + else if (!is_md_gemm_compatible_plain_format(&md) + || gemm_desc_t::get_trans(md) != exp_trans) + return status::unimplemented; + return status::success; + } + + status_t set_default_formats() { + CHECK(set_default_format(desc_.q_desc, false)); + CHECK(set_default_format(desc_.k_desc, true)); + CHECK(set_default_format(desc_.v_desc, false)); + CHECK(set_default_format(desc_.dst_desc, false)); + return status::success; + } + + const micro::Package &gemm_kq() const { return gemm_kq_; } + const micro::Package &gemm_vs() const { return gemm_vs_; } + + private: + micro::Package gemm_kq_, gemm_vs_; + }; + + status_t init(engine_t *engine) override { + using namespace micro; + + assert(engine->kind() == engine_kind::gpu); + auto *compute_engine + = utils::downcast(engine); + sg_size_ = compute_engine->device_info()->min_subgroup_size(); + + compute::kernel_ctx_t kernel_ctx; + + kernel_ctx.set_data_type(pd()->dst_md()->data_type); + + int ndims = 4; + const memory_desc_wrapper qry_mdw(pd()->qry_md()); + const memory_desc_wrapper key_mdw(pd()->key_md()); + const memory_desc_wrapper val_mdw(pd()->val_md()); + const memory_desc_wrapper dst_mdw(pd()->dst_md()); + const memory_desc_wrapper msk_mdw(pd()->attn_mask_md()); + using offset_t = decltype(offsets_t().src_off); + offset_t qry_off, key_off, val_off, dst_off, msk_off; + set_offsets(qry_mdw, qry_off); + set_offsets(key_mdw, key_off); + set_offsets(val_mdw, val_off); + set_offsets(dst_mdw, dst_off); + set_offsets(msk_mdw, msk_off); + def_offsets(qry_off, kernel_ctx, "QRY", ndims); + def_offsets(key_off, kernel_ctx, "KEY", ndims); + def_offsets(val_off, kernel_ctx, "VAL", ndims); + def_offsets(dst_off, kernel_ctx, "DST", ndims); + def_offsets(msk_off, kernel_ctx, "MSK", ndims); + kernel_ctx.define_int("NDIMS", ndims); + + kernel_ctx.define_int("SUBGROUP_SIZE", sg_size_); + kernel_ctx.define_int("INVERT_SCALE", pd()->desc()->invert_scale); + kernel_ctx.define_int("WITH_ATTN_MASK", pd()->with_attn_mask()); + def_data_type(kernel_ctx, pd()->desc()->scale_dt, "SCALE"); + + /* Generate microkernel shims */ + ShimOptions shimOptions; + shimOptions.subgroupSize = sg_size_; + shimOptions.useTileOps = true; + shimOptions.decorator = "kq"; + + kernel_ctx.add_custom_header("gemm_kq.h", + micro::generateShim( + pd()->gemm_kq(), HostLanguage::OpenCL_C, shimOptions)); + + shimOptions.microkernelID++; + shimOptions.decorator = "vs"; + + kernel_ctx.add_custom_header("gemm_vs.h", + micro::generateShim( + pd()->gemm_vs(), HostLanguage::OpenCL_C, shimOptions)); + + if (pd()->gemm_kq().grfMin > 128 || pd()->gemm_vs().grfMin > 128) + kernel_ctx.add_option("-cl-intel-256-GRF-per-thread"); + + CHECK(create_kernel(engine, &kernel_, "micro_sdpa", kernel_ctx)); + if (!kernel_) return status::runtime_error; + return status::success; + } + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + status_t execute(const exec_ctx_t &ctx) const override; + compute::kernel_t kernel_; + int sg_size_ = 0; +}; + +} // namespace ocl +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/intel/ocl/ref_sdpa.cl b/src/gpu/intel/ocl/ref_sdpa.cl index d801f84b901..5458c339e7b 100644 --- a/src/gpu/intel/ocl/ref_sdpa.cl +++ b/src/gpu/intel/ocl/ref_sdpa.cl @@ -16,17 +16,7 @@ #include "gpu/intel/ocl/ocl_post_ops.h" #include "gpu/intel/ocl/ocl_types.h" - -#define _4D_OFF(tag, x0, x1, x2, x3) \ - (((x0) % tag##_B0) * tag##_SB0 + ((x0) / tag##_B0) * tag##_S0 \ - + ((x1) % tag##_B1) * tag##_SB1 + ((x1) / tag##_B1) * tag##_S1 \ - + ((x2) % tag##_B2) * tag##_SB2 + ((x2) / tag##_B2) * tag##_S2 \ - + ((x3) % tag##_B3) * tag##_SB3 + ((x3) / tag##_B3) * tag##_S3) - -#define QRY_OFF(x0, x1, x2, x3) _4D_OFF(QRY, x0, x1, x2, x3) -#define KEY_OFF(x0, x1, x2, x3) _4D_OFF(KEY, x0, x1, x2, x3) -#define VAL_OFF(x0, x1, x2, x3) _4D_OFF(VAL, x0, x1, x2, x3) -#define MSK_OFF(x0, x1, x2, x3) _4D_OFF(MSK, x0, x1, x2, x3) +#include "gpu/intel/ocl/sdpa_utils.h" __kernel void ref_sdpa(const __global QRY_DATA_T *Q, const __global KEY_DATA_T *K, const __global VAL_DATA_T *V, diff --git a/src/gpu/intel/ocl/sdpa_utils.h b/src/gpu/intel/ocl/sdpa_utils.h new file mode 100644 index 00000000000..badf11b8b6a --- /dev/null +++ b/src/gpu/intel/ocl/sdpa_utils.h @@ -0,0 +1,31 @@ +/******************************************************************************* + * Copyright 2024 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ + +#ifndef GPU_OCL_SDPA_UTILS_H +#define GPU_OCL_SDPA_UTILS_H + +#define _4D_OFF(tag, x0, x1, x2, x3) \ + (((x0) % tag##_B0) * tag##_SB0 + ((x0) / tag##_B0) * tag##_S0 \ + + ((x1) % tag##_B1) * tag##_SB1 + ((x1) / tag##_B1) * tag##_S1 \ + + ((x2) % tag##_B2) * tag##_SB2 + ((x2) / tag##_B2) * tag##_S2 \ + + ((x3) % tag##_B3) * tag##_SB3 + ((x3) / tag##_B3) * tag##_S3) + +#define QRY_OFF(x0, x1, x2, x3) _4D_OFF(QRY, x0, x1, x2, x3) +#define KEY_OFF(x0, x1, x2, x3) _4D_OFF(KEY, x0, x1, x2, x3) +#define VAL_OFF(x0, x1, x2, x3) _4D_OFF(VAL, x0, x1, x2, x3) +#define MSK_OFF(x0, x1, x2, x3) _4D_OFF(MSK, x0, x1, x2, x3) + +#endif \ No newline at end of file diff --git a/src/gpu/intel/ocl/tile_ops.h b/src/gpu/intel/ocl/tile_ops.h new file mode 100644 index 00000000000..d56f74aba32 --- /dev/null +++ b/src/gpu/intel/ocl/tile_ops.h @@ -0,0 +1,75 @@ +/******************************************************************************* + * Copyright 2024 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ + +#ifndef GPU_OCL_TILE_OPS_H +#define GPU_OCL_TILE_OPS_H + +#define tile_elementwise(t, f) \ + do { \ + _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ + i++) t.x[i] \ + = f(t.x[i]); \ + } while (0) + +#define tile_copy(t, t_new) \ + do { \ + _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ + i++) t_new.x[i] \ + = __builtin_convertvector(t.x[i], __typeof__(t_new.x[i])); \ + } while (0) + +#define DECLARE_2D_TILE_OPS(tile_type, element_type, sg, br, bc, nbr, nbc) \ + __attribute__((overloadable)) void tile_store(tile_type t, \ + global element_type *ptr, int ld, int m, int n, int offset_r, \ + int offset_c) { \ + ptr += m * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + if (offset_c + j < n) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + if (offset_r + i < m) \ + ptr[i] = t.x[i0 / br + nbr * (j / bc)] \ + [(i0 % br) / sg + (j % bc) * (br / sg)]; \ + } \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_store(tile_type t, \ + global element_type *ptr, int m, int n, int offset_r, \ + int offset_c) { \ + tile_store(t, ptr, m, m, n, offset_r, offset_c); \ + } \ + __attribute__((overloadable)) void tile_store_full(tile_type t, \ + local element_type *ptr, int ld, int offset_r, int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + ptr[i] = t.x[i0 / br + nbr * (j / bc)] \ + [(i0 % br) / sg + (j % bc) * (br / sg)]; \ + } \ + } \ + } + +#define DECLARE_2D_TILE(tile_type, element_type, sg, br, bc, nbr, nbc) \ + typedef element_type __attribute__((ext_vector_type(br * bc / sg))) \ + _e_##tile_type; \ + typedef struct { \ + _e_##tile_type x[nbr * nbc]; \ + } tile_type; \ + DECLARE_2D_TILE_OPS(tile_type, element_type, sg, br, bc, nbr, nbc) + +#endif \ No newline at end of file From 3111200b05f36b601247ed54793f1c91795d4cfc Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Thu, 2 May 2024 16:07:28 -0700 Subject: [PATCH 090/187] gpu: ocl: new k-blocked microkernel SDPA --- src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp | 34 +- src/gpu/intel/ocl/micro_sdpa.cl | 438 +++++++++++++++----- src/gpu/intel/ocl/micro_sdpa.cpp | 198 +++++++-- src/gpu/intel/ocl/micro_sdpa.hpp | 77 +--- src/gpu/intel/ocl/tile_ops.h | 443 ++++++++++++++++++++- 5 files changed, 968 insertions(+), 222 deletions(-) diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp index dc1877820d9..cf6248fc34a 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp @@ -33,6 +33,23 @@ namespace gpu { namespace intel { namespace jit { +static inline Type convert_dnnl_to_kernel_type(data_type_t type) { + switch (type) { + default: assert(!"Unknown type"); + case data_type::f32: return Type::f32; + case data_type::f16: return Type::f16; + case data_type::bf16: return Type::bf16; + case data_type::f8_e5m2: return Type::bf8; + case data_type::f8_e4m3: return Type::hf8; + case data_type::s32: return Type::s32; + case data_type::u8: return Type::u8; + case data_type::s8: return Type::s8; + case data_type::u4: return Type::u4; + case data_type::s4: return Type::s4; + case data_type::undef: return Type::invalid; + } +} + struct gen_gemm_kernel_desc_t { friend struct gen_gemm_kernel_t; @@ -75,23 +92,6 @@ struct gen_gemm_kernel_desc_t { }; protected: - static Type convert_dnnl_to_kernel_type(data_type_t type) { - switch (type) { - default: assert(!"Unknown type"); - case data_type::f32: return Type::f32; - case data_type::f16: return Type::f16; - case data_type::bf16: return Type::bf16; - case data_type::f8_e5m2: return Type::bf8; - case data_type::f8_e4m3: return Type::hf8; - case data_type::s32: return Type::s32; - case data_type::u8: return Type::u8; - case data_type::s8: return Type::s8; - case data_type::u4: return Type::u4; - case data_type::s4: return Type::s4; - case data_type::undef: return Type::invalid; - } - } - compute::gpu_arch_t arch_; ngen::HW hw_ = ngen::HW::Unknown; int stepping_ = 0; diff --git a/src/gpu/intel/ocl/micro_sdpa.cl b/src/gpu/intel/ocl/micro_sdpa.cl index c23cb4d0751..f367533e0ff 100644 --- a/src/gpu/intel/ocl/micro_sdpa.cl +++ b/src/gpu/intel/ocl/micro_sdpa.cl @@ -22,24 +22,133 @@ #include "gemm_kq.h" #include "gemm_vs.h" -DECLARE_2D_TILE(ugemm_vs_c_type_half, half, SUBGROUP_SIZE, - ugemm_vs_c_type_block0, ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0, - ugemm_vs_c_type_nblock1) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define DIV_UP(x, y) (((x) + (y)-1) / (y)) + +#define sg_per_wg (ugemm_kq_sg_per_wg_m * ugemm_kq_sg_per_wg_n) +#define q_tile_sg_n DIV_UP(ugemm_kq_wg_tile_n, sg_per_wg) + +/* Instantiate tile types and operations */ +typedef ugemm_kq_c_type s_tile_type; +typedef ugemm_vs_c_type a_tile_type; + +DECLARE_2D_TILE(q_tile_type, uint, SUBGROUP_SIZE, D_MAX / 2, 1, 1, q_tile_sg_n) + +#ifdef BLOCK_Q +DECLARE_2D_TILE_BLOCK_OPS( + q_tile_type, uint, SUBGROUP_SIZE, D_MAX / 2, 1, 1, q_tile_sg_n) +#elif Q_ALIGN < 4 +DECLARE_2D_TILE_LOAD_PACKED_HALF( + q_tile_type, SUBGROUP_SIZE, D_MAX / 2, 1, 1, q_tile_sg_n) +#endif + +#ifdef BLOCK_A +DECLARE_2D_TILE(a_tile_type_half, half, SUBGROUP_SIZE, ugemm_vs_sg_tile_m, 1, 1, + ugemm_vs_sg_tile_n) +#else +DECLARE_2D_TILE(a_tile_type_half, half, SUBGROUP_SIZE, ugemm_vs_sg_tile_m, 8, 1, + ugemm_vs_sg_tile_n / 8) +#endif + +DECLARE_2D_TILE(s_tile_type_half2, uint, SUBGROUP_SIZE, ugemm_kq_c_type_block0, + ugemm_kq_c_type_block1 / 2, ugemm_kq_c_type_nblock0, + ugemm_kq_c_type_nblock1) + +DECLARE_2D_TILE( + s_sum_tile_type, float, SUBGROUP_SIZE, ugemm_kq_sg_tile_n, 1, 1, 1) + +DECLARE_2D_TILE( + a_scale_tile_type, float, SUBGROUP_SIZE, ugemm_vs_sg_tile_n, 1, 1, 1) + +DECLARE_2D_TILE( + mask_tile_type, half, SUBGROUP_SIZE, ugemm_kq_sg_tile_m, 1, 1, 1) + +DECLARE_2D_TILE( + mask_tile_type_float, float, SUBGROUP_SIZE, ugemm_kq_sg_tile_m, 1, 1, 1) + +DECLARE_2D_TILE_BLOCK_OPS( + mask_tile_type, half, SUBGROUP_SIZE, ugemm_kq_sg_tile_m, 1, 1, 1) + +#ifdef BLOCK_A +DECLARE_2D_TILE_BLOCK_OPS(a_tile_type_half, half, SUBGROUP_SIZE, + ugemm_vs_sg_tile_m, 1, 1, ugemm_vs_sg_tile_n) +#endif +#ifdef BLOCK_2D_A +DECLARE_2D_TILE_BLOCK2D_OPS(a_tile_type_half, half, SUBGROUP_SIZE, + ugemm_vs_sg_tile_m, 8, 1, ugemm_vs_sg_tile_n / 8) +#endif + +#ifdef BLOCK_A +DECLARE_2D_TILE_COPY_REBLOCK(a_tile_type, SUBGROUP_SIZE, ugemm_vs_c_type_block0, + ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0, + ugemm_vs_c_type_nblock1, a_tile_type_half, SUBGROUP_SIZE, + ugemm_vs_sg_tile_m, 1, 1, ugemm_vs_sg_tile_n) +#else +DECLARE_2D_TILE_COPY_REBLOCK(a_tile_type, SUBGROUP_SIZE, ugemm_vs_c_type_block0, + ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0, + ugemm_vs_c_type_nblock1, a_tile_type_half, SUBGROUP_SIZE, + ugemm_vs_sg_tile_m, 8, 1, ugemm_vs_sg_tile_n / 8) +#endif + +DECLARE_2D_TILE_VREDUCE(s_tile_type, SUBGROUP_SIZE, ugemm_kq_c_type_block0, + ugemm_kq_c_type_block1, ugemm_kq_c_type_nblock0, + ugemm_kq_c_type_nblock1, s_sum_tile_type, SUBGROUP_SIZE, + ugemm_kq_sg_tile_n, 1, 1, 1) + +DECLARE_2D_TILE_HREDUCE(s_tile_type, SUBGROUP_SIZE, ugemm_kq_c_type_block0, + ugemm_kq_c_type_block1, ugemm_kq_c_type_nblock0, + ugemm_kq_c_type_nblock1, mask_tile_type_float, SUBGROUP_SIZE, + ugemm_kq_sg_tile_m, 1, 1, 1) + +DECLARE_2D_TILE_HREDUCE(a_tile_type, SUBGROUP_SIZE, ugemm_vs_c_type_block0, + ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0, + ugemm_vs_c_type_nblock1, a_scale_tile_type, SUBGROUP_SIZE, + ugemm_vs_sg_tile_n, 1, 1, 1) + +#if ugemm_kq_wg_tile_n == ugemm_vs_wg_tile_n \ + && (ugemm_kq_sg_tile_n % ugemm_vs_sg_tile_n) == 0 +DECLARE_2D_TILE_RSELECT(a_scale_tile_type, SUBGROUP_SIZE, ugemm_vs_sg_tile_n, 1, + 1, 1, s_sum_tile_type, SUBGROUP_SIZE, ugemm_kq_sg_tile_n, 1, 1, 1) +#endif __attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE))) kernel void micro_sdpa(const global half *K, const global half *Q, const global half *V, - global half *A, global SCALE_DATA_T *scale_ptr, global void *attn_mask, - int d, int k, int q, local uint *slm) { + global half *A, global SCALE_DATA_T *scale_ptr, const global half *msk, + int d, int k, int q) { uint sg_ij = sub_group_broadcast(get_local_id(1), 0); uint b0 = get_group_id(1); uint b1 = get_group_id(2); + uint wg_j0 = get_group_id(0) * ugemm_kq_wg_tile_n; + + /* Leading dimension for matrices */ uint ldk = KEY_S3; uint ldq = QRY_S2; uint ldv = VAL_S2; uint lda = DST_S2; - float scale = convert_float(*scale_ptr); + /* Subgroup IDs for each GEMM */ + uint sg_i_kq = sg_ij % ugemm_kq_sg_per_wg_m; + uint sg_j_kq = sg_ij / ugemm_kq_sg_per_wg_m; + + uint sg_i_vs = sg_ij % ugemm_vs_sg_per_wg_m; + uint sg_j_vs = sg_ij / ugemm_vs_sg_per_wg_m; + + /* SLM allocations */ + local half Q_slm[D_MAX * ugemm_kq_wg_tile_n]; + local half S_slm[ugemm_kq_wg_tile_m * ugemm_kq_wg_tile_n]; + local float S_sum_slm[ugemm_kq_wg_tile_n * ugemm_kq_sg_per_wg_m]; + local float S_max_slm[ugemm_kq_wg_tile_n]; + +#if ugemm_kq_slm_size + ugemm_vs_slm_size > 0 + local uint + ugemm_slm[MAX(ugemm_kq_slm_size, ugemm_vs_slm_size) / sizeof(uint)]; +#else + local uint *ugemm_slm = NULL; +#endif + + const bool need_sum_barrier + = (ugemm_kq_barrier_count == 0) && (ugemm_vs_barrier_count == 0); /* Locate K/Q/V/A matrices within batch */ K += KEY_OFF(b1, b0, 0, 0); @@ -47,127 +156,256 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V, V += VAL_OFF(b1, b0, 0, 0); A += DST_OFF(b1, b0, 0, 0, 0); - /* Calculate S = (K^T) * Q */ - uint wg_i0 = 0; - uint wg_j0 = get_group_id(0) * ugemm_kq_wg_tile_n; - uint sg_i_kq = sg_ij % ugemm_kq_sg_per_wg_m; - uint sg_j_kq = sg_ij / ugemm_kq_sg_per_wg_m; + __builtin_assume_aligned(K, K_ALIGN); + __builtin_assume_aligned(Q, Q_ALIGN); + __builtin_assume_aligned(V, V_ALIGN); + __builtin_assume_aligned(A, A_ALIGN); - ugemm_kq_c_type S_tile = ugemm_kq( - K, ldk, Q, ldq, k, q, d, wg_i0, wg_j0, 0, sg_i_kq, sg_j_kq); + /* Load Q tile, destined for SLM */ + q_tile_type Q_tile; + uint q0_copy = q_tile_sg_n * sg_ij; +#ifdef BLOCK_Q + tile_load_block(&Q_tile, (global uint *)Q, ldq >> 1, 0, wg_j0 + q0_copy); +#elif Q_ALIGN >= 4 + tile_load(&Q_tile, (global uint *)Q, (d + 1) >> 1, q, ldq >> 1, 0, + wg_j0 + q0_copy); +#else + tile_load_packed_half(&Q_tile, Q, d, q, ldq, 0, wg_j0 + q0_copy); +#endif - if (ugemm_kq_slm_size > 0) barrier(CLK_LOCAL_MEM_FENCE); + /* Load scale */ +#if INVERT_SCALE + float iscale = convert_float(*scale_ptr); + float scale = native_recip(iscale); +#else + float scale = convert_float(*scale_ptr); + float iscale = native_recip(scale); +#endif + scale *= 1.442695f; // log2(e) -/* Store tile to SLM */ -#define chunk_m (4 * SUBGROUP_SIZE) - uint slm_stride = max(chunk_m, ugemm_kq_wg_tile_m); - uint sg_i0_kq = sg_i_kq * ugemm_kq_sg_tile_m; - uint sg_j0_kq = sg_j_kq * ugemm_kq_sg_tile_n; - tile_store_full(S_tile, (local float *)slm, slm_stride, sg_i0_kq, sg_j0_kq); - barrier(CLK_LOCAL_MEM_FENCE); +#ifdef PREFETCH_K0 + /* Prefetch first K tile. No remainder handling yet. */ + cooperative_prefetch_2d(K, D_MAX, ugemm_kq_wg_tile_m, ldk, sg_ij, sg_per_wg, + SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); +#endif -/* Read back full column(s) */ -#define sg_per_wg (ugemm_kq_sg_per_wg_m * ugemm_kq_sg_per_wg_n) -#define cols_per_sg ((ugemm_kq_wg_tile_n + sg_per_wg - 1) / sg_per_wg) -#define chunks_per_col ((ugemm_kq_wg_tile_m + chunk_m - 1) / chunk_m) + /* Initialize S column sums in SLM to -inf */ + const uint n_col_sg = DIV_UP(ugemm_kq_wg_tile_n, SUBGROUP_SIZE * sg_per_wg); + const float neg_inf = -INFINITY; - float4 sdata[cols_per_sg][chunks_per_col]; -#pragma unroll - for (int jj = 0; jj < cols_per_sg; jj++) { - int j = jj * sg_per_wg + sg_ij; #pragma unroll - for (int ii = 0; ii < chunks_per_col; ii++) { - sdata[jj][ii] = as_float4(intel_sub_group_block_read4( - slm + ii * chunk_m + j * slm_stride)); - } - } + for (int q = 0; q < n_col_sg; q++) + intel_sub_group_block_write( + (local uint *)&S_max_slm[q + sg_ij * n_col_sg * SUBGROUP_SIZE], + as_uint(neg_inf)); - /* Scale and apply softmax to each column. */ - /* 1) Scale + exp */ -#if INVERT_SCALE - scale = native_recip(scale); + /* Clear accumulator */ + a_tile_type A_tile; + tile_fill(A_tile, 0.0f); + + /* Store Q tile to SLM */ + tile_store_t_sys_src1( + Q_tile, (local uint *)&Q_slm[0], D_MAX / 2, q0_copy, 0); + + /* Clear S column sums/maxes */ + s_sum_tile_type S_sum_tile; + s_sum_tile_type S_max_tile, S_max_tile_old; + tile_fill(S_sum_tile, 0.0f); + tile_fill(S_max_tile, -INFINITY); + + /* Wait for Q data to reach SLM */ + barrier(CLK_LOCAL_MEM_FENCE); + + /* Main loop over k blocks */ + for (int k0 = 0; k0 < k; k0 += ugemm_kq_wg_tile_m) { + bool first = (k0 == 0); + bool last = (k0 + ugemm_kq_wg_tile_m >= k); + + uint sg_i0_kq = sg_i_kq * ugemm_kq_sg_tile_m; + uint sg_j0_kq = sg_j_kq * ugemm_kq_sg_tile_n; + +#if WITH_ATTN_MASK + /* Load mask. No remainder handling needed assuming k block size is a power of 2. */ + mask_tile_type mask_tile; + tile_load_block(&mask_tile, msk, 0, k0 + sg_i0_kq, 0); #endif - scale *= 1.442695f; // log2(e) -#pragma unroll - for (int jj = 0; jj < cols_per_sg; jj++) { + + /* Prepare k mask: 0 in bounds, -inf out of bounds */ + mask_tile_type_float k_mask; #pragma unroll - for (int ii = 0; ii < chunks_per_col; ii++) - sdata[jj][ii] = native_exp2(scale * sdata[jj][ii]); - } + for (int ii = 0; ii < ugemm_kq_sg_tile_m / SUBGROUP_SIZE; ii++) + k_mask.x[0][ii] = (k0 + sg_i0_kq + ii * SUBGROUP_SIZE + + get_sub_group_local_id() + < k) + ? 0 + : -INFINITY; - /* 2) Mask out-of-bounds elements */ - int mask0 = k - get_sub_group_local_id(); - int4 mask = {mask0, mask0 - SUBGROUP_SIZE, mask0 - 2 * SUBGROUP_SIZE, - mask0 - 3 * SUBGROUP_SIZE}; + /* Calculate S = (K^T) * Q */ + s_tile_type S_tile = ugemm_kq(K, ldk, Q_slm, D_MAX, k, + ugemm_kq_wg_tile_n, d, k0, 0, 0, sg_i_kq, sg_j_kq, ugemm_slm); -#pragma unroll - for (int ii = 0; ii < chunks_per_col; ii++) { -#pragma unroll - for (int jj = 0; jj < cols_per_sg; jj++) - sdata[jj][ii] = select(sdata[jj][ii], 0, mask); - mask -= chunk_m; - } +#if WITH_ATTN_MASK +/* Apply mask, manually masking in k dimension */ +#define unscale_mask(x, y) ((x)*iscale + (y)) + mask_tile_type_float mask_tile_float; + tile_copy(mask_tile, mask_tile_float); + tile_binary(mask_tile_float, k_mask, unscale_mask); + tile_hbroadcast_add(&S_tile, mask_tile_float); +#else + tile_hbroadcast_add(&S_tile, k_mask); +#endif - /* 3) Sum columns */ - float ssums[cols_per_sg]; + /* Before softmax, we will need to scale columns by maximum values to avoid overflow. */ -#pragma unroll - for (int jj = 0; jj < cols_per_sg; jj++) - ssums[jj] = 0.0f; + /* Compute our maxima and reduce across SLM */ + tile_vreduce_max(S_tile, &S_max_tile); + tile_atomic_max_full( + S_max_tile, S_max_slm, ugemm_kq_wg_tile_n, sg_j0_kq, 0); + intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE); -#pragma unroll - for (int ii = 0; ii < chunks_per_col; ii++) { -#pragma unroll - for (int jj = 0; jj < cols_per_sg; jj++) { - ssums[jj] += sdata[jj][ii].s0; - ssums[jj] += sdata[jj][ii].s1; - ssums[jj] += sdata[jj][ii].s2; - ssums[jj] += sdata[jj][ii].s3; +#ifdef PREFETCH_V + /* Prefetch V tile. No remainder handling yet. */ + cooperative_prefetch_2d(V, D_MAX, ugemm_kq_wg_tile_m, ldv, sg_ij, + sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); +#endif + +#ifndef ALT_MAX + /* Read back WG-wide maxima */ + intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE); + tile_load_full(&S_max_tile, S_max_slm, ugemm_kq_wg_tile_n, sg_j0_kq, 0); +#endif + + tile_vbroadcast_sub(&S_tile, S_max_tile); + +/* Scale + exponentiate */ +#define scaled_exp(x) native_exp2(x *scale) + tile_elementwise(S_tile, scaled_exp); + +#ifdef ALT_MAX + /* Read back WG-wide maxima and adjust S to match */ + intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE); + s_sum_tile_type S_max_tile1; + tile_copy(S_max_tile, S_max_tile1); + tile_load_full(&S_max_tile, S_max_slm, ugemm_kq_wg_tile_n, sg_j0_kq, 0); + +#define binary_exp_neg(x, y) native_exp2(scale *((x) - (y))) + tile_binary(S_max_tile1, S_max_tile, binary_exp_neg); + tile_vbroadcast_mul(&S_tile, S_max_tile1); +#endif + + /* Accumulate sums. S tile is transposed for easy summation. */ + s_sum_tile_type S_sum_tile1; + tile_fill(S_sum_tile1, 0.0f); + tile_vreduce_add(S_tile, &S_sum_tile1); + + /* Convert to half, VNNI format */ + s_tile_type_half2 S_tile_half2; + tile_copy_to_half2(S_tile, S_tile_half2); + + /* Store to SLM, in packed format */ + tile_store_t_sys_src2(S_tile_half2, (local uint *)S_slm, + ugemm_vs_sg_tile_n, ugemm_kq_wg_tile_m / 2, sg_i0_kq / 2, + sg_j0_kq); + intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE); + + /* Rescale existing accumulator and sums to match new maxima */ + if (!first) { +#define binary_exp_sub(x, y) native_exp2(scale *((x) - (y))) +#define binary_mul(x, y) ((x) * (y)) + tile_binary(S_max_tile_old, S_max_tile, binary_exp_sub); + tile_binary(S_sum_tile, S_max_tile_old, binary_mul); + + /* Find the subset of sums that applies to the accumulation tile */ + a_scale_tile_type A_scale_tile; +#if ugemm_kq_wg_tile_n == ugemm_vs_wg_tile_n \ + && ugemm_kq_sg_tile_n == ugemm_vs_sg_tile_n + tile_copy(S_max_tile_old, A_scale_tile); +#elif ugemm_kq_wg_tile_n == ugemm_vs_wg_tile_n \ + && (ugemm_kq_sg_tile_n % ugemm_vs_sg_tile_n) == 0 + tile_rselect(&A_scale_tile, S_max_tile_old, + sg_j_vs % (ugemm_kq_sg_tile_n / ugemm_vs_sg_tile_n)); +#else +#error unimplemented +#endif + tile_hbroadcast_mul(&A_tile, A_scale_tile); } - } -#pragma unroll - for (int jj = 0; jj < cols_per_sg; jj++) - ssums[jj] = sub_group_reduce_add(ssums[jj]); +/* Accumulate sums */ +#define binary_add(x, y) ((x) + (y)) + tile_binary(S_sum_tile, S_sum_tile1, binary_add); -/* 4) Normalize */ -#pragma unroll - for (int jj = 0; jj < cols_per_sg; jj++) { - ssums[jj] = native_recip(sub_group_broadcast(ssums[jj], 0)); -#pragma unroll - for (int ii = 0; ii < chunks_per_col; ii++) - sdata[jj][ii] *= ssums[jj]; + /* Save maxima */ + tile_copy(S_max_tile, S_max_tile_old); + + /* Last iteration: store column sums in SLM */ + if (last) + tile_store_full(S_sum_tile, S_sum_slm, ugemm_kq_wg_tile_n, sg_j0_kq, + sg_i_kq); + +#ifdef PREFETCH_K + /* Prefetch next K tile. No remainder handling yet. */ + if (!last) + cooperative_prefetch_2d(K + (k0 + ugemm_kq_wg_tile_m) * ldk, D_MAX, + ugemm_kq_wg_tile_m, ldk, sg_ij, sg_per_wg, SUBGROUP_SIZE, + LSC_LDCC_L1C_L3C); +#endif +#ifdef PREFETCH_MASK +#if WITH_ATTN_MASK + /* Prefetch next mask tile. */ + if (!last) + cooperative_prefetch_2d(msk + k0 + ugemm_kq_wg_tile_m + sg_i0_kq, + ugemm_kq_sg_tile_m, 1, 0, 0, 1, SUBGROUP_SIZE, + LSC_LDCC_L1UC_L3C); +#endif +#endif + + /* Wait for S stores */ + intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE); + + /* Last iteration: signal column sums are ready */ + if (last && need_sum_barrier) + intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE); + + /* Accumulate A += V * S */ + int k_chunk = min(k - k0, ugemm_kq_wg_tile_m); + a_tile_type A_tile1 = ugemm_vs(V, ldv, S_slm, ugemm_kq_wg_tile_m, d, + ugemm_kq_wg_tile_n, k_chunk, 0, 0, 0, sg_i_vs, sg_j_vs, + ugemm_slm); + V += ldv * ugemm_kq_wg_tile_m; + + tile_binary(A_tile, A_tile1, binary_add); } - /* Convert to half precision and write back full column(s) to SLM. - Stride between columns is same as original f32 data. */ - uint slm_stride_half = slm_stride * 2; + /* Wait for column sums to be ready */ + if (need_sum_barrier) intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE); + + /* Load column sums from SLM + reduce in registers */ + a_scale_tile_type A_scale_tile, A_scale_tile_load; + tile_fill(A_scale_tile, 0.0f); #pragma unroll - for (int jj = 0; jj < cols_per_sg; jj++) { - int j = jj * sg_per_wg + sg_ij; -#pragma unroll - for (int ii = 0; ii < chunks_per_col; ii++) { - half4 sdata_half = convert_half4(sdata[jj][ii]); - intel_sub_group_block_write_us4( - (local ushort *)slm + ii * chunk_m + j * slm_stride_half, - as_ushort4(sdata_half)); - } + for (uint sg1 = 0; sg1 < ugemm_kq_sg_per_wg_m; sg1++) { + tile_load_full(&A_scale_tile_load, S_sum_slm, ugemm_kq_wg_tile_n, + ugemm_vs_sg_tile_n * sg_j_vs, sg1); + tile_binary(A_scale_tile, A_scale_tile_load, binary_add); } - barrier(CLK_LOCAL_MEM_FENCE); - /* Calculate A = V*S */ - uint sg_i_vs = sg_ij % ugemm_vs_sg_per_wg_m; - uint sg_j_vs = sg_ij / ugemm_vs_sg_per_wg_m; - ugemm_vs_c_type A_tile = ugemm_vs(V, ldv, (local half *)slm, - slm_stride_half, d, q, k, 0, 0, 0, sg_i_vs, sg_j_vs); + /* Rescale by 1 / (column sums) */ + tile_elementwise_s(A_scale_tile, native_recip); + tile_hbroadcast_mul(&A_tile, A_scale_tile); /* Convert to half precision and store */ - ugemm_vs_c_type_half A_tile_half; - tile_copy(A_tile, A_tile_half); + a_tile_type_half A_tile_half; + tile_copy_reblock(A_tile, &A_tile_half); uint sg_i0_vs = sg_i_vs * ugemm_vs_sg_tile_m; uint sg_j0_vs = sg_j_vs * ugemm_vs_sg_tile_n + wg_j0; - tile_store(A_tile_half, A, lda, d, q, sg_i0_vs, sg_j0_vs); +#ifdef BLOCK_2D_A + tile_store_block2d(A_tile_half, A, d, q, lda, sg_i0_vs, sg_j0_vs); +#elif defined(BLOCK_A) + tile_store_block(A_tile_half, A, lda, sg_i0_vs, sg_j0_vs); +#else + tile_store(A_tile_half, A, d, q, lda, sg_i0_vs, sg_j0_vs); +#endif } diff --git a/src/gpu/intel/ocl/micro_sdpa.cpp b/src/gpu/intel/ocl/micro_sdpa.cpp index 5e5ad48c6ab..06e6957509d 100644 --- a/src/gpu/intel/ocl/micro_sdpa.cpp +++ b/src/gpu/intel/ocl/micro_sdpa.cpp @@ -19,6 +19,7 @@ #include "common/c_types_map.hpp" #include "common/type_helpers.hpp" #include "gpu/intel/compute/utils.hpp" +#include "gpu/intel/jit/gemm/gen_gemm_kernel.hpp" #include "gpu/intel/jit/gemm/microkernel_provider.hpp" namespace dnnl { @@ -29,11 +30,12 @@ namespace ocl { status_t micro_sdpa_t::pd_t::init_microkernels(engine_t *engine) { using namespace jit; + using arch_t = compute::gpu_arch_t; assert(engine->kind() == engine_kind::gpu); auto *compute_engine = utils::downcast(engine); auto *dev_info = compute_engine->device_info(); - auto arch = dev_info->gpu_arch(); + arch_ = dev_info->gpu_arch(); auto *d = desc(); /* Get device information */ @@ -45,20 +47,52 @@ status_t micro_sdpa_t::pd_t::init_microkernels(engine_t *engine) { if (hw_info.gmdid == 0) return status::unimplemented; - auto max_wg_slm = dev_info->max_slm_size_per_tg(arch); + sg_size_ = dev_info->min_subgroup_size(); + + /* Choose kernel configuration */ + std::vector reqs_kq; + int unroll_m_kq = 32, unroll_n_kq = 32; + int unroll_m_vs = 32, unroll_n_vs = 16; + int wg_m_kq = 4, wg_n_kq = 4; + int wg_m_vs = 2, wg_n_vs = 8; + + if (d->head_size() > 64) return status::unimplemented; + + switch (arch_) { + case arch_t::xe_hpg: + unroll_m_kq /= 2; + unroll_m_vs /= 2; + wg_m_kq *= 2; + wg_m_vs *= 2; + break; + case arch_t::xe_hpc: + case arch_t::xe2: break; + default: return status::unimplemented; + } + + auto convert_dnnl_to_kernel_layout = [](const memory_desc_t *md) { + return (gemm_desc_t::get_trans(*md) == dnnl_trans) ? MatrixLayout::T + : MatrixLayout::N; + }; /* Set up GEMMProblem structure for first GEMM: K^T * Q */ GEMMProblem problem; - problem.Ta = problem.Ta_ext = Type::f16; - problem.Tb = problem.Tb_ext = Type::f16; + problem.Ta = problem.Ta_ext + = jit::convert_dnnl_to_kernel_type(key_md()->data_type); + problem.Tb = problem.Tb_ext + = jit::convert_dnnl_to_kernel_type(qry_md()->data_type); problem.Tc = problem.Tc_ext = Type::f32; problem.Ts = problem.Tc; - problem.A.layout = MatrixLayout::T; - problem.B.layout = MatrixLayout::N; - problem.C.layout = MatrixLayout::N; - problem.A.setAlignment(alignmentForLD(d->head_size() * problem.Ta)); - problem.B.setAlignment(alignmentForLD(d->head_size() * problem.Tb)); - problem.C.setAlignment(problem.Tc.size()); + + auto problem_kq = problem; + problem_kq.A.layout = convert_dnnl_to_kernel_layout(key_md()); + problem_kq.B.layout = MatrixLayout::Pr; + problem_kq.C.layout = MatrixLayout::T; + problem_kq.A.setAlignment(alignmentForLD(d->head_size() * problem.Ta)); + problem_kq.B.setAlignment(64); // Q is packed in VNNI format in SLM + problem_kq.B.crosspack = 2; + problem_kq.B.tileR = d_max(); + problem_kq.B.tileC = sg_size_; /* Set up problem size information */ SizeParams sizes; @@ -67,46 +101,149 @@ status_t micro_sdpa_t::pd_t::init_microkernels(engine_t *engine) { sizes.k = d->head_size(); sizes.batch = d->batch_size(); - /* Set up special kernel requirements */ - std::vector reqs_kq; - reqs_kq.push_back(StrategyRequirement::WGTileMN <= max_wg_slm / 4); - reqs_kq.push_back(StrategyRequirement::WGTileM >= sizes.m); + /* Set up microkernel requirements */ + reqs_kq.push_back(StrategyRequirement::UnrollM == unroll_m_kq); + reqs_kq.push_back(StrategyRequirement::UnrollN == unroll_n_kq); + reqs_kq.push_back(StrategyRequirement::WGM == wg_m_kq); + reqs_kq.push_back(StrategyRequirement::WGN == wg_n_kq); + + /* Set up microkernel options */ + micro::GEMMProtocol::Options opts_kq; + opts_kq.localB = true; + opts_kq.slmPtr = true; /* Ask microkernel provider for microkernel */ try { gemm_kq_ = selectGEMMMicrokernel( - micro::GEMMProtocol(), hw_info, sizes, problem, reqs_kq); + opts_kq, hw_info, sizes, problem_kq, reqs_kq); } catch (...) { return status::unimplemented; } /* Update for second GEMM: V*S */ - problem.A.layout = MatrixLayout::N; - problem.B.setAlignment(64); - sizes.m = d->head_size(); + auto problem_vs = problem; + problem_vs.Ta = problem_vs.Ta_ext + = jit::convert_dnnl_to_kernel_type(val_md()->data_type); + problem_vs.A.layout = convert_dnnl_to_kernel_layout(val_md()); + problem_vs.B.layout = MatrixLayout::Pr; + problem_vs.C.layout = MatrixLayout::N; + problem_vs.A.setAlignment(alignmentForLD(d->head_size() * problem.Ta)); + problem_vs.B.setAlignment(64); // S is packed in SLM + problem_vs.B.crosspack = 16; + sizes.m = d->values(); sizes.n = gemm_kq_.getSetting("wg_tile_n"); - sizes.k = d->keys(); + sizes.k = gemm_kq_.getSetting("wg_tile_m"); /* Set up special kernel requirements */ - int sg_per_wg = gemm_kq_.getSetting("sg_per_wg_m") - * gemm_kq_.getSetting("sg_per_wg_n"); - std::vector reqs_vs; - reqs_vs.push_back(StrategyRequirement::WGTileM - >= sizes.m); /* could relax with loop over d */ - reqs_vs.push_back(StrategyRequirement::WGTileN >= sizes.n); - reqs_vs.push_back(StrategyRequirement::WG == sg_per_wg); + reqs_vs.push_back(StrategyRequirement::UnrollM == unroll_m_vs); + reqs_vs.push_back(StrategyRequirement::UnrollN == unroll_n_vs); + reqs_vs.push_back(StrategyRequirement::WGM == wg_m_vs); + reqs_vs.push_back(StrategyRequirement::WGN == wg_n_vs); micro::GEMMProtocol::Options opts_vs; opts_vs.localB = true; + opts_vs.slmPtr = true; /* Ask microkernel provider for microkernel */ try { gemm_vs_ = selectGEMMMicrokernel( - opts_vs, hw_info, sizes, problem, reqs_vs); + opts_vs, hw_info, sizes, problem_vs, reqs_vs); } catch (...) { return status::unimplemented; } return status::success; } +status_t micro_sdpa_t::init(engine_t *engine) { + using namespace micro; + + compute::kernel_ctx_t kernel_ctx; + + auto *d = pd()->desc(); + + kernel_ctx.set_data_type(pd()->dst_md()->data_type); + + int ndims = 4; + const memory_desc_wrapper qry_mdw(pd()->qry_md()); + const memory_desc_wrapper key_mdw(pd()->key_md()); + const memory_desc_wrapper val_mdw(pd()->val_md()); + const memory_desc_wrapper dst_mdw(pd()->dst_md()); + const memory_desc_wrapper msk_mdw(pd()->attn_mask_md()); + using offset_t = decltype(offsets_t().src_off); + offset_t qry_off, key_off, val_off, dst_off, msk_off; + set_offsets(qry_mdw, qry_off); + set_offsets(key_mdw, key_off); + set_offsets(val_mdw, val_off); + set_offsets(dst_mdw, dst_off); + set_offsets(msk_mdw, msk_off); + def_offsets(qry_off, kernel_ctx, "QRY", ndims); + def_offsets(key_off, kernel_ctx, "KEY", ndims); + def_offsets(val_off, kernel_ctx, "VAL", ndims); + def_offsets(dst_off, kernel_ctx, "DST", ndims); + def_offsets(msk_off, kernel_ctx, "MSK", ndims); + kernel_ctx.define_int("NDIMS", ndims); + + auto ldq = gemm_desc_t::get_ld(*pd()->qry_md()) * qry_mdw.data_type_size(); + auto ldk = gemm_desc_t::get_ld(*pd()->key_md()) * key_mdw.data_type_size(); + auto ldv = gemm_desc_t::get_ld(*pd()->val_md()) * val_mdw.data_type_size(); + auto lda = gemm_desc_t::get_ld(*pd()->dst_md()) * dst_mdw.data_type_size(); + kernel_ctx.define_int("Q_ALIGN", jit::alignmentForLD(int(ldq))); + kernel_ctx.define_int("K_ALIGN", jit::alignmentForLD(int(ldk))); + kernel_ctx.define_int("V_ALIGN", jit::alignmentForLD(int(ldv))); + kernel_ctx.define_int("A_ALIGN", jit::alignmentForLD(int(lda))); + + def_data_type(kernel_ctx, d->scale_dt, "SCALE"); + kernel_ctx.define_int("INVERT_SCALE", d->invert_scale); + + kernel_ctx.define_int("WITH_ATTN_MASK", pd()->with_attn_mask()); + + kernel_ctx.define_int("SUBGROUP_SIZE", pd()->sg_size()); + kernel_ctx.define_int("D_MAX", pd()->d_max()); + + bool d_full = (d->head_size() == pd()->d_max()); + int tile_m = pd()->gemm_kq().getSetting("wg_tile_m"); + int tile_n = pd()->gemm_kq().getSetting("wg_tile_n"); + + if (d_full && (d->queries() % tile_n) == 0) { + if (ldq % 4 == 0) kernel_ctx.define_int("BLOCK_Q", 1); + if (lda % 4 == 0) kernel_ctx.define_int("BLOCK_A", 1); + } else if (lda % 16 == 0) + kernel_ctx.define_int("BLOCK_2D_A", 1); + + if (pd()->arch() >= compute::gpu_arch_t::xe_hpc) { + kernel_ctx.define_int("PREFETCH_MASK", 1); + if (d_full) { + if (d->keys() >= tile_m) kernel_ctx.define_int("PREFETCH_K0", 1); + if (d->keys() % tile_m == 0) { + kernel_ctx.define_int("PREFETCH_K", 1); + kernel_ctx.define_int("PREFETCH_V", 1); + } + } + } + + /* Generate microkernel shims */ + ShimOptions shimOptions; + shimOptions.subgroupSize = pd()->sg_size(); + shimOptions.useTileOps = true; + shimOptions.decorator = "kq"; + + kernel_ctx.add_custom_header("gemm_kq.h", + micro::generateShim( + pd()->gemm_kq(), HostLanguage::OpenCL_C, shimOptions)); + + shimOptions.microkernelID++; + shimOptions.decorator = "vs"; + + kernel_ctx.add_custom_header("gemm_vs.h", + micro::generateShim( + pd()->gemm_vs(), HostLanguage::OpenCL_C, shimOptions)); + + if (pd()->gemm_kq().grfMin > 128 || pd()->gemm_vs().grfMin > 128) + kernel_ctx.add_option("-cl-intel-256-GRF-per-thread"); + + CHECK(create_kernel(engine, &kernel_, "micro_sdpa", kernel_ctx)); + if (!kernel_) return status::runtime_error; + return status::success; +} + status_t micro_sdpa_t::execute(const exec_ctx_t &ctx) const { const auto &qry = CTX_IN_STORAGE(DNNL_ARG_QUERIES); const auto &key = CTX_IN_STORAGE(DNNL_ARG_KEYS); @@ -120,13 +257,9 @@ status_t micro_sdpa_t::execute(const exec_ctx_t &ctx) const { const dim_t D = pd()->desc()->head_size(); auto &gemm_kq = pd()->gemm_kq(); - auto wg_tile_k = gemm_kq.getSetting("wg_tile_m"); auto wg_tile_q = gemm_kq.getSetting("wg_tile_n"); auto sg_per_wg = gemm_kq.getSetting("sg_per_wg_m") * gemm_kq.getSetting("sg_per_wg_n"); - auto slm_stride = std::max(wg_tile_k, 4 * sg_size_); - auto slm = std::max(gemm_kq.getSetting("slm_size"), - slm_stride * wg_tile_q * sizeof(float)); compute::kernel_arg_list_t arg_list; arg_list.set(0, key); @@ -138,9 +271,8 @@ status_t micro_sdpa_t::execute(const exec_ctx_t &ctx) const { arg_list.set(6, (int)D); arg_list.set(7, (int)K); arg_list.set(8, (int)Q); - arg_list.set(9, slm, nullptr); - compute::range_t lws = {(size_t)sg_size_, (size_t)sg_per_wg, 1}; + compute::range_t lws = {(size_t)pd()->sg_size(), (size_t)sg_per_wg, 1}; compute::range_t gws = lws; gws[0] *= utils::div_up(Q, wg_tile_q); diff --git a/src/gpu/intel/ocl/micro_sdpa.hpp b/src/gpu/intel/ocl/micro_sdpa.hpp index d86ba55bbe7..e47e370dad9 100644 --- a/src/gpu/intel/ocl/micro_sdpa.hpp +++ b/src/gpu/intel/ocl/micro_sdpa.hpp @@ -72,8 +72,6 @@ struct micro_sdpa_t : public gpu_primitive_t { return status::success; } - status_t init_microkernels(engine_t *engine); - status_t set_default_format(memory_desc_t &md, bool transposed) { using namespace format_tag; memory_desc_wrapper mdw(md); @@ -97,77 +95,28 @@ struct micro_sdpa_t : public gpu_primitive_t { const micro::Package &gemm_kq() const { return gemm_kq_; } const micro::Package &gemm_vs() const { return gemm_vs_; } + int sg_size() const { return sg_size_; } + + // Block size for head_size, which must be hard-coded into the kernel. + int d_max() const { return utils::rnd_up(desc()->head_size(), 32); } + + compute::gpu_arch_t arch() const { return arch_; } + private: micro::Package gemm_kq_, gemm_vs_; + int sg_size_ = 0; + compute::gpu_arch_t arch_; + + status_t init_microkernels(engine_t *engine); }; - status_t init(engine_t *engine) override { - using namespace micro; - - assert(engine->kind() == engine_kind::gpu); - auto *compute_engine - = utils::downcast(engine); - sg_size_ = compute_engine->device_info()->min_subgroup_size(); - - compute::kernel_ctx_t kernel_ctx; - - kernel_ctx.set_data_type(pd()->dst_md()->data_type); - - int ndims = 4; - const memory_desc_wrapper qry_mdw(pd()->qry_md()); - const memory_desc_wrapper key_mdw(pd()->key_md()); - const memory_desc_wrapper val_mdw(pd()->val_md()); - const memory_desc_wrapper dst_mdw(pd()->dst_md()); - const memory_desc_wrapper msk_mdw(pd()->attn_mask_md()); - using offset_t = decltype(offsets_t().src_off); - offset_t qry_off, key_off, val_off, dst_off, msk_off; - set_offsets(qry_mdw, qry_off); - set_offsets(key_mdw, key_off); - set_offsets(val_mdw, val_off); - set_offsets(dst_mdw, dst_off); - set_offsets(msk_mdw, msk_off); - def_offsets(qry_off, kernel_ctx, "QRY", ndims); - def_offsets(key_off, kernel_ctx, "KEY", ndims); - def_offsets(val_off, kernel_ctx, "VAL", ndims); - def_offsets(dst_off, kernel_ctx, "DST", ndims); - def_offsets(msk_off, kernel_ctx, "MSK", ndims); - kernel_ctx.define_int("NDIMS", ndims); - - kernel_ctx.define_int("SUBGROUP_SIZE", sg_size_); - kernel_ctx.define_int("INVERT_SCALE", pd()->desc()->invert_scale); - kernel_ctx.define_int("WITH_ATTN_MASK", pd()->with_attn_mask()); - def_data_type(kernel_ctx, pd()->desc()->scale_dt, "SCALE"); - - /* Generate microkernel shims */ - ShimOptions shimOptions; - shimOptions.subgroupSize = sg_size_; - shimOptions.useTileOps = true; - shimOptions.decorator = "kq"; - - kernel_ctx.add_custom_header("gemm_kq.h", - micro::generateShim( - pd()->gemm_kq(), HostLanguage::OpenCL_C, shimOptions)); - - shimOptions.microkernelID++; - shimOptions.decorator = "vs"; - - kernel_ctx.add_custom_header("gemm_vs.h", - micro::generateShim( - pd()->gemm_vs(), HostLanguage::OpenCL_C, shimOptions)); - - if (pd()->gemm_kq().grfMin > 128 || pd()->gemm_vs().grfMin > 128) - kernel_ctx.add_option("-cl-intel-256-GRF-per-thread"); - - CHECK(create_kernel(engine, &kernel_, "micro_sdpa", kernel_ctx)); - if (!kernel_) return status::runtime_error; - return status::success; - } + status_t init(engine_t *engine) override; private: const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } status_t execute(const exec_ctx_t &ctx) const override; + compute::kernel_t kernel_; - int sg_size_ = 0; }; } // namespace ocl diff --git a/src/gpu/intel/ocl/tile_ops.h b/src/gpu/intel/ocl/tile_ops.h index d56f74aba32..35a7ea3d80a 100644 --- a/src/gpu/intel/ocl/tile_ops.h +++ b/src/gpu/intel/ocl/tile_ops.h @@ -17,6 +17,85 @@ #ifndef GPU_OCL_TILE_OPS_H #define GPU_OCL_TILE_OPS_H +float __builtin_IB_atomic_max_local_f32(__local float *, float); + +__attribute__((overloadable)) float local_atomic_max(local float *p, float v) { + return __builtin_IB_atomic_max_local_f32(p, v); +} + +__attribute__((overloadable)) half local_atomic_max( + local half *p, half v) { /* not implemented */ + return v; +} + +__attribute__((overloadable)) uint local_atomic_max(local uint *p, uint v) { + return atomic_max(p, v); +} + +__attribute__((overloadable)) int local_atomic_max(local int *p, int v) { + return atomic_max(p, v); +} + +#define DEF_BLOCK_LOAD_STORE(type, itype, suffix, n1, n) \ + __attribute__((overloadable)) type##n block_load( \ + const global type *p, int vlen) \ + __attribute__((enable_if(vlen == n1, "wrong vector length"))) { \ + return as_##type##n( \ + intel_sub_group_block_read##suffix##n((global void *)p)); \ + } \ + __attribute__((overloadable)) void block_store( \ + global type *p, type##n v) { \ + intel_sub_group_block_write##suffix##n( \ + (global itype *)p, as_##itype##n(v)); \ + } + +DEF_BLOCK_LOAD_STORE(half, ushort, _us, 1, ) +DEF_BLOCK_LOAD_STORE(half, ushort, _us, 2, 2) +DEF_BLOCK_LOAD_STORE(half, ushort, _us, 4, 4) +DEF_BLOCK_LOAD_STORE(half, ushort, _us, 8, 8) +DEF_BLOCK_LOAD_STORE(half, ushort, _us, 16, 16) +DEF_BLOCK_LOAD_STORE(uint, uint, , 1, ) +DEF_BLOCK_LOAD_STORE(uint, uint, , 2, 2) +DEF_BLOCK_LOAD_STORE(uint, uint, , 4, 4) +DEF_BLOCK_LOAD_STORE(uint, uint, , 8, 8) + +#define DEF_BLOCK2D_LOAD_STORE(type, itype, vl, SG, suffix, BR, BC) \ + itype##vl __builtin_IB_subgroup_block_read_flat_##suffix( \ + long, int, int, int, int2); \ + void __builtin_IB_subgroup_block_write_flat_##suffix( \ + long, int, int, int, int2, itype##vl); \ + __attribute__((overloadable)) type##vl block2d_load(const global type *p, \ + int w, int h, int ld, int x, int y, int br, int bc, \ + int sg) __attribute__((enable_if(br == BR, "wrong #rows"))) \ + __attribute__((enable_if(bc == BC, "wrong #columns"))) \ + __attribute__( \ + (enable_if(sg == SG, "wrong subgroup size"))) { \ + int2 coord = {x, y}; \ + return as_##type##vl(__builtin_IB_subgroup_block_read_flat_##suffix( \ + as_long(p), w - 1, h - 1, ld - 1, coord)); \ + } \ + __attribute__((overloadable)) void block2d_store(type##vl v, \ + global type *p, int w, int h, int ld, int x, int y, int br, \ + int bc, \ + int sg) __attribute__((enable_if(br == BR, "wrong #rows"))) \ + __attribute__((enable_if(bc == BC, "wrong #columns"))) \ + __attribute__( \ + (enable_if(sg == SG, "wrong subgroup size"))) { \ + int2 coord = {x, y}; \ + __builtin_IB_subgroup_block_write_flat_##suffix( \ + as_long(p), w - 1, h - 1, ld - 1, coord, as_##itype##vl(v)); \ + } + +DEF_BLOCK2D_LOAD_STORE(half, ushort, 8, 16, u16_m4k32v1, 32, 4) +DEF_BLOCK2D_LOAD_STORE(half, ushort, 16, 16, u16_m8k32v1, 32, 8) + +#define tile_fill(t, v) \ + do { \ + _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ + i++) t.x[i] \ + = v; \ + } while (0) + #define tile_elementwise(t, f) \ do { \ _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ @@ -24,6 +103,24 @@ = f(t.x[i]); \ } while (0) +#define tile_elementwise_s(t, f) \ + do { \ + _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ + i++) { \ + _Pragma("unroll") for (int s = 0; \ + s < sizeof(t.x[0]) / sizeof(t.x[0][0]); \ + s++) t.x[i][s] \ + = f(t.x[i][s]); \ + } \ + } while (0) + +#define tile_binary(t, t2, f) \ + do { \ + _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ + i++) t.x[i] \ + = f(t.x[i], t2.x[i]); \ + } while (0) + #define tile_copy(t, t_new) \ do { \ _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ @@ -31,18 +128,106 @@ = __builtin_convertvector(t.x[i], __typeof__(t_new.x[i])); \ } while (0) +#define tile_copy_to_half2(t, t_new) \ + do { \ + _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ + i++) { \ + _Pragma("unroll") for (int s = 0; \ + s < sizeof(t.x[0]) / sizeof(t.x[0][0]) / 2; \ + s++) { \ + half2 v = {t.x[i][2 * s], t.x[i][2 * s + 1]}; \ + t_new.x[i][s] = as_uint(v); \ + } \ + } \ + } while (0) + +#define tile_access(t, i0, j, sg, br, bc, nbr) \ + (t).x[(i0) / (br) + (nbr) * ((j) / (bc))] \ + [((i0) % (br)) / (sg) + ((j) % (bc)) * ((br) / (sg))] + +#define xlane_tile_access(t, i, j, sg, br, bc, nbr) \ + sub_group_broadcast(tile_access(t, i, j, sg, br, bc, nbr), i % sg) + #define DECLARE_2D_TILE_OPS(tile_type, element_type, sg, br, bc, nbr, nbc) \ + __attribute__((overloadable)) void tile_load_full(tile_type *t, \ + const global element_type *ptr, int ld, int offset_r, \ + int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + tile_access(*t, i0, j, sg, br, bc, nbr) = ptr[i]; \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_load_full(tile_type *t, \ + const local element_type *ptr, int ld, int offset_r, \ + int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + tile_access(*t, i0, j, sg, br, bc, nbr) = ptr[i]; \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_load(tile_type *t, \ + const global element_type *ptr, int m, int n, int ld, \ + int offset_r, int offset_c) { \ + if (m >= offset_r + br * nbr && n >= offset_c + bc * nbc) { \ + tile_load_full(t, ptr, ld, offset_r, offset_c); \ + return; \ + } \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + if (offset_c + j < n) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + if (offset_r + i < m) \ + tile_access(*t, i0, j, sg, br, bc, nbr) = ptr[i]; \ + } \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_load(tile_type *t, \ + const global element_type *ptr, int m, int n, int offset_r, \ + int offset_c) { \ + tile_load(t, ptr, m, n, m, offset_r, offset_c); \ + } \ + __attribute__((overloadable)) void tile_store_full(tile_type t, \ + local element_type *ptr, int ld, int offset_r, int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + ptr[i] = tile_access(t, i0, j, sg, br, bc, nbr); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_store_full(tile_type t, \ + global element_type *ptr, int ld, int offset_r, int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + ptr[i] = tile_access(t, i0, j, sg, br, bc, nbr); \ + } \ + } \ + } \ __attribute__((overloadable)) void tile_store(tile_type t, \ - global element_type *ptr, int ld, int m, int n, int offset_r, \ + global element_type *ptr, int m, int n, int ld, int offset_r, \ int offset_c) { \ - ptr += m * offset_c + offset_r; \ + if (m >= offset_r + br * nbr && n >= offset_c + bc * nbc) { \ + tile_store_full(t, ptr, ld, offset_r, offset_c); \ + return; \ + } \ + ptr += ld * offset_c + offset_r; \ _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ if (offset_c + j < n) { \ _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ int i = i0 + get_sub_group_local_id(); \ if (offset_r + i < m) \ - ptr[i] = t.x[i0 / br + nbr * (j / bc)] \ - [(i0 % br) / sg + (j % bc) * (br / sg)]; \ + ptr[i] = tile_access(t, i0, j, sg, br, bc, nbr); \ } \ } \ } \ @@ -50,16 +235,140 @@ __attribute__((overloadable)) void tile_store(tile_type t, \ global element_type *ptr, int m, int n, int offset_r, \ int offset_c) { \ - tile_store(t, ptr, m, m, n, offset_r, offset_c); \ + tile_store(t, ptr, m, n, m, offset_r, offset_c); \ } \ - __attribute__((overloadable)) void tile_store_full(tile_type t, \ + __attribute__((overloadable)) void tile_store_t_sys_src1(tile_type t, \ + local element_type *ptr, int ld, int offset_r, int offset_c) { \ + offset_c += get_sub_group_local_id(); \ + int offset_r0 = offset_r & (sg - 1); \ + int offset_r1 = offset_r & ~(sg - 1); \ + ptr += offset_r0 + sg * offset_c + ld * offset_r1; \ + _Pragma("unroll") for (int j0 = 0; j0 < br * nbr; \ + j0 += sg, ptr += sg * sg) { \ + _Pragma("unroll") for (int i = 0; i < bc * nbc; i++) ptr[i] \ + = tile_access(t, j0, i, sg, br, bc, nbr); \ + } \ + } \ + __attribute__((overloadable)) void tile_store_t_sys_src2(tile_type t, \ + local element_type *ptr, int tile_n, int ld, int offset_r, \ + int offset_c) { \ + const int cp = 32 / sizeof(element_type); \ + offset_c += get_sub_group_local_id(); \ + int offset_r0 = offset_r & (cp - 1); \ + int offset_r1 = offset_r & ~(cp - 1); \ + ptr += offset_r0 + tile_n * offset_r1; \ + _Pragma("unroll") for (int j0 = 0; j0 < br * nbr; \ + j0 += sg, offset_c += sg) { \ + int offset_c0 = offset_c & (tile_n - 1); \ + int offset_c1 = offset_c & ~(tile_n - 1); \ + local element_type *ptr_j = ptr + cp * offset_c0 + ld * offset_c1; \ + _Pragma("unroll") for (int i = 0; i < bc * nbc; i++) { \ + *ptr_j = tile_access(t, j0, i, sg, br, bc, nbr); \ + ptr_j++; \ + if ((~i & (cp - 1)) == 0) ptr_j += cp * (tile_n - 1); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_atomic_max_full(tile_type t, \ local element_type *ptr, int ld, int offset_r, int offset_c) { \ ptr += ld * offset_c + offset_r; \ _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ int i = i0 + get_sub_group_local_id(); \ - ptr[i] = t.x[i0 / br + nbr * (j / bc)] \ - [(i0 % br) / sg + (j % bc) * (br / sg)]; \ + local_atomic_max( \ + ptr + i, tile_access(t, i0, j, sg, br, bc, nbr)); \ + } \ + } \ + } + +#define DECLARE_2D_TILE_VREDUCE(tile_type, sg, br, bc, nbr, nbc, rtile_type, \ + rsg, rbr, rbc, rnbr, rnbc) \ + __attribute__((overloadable)) void tile_vreduce_add( \ + tile_type t, rtile_type *tr) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + tile_access(*tr, i0, 0, rsg, rbr, rbc, rnbr) \ + += tile_access(t, i0, j, sg, br, bc, nbr); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_vreduce_max( \ + tile_type t, rtile_type *tr) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + tile_access(*tr, i0, 0, rsg, rbr, rbc, rnbr) \ + = max(tile_access(t, i0, j, sg, br, bc, nbr), \ + tile_access(*tr, i0, 0, rsg, rbr, rbc, rnbr)); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_vbroadcast_sub( \ + tile_type *t, rtile_type tr) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + tile_access(*t, i0, j, sg, br, bc, nbr) \ + -= tile_access(tr, i0, 0, rsg, rbr, rbc, rnbr); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_vbroadcast_mul( \ + tile_type *t, rtile_type tr) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + tile_access(*t, i0, j, sg, br, bc, nbr) \ + *= tile_access(tr, i0, 0, rsg, rbr, rbc, rnbr); \ + } \ + } \ + } + +#define DECLARE_2D_TILE_HREDUCE(tile_type, sg, br, bc, nbr, nbc, rtile_type, \ + rsg, rbr, rbc, rnbr, rnbc) \ + __attribute__((overloadable)) void tile_hbroadcast_add( \ + tile_type *t, rtile_type tr) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + tile_access(*t, i0, j, sg, br, bc, nbr) \ + += xlane_tile_access(tr, j, 0, rsg, rbr, rbc, rnbr); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_hbroadcast_mul( \ + tile_type *t, rtile_type tr) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + tile_access(*t, i0, j, sg, br, bc, nbr) \ + *= xlane_tile_access(tr, j, 0, rsg, rbr, rbc, rnbr); \ + } \ + } \ + } + +#define DECLARE_2D_TILE_RSELECT(tile_type0, sg0, br0, bc0, nbr0, nbc0, \ + tile_type1, sg1, br1, bc1, nbr1, nbc1) \ + __attribute__((overloadable)) void tile_rselect( \ + tile_type0 *t0, tile_type1 t1, int idx) { \ + _Pragma("unroll") for (int j = 0; j < bc0 * nbc0; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br0 * nbr0; i0 += sg0) { \ + tile_access(*t0, i0, j, sg0, br0, bc0, nbr0) \ + = tile_access(t1, i0, j, sg1, br1, bc1, nbr1); \ + _Pragma("unroll") for (int z = 1; \ + z < (br1 * nbr1 / br0 * nbr0); \ + z++) if (z == idx) { \ + tile_access(*t0, i0, j, sg0, br0, bc0, nbr0) \ + = tile_access(t1, i0 + z * br0 * nbr0, j, sg1, \ + br1, bc1, nbr1); \ + } \ + } \ + } \ + } + +#define DECLARE_2D_TILE_COPY_REBLOCK(tile_type0, sg0, br0, bc0, nbr0, nbc0, \ + tile_type1, sg1, br1, bc1, nbr1, nbc1) \ + __attribute__((overloadable)) void tile_copy_reblock( \ + tile_type0 t0, tile_type1 *t1) { \ + _Pragma("unroll") for (int j = 0; j < bc0 * nbc0; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br0 * nbr0; i0 += sg0) { \ + tile_access(*t1, i0, j, sg1, br1, bc1, nbr1) \ + = tile_access(t0, i0, j, sg0, br0, bc0, nbr0); \ } \ } \ } @@ -72,4 +381,122 @@ } tile_type; \ DECLARE_2D_TILE_OPS(tile_type, element_type, sg, br, bc, nbr, nbc) +/* Requires bc == 1 currently */ +#define DECLARE_2D_TILE_BLOCK_OPS( \ + tile_type, element_type, sg, br, bc, nbr, nbc) \ + __attribute__((overloadable)) void tile_load_block(tile_type *t, \ + const global element_type *ptr, int ld, int offset_r, \ + int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int jj = 0; jj < nbc; jj++, ptr += ld * bc) { \ + _Pragma("unroll") for (int ii = 0; ii < nbr; ii++)(t) \ + ->x[ii + nbr * jj] \ + = block_load(ptr + ii * br, br / SUBGROUP_SIZE); \ + } \ + } \ + __attribute__((overloadable)) void tile_store_block(tile_type t, \ + global element_type *ptr, int ld, int offset_r, int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int jj = 0; jj < nbc; jj++, ptr += ld * bc) { \ + _Pragma("unroll") for (int ii = 0; ii < nbr; ii++) \ + block_store(ptr + ii * br, (t).x[ii + nbr * jj]); \ + } \ + } + +#define DECLARE_2D_TILE_BLOCK2D_OPS( \ + tile_type, element_type, sg, br, bc, nbr, nbc) \ + __attribute__((overloadable)) void tile_load_block2d(tile_type *t, \ + const global element_type *ptr, int m, int n, int ld, \ + int offset_r, int offset_c) { \ + const int e = sizeof(element_type); \ + _Pragma("unroll") for (int jj = 0; jj < nbc; jj++) { \ + _Pragma("unroll") for (int ii = 0; ii < nbr; ii++)(t) \ + ->x[ii + nbr * jj] \ + = block2d_load(ptr, m * e, n, ld * e, offset_r + ii * br, \ + offset_c + jj * bc, br, bc, sg); \ + } \ + } \ + __attribute__((overloadable)) void tile_load_block2d(tile_type *t, \ + const global element_type *ptr, int m, int n, int offset_r, \ + int offset_c) { \ + tile_load_block2d(t, ptr, m, n, m, offset_r, offset_c); \ + } \ + __attribute__((overloadable)) void tile_store_block2d(tile_type t, \ + global element_type *ptr, int m, int n, int ld, int offset_r, \ + int offset_c) { \ + const int e = sizeof(element_type); \ + _Pragma("unroll") for (int jj = 0; jj < nbc; jj++) { \ + _Pragma("unroll") for (int ii = 0; ii < nbr; ii++) block2d_store( \ + (t).x[ii + nbr * jj], ptr, m *e, n, ld *e, \ + offset_r + ii * br, offset_c + jj * bc, br, bc, sg); \ + } \ + } \ + __attribute__((overloadable)) void tile_store_block2d(tile_type t, \ + const global element_type *ptr, int m, int n, int offset_r, \ + int offset_c) { \ + tile_store_block2d(t, ptr, m, n, m, offset_r, offset_c); \ + } + +#define DECLARE_2D_TILE_LOAD_PACKED_HALF(tile_type, sg, br, bc, nbr, nbc) \ + __attribute__((overloadable)) void tile_load_packed_half(tile_type *t, \ + const global half *ptr, int m, int n, int ld, int offset_r, \ + int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + if (offset_c + j < n) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = 2 * (i0 + get_sub_group_local_id()); \ + half2 loaded = 0; \ + if (offset_r + i < m) loaded.s0 = ptr[i]; \ + if (offset_r + i + 1 < m) loaded.s1 = ptr[i + 1]; \ + tile_access(*t, i0, j, sg, br, bc, nbr) = as_uint(loaded); \ + } \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_load_packed_half(tile_type *t, \ + const global half *ptr, int m, int n, int offset_r, \ + int offset_c) { \ + tile_load_packed_half(t, ptr, m, n, m, offset_r, offset_c); \ + } + +#define cooperative_prefetch_2d(ptr, r, c, ld, sg_id, n_sg, sg_size, caching) \ + cooperative_prefetch_2d_internal(ptr, (r) * sizeof(*(ptr)), c, \ + (ld) * sizeof(*(ptr)), sg_id, n_sg, sg_size, caching) + +/* IGC prefetch intrinsics */ +enum LSC_LDCC { + LSC_LDCC_DEFAULT = 0, + LSC_LDCC_L1UC_L3UC = 1, + LSC_LDCC_L1UC_L3C = 2, + LSC_LDCC_L1C_L3UC = 3, + LSC_LDCC_L1C_L3C = 4, + LSC_LDCC_L1S_L3UC = 5, + LSC_LDCC_L1S_L3C = 6, + LSC_LDCC_L1IAR_L3C = 7, +}; + +extern void __builtin_IB_lsc_prefetch_global_uint( + const __global uint *base, int immElemOff, enum LSC_LDCC cacheOpt); + +void cooperative_prefetch_2d_internal(const global char *ptr, uint rbytes, + uint c, uint ld_bytes, uint sg_id, uint n_sg, uint sg_size, + enum LSC_LDCC caching) { + const uint cl_per_row = (rbytes + 63) >> 6; + const uint cl = cl_per_row * c; + const uint cl_per_sg = (cl + n_sg - 1) / n_sg; + const uint cl_iters = (cl_per_sg + sg_size - 1) / sg_size; +#pragma unroll + for (uint ii_cl = 0; ii_cl < cl_iters; ii_cl++) { + uint i_cl = ii_cl + (sg_id * cl_per_sg) + get_sub_group_local_id(); + uint r_cl = i_cl % cl_per_row; + uint c_cl = i_cl / cl_per_row; + if (i_cl < cl) { + __builtin_IB_lsc_prefetch_global_uint( + (const global uint *)(ptr + r_cl * 64 + c_cl * ld_bytes), 0, + caching); + } + } +} + #endif \ No newline at end of file From 58d58208e3d6ac698a5f3e451ddbcab2d9f5d933 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Fri, 3 May 2024 07:20:03 -0700 Subject: [PATCH 091/187] gpu: ocl: enable ref_sdpa only on demand --- src/gpu/gpu_sdpa_list.cpp | 2 ++ src/gpu/intel/ocl/ref_sdpa.hpp | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/src/gpu/gpu_sdpa_list.cpp b/src/gpu/gpu_sdpa_list.cpp index c45e6b430e8..42ea1d00f66 100644 --- a/src/gpu/gpu_sdpa_list.cpp +++ b/src/gpu/gpu_sdpa_list.cpp @@ -30,7 +30,9 @@ namespace { // clang-format off constexpr impl_list_item_t impl_list[] = { INSTANCE(intel::ocl::micro_sdpa_t) +#ifdef DNNL_DEV_MODE INSTANCE(intel::ocl::ref_sdpa_t) +#endif nullptr, }; // clang-format on diff --git a/src/gpu/intel/ocl/ref_sdpa.hpp b/src/gpu/intel/ocl/ref_sdpa.hpp index 208a3fdf7b1..4ec9477bbd5 100644 --- a/src/gpu/intel/ocl/ref_sdpa.hpp +++ b/src/gpu/intel/ocl/ref_sdpa.hpp @@ -46,6 +46,10 @@ struct ref_sdpa_t : public gpu_primitive_t { using namespace data_type; using smask_t = primitive_attr_t::skip_mask_t; + /* Reference SDPA is only enabled on-demand, for testing. */ + bool enable_ref = gpu_utils::dev_getenv("enable_ref_sdpa", false); + VDISPATCH_SDPA(enable_ref, VERBOSE_SKIP_PRIMITIVE_IMPL); + VDISPATCH_SDPA(attr()->has_default_values(smask_t::scales_runtime), VERBOSE_UNSUPPORTED_ATTR); VDISPATCH_SDPA( From 10a82850ad2242cb0a995fccd4b9a1362982769f Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Mon, 6 May 2024 14:44:27 -0700 Subject: [PATCH 092/187] gpu: microkernels: compaction support --- src/gpu/intel/microkernels/fuser.cpp | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/gpu/intel/microkernels/fuser.cpp b/src/gpu/intel/microkernels/fuser.cpp index 0e7dfbb0381..ec3e74addf3 100644 --- a/src/gpu/intel/microkernels/fuser.cpp +++ b/src/gpu/intel/microkernels/fuser.cpp @@ -89,17 +89,16 @@ void fuseMicrokernel(std::vector &binary, "IGC did not generate a valid zebin program binary"); auto *insn = reinterpret_cast(base + text->offset); - int icount = text->size >> 4; + auto *iend = reinterpret_cast( + base + text->offset + text->size); const uint8_t *spliceStart = nullptr; const uint8_t *spliceEnd = nullptr; - for (int inum = 0; inum < icount; inum++, insn += 4) { + for (; insn < iend; insn += 4) { if (insn[0] & (1u << 29)) - throw std::runtime_error( - "Found a compacted instruction. Please run with the " - "environment variable IGC_disableCompaction=1"); - if (insn[3] == (sigilStart ^ id)) + insn -= 2; + else if (insn[3] == (sigilStart ^ id)) spliceStart = reinterpret_cast(insn); else if (insn[3] == (sigilEnd ^ id)) { spliceEnd = reinterpret_cast(insn); @@ -178,10 +177,14 @@ void fuseMicrokernels(std::vector &binary, const char *source) { } static void fixupJumpTargets(uint8_t *start, size_t len, ptrdiff_t adjust) { - auto insn = reinterpret_cast(start); - auto icount = len >> 4; + auto istart = reinterpret_cast(start); + auto iend = reinterpret_cast(start + len); - for (size_t inum = 0; inum < icount; inum++, insn += 4) { + for (auto insn = istart; insn < iend; insn += 4) { + if (insn[0] & (1u << 29)) { + insn -= 2; /* skip compacted instructions */ + continue; + } uint8_t op = insn[0] & 0xFF; if ((op & 0xF0) != 0x20) continue; /* skip non-jumps */ if (op == 0x2B || op == 0x2D) continue; /* skip ret/calla */ @@ -189,12 +192,13 @@ static void fixupJumpTargets(uint8_t *start, size_t len, ptrdiff_t adjust) { || op == 0x2A || op == 0x2E); auto jumpFixup = [=](int32_t &ip) { - auto target = ptrdiff_t(inum << 4) + ip; + auto target = ((insn - istart) << 2) + ip; if (target < 0 || target >= ptrdiff_t(len)) ip += adjust; }; if (hasUIP) jumpFixup(insn[2]); jumpFixup(insn[3]); + insn += 4; } } From 0d6fefdd4b3f48865036e14fd965e9a0a17c6fa9 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Tue, 7 May 2024 17:13:31 -0700 Subject: [PATCH 093/187] sdpa: remove any format support --- src/common/sdpa_pd.hpp | 7 +------ src/gpu/intel/ocl/micro_sdpa.hpp | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/common/sdpa_pd.hpp b/src/common/sdpa_pd.hpp index 06e9ed6b2f0..2ada798ee9c 100644 --- a/src/common/sdpa_pd.hpp +++ b/src/common/sdpa_pd.hpp @@ -108,14 +108,9 @@ struct sdpa_pd_t : public primitive_desc_t { const hint_class *hint_fwd_pd) : primitive_desc_t(attr, base_pkind), desc_(*adesc) {} - // By default, we just resolve 'any' with blocked layout and trivial strides bool set_default_format(memory_desc_t *md) { memory_desc_wrapper mdw(md); - if (mdw.format_any()) { - if (mdw.has_runtime_dims_or_strides()) return false; - status_t status = memory_desc_init_by_strides(*md, nullptr); - if (status != status::success) return false; - } + if (mdw.format_any()) return false; return true; } diff --git a/src/gpu/intel/ocl/micro_sdpa.hpp b/src/gpu/intel/ocl/micro_sdpa.hpp index e47e370dad9..b075fbf7270 100644 --- a/src/gpu/intel/ocl/micro_sdpa.hpp +++ b/src/gpu/intel/ocl/micro_sdpa.hpp @@ -77,7 +77,7 @@ struct micro_sdpa_t : public gpu_primitive_t { memory_desc_wrapper mdw(md); auto exp_trans = transposed ? dnnl_trans : dnnl_notrans; if (mdw.format_any()) - CHECK(memory_desc_init_by_tag(md, transposed ? abdc : abcd)); + return status::unimplemented; else if (!is_md_gemm_compatible_plain_format(&md) || gemm_desc_t::get_trans(md) != exp_trans) return status::unimplemented; From 7ea91b02c25a7816dd972e79fa8ebcd79861cb64 Mon Sep 17 00:00:00 2001 From: Tomasz Czeszun Date: Thu, 16 May 2024 09:09:59 -0700 Subject: [PATCH 094/187] x64: matmul: fix binary po batch bcast --- src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp b/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp index 78a994dc0f6..7f4fbdadffb 100644 --- a/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp +++ b/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp @@ -73,6 +73,7 @@ struct jit_brgemm_amx_uker_base_t : public jit_generator { broadcasting_strategy_t::per_mb_spatial, broadcasting_strategy_t::per_mb_w, broadcasting_strategy_t::per_w, + broadcasting_strategy_t::batch, broadcasting_strategy_t::spatial, broadcasting_strategy_t::no_broadcast}; const binary_injector::rhs_arg_static_params_t rhs_sp { From 4baa57cc1003a8d5b52ff8a47eb83d8592e3e51b Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Thu, 16 May 2024 10:11:47 -0700 Subject: [PATCH 095/187] x64: brgemm 1x1 conv: update brgemm parameters creation for 'reduced_rtus' --- src/cpu/x64/jit_brgemm_1x1_conv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpu/x64/jit_brgemm_1x1_conv.cpp b/src/cpu/x64/jit_brgemm_1x1_conv.cpp index 363482497a4..b94c4a1ecc1 100644 --- a/src/cpu/x64/jit_brgemm_1x1_conv.cpp +++ b/src/cpu/x64/jit_brgemm_1x1_conv.cpp @@ -134,12 +134,12 @@ status_t brgemm_1x1_convolution_fwd_t::pd_t::init(engine_t *engine) { auto vK = is_accum_kernel ? jcp_.rtus_ic_size : jcp_.ic_without_padding - jcp_.rtus_ic_size; + if (vM <= 0 || vK <= 0) continue; const bool use_rtus_LDA = is_accum_kernel; const auto LDA = use_rtus_LDA ? jcp_.rtus_padded_ic_size : jcp_.LDA; constexpr int extra_m_kernel_start_idx = 2; brgemm_init_params_.emplace_front( extra_m_kernel_start_idx + idx, vM, vN, vK, LDA); - assert(vM > 0 && vK > 0); } } From af4e339d5e07400541722f65b0f7615b010c57eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 16 May 2024 10:37:47 +0200 Subject: [PATCH 096/187] cpu: sycl: bugfix engine creation --- src/sycl/sycl_engine_base.hpp | 4 ++-- src/xpu/sycl/utils.cpp | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/sycl/sycl_engine_base.hpp b/src/sycl/sycl_engine_base.hpp index 8a0fd1a4775..6add8b06dd2 100644 --- a/src/sycl/sycl_engine_base.hpp +++ b/src/sycl/sycl_engine_base.hpp @@ -39,8 +39,8 @@ class sycl_engine_base_t : public gpu::intel::compute::compute_engine_t { public: sycl_engine_base_t(engine_kind_t kind, const ::sycl::device &dev, const ::sycl::context &ctx, size_t index) - : gpu::intel::compute::compute_engine_t(new xpu::sycl::engine_impl_t( - engine_kind::gpu, dev, ctx, index)) {} + : gpu::intel::compute::compute_engine_t( + new xpu::sycl::engine_impl_t(kind, dev, ctx, index)) {} status_t init() override { CHECK(init_impl()); diff --git a/src/xpu/sycl/utils.cpp b/src/xpu/sycl/utils.cpp index dd410df8916..044751abf63 100644 --- a/src/xpu/sycl/utils.cpp +++ b/src/xpu/sycl/utils.cpp @@ -155,14 +155,17 @@ bool are_equal(const ::sycl::device &lhs, const ::sycl::device &rhs) { // Only one host device exists. if (lhs_be == backend_t::host) return true; -#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL \ + || DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL if (lhs_be == backend_t::opencl) { // Use wrapper objects to avoid memory leak. auto lhs_ocl_handle = compat::get_native(lhs); auto rhs_ocl_handle = compat::get_native(rhs); return lhs_ocl_handle == rhs_ocl_handle; } +#endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL if (lhs_be == backend_t::level0) { return gpu::intel::sycl::compare_ze_devices(lhs, rhs); } From 64acc16c5f06485ddee8e289b4b283f3266b6261 Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Mon, 13 May 2024 13:53:53 -0700 Subject: [PATCH 097/187] gpu: keep impl list getters in a single place --- src/gpu/gpu_concat_list.cpp | 6 +++--- src/gpu/gpu_impl_list.cpp | 13 +++++++++++++ src/gpu/gpu_impl_list.hpp | 13 +++++++++---- src/gpu/gpu_reorder_list.cpp | 6 +++--- src/gpu/gpu_sum_list.cpp | 6 +++--- 5 files changed, 31 insertions(+), 13 deletions(-) diff --git a/src/gpu/gpu_concat_list.cpp b/src/gpu/gpu_concat_list.cpp index 128b5310a2e..856ab9ab0bb 100644 --- a/src/gpu/gpu_concat_list.cpp +++ b/src/gpu/gpu_concat_list.cpp @@ -33,7 +33,7 @@ namespace { __VA_ARGS__::pd_t>()), // clang-format off -constexpr impl_list_item_t concat_impl_list[] = REG_CONCAT_P({ +constexpr impl_list_item_t impl_list[] = REG_CONCAT_P({ CONCAT_INSTANCE(intel::ocl::simple_concat_t) CONCAT_INSTANCE(intel::ocl::gen9_concat_t) CONCAT_INSTANCE(intel::ocl::multi_concat_t) @@ -44,8 +44,8 @@ constexpr impl_list_item_t concat_impl_list[] = REG_CONCAT_P({ #undef INSTANCE } // namespace -const impl_list_item_t *gpu_impl_list_t::get_concat_implementation_list() { - return concat_impl_list; +const impl_list_item_t *get_concat_impl_list() { + return impl_list; } } // namespace gpu diff --git a/src/gpu/gpu_impl_list.cpp b/src/gpu/gpu_impl_list.cpp index e044e7dbf89..c056be21cec 100644 --- a/src/gpu/gpu_impl_list.cpp +++ b/src/gpu/gpu_impl_list.cpp @@ -55,6 +55,19 @@ const impl_list_item_t *gpu_impl_list_t::get_implementation_list( // clang-format on } +const impl_list_item_t *gpu_impl_list_t::get_concat_implementation_list() { + return get_concat_impl_list(); +} + +const impl_list_item_t *gpu_impl_list_t::get_sum_implementation_list() { + return get_sum_impl_list(); +} + +const impl_list_item_t *gpu_impl_list_t::get_reorder_implementation_list( + const memory_desc_t *src_md, const memory_desc_t *dst_md) { + return get_reorder_impl_list(src_md, dst_md); +} + } // namespace gpu } // namespace impl } // namespace dnnl diff --git a/src/gpu/gpu_impl_list.hpp b/src/gpu/gpu_impl_list.hpp index 43198956dfd..130a462e74c 100644 --- a/src/gpu/gpu_impl_list.hpp +++ b/src/gpu/gpu_impl_list.hpp @@ -59,14 +59,19 @@ DECLARE_IMPL_LIST(zero_pad); #undef DECLARE_IMPL_LIST +const impl_list_item_t *get_concat_impl_list(); +const impl_list_item_t *get_sum_impl_list(); +const impl_list_item_t *get_reorder_impl_list( + const memory_desc_t *, const memory_desc_t *); + class gpu_impl_list_t { public: - static const impl_list_item_t *get_concat_implementation_list(); - static const impl_list_item_t *get_reorder_implementation_list( - const memory_desc_t *src_md, const memory_desc_t *dst_md); - static const impl_list_item_t *get_sum_implementation_list(); static const impl_list_item_t *get_implementation_list( const op_desc_t *desc); + static const impl_list_item_t *get_concat_implementation_list(); + static const impl_list_item_t *get_sum_implementation_list(); + static const impl_list_item_t *get_reorder_implementation_list( + const memory_desc_t *, const memory_desc_t *); }; } // namespace gpu diff --git a/src/gpu/gpu_reorder_list.cpp b/src/gpu/gpu_reorder_list.cpp index e50bd6d0bef..6ec7b675251 100644 --- a/src/gpu/gpu_reorder_list.cpp +++ b/src/gpu/gpu_reorder_list.cpp @@ -36,7 +36,7 @@ using namespace dnnl::impl::data_type; impl_list_item_t::reorder_type_deduction_helper_t<__VA_ARGS__>()), // clang-format off -constexpr impl_list_item_t reorder_impl_list[] = REG_REORDER_P({ +constexpr impl_list_item_t impl_list[] = REG_REORDER_P({ REORDER_INSTANCE(intel::ocl::rnn_weights_reorder_t::pd_t) REORDER_INSTANCE(intel::ocl::cross_engine_reorder_t::pd_t) REORDER_INSTANCE(intel::jit::gen_reorder_t::pd_t) @@ -49,9 +49,9 @@ constexpr impl_list_item_t reorder_impl_list[] = REG_REORDER_P({ } // namespace -const impl_list_item_t *gpu_impl_list_t::get_reorder_implementation_list( +const impl_list_item_t *get_reorder_impl_list( const memory_desc_t *, const memory_desc_t *) { - return reorder_impl_list; + return impl_list; } } // namespace gpu diff --git a/src/gpu/gpu_sum_list.cpp b/src/gpu/gpu_sum_list.cpp index 1ef8307bdbc..4294620ec19 100644 --- a/src/gpu/gpu_sum_list.cpp +++ b/src/gpu/gpu_sum_list.cpp @@ -39,7 +39,7 @@ namespace { __VA_ARGS__::pd_t>()), // clang-format off -constexpr impl_list_item_t sum_impl_list[] = REG_SUM_P({ +constexpr impl_list_item_t impl_list[] = REG_SUM_P({ SUM_INSTANCE(intel::ocl::multi_po_reorder_sum) SUM_INSTANCE(intel::ocl::gen9_sum_t) SUM_INSTANCE(intel::ocl::many_inputs_sum_t) @@ -51,8 +51,8 @@ constexpr impl_list_item_t sum_impl_list[] = REG_SUM_P({ #undef INSTANCE } // namespace -const impl_list_item_t *gpu_impl_list_t::get_sum_implementation_list() { - return sum_impl_list; +const impl_list_item_t *get_sum_impl_list() { + return impl_list; } } // namespace gpu From ddacf6fb2e0c44a47e52120bcae6c2f5ee99df02 Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Mon, 13 May 2024 15:52:07 -0700 Subject: [PATCH 098/187] gpu: remove inclusion of a redundant header --- src/gpu/gpu_sum_list.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gpu/gpu_sum_list.cpp b/src/gpu/gpu_sum_list.cpp index 4294620ec19..ef7082f8616 100644 --- a/src/gpu/gpu_sum_list.cpp +++ b/src/gpu/gpu_sum_list.cpp @@ -16,7 +16,6 @@ #include "gpu/gpu_impl_list.hpp" -#include "common/impl_list_item.hpp" #include "common/utils.hpp" #include "gpu/gpu_sum_pd.hpp" #include "gpu/intel/jit/gen9_simple_sum.hpp" From e7599d244ff5aa8764be00b5bebdaca38d69a352 Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Mon, 13 May 2024 19:10:09 -0700 Subject: [PATCH 099/187] gpu: introduce new instance macros --- src/gpu/amd/miopen_reorder_impl.cpp | 10 +- src/gpu/amd/sycl_hip_engine.cpp | 44 ++++----- src/gpu/gpu_batch_normalization_list.cpp | 42 ++++---- src/gpu/gpu_binary_list.cpp | 8 +- src/gpu/gpu_concat_list.cpp | 17 ++-- src/gpu/gpu_convolution_list.cpp | 37 +++---- src/gpu/gpu_deconvolution_list.cpp | 8 +- src/gpu/gpu_eltwise_list.cpp | 10 +- src/gpu/gpu_gemm_list.cpp | 20 ++-- src/gpu/gpu_group_normalization_list.cpp | 6 +- src/gpu/gpu_impl_list.hpp | 119 ++++++++++++++++++++++- src/gpu/gpu_inner_product_list.cpp | 16 +-- src/gpu/gpu_layer_normalization_list.cpp | 14 +-- src/gpu/gpu_lrn_list.cpp | 6 +- src/gpu/gpu_matmul_list.cpp | 6 +- src/gpu/gpu_pooling_list.cpp | 16 +-- src/gpu/gpu_prelu_list.cpp | 6 +- src/gpu/gpu_reduction_list.cpp | 25 +++-- src/gpu/gpu_reorder_list.cpp | 18 ++-- src/gpu/gpu_resampling_list.cpp | 8 +- src/gpu/gpu_rnn_list.cpp | 6 +- src/gpu/gpu_shuffle_list.cpp | 6 +- src/gpu/gpu_softmax_list.cpp | 12 ++- src/gpu/gpu_sum_list.cpp | 18 ++-- src/gpu/gpu_zero_pad_list.cpp | 4 +- src/gpu/nvidia/cudnn_concat.cpp | 12 ++- src/gpu/nvidia/cudnn_reorder_impl.cpp | 10 +- src/gpu/nvidia/cudnn_sum.cpp | 11 ++- src/gpu/nvidia/sycl_cuda_engine.cpp | 90 ++++++++--------- 29 files changed, 368 insertions(+), 237 deletions(-) diff --git a/src/gpu/amd/miopen_reorder_impl.cpp b/src/gpu/amd/miopen_reorder_impl.cpp index 87c1f0cc55d..03d0418f39f 100644 --- a/src/gpu/amd/miopen_reorder_impl.cpp +++ b/src/gpu/amd/miopen_reorder_impl.cpp @@ -15,9 +15,9 @@ * limitations under the License. *******************************************************************************/ #include "common/engine.hpp" -#include "common/impl_list_item.hpp" #include "gpu/amd/miopen_reorder.hpp" #include "gpu/amd/sycl_hip_engine.hpp" +#include "gpu/gpu_impl_list.hpp" #include "gpu/intel/ocl/cross_engine_reorder.hpp" namespace dnnl { @@ -27,14 +27,10 @@ namespace amd { namespace { -#define REORDER_INSTANCE(...) \ - impl_list_item_t( \ - impl_list_item_t::reorder_type_deduction_helper_t<__VA_ARGS__>()), - // clang-format off constexpr impl_list_item_t hip_reorder_impl_list[] = { - REORDER_INSTANCE(gpu::intel::ocl::cross_engine_reorder_t::pd_t) - REORDER_INSTANCE(miopen_reorder_t::pd_t) + GPU_REORDER_INSTANCE_AMD(gpu::intel::ocl::cross_engine_reorder_t::pd_t) + GPU_REORDER_INSTANCE_AMD(gpu::amd::miopen_reorder_t::pd_t) nullptr, }; // clang-format on diff --git a/src/gpu/amd/sycl_hip_engine.cpp b/src/gpu/amd/sycl_hip_engine.cpp index 7e9cef7526e..fc6191a870a 100644 --- a/src/gpu/amd/sycl_hip_engine.cpp +++ b/src/gpu/amd/sycl_hip_engine.cpp @@ -168,38 +168,38 @@ using namespace dnnl::impl::data_type; // clang-format off constexpr dnnl::impl::impl_list_item_t sycl_hip_impl_list[] = { // Binary - INSTANCE(miopen_binary_t) + GPU_INSTANCE_AMD(miopen_binary_t) // Elementwise - INSTANCE(miopen_eltwise_fwd_t) - INSTANCE(miopen_eltwise_bwd_t) + GPU_INSTANCE_AMD(miopen_eltwise_fwd_t) + GPU_INSTANCE_AMD(miopen_eltwise_bwd_t) // Softmax - INSTANCE(miopen_softmax_fwd_t) - INSTANCE(miopen_softmax_bwd_t) + GPU_INSTANCE_AMD(miopen_softmax_fwd_t) + GPU_INSTANCE_AMD(miopen_softmax_bwd_t) // LRN - INSTANCE(miopen_lrn_fwd_t) - INSTANCE(miopen_lrn_bwd_t) + GPU_INSTANCE_AMD(miopen_lrn_fwd_t) + GPU_INSTANCE_AMD(miopen_lrn_bwd_t) // Pooling - INSTANCE(miopen_pooling_fwd_t) - INSTANCE(miopen_pooling_bwd_t) + GPU_INSTANCE_AMD(miopen_pooling_fwd_t) + GPU_INSTANCE_AMD(miopen_pooling_bwd_t) // Reduction - INSTANCE(miopen_reduction_t) + GPU_INSTANCE_AMD(miopen_reduction_t) // MatMul - INSTANCE(miopen_matmul_t) + GPU_INSTANCE_AMD(miopen_matmul_t) // Inner Product - INSTANCE(miopen_gemm_inner_product_fwd_t) - INSTANCE(miopen_gemm_inner_product_bwd_data_t) - INSTANCE(miopen_gemm_inner_product_bwd_weights_t) + GPU_INSTANCE_AMD(miopen_gemm_inner_product_fwd_t) + GPU_INSTANCE_AMD(miopen_gemm_inner_product_bwd_data_t) + GPU_INSTANCE_AMD(miopen_gemm_inner_product_bwd_weights_t) // Convolution - INSTANCE(miopen_convolution_fwd_t) - INSTANCE(miopen_convolution_bwd_data_t) - INSTANCE(miopen_convolution_bwd_weights_t) + GPU_INSTANCE_AMD(miopen_convolution_fwd_t) + GPU_INSTANCE_AMD(miopen_convolution_bwd_data_t) + GPU_INSTANCE_AMD(miopen_convolution_bwd_weights_t) // Batch Normalization - INSTANCE(miopen_batch_normalization_fwd_t) - INSTANCE(miopen_batch_normalization_bwd_t) + GPU_INSTANCE_AMD(miopen_batch_normalization_fwd_t) + GPU_INSTANCE_AMD(miopen_batch_normalization_bwd_t) // Deconvolution - INSTANCE(miopen_deconvolution_fwd_t) - INSTANCE(miopen_deconvolution_bwd_data_t) - INSTANCE(miopen_deconvolution_bwd_weights_t) + GPU_INSTANCE_AMD(miopen_deconvolution_fwd_t) + GPU_INSTANCE_AMD(miopen_deconvolution_bwd_data_t) + GPU_INSTANCE_AMD(miopen_deconvolution_bwd_weights_t) nullptr, }; diff --git a/src/gpu/gpu_batch_normalization_list.cpp b/src/gpu/gpu_batch_normalization_list.cpp index b3878668d8e..724dbdbb0b3 100644 --- a/src/gpu/gpu_batch_normalization_list.cpp +++ b/src/gpu/gpu_batch_normalization_list.cpp @@ -16,13 +16,19 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/bnorm/gen9_batch_normalization.hpp" #include "gpu/intel/ocl/bnorm/nhwc_batch_normalization.hpp" -#include "gpu/intel/ocl/bnorm/nhwc_reusable.hpp" #include "gpu/intel/ocl/bnorm/ref_batch_normalization.hpp" #include "gpu/intel/ocl/bnorm/reusable_bnorm.hpp" #include "gpu/intel/ocl/bnorm/simple_bnorm.hpp" +#ifdef DNNL_DEV_MODE +#include "gpu/intel/ocl/bnorm/nhwc_reusable.hpp" +#endif + +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -30,35 +36,25 @@ namespace gpu { namespace { using namespace dnnl::impl::prop_kind; -#ifdef DNNL_DEV_MODE -#define NHWC_REUSABLE_FWD_INSTANCE \ - INSTANCE(intel::ocl::nhwc_reusable_batch_normalization_fwd_t) -#define NHWC_REUSABLE_BWD_INSTANCE \ - INSTANCE(intel::ocl::nhwc_reusable_batch_normalization_bwd_t) -#else -#define NHWC_REUSABLE_FWD_INSTANCE -#define NHWC_REUSABLE_BWD_INSTANCE -#endif - // clang-format off const std::map> impl_list_map REG_BNORM_P({ {{forward}, { - NHWC_REUSABLE_FWD_INSTANCE - INSTANCE(intel::ocl::nhwc_batch_normalization_fwd_t) - INSTANCE(intel::ocl::gen9_batch_normalization_fwd_t) - INSTANCE(intel::ocl::simple_batch_normalization_fwd_t) - INSTANCE(intel::ocl::reusable_batch_normalization_fwd_t) - INSTANCE(intel::ocl::ref_batch_normalization_fwd_t) + GPU_INSTANCE_INTEL_DEVMODE(intel::ocl::nhwc_reusable_batch_normalization_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::nhwc_batch_normalization_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::gen9_batch_normalization_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::simple_batch_normalization_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::reusable_batch_normalization_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_batch_normalization_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ - NHWC_REUSABLE_BWD_INSTANCE - INSTANCE(intel::ocl::nhwc_batch_normalization_bwd_t) - INSTANCE(intel::ocl::gen9_batch_normalization_bwd_t) - INSTANCE(intel::ocl::simple_batch_normalization_bwd_t) - INSTANCE(intel::ocl::reusable_batch_normalization_bwd_t) - INSTANCE(intel::ocl::ref_batch_normalization_bwd_t) + GPU_INSTANCE_INTEL_DEVMODE(intel::ocl::nhwc_reusable_batch_normalization_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::nhwc_batch_normalization_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::gen9_batch_normalization_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::simple_batch_normalization_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::reusable_batch_normalization_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_batch_normalization_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_binary_list.cpp b/src/gpu/gpu_binary_list.cpp index 8fc3dcb4b50..1c43b9c06b6 100644 --- a/src/gpu/gpu_binary_list.cpp +++ b/src/gpu/gpu_binary_list.cpp @@ -16,9 +16,11 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/gen9_binary.hpp" #include "gpu/intel/ocl/multi_po_reorder_binary.hpp" #include "gpu/intel/ocl/ref_binary.hpp" +#endif namespace dnnl { namespace impl { @@ -28,9 +30,9 @@ namespace { // clang-format off constexpr impl_list_item_t impl_list[] = REG_BINARY_P({ - INSTANCE(intel::ocl::multi_po_reorder_binary) - INSTANCE(intel::ocl::gen9_binary_t) - INSTANCE(intel::ocl::ref_binary_t) + GPU_INSTANCE_INTEL(intel::ocl::multi_po_reorder_binary) + GPU_INSTANCE_INTEL(intel::ocl::gen9_binary_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_binary_t) nullptr, }); // clang-format on diff --git a/src/gpu/gpu_concat_list.cpp b/src/gpu/gpu_concat_list.cpp index 856ab9ab0bb..ba062736984 100644 --- a/src/gpu/gpu_concat_list.cpp +++ b/src/gpu/gpu_concat_list.cpp @@ -14,34 +14,31 @@ * limitations under the License. *******************************************************************************/ -#include "common/impl_list_item.hpp" - #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/gen9_concat.hpp" #include "gpu/intel/ocl/multi_concat.hpp" #include "gpu/intel/ocl/ref_concat.hpp" #include "gpu/intel/ocl/simple_concat.hpp" +#endif namespace dnnl { namespace impl { namespace gpu { namespace { -#define CONCAT_INSTANCE(...) \ - impl_list_item_t(impl_list_item_t::concat_type_deduction_helper_t< \ - __VA_ARGS__::pd_t>()), // clang-format off constexpr impl_list_item_t impl_list[] = REG_CONCAT_P({ - CONCAT_INSTANCE(intel::ocl::simple_concat_t) - CONCAT_INSTANCE(intel::ocl::gen9_concat_t) - CONCAT_INSTANCE(intel::ocl::multi_concat_t) - CONCAT_INSTANCE(intel::ocl::ref_concat_t) + GPU_CONCAT_INSTANCE_INTEL(intel::ocl::simple_concat_t) + GPU_CONCAT_INSTANCE_INTEL(intel::ocl::gen9_concat_t) + GPU_CONCAT_INSTANCE_INTEL(intel::ocl::multi_concat_t) + GPU_CONCAT_INSTANCE_INTEL(intel::ocl::ref_concat_t) nullptr, }); // clang-format on -#undef INSTANCE + } // namespace const impl_list_item_t *get_concat_impl_list() { diff --git a/src/gpu/gpu_convolution_list.cpp b/src/gpu/gpu_convolution_list.cpp index 6c0fe857b09..3c3ccbf994b 100644 --- a/src/gpu/gpu_convolution_list.cpp +++ b/src/gpu/gpu_convolution_list.cpp @@ -16,6 +16,7 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/jit/binary_format.hpp" #include "gpu/intel/jit/conv/gen_convolution.hpp" #include "gpu/intel/ocl/gen9_wino_convolution.hpp" @@ -25,6 +26,8 @@ #include "gpu/intel/jit/v2/conv/gen_convolution.hpp" #endif +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -32,45 +35,31 @@ namespace gpu { namespace { using namespace dnnl::impl::prop_kind; -#ifdef DNNL_DEV_MODE -#define V2_CONV_FWD_INSTANCE \ - INSTANCE(intel::jit::v2::conv::gen_convolution_fwd_t) -#define V2_CONV_BWD_D_INSTANCE \ - INSTANCE(intel::jit::v2::conv::gen_convolution_bwd_data_t) -#define V2_CONV_BWD_W_INSTANCE \ - INSTANCE(intel::jit::v2::conv::gen_convolution_bwd_weights_t) -#else -#define V2_CONV_FWD_INSTANCE -#define V2_CONV_BWD_D_INSTANCE -#define V2_CONV_BWD_W_INSTANCE -#endif - // clang-format off const std::map> impl_list_map REG_CONV_P({ {{forward}, { - V2_CONV_FWD_INSTANCE - INSTANCE(intel::jit::gen_convolution_fwd_t) - INSTANCE(intel::ocl::gen9_wino_convolution_fwd_t) - INSTANCE(intel::ocl::ref_convolution_fwd_t) + GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_fwd_t) + GPU_INSTANCE_INTEL(intel::jit::gen_convolution_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::gen9_wino_convolution_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_convolution_fwd_t) nullptr, }}, {{backward_data}, REG_BWD_D_PK({ - V2_CONV_BWD_D_INSTANCE - INSTANCE(intel::jit::gen_convolution_bwd_data_t) - INSTANCE(intel::ocl::ref_convolution_bwd_data_t) + GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_bwd_data_t) + GPU_INSTANCE_INTEL(intel::jit::gen_convolution_bwd_data_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_convolution_bwd_data_t) nullptr, })}, {{backward_weights}, REG_BWD_PK({ - V2_CONV_BWD_W_INSTANCE - INSTANCE(intel::jit::gen_convolution_bwd_weights_t) - INSTANCE(intel::ocl::ref_convolution_bwd_weights_t) + GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_bwd_weights_t) + GPU_INSTANCE_INTEL(intel::jit::gen_convolution_bwd_weights_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_convolution_bwd_weights_t) nullptr, })}, }); // clang-format on -#undef V2_CONV_INSTANCE } // namespace const impl_list_item_t *get_convolution_impl_list( diff --git a/src/gpu/gpu_deconvolution_list.cpp b/src/gpu/gpu_deconvolution_list.cpp index 488c4601874..438f27cd253 100644 --- a/src/gpu/gpu_deconvolution_list.cpp +++ b/src/gpu/gpu_deconvolution_list.cpp @@ -16,7 +16,9 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/ref_deconvolution.hpp" +#endif namespace dnnl { namespace impl { @@ -29,12 +31,12 @@ using namespace dnnl::impl::prop_kind; const std::map> impl_list_map REG_DECONV_P({ {{forward}, { - INSTANCE(intel::ocl::ref_deconvolution_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_deconvolution_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ - INSTANCE(intel::ocl::ref_deconvolution_bwd_data_t) - INSTANCE(intel::ocl::ref_deconvolution_bwd_weights_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_deconvolution_bwd_data_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_deconvolution_bwd_weights_t) nullptr, })}, }); diff --git a/src/gpu/gpu_eltwise_list.cpp b/src/gpu/gpu_eltwise_list.cpp index f8ff46b986a..1ec18cfb798 100644 --- a/src/gpu/gpu_eltwise_list.cpp +++ b/src/gpu/gpu_eltwise_list.cpp @@ -16,8 +16,10 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/gen9_eltwise.hpp" #include "gpu/intel/ocl/ref_eltwise.hpp" +#endif namespace dnnl { namespace impl { @@ -30,13 +32,13 @@ using namespace dnnl::impl::prop_kind; const std::map> impl_list_map REG_ELTWISE_P({ {{forward}, { - INSTANCE(intel::ocl::gen9_eltwise_fwd_t) - INSTANCE(intel::ocl::ref_eltwise_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::gen9_eltwise_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_eltwise_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ - INSTANCE(intel::ocl::gen9_eltwise_bwd_t) - INSTANCE(intel::ocl::ref_eltwise_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::gen9_eltwise_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_eltwise_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_gemm_list.cpp b/src/gpu/gpu_gemm_list.cpp index 3609fec3507..138f46fcb67 100644 --- a/src/gpu/gpu_gemm_list.cpp +++ b/src/gpu/gpu_gemm_list.cpp @@ -18,14 +18,20 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/jit/binary_format.hpp" #include "gpu/intel/jit/gemm/gen_gemm.hpp" #include "gpu/intel/jit/gemm/xe_hp_systolic_gemm.hpp" -#include "gpu/intel/ocl/gemm/conv_gemm.hpp" #include "gpu/intel/ocl/gemm/gemm_with_post_ops.hpp" #include "gpu/intel/ocl/gemm/ref_gemm.hpp" +#ifdef DNNL_DEV_MODE +#include "gpu/intel/ocl/gemm/conv_gemm.hpp" +#endif + +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -34,13 +40,11 @@ namespace { // clang-format off constexpr impl_list_item_t impl_list[] = { -#ifdef DNNL_DEV_MODE - INSTANCE(intel::ocl::conv_gemm_t) -#endif - INSTANCE(intel::jit::xe_hp_systolic_gemm_t) - INSTANCE(intel::ocl::gemm_with_post_ops_t) - INSTANCE(intel::jit::gen_gemm_t) - INSTANCE(intel::ocl::ref_gemm_t) + GPU_INSTANCE_INTEL_DEVMODE(intel::ocl::conv_gemm_t) + GPU_INSTANCE_INTEL(intel::jit::xe_hp_systolic_gemm_t) + GPU_INSTANCE_INTEL(intel::ocl::gemm_with_post_ops_t) + GPU_INSTANCE_INTEL(intel::jit::gen_gemm_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_gemm_t) nullptr, }; // clang-format on diff --git a/src/gpu/gpu_group_normalization_list.cpp b/src/gpu/gpu_group_normalization_list.cpp index ff5b5f62312..65e3c4c6e2e 100644 --- a/src/gpu/gpu_group_normalization_list.cpp +++ b/src/gpu/gpu_group_normalization_list.cpp @@ -16,7 +16,9 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/ref_group_normalization.hpp" +#endif namespace dnnl { namespace impl { @@ -29,12 +31,12 @@ using namespace dnnl::impl::prop_kind; const std::map> impl_list_map REG_GNORM_P({ {{forward}, { - INSTANCE(intel::ocl::ref_group_normalization_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_group_normalization_fwd_t) nullptr, } }, {{backward}, REG_BWD_PK({ - INSTANCE(intel::ocl::ref_group_normalization_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_group_normalization_bwd_t) nullptr, }) }, diff --git a/src/gpu/gpu_impl_list.hpp b/src/gpu/gpu_impl_list.hpp index 130a462e74c..d520b5bd299 100644 --- a/src/gpu/gpu_impl_list.hpp +++ b/src/gpu/gpu_impl_list.hpp @@ -20,6 +20,8 @@ #include #include +#include "oneapi/dnnl/dnnl.h" + #include "common/engine.hpp" #include "common/impl_list_item.hpp" #include "common/impl_registration.hpp" @@ -29,10 +31,125 @@ namespace dnnl { namespace impl { namespace gpu { -#define INSTANCE(...) \ +// There is a set of macros to instantiate implementations for different +// vendors and kernel languages to enable using a single implementation list +// (exception: implementation lists for concat, sum and reorder). +// +// oneDNN currently supports four GPU vendors: +// - INTEL +// - NVIDIA +// - AMD +// - GENERIC (standalone (not yet supported) or in a combination with the +// other vendors) +// +// The macros for INTEL, NVIDIA and AMD vendors assume that all implementations +// within a single vendor can be enabled at once. +// +// The macros for the GENERIC vendor can be either truly generic or +// runtime specific: +// - GENERIC: truly generic implementation that is not tied to any vendor +// and runtime, e.g. an implementation of the concat primitive +// based on reorders. +// - GENERIC_SYCL: SYCL generic implementations (written in generic SYCL). +// NOTE: these implementations are currently only enabled for +// NVIDIA vendor. +// +// The concat, sum and reorder primitives require specialized versions of the +// macros because their `pd_t::create` functions have unique signatures. + +// Conditional macros for different vendors. +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL +#define DNNL_GPU_INTEL_ONLY(...) __VA_ARGS__ +#else +#define DNNL_GPU_INTEL_ONLY(...) +#endif + +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#define DNNL_GPU_NVIDIA_ONLY(...) __VA_ARGS__ +#else +#define DNNL_GPU_NVIDIA_ONLY(...) +#endif + +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#define DNNL_GPU_AMD_ONLY(...) __VA_ARGS__ +#else +#define DNNL_GPU_AMD_ONLY(...) +#endif + +// NOTE: Support for the standalone GENERIC vendor has not been added yet. +#if defined(DNNL_WITH_SYCL) \ + && ((DNNL_GPU_VENDOR == DNNL_VENDOR_GENERIC) \ + || (DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA)) +#define DNNL_GPU_GENERIC_SYCL_ONLY(...) __VA_ARGS__ +#else +#define DNNL_GPU_GENERIC_SYCL_ONLY(...) +#endif + +// Primary instance macro for the GPU primitives. +#define GPU_INSTANCE(...) \ impl_list_item_t( \ impl_list_item_t::type_deduction_helper_t<__VA_ARGS__::pd_t>()), +// Specializations of the primary instance macro for concat, sum and reorder +// primitives. +#define GPU_CONCAT_INSTANCE(...) \ + impl_list_item_t(impl_list_item_t::concat_type_deduction_helper_t< \ + __VA_ARGS__::pd_t>()), +#define GPU_SUM_INSTANCE(...) \ + impl_list_item_t(impl_list_item_t::sum_type_deduction_helper_t< \ + __VA_ARGS__::pd_t>()), +#define GPU_REORDER_INSTANCE(...) \ + impl_list_item_t( \ + impl_list_item_t::reorder_type_deduction_helper_t<__VA_ARGS__>()), + +// Vendor specific instance macros. +#define GPU_INSTANCE_INTEL(...) DNNL_GPU_INTEL_ONLY(GPU_INSTANCE(__VA_ARGS__)) +#define GPU_INSTANCE_NVIDIA(...) DNNL_GPU_NVIDIA_ONLY(GPU_INSTANCE(__VA_ARGS__)) +#define GPU_INSTANCE_AMD(...) DNNL_GPU_AMD_ONLY(GPU_INSTANCE(__VA_ARGS__)) +#define GPU_INSTANCE_GENERIC_SYCL(...) \ + DNNL_GPU_GENERIC_SYCL_ONLY(GPU_INSTANCE(__VA_ARGS__)) +#define GPU_INSTANCE_GENERIC(...) GPU_INSTANCE(__VA_ARGS__) + +// Specializations of the vendor specific instance macros for concat, sum +// and reorder primitives. +#define GPU_CONCAT_INSTANCE_INTEL(...) \ + DNNL_GPU_INTEL_ONLY(GPU_CONCAT_INSTANCE(__VA_ARGS__)) +#define GPU_CONCAT_INSTANCE_NVIDIA(...) \ + DNNL_GPU_NVIDIA_ONLY(GPU_CONCAT_INSTANCE(__VA_ARGS__)) +#define GPU_CONCAT_INSTANCE_AMD(...) \ + DNNL_GPU_AMD_ONLY(GPU_CONCAT_INSTANCE(__VA_ARGS__)) +#define GPU_CONCAT_INSTANCE_GENERIC_SYCL(...) \ + DNNL_GPU_GENERIC_SYCL_ONLY(GPU_CONCAT_INSTANCE(__VA_ARGS__)) +#define GPU_CONCAT_INSTANCE_GENERIC(...) GPU_CONCAT_INSTANCE(__VA_ARGS__) + +#define GPU_SUM_INSTANCE_INTEL(...) \ + DNNL_GPU_INTEL_ONLY(GPU_SUM_INSTANCE(__VA_ARGS__)) +#define GPU_SUM_INSTANCE_NVIDIA(...) \ + DNNL_GPU_NVIDIA_ONLY(GPU_SUM_INSTANCE(__VA_ARGS__)) +#define GPU_SUM_INSTANCE_AMD(...) \ + DNNL_GPU_AMD_ONLY(GPU_SUM_INSTANCE(__VA_ARGS__)) +#define GPU_SUM_INSTANCE_GENERIC_SYCL(...) \ + DNNL_GPU_GENERIC_SYCL_ONLY(GPU_SUM_INSTANCE(__VA_ARGS__)) +#define GPU_SUM_INSTANCE_GENERIC(...) GPU_SUM_INSTANCE(__VA_ARGS__) + +#define GPU_REORDER_INSTANCE_INTEL(...) \ + DNNL_GPU_INTEL_ONLY(GPU_REORDER_INSTANCE(__VA_ARGS__)) +#define GPU_REORDER_INSTANCE_NVIDIA(...) \ + DNNL_GPU_NVIDIA_ONLY(GPU_REORDER_INSTANCE(__VA_ARGS__)) +#define GPU_REORDER_INSTANCE_AMD(...) \ + DNNL_GPU_AMD_ONLY(GPU_REORDER_INSTANCE(__VA_ARGS__)) +#define GPU_REORDER_INSTANCE_GENERIC_SYCL(...) \ + DNNL_GPU_GENERIC_SYCL_ONLY(GPU_REORDER_INSTANCE(__VA_ARGS__)) +#define GPU_REORDER_INSTANCE_GENERIC(...) GPU_REORDER_INSTANCE(__VA_ARGS_) + +// Instance macros that are enabled only in the DEV_MODE. +#ifdef DNNL_DEV_MODE +#define GPU_INSTANCE_INTEL_DEVMODE(...) \ + DNNL_GPU_INTEL_ONLY(GPU_INSTANCE(__VA_ARGS_)) +#else +#define GPU_INSTANCE_INTEL_DEVMODE(...) +#endif + #define DECLARE_IMPL_LIST(kind) \ const impl_list_item_t *get_##kind##_impl_list(const kind##_desc_t *desc); diff --git a/src/gpu/gpu_inner_product_list.cpp b/src/gpu/gpu_inner_product_list.cpp index cef307cc131..564d2566f09 100644 --- a/src/gpu/gpu_inner_product_list.cpp +++ b/src/gpu/gpu_inner_product_list.cpp @@ -16,10 +16,12 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/convolution_inner_product.hpp" #include "gpu/intel/ocl/gemm_inner_product.hpp" #include "gpu/intel/ocl/gemm_post_ops_inner_product.hpp" #include "gpu/intel/ocl/ref_inner_product.hpp" +#endif namespace dnnl { namespace impl { @@ -32,16 +34,16 @@ using namespace dnnl::impl::prop_kind; const std::map> impl_list_map REG_IP_P({ {{forward}, { - INSTANCE(intel::ocl::gemm_inner_product_fwd_t) - INSTANCE(intel::ocl::convolution_inner_product_fwd_t) - INSTANCE(intel::ocl::ref_inner_product_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::gemm_inner_product_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::convolution_inner_product_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_inner_product_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ - INSTANCE(intel::ocl::gemm_inner_product_bwd_data_t) - INSTANCE(intel::ocl::gemm_inner_product_bwd_weights_t) - INSTANCE(intel::ocl::ref_inner_product_bwd_data_t) - INSTANCE(intel::ocl::ref_inner_product_bwd_weights_t) + GPU_INSTANCE_INTEL(intel::ocl::gemm_inner_product_bwd_data_t) + GPU_INSTANCE_INTEL(intel::ocl::gemm_inner_product_bwd_weights_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_inner_product_bwd_data_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_inner_product_bwd_weights_t) nullptr, })}, }); diff --git a/src/gpu/gpu_layer_normalization_list.cpp b/src/gpu/gpu_layer_normalization_list.cpp index 5605499ef85..b6075ed3a24 100644 --- a/src/gpu/gpu_layer_normalization_list.cpp +++ b/src/gpu/gpu_layer_normalization_list.cpp @@ -16,9 +16,11 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/ref_layer_normalization.hpp" #include "gpu/intel/ocl/reusable_lnorm.hpp" #include "gpu/intel/ocl/vectorized_lnorm.hpp" +#endif namespace dnnl { namespace impl { @@ -31,15 +33,15 @@ using namespace dnnl::impl::prop_kind; const std::map> impl_list_map REG_LNORM_P({ {{forward}, { - INSTANCE(intel::ocl::vectorized_lnorm_fwd_t) - INSTANCE(intel::ocl::ref_layer_normalization_fwd_t) - INSTANCE(intel::ocl::reusable_layer_normalization_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::vectorized_lnorm_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_layer_normalization_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::reusable_layer_normalization_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ - INSTANCE(intel::ocl::vectorized_lnorm_bwd_t) - INSTANCE(intel::ocl::ref_layer_normalization_bwd_t) - INSTANCE(intel::ocl::reusable_layer_normalization_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::vectorized_lnorm_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_layer_normalization_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::reusable_layer_normalization_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_lrn_list.cpp b/src/gpu/gpu_lrn_list.cpp index 762bd417b09..70afedb328d 100644 --- a/src/gpu/gpu_lrn_list.cpp +++ b/src/gpu/gpu_lrn_list.cpp @@ -16,7 +16,9 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/ref_lrn.hpp" +#endif namespace dnnl { namespace impl { @@ -29,11 +31,11 @@ using namespace dnnl::impl::prop_kind; const std::map> impl_list_map REG_LRN_P({ {{forward}, { - INSTANCE(intel::ocl::ref_lrn_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_lrn_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ - INSTANCE(intel::ocl::ref_lrn_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_lrn_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_matmul_list.cpp b/src/gpu/gpu_matmul_list.cpp index dbc27e01a81..beccc900068 100644 --- a/src/gpu/gpu_matmul_list.cpp +++ b/src/gpu/gpu_matmul_list.cpp @@ -16,8 +16,10 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/gemm_matmul.hpp" #include "gpu/intel/ocl/ref_matmul.hpp" +#endif namespace dnnl { namespace impl { @@ -27,8 +29,8 @@ namespace { // clang-format off constexpr impl_list_item_t impl_list[] = REG_MATMUL_P({ - INSTANCE(intel::ocl::gemm_matmul_t) - INSTANCE(intel::ocl::ref_matmul_t) + GPU_INSTANCE_INTEL(intel::ocl::gemm_matmul_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_matmul_t) nullptr, }); // clang-format on diff --git a/src/gpu/gpu_pooling_list.cpp b/src/gpu/gpu_pooling_list.cpp index efa9f26c848..4505abd6537 100644 --- a/src/gpu/gpu_pooling_list.cpp +++ b/src/gpu/gpu_pooling_list.cpp @@ -18,10 +18,12 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/jit/pooling/gen_pooling.hpp" #include "gpu/intel/ocl/gen9_global_pooling.hpp" #include "gpu/intel/ocl/gen9_pooling.hpp" #include "gpu/intel/ocl/ref_pooling.hpp" +#endif namespace dnnl { namespace impl { @@ -34,16 +36,16 @@ using namespace dnnl::impl::prop_kind; const std::map> impl_list_map REG_POOLING_P({ {{forward}, { - INSTANCE(intel::jit::gen_pooling_fwd_t) - INSTANCE(intel::ocl::gen9_global_pooling_fwd_t) - INSTANCE(intel::ocl::gen9_pooling_fwd_t) - INSTANCE(intel::ocl::ref_pooling_fwd_t) + GPU_INSTANCE_INTEL(intel::jit::gen_pooling_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::gen9_global_pooling_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::gen9_pooling_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_pooling_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ - INSTANCE(intel::ocl::gen9_global_pooling_bwd_t) - INSTANCE(intel::ocl::gen9_pooling_bwd_t) - INSTANCE(intel::ocl::ref_pooling_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::gen9_global_pooling_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::gen9_pooling_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_pooling_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_prelu_list.cpp b/src/gpu/gpu_prelu_list.cpp index a31713a8548..62791b1c5ad 100644 --- a/src/gpu/gpu_prelu_list.cpp +++ b/src/gpu/gpu_prelu_list.cpp @@ -18,7 +18,9 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/ref_prelu.hpp" +#endif namespace dnnl { namespace impl { @@ -31,11 +33,11 @@ using namespace dnnl::impl::prop_kind; const std::map> impl_list_map REG_PRELU_P({ {{forward}, { - INSTANCE(intel::ocl::ref_prelu_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_prelu_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ - INSTANCE(intel::ocl::ref_prelu_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_prelu_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_reduction_list.cpp b/src/gpu/gpu_reduction_list.cpp index b3590bf5497..bc8bc9729a6 100644 --- a/src/gpu/gpu_reduction_list.cpp +++ b/src/gpu/gpu_reduction_list.cpp @@ -16,36 +16,35 @@ #include "gpu/gpu_impl_list.hpp" -#include "gpu/intel/jit/jit_reduction.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/reduction/atomic_reduction.hpp" #include "gpu/intel/ocl/reduction/combined_reduction.hpp" #include "gpu/intel/ocl/reduction/ref_reduction.hpp" #include "gpu/intel/ocl/reduction/reusable_ref_reduction.hpp" +#ifdef DNNL_DEV_MODE +#include "gpu/intel/jit/jit_reduction.hpp" +#endif + +#endif + namespace dnnl { namespace impl { namespace gpu { namespace { -#ifdef DNNL_DEV_MODE -#define JIT_REDUCTION_INSTANCE INSTANCE(intel::jit::jit_reduction_t) -#else -#define JIT_REDUCTION_INSTANCE -#endif - // clang-format off constexpr impl_list_item_t impl_list[] = REG_REDUCTION_P({ - JIT_REDUCTION_INSTANCE - INSTANCE(intel::ocl::atomic_reduction_t) - INSTANCE(intel::ocl::combined_reduction_t) - INSTANCE(intel::ocl::ref_reduction_t) - INSTANCE(intel::ocl::reusable_ref_reduction_t) + GPU_INSTANCE_INTEL_DEVMODE(intel::jit::jit_reduction_t) + GPU_INSTANCE_INTEL(intel::ocl::atomic_reduction_t) + GPU_INSTANCE_INTEL(intel::ocl::combined_reduction_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_reduction_t) + GPU_INSTANCE_INTEL(intel::ocl::reusable_ref_reduction_t) nullptr, }); // clang-format on -#undef JIT_REDUCTION_INSTANCE } // namespace const impl_list_item_t *get_reduction_impl_list(const reduction_desc_t *desc) { diff --git a/src/gpu/gpu_reorder_list.cpp b/src/gpu/gpu_reorder_list.cpp index 6ec7b675251..cf9376fb000 100644 --- a/src/gpu/gpu_reorder_list.cpp +++ b/src/gpu/gpu_reorder_list.cpp @@ -16,12 +16,14 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/jit/reorder/gen_reorder.hpp" #include "gpu/intel/ocl/cross_engine_reorder.hpp" #include "gpu/intel/ocl/custom_reorder.hpp" #include "gpu/intel/ocl/generic_reorder.hpp" #include "gpu/intel/ocl/ref_reorder.hpp" #include "gpu/intel/ocl/rnn/rnn_reorders.hpp" +#endif namespace dnnl { namespace impl { @@ -31,18 +33,14 @@ namespace { using namespace dnnl::impl::data_type; -#define REORDER_INSTANCE(...) \ - impl_list_item_t( \ - impl_list_item_t::reorder_type_deduction_helper_t<__VA_ARGS__>()), - // clang-format off constexpr impl_list_item_t impl_list[] = REG_REORDER_P({ - REORDER_INSTANCE(intel::ocl::rnn_weights_reorder_t::pd_t) - REORDER_INSTANCE(intel::ocl::cross_engine_reorder_t::pd_t) - REORDER_INSTANCE(intel::jit::gen_reorder_t::pd_t) - REORDER_INSTANCE(intel::ocl::custom_reorder_t::pd_t) // for specific tensor shapes - REORDER_INSTANCE(intel::ocl::generic_reorder_t::pd_t)// fast and quite generic - REORDER_INSTANCE(intel::ocl::ref_reorder_t::pd_t) // slow but fits every use case + GPU_REORDER_INSTANCE_INTEL(intel::ocl::rnn_weights_reorder_t::pd_t) + GPU_REORDER_INSTANCE_INTEL(intel::ocl::cross_engine_reorder_t::pd_t) + GPU_REORDER_INSTANCE_INTEL(intel::jit::gen_reorder_t::pd_t) + GPU_REORDER_INSTANCE_INTEL(intel::ocl::custom_reorder_t::pd_t) // for specific tensor shapes + GPU_REORDER_INSTANCE_INTEL(intel::ocl::generic_reorder_t::pd_t)// fast and quite generic + GPU_REORDER_INSTANCE_INTEL(intel::ocl::ref_reorder_t::pd_t) // slow but fits every use case nullptr, }); // clang-format on diff --git a/src/gpu/gpu_resampling_list.cpp b/src/gpu/gpu_resampling_list.cpp index 6a074a72d2a..1abcf25737f 100644 --- a/src/gpu/gpu_resampling_list.cpp +++ b/src/gpu/gpu_resampling_list.cpp @@ -16,8 +16,10 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/ref_resampling.hpp" #include "gpu/intel/ocl/vectorized_resampling.hpp" +#endif namespace dnnl { namespace impl { @@ -30,12 +32,12 @@ using namespace dnnl::impl::prop_kind; const std::map> impl_list_map REG_RESAMPLING_P({ {{forward}, { - INSTANCE(intel::ocl::ref_resampling_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ - INSTANCE(intel::ocl::vectorized_resampling_bwd_t) - INSTANCE(intel::ocl::ref_resampling_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::vectorized_resampling_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_rnn_list.cpp b/src/gpu/gpu_rnn_list.cpp index 043fe9001e8..6dcdbb83734 100644 --- a/src/gpu/gpu_rnn_list.cpp +++ b/src/gpu/gpu_rnn_list.cpp @@ -16,7 +16,9 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/rnn/ref_rnn.hpp" +#endif namespace dnnl { namespace impl { @@ -29,11 +31,11 @@ using namespace dnnl::impl::prop_kind; const std::map> impl_list_map REG_RNN_P({ {{forward}, { - INSTANCE(intel::ocl::ref_rnn_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_rnn_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ - INSTANCE(intel::ocl::ref_rnn_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_rnn_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_shuffle_list.cpp b/src/gpu/gpu_shuffle_list.cpp index 27e206dc8ad..9ac53d03d46 100644 --- a/src/gpu/gpu_shuffle_list.cpp +++ b/src/gpu/gpu_shuffle_list.cpp @@ -16,8 +16,10 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/ref_shuffle.hpp" #include "gpu/intel/ocl/shuffle_by_reorder.hpp" +#endif namespace dnnl { namespace impl { @@ -27,8 +29,8 @@ namespace { // clang-format off constexpr impl_list_item_t impl_list[] = REG_SHUFFLE_P({ - INSTANCE(intel::ocl::shuffle_by_reorder_t) - INSTANCE(intel::ocl::ref_shuffle_t) + GPU_INSTANCE_INTEL(intel::ocl::shuffle_by_reorder_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_shuffle_t) nullptr, }); // clang-format on diff --git a/src/gpu/gpu_softmax_list.cpp b/src/gpu/gpu_softmax_list.cpp index 104684afea2..404383d3a49 100644 --- a/src/gpu/gpu_softmax_list.cpp +++ b/src/gpu/gpu_softmax_list.cpp @@ -16,9 +16,11 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/gen9_softmax.hpp" #include "gpu/intel/ocl/ref_softmax.hpp" #include "gpu/intel/ocl/reusable_softmax.hpp" +#endif namespace dnnl { namespace impl { @@ -31,14 +33,14 @@ using namespace dnnl::impl::prop_kind; const std::map> impl_list_map REG_SOFTMAX_P({ {{forward}, { - INSTANCE(intel::ocl::gen9_softmax_fwd_t) - INSTANCE(intel::ocl::ref_softmax_fwd_t) - INSTANCE(intel::ocl::reusable_softmax_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::gen9_softmax_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_softmax_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::reusable_softmax_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ - INSTANCE(intel::ocl::gen9_softmax_bwd_t) - INSTANCE(intel::ocl::ref_softmax_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::gen9_softmax_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_softmax_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_sum_list.cpp b/src/gpu/gpu_sum_list.cpp index ef7082f8616..0f7ca833cf0 100644 --- a/src/gpu/gpu_sum_list.cpp +++ b/src/gpu/gpu_sum_list.cpp @@ -18,12 +18,15 @@ #include "common/utils.hpp" #include "gpu/gpu_sum_pd.hpp" + +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/jit/gen9_simple_sum.hpp" #include "gpu/intel/ocl/gen9_sum.hpp" #include "gpu/intel/ocl/many_inputs_sum.hpp" #include "gpu/intel/ocl/multi_po_reorder_sum.hpp" #include "gpu/intel/ocl/ref_sum.hpp" #include "gpu/intel/ocl/simple_sum.hpp" +#endif namespace dnnl { namespace impl { @@ -33,21 +36,18 @@ namespace { // TODO: Re-enable nGEN-based implementation after architecture // dispatching is implemented. // INSTANCE(jit::gen9_simple_sum_t) -#define SUM_INSTANCE(...) \ - impl_list_item_t(impl_list_item_t::sum_type_deduction_helper_t< \ - __VA_ARGS__::pd_t>()), // clang-format off constexpr impl_list_item_t impl_list[] = REG_SUM_P({ - SUM_INSTANCE(intel::ocl::multi_po_reorder_sum) - SUM_INSTANCE(intel::ocl::gen9_sum_t) - SUM_INSTANCE(intel::ocl::many_inputs_sum_t) - SUM_INSTANCE(intel::ocl::simple_sum_t) - SUM_INSTANCE(intel::ocl::ref_sum_t) + GPU_SUM_INSTANCE_INTEL(intel::ocl::multi_po_reorder_sum) + GPU_SUM_INSTANCE_INTEL(intel::ocl::gen9_sum_t) + GPU_SUM_INSTANCE_INTEL(intel::ocl::many_inputs_sum_t) + GPU_SUM_INSTANCE_INTEL(intel::ocl::simple_sum_t) + GPU_SUM_INSTANCE_INTEL(intel::ocl::ref_sum_t) nullptr, }); // clang-format on -#undef INSTANCE + } // namespace const impl_list_item_t *get_sum_impl_list() { diff --git a/src/gpu/gpu_zero_pad_list.cpp b/src/gpu/gpu_zero_pad_list.cpp index 73d92980763..e4dc9c1fa25 100644 --- a/src/gpu/gpu_zero_pad_list.cpp +++ b/src/gpu/gpu_zero_pad_list.cpp @@ -16,7 +16,9 @@ #include "gpu/gpu_impl_list.hpp" +#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL #include "gpu/intel/ocl/ref_zero_pad.hpp" +#endif namespace dnnl { namespace impl { @@ -26,7 +28,7 @@ namespace { // clang-format off constexpr impl_list_item_t impl_list[] = { - INSTANCE(intel::ocl::ref_zero_pad_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_zero_pad_t) nullptr, }; // clang-format on diff --git a/src/gpu/nvidia/cudnn_concat.cpp b/src/gpu/nvidia/cudnn_concat.cpp index c0fd00f4511..99c3f0fc35b 100644 --- a/src/gpu/nvidia/cudnn_concat.cpp +++ b/src/gpu/nvidia/cudnn_concat.cpp @@ -15,6 +15,7 @@ * limitations under the License. *******************************************************************************/ +#include "gpu/gpu_impl_list.hpp" #include "gpu/intel/ocl/ref_concat.hpp" #include "gpu/nvidia/sycl_cuda_engine.hpp" @@ -25,10 +26,13 @@ namespace nvidia { namespace { -constexpr impl_list_item_t cuda_concat_impl_list[] - = {impl_list_item_t::concat_type_deduction_helper_t< - gpu::intel::ocl::ref_concat_t::pd_t>(), - nullptr}; +// clang-format off +constexpr impl_list_item_t cuda_concat_impl_list[] = { + GPU_CONCAT_INSTANCE_NVIDIA(gpu::intel::ocl::ref_concat_t) + nullptr +}; +// clang-format on + } // namespace const impl_list_item_t * diff --git a/src/gpu/nvidia/cudnn_reorder_impl.cpp b/src/gpu/nvidia/cudnn_reorder_impl.cpp index d4ed303c007..6211aabe1c5 100644 --- a/src/gpu/nvidia/cudnn_reorder_impl.cpp +++ b/src/gpu/nvidia/cudnn_reorder_impl.cpp @@ -15,7 +15,7 @@ * limitations under the License. *******************************************************************************/ #include "common/engine.hpp" -#include "common/impl_list_item.hpp" +#include "gpu/gpu_impl_list.hpp" #include "gpu/intel/ocl/cross_engine_reorder.hpp" #include "gpu/nvidia/cudnn_reorder.hpp" #include "gpu/nvidia/sycl_cuda_engine.hpp" @@ -27,14 +27,10 @@ namespace nvidia { namespace { -#define REORDER_INSTANCE(...) \ - impl_list_item_t( \ - impl_list_item_t::reorder_type_deduction_helper_t<__VA_ARGS__>()), - // clang-format off constexpr impl_list_item_t cuda_reorder_impl_list[] = { - REORDER_INSTANCE(gpu::intel::ocl::cross_engine_reorder_t::pd_t) - REORDER_INSTANCE(cudnn_reorder_t::pd_t) + GPU_REORDER_INSTANCE_NVIDIA(gpu::intel::ocl::cross_engine_reorder_t::pd_t) + GPU_REORDER_INSTANCE_NVIDIA(gpu::nvidia::cudnn_reorder_t::pd_t) nullptr, }; // clang-format on diff --git a/src/gpu/nvidia/cudnn_sum.cpp b/src/gpu/nvidia/cudnn_sum.cpp index b48b3d79cc8..c351f31dec9 100644 --- a/src/gpu/nvidia/cudnn_sum.cpp +++ b/src/gpu/nvidia/cudnn_sum.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -16,6 +16,7 @@ *******************************************************************************/ #include "gpu/nvidia/cudnn_sum.hpp" +#include "gpu/gpu_impl_list.hpp" #include "gpu/nvidia/sycl_cuda_engine.hpp" namespace dnnl { @@ -25,9 +26,13 @@ namespace nvidia { namespace { +// clang-format off constexpr impl_list_item_t cuda_sum_impl_list[] = { - impl_list_item_t::sum_type_deduction_helper_t(), - nullptr}; + GPU_SUM_INSTANCE_NVIDIA(gpu::nvidia::cudnn_ref_sum_t) + nullptr +}; +// clang-format on + } // namespace const impl_list_item_t * diff --git a/src/gpu/nvidia/sycl_cuda_engine.cpp b/src/gpu/nvidia/sycl_cuda_engine.cpp index e515cda7791..d8b256b9991 100644 --- a/src/gpu/nvidia/sycl_cuda_engine.cpp +++ b/src/gpu/nvidia/sycl_cuda_engine.cpp @@ -178,79 +178,79 @@ using namespace dnnl::impl::data_type; // clang-format off constexpr dnnl::impl::impl_list_item_t sycl_cuda_impl_list[] = { // Elementwise - INSTANCE(cudnn_eltwise_fwd_t) - INSTANCE(cudnn_eltwise_bwd_t) - INSTANCE(sycl::ref_sycl_eltwise_fwd_t) - INSTANCE(sycl::ref_sycl_eltwise_bwd_t) + GPU_INSTANCE_NVIDIA(cudnn_eltwise_fwd_t) + GPU_INSTANCE_NVIDIA(cudnn_eltwise_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_eltwise_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_eltwise_bwd_t) // Deconvolution - INSTANCE(cudnn_deconvolution_fwd_t) - INSTANCE(cudnn_deconvolution_bwd_data_t) - INSTANCE(cudnn_deconvolution_bwd_weights_t) + GPU_INSTANCE_NVIDIA(cudnn_deconvolution_fwd_t) + GPU_INSTANCE_NVIDIA(cudnn_deconvolution_bwd_data_t) + GPU_INSTANCE_NVIDIA(cudnn_deconvolution_bwd_weights_t) // Convolution - INSTANCE(cudnn_convolution_fwd_t) - INSTANCE(cudnn_convolution_bwd_data_t) - INSTANCE(cudnn_convolution_bwd_weights_t) + GPU_INSTANCE_NVIDIA(cudnn_convolution_fwd_t) + GPU_INSTANCE_NVIDIA(cudnn_convolution_bwd_data_t) + GPU_INSTANCE_NVIDIA(cudnn_convolution_bwd_weights_t) // Batch Normalization - INSTANCE(cudnn_batch_normalization_fwd_t) - INSTANCE(cudnn_batch_normalization_bwd_t) - INSTANCE(sycl::ref_batch_normalization_fwd_t) - INSTANCE(sycl::ref_batch_normalization_bwd_t) + GPU_INSTANCE_NVIDIA(cudnn_batch_normalization_fwd_t) + GPU_INSTANCE_NVIDIA(cudnn_batch_normalization_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_batch_normalization_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_batch_normalization_bwd_t) // Layer Normalization - INSTANCE(sycl::ref_layer_normalization_fwd_t) - INSTANCE(sycl::ref_layer_normalization_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_layer_normalization_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_layer_normalization_bwd_t) // PReLU - INSTANCE(sycl::ref_prelu_fwd_t) - INSTANCE(sycl::ref_prelu_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_prelu_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_prelu_bwd_t) // Pooling - INSTANCE(cudnn_pooling_fwd_t) - INSTANCE(cudnn_pooling_bwd_t) - INSTANCE(sycl::ref_pooling_fwd_t) - INSTANCE(sycl::ref_pooling_bwd_t) + GPU_INSTANCE_NVIDIA(cudnn_pooling_fwd_t) + GPU_INSTANCE_NVIDIA(cudnn_pooling_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_pooling_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_pooling_bwd_t) // LRN - INSTANCE(cudnn_lrn_fwd_t) - INSTANCE(cudnn_lrn_bwd_t) - INSTANCE(sycl::ref_sycl_lrn_fwd_t) - INSTANCE(sycl::ref_sycl_lrn_bwd_t) + GPU_INSTANCE_NVIDIA(cudnn_lrn_fwd_t) + GPU_INSTANCE_NVIDIA(cudnn_lrn_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_lrn_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_lrn_bwd_t) // Inner Product - INSTANCE(cudnn_gemm_inner_product_fwd_t) - INSTANCE(cudnn_conv_inner_product_fwd_t) - INSTANCE(cudnn_gemm_inner_product_bwd_data_t) - INSTANCE(cudnn_conv_inner_product_bwd_data_t) - INSTANCE(cudnn_gemm_inner_product_bwd_weights_t) - INSTANCE(cudnn_conv_inner_product_bwd_weights_t) + GPU_INSTANCE_NVIDIA(cudnn_gemm_inner_product_fwd_t) + GPU_INSTANCE_NVIDIA(cudnn_conv_inner_product_fwd_t) + GPU_INSTANCE_NVIDIA(cudnn_gemm_inner_product_bwd_data_t) + GPU_INSTANCE_NVIDIA(cudnn_conv_inner_product_bwd_data_t) + GPU_INSTANCE_NVIDIA(cudnn_gemm_inner_product_bwd_weights_t) + GPU_INSTANCE_NVIDIA(cudnn_conv_inner_product_bwd_weights_t) // Softmax - INSTANCE(cudnn_softmax_fwd_t) - INSTANCE(cudnn_softmax_bwd_t) - INSTANCE(sycl::ref_sycl_softmax_fwd_t) - INSTANCE(sycl::ref_sycl_softmax_bwd_t) + GPU_INSTANCE_NVIDIA(cudnn_softmax_fwd_t) + GPU_INSTANCE_NVIDIA(cudnn_softmax_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_softmax_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_softmax_bwd_t) // Binary - INSTANCE(cudnn_binary_t) - INSTANCE(sycl::ref_binary_t) + GPU_INSTANCE_NVIDIA(cudnn_binary_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_binary_t) // MatMul - INSTANCE(cudnn_matmul_t) + GPU_INSTANCE_NVIDIA(cudnn_matmul_t) // Resampling - INSTANCE(cudnn_resampling_fwd_t) - INSTANCE(cudnn_resampling_bwd_t) - INSTANCE(sycl::ref_resampling_fwd_t) - INSTANCE(sycl::ref_resampling_bwd_t) + GPU_INSTANCE_NVIDIA(cudnn_resampling_fwd_t) + GPU_INSTANCE_NVIDIA(cudnn_resampling_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_resampling_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_resampling_bwd_t) // Reduction - INSTANCE(cudnn_reduction_t) + GPU_INSTANCE_NVIDIA(cudnn_reduction_t) // Shuffle - INSTANCE(sycl::ref_shuffle_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_shuffle_t) nullptr, }; // clang-format on From c84e419ee907461d17e2a41ca7b3f8720102c815 Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Tue, 14 May 2024 11:30:27 -0700 Subject: [PATCH 100/187] build: gpu: use warn_unused_result only for intel vendor --- src/gpu/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gpu/CMakeLists.txt b/src/gpu/CMakeLists.txt index 2e00d2fa4c7..ed471a4ab39 100644 --- a/src/gpu/CMakeLists.txt +++ b/src/gpu/CMakeLists.txt @@ -61,5 +61,7 @@ check_cxx_source_compiles("${COMPILER_ALLOWS_ALIAS_ATTRIBUTES_SOURCE}" COMPILER_ unset(CMAKE_REQUIRED_FLAGS) if(${COMPILER_ALLOWS_ALIAS_ATTRIBUTES}) - add_definitions_with_host_compiler(-DDNNL_STATUS_NODISCARD) + if(DNNL_GPU_VENDOR STREQUAL "INTEL") + add_definitions_with_host_compiler(-DDNNL_STATUS_NODISCARD) + endif() endif() From f5b7330231c8541ce06c6ee44b2dc993473d7e1f Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Tue, 14 May 2024 11:31:50 -0700 Subject: [PATCH 101/187] gpu: nvidia: move impls to the common list --- src/gpu/gpu_batch_normalization_list.cpp | 9 ++ src/gpu/gpu_binary_list.cpp | 7 ++ src/gpu/gpu_concat_list.cpp | 5 + src/gpu/gpu_convolution_list.cpp | 7 ++ src/gpu/gpu_deconvolution_list.cpp | 7 ++ src/gpu/gpu_eltwise_list.cpp | 9 ++ src/gpu/gpu_inner_product_list.cpp | 11 +++ src/gpu/gpu_layer_normalization_list.cpp | 6 ++ src/gpu/gpu_lrn_list.cpp | 9 ++ src/gpu/gpu_matmul_list.cpp | 5 + src/gpu/gpu_pooling_list.cpp | 9 ++ src/gpu/gpu_prelu_list.cpp | 6 ++ src/gpu/gpu_reduction_list.cpp | 5 + src/gpu/gpu_reorder_list.cpp | 7 ++ src/gpu/gpu_resampling_list.cpp | 9 ++ src/gpu/gpu_shuffle_list.cpp | 5 + src/gpu/gpu_softmax_list.cpp | 9 ++ src/gpu/gpu_sum_list.cpp | 5 + src/gpu/nvidia/cudnn_concat.cpp | 46 ---------- src/gpu/nvidia/cudnn_reorder_impl.cpp | 49 ---------- src/gpu/nvidia/cudnn_sum.cpp | 46 ---------- src/gpu/nvidia/sycl_cuda_engine.cpp | 112 ----------------------- src/gpu/nvidia/sycl_cuda_engine.hpp | 36 +++----- 23 files changed, 145 insertions(+), 274 deletions(-) delete mode 100644 src/gpu/nvidia/cudnn_concat.cpp delete mode 100644 src/gpu/nvidia/cudnn_reorder_impl.cpp delete mode 100644 src/gpu/nvidia/cudnn_sum.cpp diff --git a/src/gpu/gpu_batch_normalization_list.cpp b/src/gpu/gpu_batch_normalization_list.cpp index 724dbdbb0b3..eb2940bbedf 100644 --- a/src/gpu/gpu_batch_normalization_list.cpp +++ b/src/gpu/gpu_batch_normalization_list.cpp @@ -29,6 +29,11 @@ #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_batch_normalization.hpp" +#include "gpu/sycl/ref_batch_normalization.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -46,6 +51,8 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::simple_batch_normalization_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::reusable_batch_normalization_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_batch_normalization_fwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_batch_normalization_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_batch_normalization_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ @@ -55,6 +62,8 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::simple_batch_normalization_bwd_t) GPU_INSTANCE_INTEL(intel::ocl::reusable_batch_normalization_bwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_batch_normalization_bwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_batch_normalization_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_batch_normalization_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_binary_list.cpp b/src/gpu/gpu_binary_list.cpp index 1c43b9c06b6..ceca5466fcb 100644 --- a/src/gpu/gpu_binary_list.cpp +++ b/src/gpu/gpu_binary_list.cpp @@ -22,6 +22,11 @@ #include "gpu/intel/ocl/ref_binary.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_binary.hpp" +#include "gpu/sycl/ref_binary.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -33,6 +38,8 @@ constexpr impl_list_item_t impl_list[] = REG_BINARY_P({ GPU_INSTANCE_INTEL(intel::ocl::multi_po_reorder_binary) GPU_INSTANCE_INTEL(intel::ocl::gen9_binary_t) GPU_INSTANCE_INTEL(intel::ocl::ref_binary_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_binary_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_binary_t) nullptr, }); // clang-format on diff --git a/src/gpu/gpu_concat_list.cpp b/src/gpu/gpu_concat_list.cpp index ba062736984..390db7bcd8f 100644 --- a/src/gpu/gpu_concat_list.cpp +++ b/src/gpu/gpu_concat_list.cpp @@ -23,6 +23,10 @@ #include "gpu/intel/ocl/simple_concat.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/intel/ocl/ref_concat.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -35,6 +39,7 @@ constexpr impl_list_item_t impl_list[] = REG_CONCAT_P({ GPU_CONCAT_INSTANCE_INTEL(intel::ocl::gen9_concat_t) GPU_CONCAT_INSTANCE_INTEL(intel::ocl::multi_concat_t) GPU_CONCAT_INSTANCE_INTEL(intel::ocl::ref_concat_t) + GPU_CONCAT_INSTANCE_NVIDIA(intel::ocl::ref_concat_t) nullptr, }); // clang-format on diff --git a/src/gpu/gpu_convolution_list.cpp b/src/gpu/gpu_convolution_list.cpp index 3c3ccbf994b..a9ee384a495 100644 --- a/src/gpu/gpu_convolution_list.cpp +++ b/src/gpu/gpu_convolution_list.cpp @@ -28,6 +28,10 @@ #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_convolution.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -43,18 +47,21 @@ const std::map> GPU_INSTANCE_INTEL(intel::jit::gen_convolution_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::gen9_wino_convolution_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_convolution_fwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_fwd_t) nullptr, }}, {{backward_data}, REG_BWD_D_PK({ GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_bwd_data_t) GPU_INSTANCE_INTEL(intel::jit::gen_convolution_bwd_data_t) GPU_INSTANCE_INTEL(intel::ocl::ref_convolution_bwd_data_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_bwd_data_t) nullptr, })}, {{backward_weights}, REG_BWD_PK({ GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_bwd_weights_t) GPU_INSTANCE_INTEL(intel::jit::gen_convolution_bwd_weights_t) GPU_INSTANCE_INTEL(intel::ocl::ref_convolution_bwd_weights_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_bwd_weights_t) nullptr, })}, }); diff --git a/src/gpu/gpu_deconvolution_list.cpp b/src/gpu/gpu_deconvolution_list.cpp index 438f27cd253..488dc9d636f 100644 --- a/src/gpu/gpu_deconvolution_list.cpp +++ b/src/gpu/gpu_deconvolution_list.cpp @@ -20,6 +20,10 @@ #include "gpu/intel/ocl/ref_deconvolution.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_deconvolution.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -32,11 +36,14 @@ const std::map> impl_list_map REG_DECONV_P({ {{forward}, { GPU_INSTANCE_INTEL(intel::ocl::ref_deconvolution_fwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ GPU_INSTANCE_INTEL(intel::ocl::ref_deconvolution_bwd_data_t) GPU_INSTANCE_INTEL(intel::ocl::ref_deconvolution_bwd_weights_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_bwd_data_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_bwd_weights_t) nullptr, })}, }); diff --git a/src/gpu/gpu_eltwise_list.cpp b/src/gpu/gpu_eltwise_list.cpp index 1ec18cfb798..51b66628894 100644 --- a/src/gpu/gpu_eltwise_list.cpp +++ b/src/gpu/gpu_eltwise_list.cpp @@ -21,6 +21,11 @@ #include "gpu/intel/ocl/ref_eltwise.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_eltwise.hpp" +#include "gpu/sycl/ref_eltwise.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -34,11 +39,15 @@ const std::map> {{forward}, { GPU_INSTANCE_INTEL(intel::ocl::gen9_eltwise_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_eltwise_fwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_eltwise_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_eltwise_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ GPU_INSTANCE_INTEL(intel::ocl::gen9_eltwise_bwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_eltwise_bwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_eltwise_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_eltwise_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_inner_product_list.cpp b/src/gpu/gpu_inner_product_list.cpp index 564d2566f09..7a2257775fa 100644 --- a/src/gpu/gpu_inner_product_list.cpp +++ b/src/gpu/gpu_inner_product_list.cpp @@ -23,6 +23,11 @@ #include "gpu/intel/ocl/ref_inner_product.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_conv_inner_product.hpp" +#include "gpu/nvidia/cudnn_gemm_inner_product.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -37,6 +42,8 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::gemm_inner_product_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::convolution_inner_product_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_inner_product_fwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_fwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ @@ -44,6 +51,10 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::gemm_inner_product_bwd_weights_t) GPU_INSTANCE_INTEL(intel::ocl::ref_inner_product_bwd_data_t) GPU_INSTANCE_INTEL(intel::ocl::ref_inner_product_bwd_weights_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_bwd_data_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_bwd_weights_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_bwd_data_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_bwd_weights_t) nullptr, })}, }); diff --git a/src/gpu/gpu_layer_normalization_list.cpp b/src/gpu/gpu_layer_normalization_list.cpp index b6075ed3a24..8d3711ac593 100644 --- a/src/gpu/gpu_layer_normalization_list.cpp +++ b/src/gpu/gpu_layer_normalization_list.cpp @@ -22,6 +22,10 @@ #include "gpu/intel/ocl/vectorized_lnorm.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/sycl/ref_layer_normalizations.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -36,12 +40,14 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::vectorized_lnorm_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_layer_normalization_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::reusable_layer_normalization_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_layer_normalization_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ GPU_INSTANCE_INTEL(intel::ocl::vectorized_lnorm_bwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_layer_normalization_bwd_t) GPU_INSTANCE_INTEL(intel::ocl::reusable_layer_normalization_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_layer_normalization_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_lrn_list.cpp b/src/gpu/gpu_lrn_list.cpp index 70afedb328d..8d993b1bff1 100644 --- a/src/gpu/gpu_lrn_list.cpp +++ b/src/gpu/gpu_lrn_list.cpp @@ -20,6 +20,11 @@ #include "gpu/intel/ocl/ref_lrn.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_lrn.hpp" +#include "gpu/sycl/ref_lrn.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -32,10 +37,14 @@ const std::map> impl_list_map REG_LRN_P({ {{forward}, { GPU_INSTANCE_INTEL(intel::ocl::ref_lrn_fwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_lrn_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_lrn_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ GPU_INSTANCE_INTEL(intel::ocl::ref_lrn_bwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_lrn_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_lrn_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_matmul_list.cpp b/src/gpu/gpu_matmul_list.cpp index beccc900068..af5b46789eb 100644 --- a/src/gpu/gpu_matmul_list.cpp +++ b/src/gpu/gpu_matmul_list.cpp @@ -21,6 +21,10 @@ #include "gpu/intel/ocl/ref_matmul.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_matmul.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -31,6 +35,7 @@ namespace { constexpr impl_list_item_t impl_list[] = REG_MATMUL_P({ GPU_INSTANCE_INTEL(intel::ocl::gemm_matmul_t) GPU_INSTANCE_INTEL(intel::ocl::ref_matmul_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_matmul_t) nullptr, }); // clang-format on diff --git a/src/gpu/gpu_pooling_list.cpp b/src/gpu/gpu_pooling_list.cpp index 4505abd6537..0f79b6c2fcc 100644 --- a/src/gpu/gpu_pooling_list.cpp +++ b/src/gpu/gpu_pooling_list.cpp @@ -25,6 +25,11 @@ #include "gpu/intel/ocl/ref_pooling.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_pooling.hpp" +#include "gpu/sycl/ref_pooling.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -40,12 +45,16 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::gen9_global_pooling_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::gen9_pooling_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_pooling_fwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_pooling_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_pooling_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ GPU_INSTANCE_INTEL(intel::ocl::gen9_global_pooling_bwd_t) GPU_INSTANCE_INTEL(intel::ocl::gen9_pooling_bwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_pooling_bwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_pooling_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_pooling_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_prelu_list.cpp b/src/gpu/gpu_prelu_list.cpp index 62791b1c5ad..5ca4128c386 100644 --- a/src/gpu/gpu_prelu_list.cpp +++ b/src/gpu/gpu_prelu_list.cpp @@ -22,6 +22,10 @@ #include "gpu/intel/ocl/ref_prelu.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/sycl/ref_prelu.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -34,10 +38,12 @@ const std::map> impl_list_map REG_PRELU_P({ {{forward}, { GPU_INSTANCE_INTEL(intel::ocl::ref_prelu_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_prelu_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ GPU_INSTANCE_INTEL(intel::ocl::ref_prelu_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_prelu_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_reduction_list.cpp b/src/gpu/gpu_reduction_list.cpp index bc8bc9729a6..558feda807d 100644 --- a/src/gpu/gpu_reduction_list.cpp +++ b/src/gpu/gpu_reduction_list.cpp @@ -28,6 +28,10 @@ #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_reduction.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -41,6 +45,7 @@ constexpr impl_list_item_t impl_list[] = REG_REDUCTION_P({ GPU_INSTANCE_INTEL(intel::ocl::combined_reduction_t) GPU_INSTANCE_INTEL(intel::ocl::ref_reduction_t) GPU_INSTANCE_INTEL(intel::ocl::reusable_ref_reduction_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_reduction_t) nullptr, }); // clang-format on diff --git a/src/gpu/gpu_reorder_list.cpp b/src/gpu/gpu_reorder_list.cpp index cf9376fb000..3a5c1524f6e 100644 --- a/src/gpu/gpu_reorder_list.cpp +++ b/src/gpu/gpu_reorder_list.cpp @@ -25,6 +25,11 @@ #include "gpu/intel/ocl/rnn/rnn_reorders.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/intel/ocl/cross_engine_reorder.hpp" +#include "gpu/nvidia/cudnn_reorder.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -41,6 +46,8 @@ constexpr impl_list_item_t impl_list[] = REG_REORDER_P({ GPU_REORDER_INSTANCE_INTEL(intel::ocl::custom_reorder_t::pd_t) // for specific tensor shapes GPU_REORDER_INSTANCE_INTEL(intel::ocl::generic_reorder_t::pd_t)// fast and quite generic GPU_REORDER_INSTANCE_INTEL(intel::ocl::ref_reorder_t::pd_t) // slow but fits every use case + GPU_REORDER_INSTANCE_NVIDIA(intel::ocl::cross_engine_reorder_t::pd_t) + GPU_REORDER_INSTANCE_NVIDIA(nvidia::cudnn_reorder_t::pd_t) nullptr, }); // clang-format on diff --git a/src/gpu/gpu_resampling_list.cpp b/src/gpu/gpu_resampling_list.cpp index 1abcf25737f..0a4836c7a8d 100644 --- a/src/gpu/gpu_resampling_list.cpp +++ b/src/gpu/gpu_resampling_list.cpp @@ -21,6 +21,11 @@ #include "gpu/intel/ocl/vectorized_resampling.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_resampling.hpp" +#include "gpu/sycl/ref_resampling.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -33,11 +38,15 @@ const std::map> impl_list_map REG_RESAMPLING_P({ {{forward}, { GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_fwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_resampling_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_resampling_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ GPU_INSTANCE_INTEL(intel::ocl::vectorized_resampling_bwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_bwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_resampling_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_resampling_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_shuffle_list.cpp b/src/gpu/gpu_shuffle_list.cpp index 9ac53d03d46..db4632b4cbc 100644 --- a/src/gpu/gpu_shuffle_list.cpp +++ b/src/gpu/gpu_shuffle_list.cpp @@ -21,6 +21,10 @@ #include "gpu/intel/ocl/shuffle_by_reorder.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/sycl/ref_shuffle.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -31,6 +35,7 @@ namespace { constexpr impl_list_item_t impl_list[] = REG_SHUFFLE_P({ GPU_INSTANCE_INTEL(intel::ocl::shuffle_by_reorder_t) GPU_INSTANCE_INTEL(intel::ocl::ref_shuffle_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_shuffle_t) nullptr, }); // clang-format on diff --git a/src/gpu/gpu_softmax_list.cpp b/src/gpu/gpu_softmax_list.cpp index 404383d3a49..f7121ef4e0c 100644 --- a/src/gpu/gpu_softmax_list.cpp +++ b/src/gpu/gpu_softmax_list.cpp @@ -22,6 +22,11 @@ #include "gpu/intel/ocl/reusable_softmax.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_softmax.hpp" +#include "gpu/sycl/ref_softmax.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -36,11 +41,15 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::gen9_softmax_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_softmax_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::reusable_softmax_fwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_softmax_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_softmax_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ GPU_INSTANCE_INTEL(intel::ocl::gen9_softmax_bwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_softmax_bwd_t) + GPU_INSTANCE_NVIDIA(nvidia::cudnn_softmax_bwd_t) + GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_softmax_bwd_t) nullptr, })}, }); diff --git a/src/gpu/gpu_sum_list.cpp b/src/gpu/gpu_sum_list.cpp index 0f7ca833cf0..6ab7e6a7083 100644 --- a/src/gpu/gpu_sum_list.cpp +++ b/src/gpu/gpu_sum_list.cpp @@ -28,6 +28,10 @@ #include "gpu/intel/ocl/simple_sum.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_sum.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -44,6 +48,7 @@ constexpr impl_list_item_t impl_list[] = REG_SUM_P({ GPU_SUM_INSTANCE_INTEL(intel::ocl::many_inputs_sum_t) GPU_SUM_INSTANCE_INTEL(intel::ocl::simple_sum_t) GPU_SUM_INSTANCE_INTEL(intel::ocl::ref_sum_t) + GPU_SUM_INSTANCE_NVIDIA(nvidia::cudnn_ref_sum_t) nullptr, }); // clang-format on diff --git a/src/gpu/nvidia/cudnn_concat.cpp b/src/gpu/nvidia/cudnn_concat.cpp deleted file mode 100644 index 99c3f0fc35b..00000000000 --- a/src/gpu/nvidia/cudnn_concat.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/******************************************************************************* -* Copyright 2020-2024 Intel Corporation -* Copyright 2020 Codeplay Software Limited -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "gpu/gpu_impl_list.hpp" -#include "gpu/intel/ocl/ref_concat.hpp" -#include "gpu/nvidia/sycl_cuda_engine.hpp" - -namespace dnnl { -namespace impl { -namespace gpu { -namespace nvidia { - -namespace { - -// clang-format off -constexpr impl_list_item_t cuda_concat_impl_list[] = { - GPU_CONCAT_INSTANCE_NVIDIA(gpu::intel::ocl::ref_concat_t) - nullptr -}; -// clang-format on - -} // namespace - -const impl_list_item_t * -cuda_gpu_engine_impl_list_t::get_concat_implementation_list() { - return cuda_concat_impl_list; -} - -} // namespace nvidia -} // namespace gpu -} // namespace impl -} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_reorder_impl.cpp b/src/gpu/nvidia/cudnn_reorder_impl.cpp deleted file mode 100644 index 6211aabe1c5..00000000000 --- a/src/gpu/nvidia/cudnn_reorder_impl.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/******************************************************************************* -* Copyright 2020-2024 Intel Corporation -* Copyright 2020 Codeplay Software Limited -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ -#include "common/engine.hpp" -#include "gpu/gpu_impl_list.hpp" -#include "gpu/intel/ocl/cross_engine_reorder.hpp" -#include "gpu/nvidia/cudnn_reorder.hpp" -#include "gpu/nvidia/sycl_cuda_engine.hpp" - -namespace dnnl { -namespace impl { -namespace gpu { -namespace nvidia { - -namespace { - -// clang-format off -constexpr impl_list_item_t cuda_reorder_impl_list[] = { - GPU_REORDER_INSTANCE_NVIDIA(gpu::intel::ocl::cross_engine_reorder_t::pd_t) - GPU_REORDER_INSTANCE_NVIDIA(gpu::nvidia::cudnn_reorder_t::pd_t) - nullptr, -}; -// clang-format on - -} // namespace - -const impl_list_item_t * -cuda_gpu_engine_impl_list_t::get_reorder_implementation_list( - const memory_desc_t *, const memory_desc_t *) { - return cuda_reorder_impl_list; -} - -} // namespace nvidia -} // namespace gpu -} // namespace impl -} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_sum.cpp b/src/gpu/nvidia/cudnn_sum.cpp deleted file mode 100644 index c351f31dec9..00000000000 --- a/src/gpu/nvidia/cudnn_sum.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/******************************************************************************* -* Copyright 2020-2024 Intel Corporation -* Copyright 2020 Codeplay Software Limited -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "gpu/nvidia/cudnn_sum.hpp" -#include "gpu/gpu_impl_list.hpp" -#include "gpu/nvidia/sycl_cuda_engine.hpp" - -namespace dnnl { -namespace impl { -namespace gpu { -namespace nvidia { - -namespace { - -// clang-format off -constexpr impl_list_item_t cuda_sum_impl_list[] = { - GPU_SUM_INSTANCE_NVIDIA(gpu::nvidia::cudnn_ref_sum_t) - nullptr -}; -// clang-format on - -} // namespace - -const impl_list_item_t * -cuda_gpu_engine_impl_list_t::get_sum_implementation_list() { - return cuda_sum_impl_list; -} - -} // namespace nvidia -} // namespace gpu -} // namespace impl -} // namespace dnnl diff --git a/src/gpu/nvidia/sycl_cuda_engine.cpp b/src/gpu/nvidia/sycl_cuda_engine.cpp index d8b256b9991..6a2e7b27d8b 100644 --- a/src/gpu/nvidia/sycl_cuda_engine.cpp +++ b/src/gpu/nvidia/sycl_cuda_engine.cpp @@ -20,35 +20,11 @@ #include "xpu/sycl/utils.hpp" -#include "gpu/nvidia/cudnn_batch_normalization.hpp" -#include "gpu/nvidia/cudnn_binary.hpp" -#include "gpu/nvidia/cudnn_conv_inner_product.hpp" -#include "gpu/nvidia/cudnn_convolution.hpp" -#include "gpu/nvidia/cudnn_deconvolution.hpp" -#include "gpu/nvidia/cudnn_eltwise.hpp" -#include "gpu/nvidia/cudnn_gemm_inner_product.hpp" -#include "gpu/nvidia/cudnn_lrn.hpp" -#include "gpu/nvidia/cudnn_matmul.hpp" -#include "gpu/nvidia/cudnn_pooling.hpp" -#include "gpu/nvidia/cudnn_reduction.hpp" -#include "gpu/nvidia/cudnn_resampling.hpp" -#include "gpu/nvidia/cudnn_softmax.hpp" #include "gpu/nvidia/sycl_cuda_compat.hpp" #include "gpu/nvidia/sycl_cuda_engine.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" -#include "gpu/sycl/ref_batch_normalization.hpp" -#include "gpu/sycl/ref_binary.hpp" -#include "gpu/sycl/ref_eltwise.hpp" -#include "gpu/sycl/ref_layer_normalizations.hpp" -#include "gpu/sycl/ref_lrn.hpp" -#include "gpu/sycl/ref_pooling.hpp" -#include "gpu/sycl/ref_prelu.hpp" -#include "gpu/sycl/ref_resampling.hpp" -#include "gpu/sycl/ref_shuffle.hpp" -#include "gpu/sycl/ref_softmax.hpp" - namespace dnnl { namespace impl { namespace gpu { @@ -172,94 +148,6 @@ void sycl_cuda_engine_t::activate_stream_cudnn(CUstream cuda_stream) { } } -namespace { -using namespace dnnl::impl::data_type; - -// clang-format off -constexpr dnnl::impl::impl_list_item_t sycl_cuda_impl_list[] = { - // Elementwise - GPU_INSTANCE_NVIDIA(cudnn_eltwise_fwd_t) - GPU_INSTANCE_NVIDIA(cudnn_eltwise_bwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_eltwise_fwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_eltwise_bwd_t) - - // Deconvolution - GPU_INSTANCE_NVIDIA(cudnn_deconvolution_fwd_t) - GPU_INSTANCE_NVIDIA(cudnn_deconvolution_bwd_data_t) - GPU_INSTANCE_NVIDIA(cudnn_deconvolution_bwd_weights_t) - - // Convolution - GPU_INSTANCE_NVIDIA(cudnn_convolution_fwd_t) - GPU_INSTANCE_NVIDIA(cudnn_convolution_bwd_data_t) - GPU_INSTANCE_NVIDIA(cudnn_convolution_bwd_weights_t) - - // Batch Normalization - GPU_INSTANCE_NVIDIA(cudnn_batch_normalization_fwd_t) - GPU_INSTANCE_NVIDIA(cudnn_batch_normalization_bwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_batch_normalization_fwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_batch_normalization_bwd_t) - - // Layer Normalization - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_layer_normalization_fwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_layer_normalization_bwd_t) - - // PReLU - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_prelu_fwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_prelu_bwd_t) - - // Pooling - GPU_INSTANCE_NVIDIA(cudnn_pooling_fwd_t) - GPU_INSTANCE_NVIDIA(cudnn_pooling_bwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_pooling_fwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_pooling_bwd_t) - - // LRN - GPU_INSTANCE_NVIDIA(cudnn_lrn_fwd_t) - GPU_INSTANCE_NVIDIA(cudnn_lrn_bwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_lrn_fwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_lrn_bwd_t) - - // Inner Product - GPU_INSTANCE_NVIDIA(cudnn_gemm_inner_product_fwd_t) - GPU_INSTANCE_NVIDIA(cudnn_conv_inner_product_fwd_t) - GPU_INSTANCE_NVIDIA(cudnn_gemm_inner_product_bwd_data_t) - GPU_INSTANCE_NVIDIA(cudnn_conv_inner_product_bwd_data_t) - GPU_INSTANCE_NVIDIA(cudnn_gemm_inner_product_bwd_weights_t) - GPU_INSTANCE_NVIDIA(cudnn_conv_inner_product_bwd_weights_t) - - // Softmax - GPU_INSTANCE_NVIDIA(cudnn_softmax_fwd_t) - GPU_INSTANCE_NVIDIA(cudnn_softmax_bwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_softmax_fwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_softmax_bwd_t) - - // Binary - GPU_INSTANCE_NVIDIA(cudnn_binary_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_binary_t) - - // MatMul - GPU_INSTANCE_NVIDIA(cudnn_matmul_t) - - // Resampling - GPU_INSTANCE_NVIDIA(cudnn_resampling_fwd_t) - GPU_INSTANCE_NVIDIA(cudnn_resampling_bwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_resampling_fwd_t) - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_resampling_bwd_t) - - // Reduction - GPU_INSTANCE_NVIDIA(cudnn_reduction_t) - - // Shuffle - GPU_INSTANCE_GENERIC_SYCL(sycl::ref_shuffle_t) - nullptr, -}; -// clang-format on -} // namespace -const dnnl::impl::impl_list_item_t *sycl_cuda_engine_t::get_implementation_list( - const op_desc_t *) const { - return sycl_cuda_impl_list; -} - } // namespace nvidia } // namespace gpu } // namespace impl diff --git a/src/gpu/nvidia/sycl_cuda_engine.hpp b/src/gpu/nvidia/sycl_cuda_engine.hpp index 5f5312f78a7..00a1206cd47 100644 --- a/src/gpu/nvidia/sycl_cuda_engine.hpp +++ b/src/gpu/nvidia/sycl_cuda_engine.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -23,6 +23,7 @@ #include "common/stream.hpp" #include "common/thread_local_storage.hpp" +#include "gpu/gpu_impl_list.hpp" #include "gpu/nvidia/sycl_cuda_utils.hpp" #include "sycl/sycl_device_info.hpp" #include "sycl/sycl_engine_base.hpp" @@ -32,14 +33,6 @@ namespace impl { namespace gpu { namespace nvidia { -class cuda_gpu_engine_impl_list_t { -public: - static const impl_list_item_t *get_reorder_implementation_list( - const memory_desc_t *src_md, const memory_desc_t *dst_md); - static const dnnl::impl::impl_list_item_t *get_concat_implementation_list(); - static const dnnl::impl::impl_list_item_t *get_sum_implementation_list(); -}; - class sycl_cuda_engine_t : public dnnl::impl::sycl::sycl_engine_base_t { public: using base_t = dnnl::impl::sycl::sycl_engine_base_t; @@ -52,28 +45,29 @@ class sycl_cuda_engine_t : public dnnl::impl::sycl::sycl_engine_base_t { status_t create_stream(stream_t **stream, unsigned flags) override; status_t create_stream(stream_t **stream, ::sycl::queue &queue); - const dnnl::impl::impl_list_item_t *get_reorder_implementation_list( + void activate_stream_cudnn(CUstream cuda_stream); + void activate_stream_cublas(CUstream cuda_stream); + + const impl_list_item_t *get_reorder_implementation_list( const memory_desc_t *src_md, const memory_desc_t *dst_md) const override { - return cuda_gpu_engine_impl_list_t::get_reorder_implementation_list( + return gpu::gpu_impl_list_t::get_reorder_implementation_list( src_md, dst_md); } - const dnnl::impl::impl_list_item_t * - get_concat_implementation_list() const override { - return cuda_gpu_engine_impl_list_t::get_concat_implementation_list(); + const impl_list_item_t *get_concat_implementation_list() const override { + return gpu::gpu_impl_list_t::get_concat_implementation_list(); } - const dnnl::impl::impl_list_item_t * - get_sum_implementation_list() const override { - return cuda_gpu_engine_impl_list_t::get_sum_implementation_list(); + const impl_list_item_t *get_sum_implementation_list() const override { + return gpu::gpu_impl_list_t::get_sum_implementation_list(); } - void activate_stream_cudnn(CUstream cuda_stream); - void activate_stream_cublas(CUstream cuda_stream); - const impl_list_item_t *get_implementation_list( - const op_desc_t *) const override; + const op_desc_t *desc) const override { + return gpu::gpu_impl_list_t::get_implementation_list(desc); + } + CUcontext get_underlying_context() const; CUdevice get_underlying_device() const; cudnnHandle_t *get_cudnn_handle(); From 19473d4ab5714738f8ef0b509f1a84acc6a74527 Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Tue, 14 May 2024 12:15:41 -0700 Subject: [PATCH 102/187] gpu: amd: move impls to the common list --- src/gpu/amd/miopen_reorder_impl.cpp | 49 ------------------- src/gpu/amd/sycl_hip_engine.cpp | 60 ------------------------ src/gpu/amd/sycl_hip_engine.hpp | 46 ++++++------------ src/gpu/gpu_batch_normalization_list.cpp | 6 +++ src/gpu/gpu_binary_list.cpp | 5 ++ src/gpu/gpu_convolution_list.cpp | 7 +++ src/gpu/gpu_deconvolution_list.cpp | 7 +++ src/gpu/gpu_eltwise_list.cpp | 6 +++ src/gpu/gpu_inner_product_list.cpp | 7 +++ src/gpu/gpu_lrn_list.cpp | 6 +++ src/gpu/gpu_matmul_list.cpp | 5 ++ src/gpu/gpu_pooling_list.cpp | 6 +++ src/gpu/gpu_reduction_list.cpp | 5 ++ src/gpu/gpu_reorder_list.cpp | 7 +++ src/gpu/gpu_softmax_list.cpp | 6 +++ 15 files changed, 87 insertions(+), 141 deletions(-) delete mode 100644 src/gpu/amd/miopen_reorder_impl.cpp diff --git a/src/gpu/amd/miopen_reorder_impl.cpp b/src/gpu/amd/miopen_reorder_impl.cpp deleted file mode 100644 index 03d0418f39f..00000000000 --- a/src/gpu/amd/miopen_reorder_impl.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/******************************************************************************* -* Copyright 2020-2024 Intel Corporation -* Copyright 2020-2022 Codeplay Software Limited -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ -#include "common/engine.hpp" -#include "gpu/amd/miopen_reorder.hpp" -#include "gpu/amd/sycl_hip_engine.hpp" -#include "gpu/gpu_impl_list.hpp" -#include "gpu/intel/ocl/cross_engine_reorder.hpp" - -namespace dnnl { -namespace impl { -namespace gpu { -namespace amd { - -namespace { - -// clang-format off -constexpr impl_list_item_t hip_reorder_impl_list[] = { - GPU_REORDER_INSTANCE_AMD(gpu::intel::ocl::cross_engine_reorder_t::pd_t) - GPU_REORDER_INSTANCE_AMD(gpu::amd::miopen_reorder_t::pd_t) - nullptr, -}; -// clang-format on - -} // namespace - -const impl_list_item_t * -hip_gpu_engine_impl_list_t::get_reorder_implementation_list( - const memory_desc_t *, const memory_desc_t *) { - return hip_reorder_impl_list; -} - -} // namespace amd -} // namespace gpu -} // namespace impl -} // namespace dnnl diff --git a/src/gpu/amd/sycl_hip_engine.cpp b/src/gpu/amd/sycl_hip_engine.cpp index fc6191a870a..ec3e6247aba 100644 --- a/src/gpu/amd/sycl_hip_engine.cpp +++ b/src/gpu/amd/sycl_hip_engine.cpp @@ -22,17 +22,6 @@ #include "miopen/miopen.h" #include "xpu/sycl/utils.hpp" -#include "gpu/amd/miopen_batch_normalization.hpp" -#include "gpu/amd/miopen_binary.hpp" -#include "gpu/amd/miopen_convolution.hpp" -#include "gpu/amd/miopen_deconvolution.hpp" -#include "gpu/amd/miopen_eltwise.hpp" -#include "gpu/amd/miopen_gemm_inner_product.hpp" -#include "gpu/amd/miopen_lrn.hpp" -#include "gpu/amd/miopen_matmul.hpp" -#include "gpu/amd/miopen_pooling.hpp" -#include "gpu/amd/miopen_reduction.hpp" -#include "gpu/amd/miopen_softmax.hpp" #include "gpu/amd/sycl_hip_compat.hpp" #include "gpu/amd/sycl_hip_engine.hpp" #include "gpu/amd/sycl_hip_scoped_context.hpp" @@ -162,55 +151,6 @@ void sycl_hip_engine_t::activate_stream_miopen(HIPstream hip_stream) { } } -namespace { -using namespace dnnl::impl::data_type; - -// clang-format off -constexpr dnnl::impl::impl_list_item_t sycl_hip_impl_list[] = { - // Binary - GPU_INSTANCE_AMD(miopen_binary_t) - // Elementwise - GPU_INSTANCE_AMD(miopen_eltwise_fwd_t) - GPU_INSTANCE_AMD(miopen_eltwise_bwd_t) - // Softmax - GPU_INSTANCE_AMD(miopen_softmax_fwd_t) - GPU_INSTANCE_AMD(miopen_softmax_bwd_t) - // LRN - GPU_INSTANCE_AMD(miopen_lrn_fwd_t) - GPU_INSTANCE_AMD(miopen_lrn_bwd_t) - // Pooling - GPU_INSTANCE_AMD(miopen_pooling_fwd_t) - GPU_INSTANCE_AMD(miopen_pooling_bwd_t) - // Reduction - GPU_INSTANCE_AMD(miopen_reduction_t) - // MatMul - GPU_INSTANCE_AMD(miopen_matmul_t) - // Inner Product - GPU_INSTANCE_AMD(miopen_gemm_inner_product_fwd_t) - GPU_INSTANCE_AMD(miopen_gemm_inner_product_bwd_data_t) - GPU_INSTANCE_AMD(miopen_gemm_inner_product_bwd_weights_t) - // Convolution - GPU_INSTANCE_AMD(miopen_convolution_fwd_t) - GPU_INSTANCE_AMD(miopen_convolution_bwd_data_t) - GPU_INSTANCE_AMD(miopen_convolution_bwd_weights_t) - // Batch Normalization - GPU_INSTANCE_AMD(miopen_batch_normalization_fwd_t) - GPU_INSTANCE_AMD(miopen_batch_normalization_bwd_t) - // Deconvolution - GPU_INSTANCE_AMD(miopen_deconvolution_fwd_t) - GPU_INSTANCE_AMD(miopen_deconvolution_bwd_data_t) - GPU_INSTANCE_AMD(miopen_deconvolution_bwd_weights_t) - - nullptr, -}; -// clang-format on -} // namespace - -const dnnl::impl::impl_list_item_t *sycl_hip_engine_t::get_implementation_list( - const op_desc_t *) const { - return sycl_hip_impl_list; -} - } // namespace amd } // namespace gpu } // namespace impl diff --git a/src/gpu/amd/sycl_hip_engine.hpp b/src/gpu/amd/sycl_hip_engine.hpp index 33fc8bc3e81..44352752303 100644 --- a/src/gpu/amd/sycl_hip_engine.hpp +++ b/src/gpu/amd/sycl_hip_engine.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * Copyright 2020 Codeplay Software Limited * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -33,25 +33,6 @@ namespace impl { namespace gpu { namespace amd { -class hip_gpu_engine_impl_list_t { -public: - static const impl_list_item_t *get_reorder_implementation_list( - const memory_desc_t *src_md, const memory_desc_t *dst_md); - static const dnnl::impl::impl_list_item_t * - get_concat_implementation_list() { - static impl_list_item_t hip_concat_impl_list[] = { - nullptr, - }; - return hip_concat_impl_list; - } - static const dnnl::impl::impl_list_item_t *get_sum_implementation_list() { - static impl_list_item_t hip_sum_impl_list[] = { - nullptr, - }; - return hip_sum_impl_list; - } -}; - class sycl_hip_engine_t : public dnnl::impl::sycl::sycl_engine_base_t { public: using base_t = dnnl::impl::sycl::sycl_engine_base_t; @@ -64,28 +45,29 @@ class sycl_hip_engine_t : public dnnl::impl::sycl::sycl_engine_base_t { status_t create_stream(stream_t **stream, unsigned flags) override; status_t create_stream(stream_t **stream, ::sycl::queue &queue); - const dnnl::impl::impl_list_item_t *get_reorder_implementation_list( + void activate_stream_miopen(HIPstream hip_stream); + void activate_stream_rocblas(HIPstream hip_stream); + + const impl_list_item_t *get_reorder_implementation_list( const memory_desc_t *src_md, const memory_desc_t *dst_md) const override { - return hip_gpu_engine_impl_list_t::get_reorder_implementation_list( + return gpu::gpu_impl_list_t::get_reorder_implementation_list( src_md, dst_md); } - const dnnl::impl::impl_list_item_t * - get_concat_implementation_list() const override { - return hip_gpu_engine_impl_list_t::get_concat_implementation_list(); + const impl_list_item_t *get_concat_implementation_list() const override { + return gpu::gpu_impl_list_t::get_concat_implementation_list(); } - const dnnl::impl::impl_list_item_t * - get_sum_implementation_list() const override { - return hip_gpu_engine_impl_list_t::get_sum_implementation_list(); + const impl_list_item_t *get_sum_implementation_list() const override { + return gpu::gpu_impl_list_t::get_sum_implementation_list(); } - void activate_stream_miopen(HIPstream hip_stream); - void activate_stream_rocblas(HIPstream hip_stream); - const impl_list_item_t *get_implementation_list( - const op_desc_t *) const override; + const op_desc_t *desc) const override { + return gpu::gpu_impl_list_t::get_implementation_list(desc); + } + hipCtx_t get_underlying_context() const; hipDevice_t get_underlying_device() const; miopenHandle_t *get_miopen_handle(); diff --git a/src/gpu/gpu_batch_normalization_list.cpp b/src/gpu/gpu_batch_normalization_list.cpp index eb2940bbedf..305a11b9bf8 100644 --- a/src/gpu/gpu_batch_normalization_list.cpp +++ b/src/gpu/gpu_batch_normalization_list.cpp @@ -34,6 +34,10 @@ #include "gpu/sycl/ref_batch_normalization.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/amd/miopen_batch_normalization.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -52,6 +56,7 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::reusable_batch_normalization_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_batch_normalization_fwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_batch_normalization_fwd_t) + GPU_INSTANCE_AMD(amd::miopen_batch_normalization_fwd_t) GPU_INSTANCE_GENERIC_SYCL(sycl::ref_batch_normalization_fwd_t) nullptr, }}, @@ -63,6 +68,7 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::reusable_batch_normalization_bwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_batch_normalization_bwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_batch_normalization_bwd_t) + GPU_INSTANCE_AMD(amd::miopen_batch_normalization_bwd_t) GPU_INSTANCE_GENERIC_SYCL(sycl::ref_batch_normalization_bwd_t) nullptr, })}, diff --git a/src/gpu/gpu_binary_list.cpp b/src/gpu/gpu_binary_list.cpp index ceca5466fcb..002e6f8ece4 100644 --- a/src/gpu/gpu_binary_list.cpp +++ b/src/gpu/gpu_binary_list.cpp @@ -27,6 +27,10 @@ #include "gpu/sycl/ref_binary.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/amd/miopen_binary.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -39,6 +43,7 @@ constexpr impl_list_item_t impl_list[] = REG_BINARY_P({ GPU_INSTANCE_INTEL(intel::ocl::gen9_binary_t) GPU_INSTANCE_INTEL(intel::ocl::ref_binary_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_binary_t) + GPU_INSTANCE_AMD(amd::miopen_binary_t) GPU_INSTANCE_GENERIC_SYCL(sycl::ref_binary_t) nullptr, }); diff --git a/src/gpu/gpu_convolution_list.cpp b/src/gpu/gpu_convolution_list.cpp index a9ee384a495..c2596f0201e 100644 --- a/src/gpu/gpu_convolution_list.cpp +++ b/src/gpu/gpu_convolution_list.cpp @@ -32,6 +32,10 @@ #include "gpu/nvidia/cudnn_convolution.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/amd/miopen_convolution.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -48,6 +52,7 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::gen9_wino_convolution_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_convolution_fwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_fwd_t) + GPU_INSTANCE_AMD(amd::miopen_convolution_fwd_t) nullptr, }}, {{backward_data}, REG_BWD_D_PK({ @@ -55,6 +60,7 @@ const std::map> GPU_INSTANCE_INTEL(intel::jit::gen_convolution_bwd_data_t) GPU_INSTANCE_INTEL(intel::ocl::ref_convolution_bwd_data_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_bwd_data_t) + GPU_INSTANCE_AMD(amd::miopen_convolution_bwd_data_t) nullptr, })}, {{backward_weights}, REG_BWD_PK({ @@ -62,6 +68,7 @@ const std::map> GPU_INSTANCE_INTEL(intel::jit::gen_convolution_bwd_weights_t) GPU_INSTANCE_INTEL(intel::ocl::ref_convolution_bwd_weights_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_bwd_weights_t) + GPU_INSTANCE_AMD(amd::miopen_convolution_bwd_weights_t) nullptr, })}, }); diff --git a/src/gpu/gpu_deconvolution_list.cpp b/src/gpu/gpu_deconvolution_list.cpp index 488dc9d636f..057a1ca62de 100644 --- a/src/gpu/gpu_deconvolution_list.cpp +++ b/src/gpu/gpu_deconvolution_list.cpp @@ -24,6 +24,10 @@ #include "gpu/nvidia/cudnn_deconvolution.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/amd/miopen_deconvolution.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -37,6 +41,7 @@ const std::map> {{forward}, { GPU_INSTANCE_INTEL(intel::ocl::ref_deconvolution_fwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_fwd_t) + GPU_INSTANCE_AMD(amd::miopen_deconvolution_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ @@ -44,6 +49,8 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::ref_deconvolution_bwd_weights_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_bwd_data_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_bwd_weights_t) + GPU_INSTANCE_AMD(amd::miopen_deconvolution_bwd_data_t) + GPU_INSTANCE_AMD(amd::miopen_deconvolution_bwd_weights_t) nullptr, })}, }); diff --git a/src/gpu/gpu_eltwise_list.cpp b/src/gpu/gpu_eltwise_list.cpp index 51b66628894..15b56b98c27 100644 --- a/src/gpu/gpu_eltwise_list.cpp +++ b/src/gpu/gpu_eltwise_list.cpp @@ -26,6 +26,10 @@ #include "gpu/sycl/ref_eltwise.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/amd/miopen_eltwise.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -40,6 +44,7 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::gen9_eltwise_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_eltwise_fwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_eltwise_fwd_t) + GPU_INSTANCE_AMD(amd::miopen_eltwise_fwd_t) GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_eltwise_fwd_t) nullptr, }}, @@ -47,6 +52,7 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::gen9_eltwise_bwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_eltwise_bwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_eltwise_bwd_t) + GPU_INSTANCE_AMD(amd::miopen_eltwise_bwd_t) GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_eltwise_bwd_t) nullptr, })}, diff --git a/src/gpu/gpu_inner_product_list.cpp b/src/gpu/gpu_inner_product_list.cpp index 7a2257775fa..3ce83f80509 100644 --- a/src/gpu/gpu_inner_product_list.cpp +++ b/src/gpu/gpu_inner_product_list.cpp @@ -28,6 +28,10 @@ #include "gpu/nvidia/cudnn_gemm_inner_product.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/amd/miopen_gemm_inner_product.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -44,6 +48,7 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::ref_inner_product_fwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_fwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_fwd_t) + GPU_INSTANCE_AMD(amd::miopen_gemm_inner_product_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ @@ -55,6 +60,8 @@ const std::map> GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_bwd_weights_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_bwd_data_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_bwd_weights_t) + GPU_INSTANCE_AMD(amd::miopen_gemm_inner_product_bwd_data_t) + GPU_INSTANCE_AMD(amd::miopen_gemm_inner_product_bwd_weights_t) nullptr, })}, }); diff --git a/src/gpu/gpu_lrn_list.cpp b/src/gpu/gpu_lrn_list.cpp index 8d993b1bff1..17abf2a9799 100644 --- a/src/gpu/gpu_lrn_list.cpp +++ b/src/gpu/gpu_lrn_list.cpp @@ -25,6 +25,10 @@ #include "gpu/sycl/ref_lrn.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/amd/miopen_lrn.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -38,12 +42,14 @@ const std::map> {{forward}, { GPU_INSTANCE_INTEL(intel::ocl::ref_lrn_fwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_lrn_fwd_t) + GPU_INSTANCE_AMD(amd::miopen_lrn_fwd_t) GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_lrn_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ GPU_INSTANCE_INTEL(intel::ocl::ref_lrn_bwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_lrn_bwd_t) + GPU_INSTANCE_AMD(amd::miopen_lrn_bwd_t) GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_lrn_bwd_t) nullptr, })}, diff --git a/src/gpu/gpu_matmul_list.cpp b/src/gpu/gpu_matmul_list.cpp index af5b46789eb..e28c46fda41 100644 --- a/src/gpu/gpu_matmul_list.cpp +++ b/src/gpu/gpu_matmul_list.cpp @@ -25,6 +25,10 @@ #include "gpu/nvidia/cudnn_matmul.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/amd/miopen_matmul.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -36,6 +40,7 @@ constexpr impl_list_item_t impl_list[] = REG_MATMUL_P({ GPU_INSTANCE_INTEL(intel::ocl::gemm_matmul_t) GPU_INSTANCE_INTEL(intel::ocl::ref_matmul_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_matmul_t) + GPU_INSTANCE_AMD(amd::miopen_matmul_t) nullptr, }); // clang-format on diff --git a/src/gpu/gpu_pooling_list.cpp b/src/gpu/gpu_pooling_list.cpp index 0f79b6c2fcc..e827094ef0f 100644 --- a/src/gpu/gpu_pooling_list.cpp +++ b/src/gpu/gpu_pooling_list.cpp @@ -30,6 +30,10 @@ #include "gpu/sycl/ref_pooling.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/amd/miopen_pooling.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -46,6 +50,7 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::gen9_pooling_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_pooling_fwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_pooling_fwd_t) + GPU_INSTANCE_AMD(amd::miopen_pooling_fwd_t) GPU_INSTANCE_GENERIC_SYCL(sycl::ref_pooling_fwd_t) nullptr, }}, @@ -54,6 +59,7 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::gen9_pooling_bwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_pooling_bwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_pooling_bwd_t) + GPU_INSTANCE_AMD(amd::miopen_pooling_bwd_t) GPU_INSTANCE_GENERIC_SYCL(sycl::ref_pooling_bwd_t) nullptr, })}, diff --git a/src/gpu/gpu_reduction_list.cpp b/src/gpu/gpu_reduction_list.cpp index 558feda807d..b29c238e04a 100644 --- a/src/gpu/gpu_reduction_list.cpp +++ b/src/gpu/gpu_reduction_list.cpp @@ -32,6 +32,10 @@ #include "gpu/nvidia/cudnn_reduction.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/amd/miopen_reduction.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -46,6 +50,7 @@ constexpr impl_list_item_t impl_list[] = REG_REDUCTION_P({ GPU_INSTANCE_INTEL(intel::ocl::ref_reduction_t) GPU_INSTANCE_INTEL(intel::ocl::reusable_ref_reduction_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_reduction_t) + GPU_INSTANCE_AMD(amd::miopen_reduction_t) nullptr, }); // clang-format on diff --git a/src/gpu/gpu_reorder_list.cpp b/src/gpu/gpu_reorder_list.cpp index 3a5c1524f6e..0d868d40035 100644 --- a/src/gpu/gpu_reorder_list.cpp +++ b/src/gpu/gpu_reorder_list.cpp @@ -30,6 +30,11 @@ #include "gpu/nvidia/cudnn_reorder.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/amd/miopen_reorder.hpp" +#include "gpu/intel/ocl/cross_engine_reorder.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -48,6 +53,8 @@ constexpr impl_list_item_t impl_list[] = REG_REORDER_P({ GPU_REORDER_INSTANCE_INTEL(intel::ocl::ref_reorder_t::pd_t) // slow but fits every use case GPU_REORDER_INSTANCE_NVIDIA(intel::ocl::cross_engine_reorder_t::pd_t) GPU_REORDER_INSTANCE_NVIDIA(nvidia::cudnn_reorder_t::pd_t) + GPU_REORDER_INSTANCE_AMD(intel::ocl::cross_engine_reorder_t::pd_t) + GPU_REORDER_INSTANCE_AMD(amd::miopen_reorder_t::pd_t) nullptr, }); // clang-format on diff --git a/src/gpu/gpu_softmax_list.cpp b/src/gpu/gpu_softmax_list.cpp index f7121ef4e0c..91f80e8eee5 100644 --- a/src/gpu/gpu_softmax_list.cpp +++ b/src/gpu/gpu_softmax_list.cpp @@ -27,6 +27,10 @@ #include "gpu/sycl/ref_softmax.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/amd/miopen_softmax.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -42,6 +46,7 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::ref_softmax_fwd_t) GPU_INSTANCE_INTEL(intel::ocl::reusable_softmax_fwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_softmax_fwd_t) + GPU_INSTANCE_AMD(amd::miopen_softmax_fwd_t) GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_softmax_fwd_t) nullptr, }}, @@ -49,6 +54,7 @@ const std::map> GPU_INSTANCE_INTEL(intel::ocl::gen9_softmax_bwd_t) GPU_INSTANCE_INTEL(intel::ocl::ref_softmax_bwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_softmax_bwd_t) + GPU_INSTANCE_AMD(amd::miopen_softmax_bwd_t) GPU_INSTANCE_GENERIC_SYCL(sycl::ref_sycl_softmax_bwd_t) nullptr, })}, From b892dba2c28ebeaf697e3f5ed24683cab3f37b8f Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Tue, 14 May 2024 23:28:05 -0700 Subject: [PATCH 103/187] build: integrate nvidia and amd parts properly --- cmake/SYCL.cmake | 48 +++++++++++++++++++++-------------- src/CMakeLists.txt | 2 +- src/gpu/CMakeLists.txt | 3 --- src/gpu/amd/CMakeLists.txt | 23 +++++------------ src/gpu/nvidia/CMakeLists.txt | 27 +++----------------- 5 files changed, 40 insertions(+), 63 deletions(-) diff --git a/cmake/SYCL.cmake b/cmake/SYCL.cmake index f247b78db54..63843a765a4 100644 --- a/cmake/SYCL.cmake +++ b/cmake/SYCL.cmake @@ -67,6 +67,27 @@ else() list(APPEND EXTRA_SHARED_LIBS OpenCL::OpenCL) endif() +# CUDA and ROCm contain OpenCL headers that conflict with the OpenCL +# headers located in the compiler's directory. +# The workaround is to get interface include directories from all CUDA/ROCm +# import targets and lower their priority via `-idirafter` so that the +# compiler picks up the proper OpenCL headers. +macro(adjust_headers_priority targets) + if(NOT WIN32) + set(include_dirs) + foreach(import_target ${targets}) + get_target_property(import_target_include_dirs ${import_target} INTERFACE_INCLUDE_DIRECTORIES) + set_target_properties(${import_target} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "") + list(APPEND include_dirs ${import_target_include_dirs}) + endforeach() + + list(REMOVE_DUPLICATES include_dirs) + foreach(include_dir ${include_dirs}) + append(CMAKE_CXX_FLAGS "-idirafter${include_dir}") + endforeach() + endif() +endmacro() + if(DNNL_SYCL_CUDA) # XXX: Suppress warning coming from SYCL headers: # error: use of function template name with no prior declaration in @@ -80,31 +101,20 @@ if(DNNL_SYCL_CUDA) find_package(cuBLAS REQUIRED) find_package(cuDNN REQUIRED) - if(NOT WIN32) - # XXX: CUDA contains OpenCL headers that conflict with the OpenCL - # headers located in the compiler's directory. - # The workaround is the following: - # Get interface include directories from all CUDA related import - # targets and lower their priority via `-idirafter` so that the - # compiler picks up the proper OpenCL headers. - set(cuda_include_dirs) - foreach(cuda_import_target cuBLAS::cuBLAS;cuDNN::cuDNN) - get_target_property(cuda_import_target_include_dirs ${cuda_import_target} INTERFACE_INCLUDE_DIRECTORIES) - set_target_properties(${cuda_import_target} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "") - list(APPEND cuda_include_dirs ${cuda_import_target_include_dirs}) - endforeach() - - list(REMOVE_DUPLICATES cuda_include_dirs) - foreach(cuda_include_dir ${cuda_include_dirs}) - append(CMAKE_CXX_FLAGS "-idirafter${cuda_include_dir}") - endforeach() - endif() + adjust_headers_priority("cuBLAS::cuBLAS;cuDNN::cuDNN") + add_definitions_with_host_compiler("-DCUDA_NO_HALF") + list(APPEND EXTRA_SHARED_LIBS cuBLAS::cuBLAS cuDNN::cuDNN) message(STATUS "DPC++ support is enabled (CUDA)") elseif(DNNL_SYCL_HIP) find_package(HIP REQUIRED) find_package(rocBLAS REQUIRED) find_package(MIOpen REQUIRED) + + adjust_headers_priority("HIP::HIP;rocBLAS::rocBLAS;MIOpen::MIOpen") + add_definitions_with_host_compiler("-D__HIP_PLATFORM_AMD__=1") + + list(APPEND EXTRA_SHARED_LIBS HIP::HIP rocBLAS::rocBLAS MIOpen::MIOpen) message(STATUS "DPC++ support is enabled (HIP)") else() # In order to support large shapes. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b060468f0ad..68efa2742e0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -196,7 +196,7 @@ add_library(${LIB_PACKAGE_NAME} ${DNNL_LIBRARY_TYPE} ${VERSION_RESOURCE_FILE} ${HEADERS_ROOT} ${HEADERS_SUBDIR} ${LIB_DEPS}) # LINK_PRIVATE for cmake 2.8.11 compatibility -target_link_libraries(${LIB_PACKAGE_NAME} LINK_PRIVATE ${${LIB_PACKAGE_NAME}_INTERFACE} ${STATIC_LIB_DEPS} ${SHARED_LIB_DEPS}) +target_link_libraries(${LIB_PACKAGE_NAME} LINK_PRIVATE ${STATIC_LIB_DEPS} ${SHARED_LIB_DEPS}) set_property(TARGET ${LIB_PACKAGE_NAME} PROPERTY OUTPUT_NAME ${DNNL_LIBRARY_NAME}) set_property(TARGET ${LIB_PACKAGE_NAME} PROPERTY VERSION "${DNNL_VERSION_MAJOR}.${DNNL_VERSION_MINOR}") diff --git a/src/gpu/CMakeLists.txt b/src/gpu/CMakeLists.txt index ed471a4ab39..cd9e338611b 100644 --- a/src/gpu/CMakeLists.txt +++ b/src/gpu/CMakeLists.txt @@ -39,13 +39,10 @@ endif() if(DNNL_GPU_VENDOR STREQUAL "NVIDIA") add_subdirectory(nvidia) - # Pass ${LIB_PACKAGE_NAME}_INTERFACE to upper level for proper linking - set(${LIB_PACKAGE_NAME}_INTERFACE "${${LIB_PACKAGE_NAME}_INTERFACE}" PARENT_SCOPE) endif() if(DNNL_GPU_VENDOR STREQUAL "AMD") add_subdirectory(amd) - set(${LIB_PACKAGE_NAME}_INTERFACE "${${LIB_PACKAGE_NAME}_INTERFACE}" PARENT_SCOPE) endif() # check if warn_unused_result can be used in an alias diff --git a/src/gpu/amd/CMakeLists.txt b/src/gpu/amd/CMakeLists.txt index c87b8cc8397..cc11f17563a 100644 --- a/src/gpu/amd/CMakeLists.txt +++ b/src/gpu/amd/CMakeLists.txt @@ -1,5 +1,5 @@ #=============================================================================== -# Copyright 2022-2023 Intel Corporation +# Copyright 2022-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,21 +14,12 @@ # limitations under the License. #=============================================================================== -file(GLOB_RECURSE SOURCES +file(GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ) -set(OBJ_LIB ${LIB_PACKAGE_NAME}_sycl_amd) +set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_amd) add_library(${OBJ_LIB} OBJECT ${SOURCES}) - - -set_target_properties(${OBJ_LIB} PROPERTIES COMPILE_DEFINITIONS "$;$" - COMPILE_OPTIONS "$;$") - -target_include_directories(${OBJ_LIB} PRIVATE $ $) - -add_library(${OBJ_LIB}_interface INTERFACE) -target_link_libraries(${OBJ_LIB}_interface INTERFACE rocBLAS::rocBLAS MIOpen::MIOpen) -set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS $) - -set(${LIB_PACKAGE_NAME}_INTERFACE ${${LIB_PACKAGE_NAME}_INTERFACE} ${OBJ_LIB}_interface PARENT_SCOPE) +set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS + $) diff --git a/src/gpu/nvidia/CMakeLists.txt b/src/gpu/nvidia/CMakeLists.txt index 9f3cd5f8e6a..733b47d4505 100644 --- a/src/gpu/nvidia/CMakeLists.txt +++ b/src/gpu/nvidia/CMakeLists.txt @@ -1,5 +1,5 @@ #=============================================================================== -# Copyright 2020-2023 Intel Corporation +# Copyright 2020-2024 Intel Corporation # Copyright 2020 Codeplay Software Limited # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,33 +15,12 @@ # limitations under the License. #=============================================================================== -file(GLOB_RECURSE SOURCES +file(GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ) -set(OBJ_LIB ${LIB_PACKAGE_NAME}_sycl_nvidia) +set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_nvidia) add_library(${OBJ_LIB} OBJECT ${SOURCES}) - -set_target_properties( - ${OBJ_LIB} - PROPERTIES - COMPILE_DEFINITIONS - "$;$" - COMPILE_OPTIONS - "$;$" -) -target_include_directories( - ${OBJ_LIB} - PRIVATE $ - $) - -add_library(${OBJ_LIB}_interface INTERFACE) -target_link_libraries(${OBJ_LIB}_interface INTERFACE cuBLAS::cuBLAS - cuDNN::cuDNN) set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS $) - -set(${LIB_PACKAGE_NAME}_INTERFACE - ${${LIB_PACKAGE_NAME}_INTERFACE} ${OBJ_LIB}_interface - PARENT_SCOPE) From da4d5e64a69150d0a0074c070a73bd4db2afa338 Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Thu, 16 May 2024 15:24:39 -0700 Subject: [PATCH 104/187] gpu: remove unnecessary header inclusion --- src/gpu/gpu_impl_list.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/gpu/gpu_impl_list.hpp b/src/gpu/gpu_impl_list.hpp index d520b5bd299..7e137a4fbb8 100644 --- a/src/gpu/gpu_impl_list.hpp +++ b/src/gpu/gpu_impl_list.hpp @@ -20,8 +20,6 @@ #include #include -#include "oneapi/dnnl/dnnl.h" - #include "common/engine.hpp" #include "common/impl_list_item.hpp" #include "common/impl_registration.hpp" From 743dcdc70a3e8cced6662bd7db3948f1c37f4e83 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Thu, 16 May 2024 12:10:57 -0700 Subject: [PATCH 105/187] gpu: jit: gemm: don't free variables from r0 header --- src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp index dcfcdc93575..7999736bc68 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp @@ -20030,7 +20030,6 @@ void gemm_kernel_generator_t::gemmSimpleLinearOrder( state); if (!strategy.persistent) { - state.ra.safeRelease(state.inputs.groupIDMN); state.ra.safeRelease(state.inputs.groupCountM); state.ra.safeRelease(state.inputs.groupCountN); state.ra.safeRelease(state.inputs.gcMNRecip); From 90695a7cddc83119142c96c0aaf4ee0ed48e77f2 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Fri, 17 May 2024 10:58:10 +0400 Subject: [PATCH 106/187] build: fix micro sdpa build errors --- src/gpu/gpu_sdpa_list.cpp | 6 ++---- src/gpu/intel/jit/gemm/microkernel_provider.cpp | 14 +++++++------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/gpu/gpu_sdpa_list.cpp b/src/gpu/gpu_sdpa_list.cpp index 42ea1d00f66..8195fbb10d8 100644 --- a/src/gpu/gpu_sdpa_list.cpp +++ b/src/gpu/gpu_sdpa_list.cpp @@ -29,10 +29,8 @@ namespace { // clang-format off constexpr impl_list_item_t impl_list[] = { - INSTANCE(intel::ocl::micro_sdpa_t) -#ifdef DNNL_DEV_MODE - INSTANCE(intel::ocl::ref_sdpa_t) -#endif + GPU_INSTANCE_INTEL(intel::ocl::micro_sdpa_t) + GPU_INSTANCE_INTEL_DEVMODE(intel::ocl::ref_sdpa_t) nullptr, }; // clang-format on diff --git a/src/gpu/intel/jit/gemm/microkernel_provider.cpp b/src/gpu/intel/jit/gemm/microkernel_provider.cpp index dcd3a5f068c..7673151c2f1 100644 --- a/src/gpu/intel/jit/gemm/microkernel_provider.cpp +++ b/src/gpu/intel/jit/gemm/microkernel_provider.cpp @@ -207,13 +207,13 @@ Package selectGEMMMicrokernel(GEMMProtocol protocol, HWInformation hwInfo, } switch (hw) { - ARCH_DISPATCH(Gen9) - ARCH_DISPATCH(Gen11) - ARCH_DISPATCH(XeLP) - ARCH_DISPATCH(XeHP) - ARCH_DISPATCH(XeHPG) - ARCH_DISPATCH(XeHPC) - ARCH_DISPATCH(Xe2) + REG_GEN9_ISA(ARCH_DISPATCH(Gen9)) + REG_GEN11_ISA(ARCH_DISPATCH(Gen11)) + REG_XELP_ISA(ARCH_DISPATCH(XeLP)) + REG_XEHP_ISA(ARCH_DISPATCH(XeHP)) + REG_XEHPG_ISA(ARCH_DISPATCH(XeHPG)) + REG_XEHPC_ISA(ARCH_DISPATCH(XeHPC)) + REG_XE2_ISA(ARCH_DISPATCH(Xe2)) default: throw std::runtime_error("Unsupported architecture"); } #undef ARCH_DISPATCH From 802989c43dceeed31ec0ac5bbe9523932de10426 Mon Sep 17 00:00:00 2001 From: Daniel Kuts Date: Fri, 17 May 2024 13:32:42 +0300 Subject: [PATCH 107/187] cpu: reorder: check pointers before dereferencing --- src/cpu/reorder/cpu_reorder_pd.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/cpu/reorder/cpu_reorder_pd.hpp b/src/cpu/reorder/cpu_reorder_pd.hpp index 625a9da0905..d1c8499c151 100644 --- a/src/cpu/reorder/cpu_reorder_pd.hpp +++ b/src/cpu/reorder/cpu_reorder_pd.hpp @@ -65,11 +65,13 @@ struct cpu_reorder_pd_t : public reorder_pd_t { if (D_start) *D_start = utils::array_product(input_d.dims(), ndims_start); - if (D_mask) + if (D_mask) { *D_mask = utils::array_product( input_d.dims() + ndims_start, ndims_mask); - assert(*D_mask >= 1); - if (D_rest) *D_rest = input_d.nelems() / (*D_start * *D_mask); + assert(*D_mask >= 1); + } + if (D_rest && D_start && D_mask) + *D_rest = input_d.nelems() / (*D_start * *D_mask); } // The function serves same purpose as `dnnl::impl::cpu::precompute_scales`. From e41e332da61f0142df7e0056819a285c5e03d043 Mon Sep 17 00:00:00 2001 From: Ryo Suzuki Date: Mon, 29 Apr 2024 10:36:48 +0000 Subject: [PATCH 108/187] cpu: reorder: use jit uni reorder for bf16 add restriction for bf16 use when beta == 0.f as it is unsupported use jit uni reorder for aarch64 src=bf16 and dst=bf16 --- src/cpu/aarch64/jit_uni_reorder.cpp | 34 ++++++++++++-------- src/cpu/reorder/cpu_reorder_regular_bf16.cpp | 3 ++ 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp index a28815cedf6..a5b34a02a66 100644 --- a/src/cpu/aarch64/jit_uni_reorder.cpp +++ b/src/cpu/aarch64/jit_uni_reorder.cpp @@ -1,7 +1,7 @@ /******************************************************************************* * Copyright 2018-2023 Intel Corporation * Copyright 2020-2023 FUJITSU LIMITED -* Copyright 2022-2023 Arm Ltd. and affiliates +* Copyright 2022-2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -161,13 +161,20 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator { static bool applicable(const prb_t &p) { using namespace data_type; + bool bf16_ok + = (mayiuse_bf16() && (p.itype == bf16) && (p.otype == bf16) + && !interim_f32_needed(p, false) && p.beta == 0.f) + || (p.itype != bf16 && p.otype != bf16) + || (p.itype == f32 && p.otype == bf16 && mayiuse_bf16() + && p.beta == 0.f); + bool ok = true && p.ndims > 0 - && utils::one_of(p.itype, f32, s32, data_type::s8, u8) + && utils::one_of(p.itype, f32, bf16, s32, data_type::s8, u8) && utils::one_of(p.otype, f32, bf16, s32, data_type::s8, u8) && utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */ && utils::one_of(p.beta, 0.f, 1.f) /* anything else? */ && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p) - && ((p.otype != bf16) || (p.itype == f32 && mayiuse_bf16())); + && bf16_ok; return ok; } @@ -701,7 +708,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator { const int load_tail_step = !can_load_xmm && can_store_xmm ? ur_step : load_step; - const bool interim_f32 = interim_f32_needed(); + const bool interim_f32 = interim_f32_needed(prb_, compensation_needed_); const bool need_saturation = (utils::one_of(prb_.otype, u8, data_type::s8, s32) @@ -1284,17 +1291,18 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator { } } - bool interim_f32_needed() { + static bool interim_f32_needed(const prb_t &prb, bool compensation_needed) { using namespace data_type; - return utils::one_of(f32, prb_.itype, prb_.otype) - || prb_.src_scale_type != scale_type_t::NONE - || prb_.dst_scale_type != scale_type_t::NONE || prb_.beta != 0.f - || ((prb_.req_src_zp || prb_.req_dst_zp) - ? !(prb_.itype == s32 && prb_.otype == s32) + bool ret = utils::one_of(f32, prb.itype, prb.otype) + || prb.src_scale_type != scale_type_t::NONE + || prb.dst_scale_type != scale_type_t::NONE || prb.beta != 0.f + || ((prb.req_src_zp || prb.req_dst_zp) + ? !(prb.itype == s32 && prb.otype == s32) : false) - || (prb_.itype != f32 && compensation_needed_) - || prb_.scale_adjust != 1.f; + || (prb.itype != f32 && compensation_needed) + || prb.scale_adjust != 1.f; + return ret; } void process_unroll_generic( @@ -1312,7 +1320,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator { int curr = 0; // will switch between 0 and 1 - const bool interim_f32 = interim_f32_needed(); + const bool interim_f32 = interim_f32_needed(prb_, compensation_needed_); if (prb_.req_src_zp) { add_imm(X_DEFAULT_ADDR, PARAM(src_zp), X_TMP_0); diff --git a/src/cpu/reorder/cpu_reorder_regular_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_bf16.cpp index 8384e0b56e9..d00039dad64 100644 --- a/src/cpu/reorder/cpu_reorder_regular_bf16.cpp +++ b/src/cpu/reorder/cpu_reorder_regular_bf16.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020-2023 Intel Corporation +* Copyright 2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -52,6 +53,8 @@ const impl_list_map_t ®ular_bf16_impl_list_map() { DNNL_NON_X64_ONLY(REG_SR_BIDIR(bf16, any, u8, OIdhw16o16i)) DNNL_NON_X64_ONLY(REG_SR_BIDIR(bf16, any, u8, OIdhw16i16o)) + DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t)) + REG_SR(bf16, any, bf16, any, fmt_order::any, spec::reference) REG_SR(bf16, any, f32, any, fmt_order::any, spec::reference) REG_SR(bf16, any, s8, any, fmt_order::any, spec::reference) From 5e0a8a96018422e7e8911856c81589ebbb6a8c4e Mon Sep 17 00:00:00 2001 From: Kentaro Kawakami Date: Fri, 10 May 2024 19:49:50 +0900 Subject: [PATCH 109/187] aarch64: shuffle: fix segv for bf16 cases --- src/cpu/aarch64/shuffle/jit_uni_shuffle.cpp | 10 ++-- .../shuffle/jit_uni_shuffle_kernel.cpp | 46 ++++++++++++------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/src/cpu/aarch64/shuffle/jit_uni_shuffle.cpp b/src/cpu/aarch64/shuffle/jit_uni_shuffle.cpp index 4d8d15219eb..4d1cf145692 100644 --- a/src/cpu/aarch64/shuffle/jit_uni_shuffle.cpp +++ b/src/cpu/aarch64/shuffle/jit_uni_shuffle.cpp @@ -1,6 +1,6 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation -* Copyright 2022 FUJITSU LIMITED +* Copyright 2020-2024 Intel Corporation +* Copyright 2022-2024 FUJITSU LIMITED * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,6 +34,7 @@ template status_t jit_uni_shuffle_t::pd_t::init(engine_t *engine) { using namespace format_tag; using namespace data_type; + using namespace types; const memory_desc_wrapper src_d(is_fwd() ? src_md() : diff_src_md()); const memory_desc_wrapper dst_d(is_fwd() ? dst_md() : diff_dst_md()); @@ -58,7 +59,10 @@ status_t jit_uni_shuffle_t::pd_t::init(engine_t *engine) { if (blocked_format == format_tag::undef) return status::unimplemented; conf_.blk_size = src_d.blocking_desc().strides[ndims() - 1]; - conf_.simd_w = cpu_isa_traits::vlen / sizeof(float); + /* Because "ST1H { .S }, , [, .S, UXTW #1]" is used + to gather data for bf16, simd_w must be calculated + with sizeof(uint32_t). */ + conf_.simd_w = cpu_isa_traits::vlen / sizeof(uint32_t); const bool has_spatial = utils::one_of(ndims(), 3, 4, 5); const dim_t HW = H() * W(); diff --git a/src/cpu/aarch64/shuffle/jit_uni_shuffle_kernel.cpp b/src/cpu/aarch64/shuffle/jit_uni_shuffle_kernel.cpp index 72272616f5e..3ae88f5ad18 100644 --- a/src/cpu/aarch64/shuffle/jit_uni_shuffle_kernel.cpp +++ b/src/cpu/aarch64/shuffle/jit_uni_shuffle_kernel.cpp @@ -1,6 +1,6 @@ /******************************************************************************* -* Copyright 2021-2022 Intel Corporation -* Copyright 2022 FUJITSU LIMITED +* Copyright 2021-2024 Intel Corporation +* Copyright 2022-2024 FUJITSU LIMITED * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,9 +47,12 @@ jit_uni_shuffle_kernel_t::jit_uni_shuffle_kernel_t( template void jit_uni_shuffle_kernel_t::prepare_mask() { using namespace data_type; + using namespace types; if (conf_.simd_tail > 0) { - assert(utils::one_of(conf_.data_type, f32, s32)); - assert(conf_.simd_tail < isa_sveLen / sizeof(float)); + /* Because "ST1H { .S }, , [, .S, UXTW #1]" is used + to gather data for bf16, simd_tail must be evaluated + with sizeof(unsigned). */ + assert(conf_.simd_tail < isa_sveLen / sizeof(uint32_t)); index(vmm_tmp_.s, 0, 1); cmplt(k_tail_mask_.s, P_ALL_ONE / T_z, vmm_tmp_.s, conf_.simd_tail); } @@ -68,13 +71,17 @@ void jit_uni_shuffle_kernel_t::prepare_mask() {} template void jit_uni_shuffle_kernel_t::gather_data(const XReg ®_src_addr, const int indices_idx, const int data_idx, const bool is_tail) { - if (conf_.dt_size == sizeof(float)) { - const PReg &mask = is_tail ? k_tail_mask_ : k_full_mask_; + using namespace data_type; + const PReg &mask = is_tail ? k_tail_mask_ : k_full_mask_; + + if (utils::one_of(conf_.data_type, f32, s32)) { lsr(TRegS(indices_idx), TRegS(indices_idx), 2); ld1w(TRegS(data_idx), mask / T_z, ptr(reg_src_addr, TRegS(indices_idx), UXTW, 2)); - } else { - assert(!"unsupported emu_gather_data"); + } else if (conf_.data_type == bf16) { + lsr(TRegS(indices_idx), TRegS(indices_idx), 1); + ld1h(TRegS(data_idx), mask / T_z, + ptr(reg_src_addr, TRegS(indices_idx), UXTW, 1)); } } @@ -97,21 +104,26 @@ void jit_uni_shuffle_kernel_t::gather_data(const XReg &addr, template void jit_uni_shuffle_kernel_t::store_data(const int data_idx, const XReg ®_dst_addr, const int offset, const bool is_tail) { + using namespace data_type; const auto extend_for_padding = is_tail && padding_size_ + conf_.simd_tail >= conf_.simd_w; + const PReg &mask = is_tail ? k_tail_mask_ : P_ALL_ONE; + + add_imm(X_DEFAULT_ADDR, reg_dst_addr, offset, X_TMP_0); + if (extend_for_padding) { sel(vmm_tmp_.s, k_tail_mask_, TRegS(data_idx), vmm_zero_.s); - add_imm(X_DEFAULT_ADDR, reg_dst_addr, offset, X_TMP_0); - st1w(vmm_tmp_.s, P_ALL_ONE, ptr(X_DEFAULT_ADDR)); + if (utils::one_of(conf_.data_type, f32, s32)) + st1w(vmm_tmp_.s, P_ALL_ONE, ptr(X_DEFAULT_ADDR)); + else // bf16 + st1h(vmm_tmp_.s, P_ALL_ONE, ptr(X_DEFAULT_ADDR)); } else { - if (is_tail) { - add_imm(X_DEFAULT_ADDR, reg_dst_addr, offset, X_TMP_0); - st1w(TRegS(data_idx), k_tail_mask_, ptr(X_DEFAULT_ADDR)); - } else { - add_imm(X_DEFAULT_ADDR, reg_dst_addr, offset, X_TMP_0); - st1w(TRegS(data_idx), P_ALL_ONE, ptr(X_DEFAULT_ADDR)); - } + if (utils::one_of(conf_.data_type, f32, s32)) + st1w(TRegS(data_idx), mask, ptr(X_DEFAULT_ADDR)); + else // bf16 + st1h(TRegS(data_idx), mask, ptr(X_DEFAULT_ADDR)); } + append_zero_padding( reg_dst_, isa_sveLen > 128 ? extend_for_padding : false); } From 6aa7dc3ae33228ddd9b35949ba87e7f283745e0c Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Wed, 8 May 2024 14:23:43 -0700 Subject: [PATCH 110/187] api: add new plain tags This tags are intended to be replaced with a strided version of memory desc creation since they are coming from the dynamic shapes at runtime. --- include/oneapi/dnnl/dnnl.hpp | 3 +++ include/oneapi/dnnl/dnnl_types.h | 3 +++ src/common/c_types_map.hpp | 3 +++ src/common/dnnl_debug_autogenerated.cpp | 3 +++ src/common/memory_desc_wrapper.cpp | 3 +++ src/common/tag_traits.hpp | 3 +++ tests/benchdnn/dnnl_debug_autogenerated.cpp | 3 +++ 7 files changed, 21 insertions(+) diff --git a/include/oneapi/dnnl/dnnl.hpp b/include/oneapi/dnnl/dnnl.hpp index 2e967465520..0874fe2a261 100644 --- a/include/oneapi/dnnl/dnnl.hpp +++ b/include/oneapi/dnnl/dnnl.hpp @@ -2694,6 +2694,9 @@ struct memory : public handle { aCB8b16c = dnnl_aCB8b16c, BA8a8b = dnnl_BA8a8b, aCB8b8c = dnnl_aCB8b8c, + bcad = dnnl_bcad, + cabd = dnnl_cabd, + dabc = dnnl_dabc, }; /// A memory descriptor. diff --git a/include/oneapi/dnnl/dnnl_types.h b/include/oneapi/dnnl/dnnl_types.h index 4fecf2d503f..a8c819a5914 100644 --- a/include/oneapi/dnnl/dnnl_types.h +++ b/include/oneapi/dnnl/dnnl_types.h @@ -1032,6 +1032,9 @@ typedef enum { dnnl_aCB8b16c, dnnl_BA8a8b, dnnl_aCB8b8c, + dnnl_bcad, + dnnl_cabd, + dnnl_dabc, /// Just a sentinel, not real memory format tag. Must be changed after new /// format tag is added. diff --git a/src/common/c_types_map.hpp b/src/common/c_types_map.hpp index d61d0ef6c97..5fe9b98544e 100644 --- a/src/common/c_types_map.hpp +++ b/src/common/c_types_map.hpp @@ -282,13 +282,16 @@ const format_tag_t ba = dnnl_ba; const format_tag_t bac = dnnl_bac; const format_tag_t bacd = dnnl_bacd; const format_tag_t bca = dnnl_bca; +const format_tag_t bcad = dnnl_bcad; const format_tag_t bcda = dnnl_bcda; const format_tag_t bcdea = dnnl_bcdea; const format_tag_t bacde = dnnl_bacde; const format_tag_t cab = dnnl_cab; const format_tag_t cba = dnnl_cba; +const format_tag_t cabd = dnnl_cabd; const format_tag_t cdab = dnnl_cdab; const format_tag_t cdba = dnnl_cdba; +const format_tag_t dabc = dnnl_dabc; const format_tag_t dcab = dnnl_dcab; const format_tag_t cdeab = dnnl_cdeab; const format_tag_t cdeba = dnnl_cdeba; diff --git a/src/common/dnnl_debug_autogenerated.cpp b/src/common/dnnl_debug_autogenerated.cpp index dca7900eb34..af529e99024 100644 --- a/src/common/dnnl_debug_autogenerated.cpp +++ b/src/common/dnnl_debug_autogenerated.cpp @@ -936,6 +936,9 @@ const char *dnnl_fmt_tag2str(dnnl_format_tag_t v) { if (v == dnnl_aCB8b16c) return "aCB8b16c"; if (v == dnnl_BA8a8b) return "BA8a8b"; if (v == dnnl_aCB8b8c) return "aCB8b8c"; + if (v == dnnl_bcad) return "bcad"; + if (v == dnnl_cabd) return "cabd"; + if (v == dnnl_dabc) return "dabc"; if (v == dnnl_format_tag_last) return "format_tag_last"; if (v == dnnl_x) return "x"; if (v == dnnl_nc) return "nc"; diff --git a/src/common/memory_desc_wrapper.cpp b/src/common/memory_desc_wrapper.cpp index d69afec6671..4d6cb0b92cc 100644 --- a/src/common/memory_desc_wrapper.cpp +++ b/src/common/memory_desc_wrapper.cpp @@ -172,12 +172,15 @@ status_t memory_desc_wrapper::compute_blocking( C(bacd, {1, 0, 2, 3}, {}, {}); C(bacde, {1, 0, 2, 3, 4}, {}, {}); C(bca, {1, 2, 0}, {}, {}); + C(bcad, {1, 2, 0, 3}, {}, {}); C(bcda, {1, 2, 3, 0}, {}, {}); C(bcdea, {1, 2, 3, 4, 0}, {}, {}); C(cab, {2, 0, 1}, {}, {}); C(cba, {2, 1, 0}, {}, {}); + C(cabd, {2, 0, 1, 3}, {}, {}); C(cdab, {2, 3, 0, 1}, {}, {}); C(cdba, {2, 3, 1, 0}, {}, {}); + C(dabc, {3, 0, 1, 2}, {}, {}); C(dcab, {3, 2, 0, 1}, {}, {}); C(cdeab, {2, 3, 4, 0, 1}, {}, {}); C(cdeba, {2, 3, 4, 1, 0}, {}, {}); diff --git a/src/common/tag_traits.hpp b/src/common/tag_traits.hpp index cede4e4eb84..487f4581c9e 100644 --- a/src/common/tag_traits.hpp +++ b/src/common/tag_traits.hpp @@ -346,10 +346,13 @@ DECL_TRAITS(bac, _, _, 3); DECL_TRAITS(bacd, _, _, 4); DECL_TRAITS(bacde, _, _, 5); DECL_TRAITS(bca, _, _, 3); +DECL_TRAITS(bcad, _, _, 4); DECL_TRAITS(bcda, _, _, 4); DECL_TRAITS(bcdea, _, _, 5); DECL_TRAITS(cba, _, _, 3); +DECL_TRAITS(cabd, _, _, 4); DECL_TRAITS(cdba, _, _, 4); +DECL_TRAITS(dabc, _, _, 4); DECL_TRAITS(dcab, _, _, 4); DECL_TRAITS(cdeba, _, _, 5); DECL_TRAITS(decab, _, _, 5); diff --git a/tests/benchdnn/dnnl_debug_autogenerated.cpp b/tests/benchdnn/dnnl_debug_autogenerated.cpp index f5013a99569..959dcfe2588 100644 --- a/tests/benchdnn/dnnl_debug_autogenerated.cpp +++ b/tests/benchdnn/dnnl_debug_autogenerated.cpp @@ -913,6 +913,9 @@ dnnl_format_tag_t str2fmt_tag(const char *str) { CASE(aCB8b16c); CASE(BA8a8b); CASE(aCB8b8c); + CASE(bcad); + CASE(cabd); + CASE(dabc); CASE(x); CASE(nc); CASE(cn); From 684034d602ae7c9983ac7c11df9045fd85264c57 Mon Sep 17 00:00:00 2001 From: "Gu, Yonghao" Date: Wed, 15 May 2024 02:22:40 +0000 Subject: [PATCH 111/187] gtests: graph: unit: dnnl: fix the event list size assert --- tests/gtests/graph/unit/backend/dnnl/test_op_executable.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/gtests/graph/unit/backend/dnnl/test_op_executable.cpp b/tests/gtests/graph/unit/backend/dnnl/test_op_executable.cpp index 1ed0b930d87..27407f826a2 100644 --- a/tests/gtests/graph/unit/backend/dnnl/test_op_executable.cpp +++ b/tests/gtests/graph/unit/backend/dnnl/test_op_executable.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -86,7 +86,7 @@ TEST(test_op_executable_op_executable, DummyImpl) { auto returned_event2 = op_exec->execute_sycl( p_stream, {}, {std::move(input_event0), std::move(input_event1)}); const auto &event_list2 = returned_event2.get_wait_list(); - ASSERT_GT(event_list2.size(), 0U); + ASSERT_LE(event_list2.size(), 2U); returned_event2.wait(); ASSERT_EQ( returned_event2 From eba5fcfbbd8e54bb44c730d67e4bdab333e15f5f Mon Sep 17 00:00:00 2001 From: "Lv, Tao A" Date: Thu, 16 May 2024 21:29:25 +0800 Subject: [PATCH 112/187] graph: backend: dnnl: fix include when cpu rt is none --- src/graph/backend/dnnl/platform.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/graph/backend/dnnl/platform.cpp b/src/graph/backend/dnnl/platform.cpp index 9c385fd22de..edec4c44ebd 100644 --- a/src/graph/backend/dnnl/platform.cpp +++ b/src/graph/backend/dnnl/platform.cpp @@ -16,6 +16,7 @@ #include "graph/backend/dnnl/platform.hpp" +#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE #if DNNL_X64 #include "cpu/x64/cpu_isa_traits.hpp" #elif DNNL_AARCH64 @@ -26,6 +27,7 @@ #include "src/common/cpuinfo/CpuInfo.h" #endif #endif +#endif namespace dnnl { namespace impl { From e0889f624872dce9b5a533c7e62b25f7b5f5aa02 Mon Sep 17 00:00:00 2001 From: "Lv, Tao A" Date: Tue, 14 May 2024 09:23:05 +0800 Subject: [PATCH 113/187] graph: backend: dnnl: check nullptr before dereference --- src/graph/backend/dnnl/kernels/mqa.hpp | 10 ++++++---- src/graph/backend/dnnl/kernels/sdp.hpp | 5 ++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/graph/backend/dnnl/kernels/mqa.hpp b/src/graph/backend/dnnl/kernels/mqa.hpp index 6350df9fbfc..59a1e3dd9a4 100644 --- a/src/graph/backend/dnnl/kernels/mqa.hpp +++ b/src/graph/backend/dnnl/kernels/mqa.hpp @@ -187,7 +187,7 @@ struct mqa_decomp_config_t { const std::vector &inputs) { // Record the ops inside of MQA pattern in a specific order. - record_mqa_ops(sg, quantized); + record_mqa_ops(sg); // Acquire the data type from input param for later primitive creation. // The src and wei dt of both quantized mqa and float mqa are the same. @@ -441,7 +441,7 @@ struct mqa_decomp_config_t { impl::status_t record_input_offset(const std::shared_ptr &sg, const std::vector &inputs) { auto find_graph_inport = [&](std::shared_ptr val) { - // for quantized mamtul, it has producer such as add_zp,sub_zp,mul_scale. + // for quantized matmul, it has producer such as add_zp,sub_zp,mul_scale. if (val->get_consumers()[0].get_op().get_kind() == graph::op_kind::MatMul) { while (val->has_producer()) { @@ -471,6 +471,9 @@ struct mqa_decomp_config_t { } else mm2 = cur_op; } + if (impl::utils::one_of(nullptr, mm1, mm2, add)) + return status::invalid_graph; + int src1_id = find_graph_inport(mm1->get_input_value(0)); graph_inport.emplace_back(src1_id); int wei1_id = find_graph_inport(mm1->get_input_value(1)); @@ -485,8 +488,7 @@ struct mqa_decomp_config_t { return status::success; } - impl::status_t record_mqa_ops( - std::shared_ptr &sg, bool is_quantize) { + impl::status_t record_mqa_ops(std::shared_ptr &sg) { subgraph_rewriter_t rewriter(sg); op_ptr reorder1, reorder2, matmul1, softmax, matmul2; for (const auto &cur_op : sg->get_ops()) { diff --git a/src/graph/backend/dnnl/kernels/sdp.hpp b/src/graph/backend/dnnl/kernels/sdp.hpp index 722034d69b1..1e7ceadfbe5 100644 --- a/src/graph/backend/dnnl/kernels/sdp.hpp +++ b/src/graph/backend/dnnl/kernels/sdp.hpp @@ -537,7 +537,7 @@ struct sdp_decomp_config_t { impl::status_t record_input_offset(const std::shared_ptr &sg, const std::vector &inputs) { auto find_graph_inport = [&](std::shared_ptr val) { - // for quantized mamtul, it has producer such as add_zp,sub_zp,mul_scale. + // for quantized matmul, it has producer such as add_zp,sub_zp,mul_scale. if (val->get_consumers()[0].get_op().get_kind() == graph::op_kind::MatMul) { while (val->has_producer()) { @@ -575,6 +575,9 @@ struct sdp_decomp_config_t { } else mm2 = cur_op; } + if (impl::utils::one_of(nullptr, mm1, mm2)) + return status::invalid_graph; + int src1_id = find_graph_inport(mm1->get_input_value(0)); graph_inport.emplace_back(src1_id); int wei1_id = find_graph_inport(mm1->get_input_value(1)); From e93576dfad348ae6e9719b3c80388c6a69381c4a Mon Sep 17 00:00:00 2001 From: "Lv, Tao A" Date: Tue, 14 May 2024 09:23:54 +0800 Subject: [PATCH 114/187] gtests: graph: unit: use dim_t to avoid overflow --- tests/gtests/graph/unit/utils.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/gtests/graph/unit/utils.hpp b/tests/gtests/graph/unit/utils.hpp index 6e15e8dbbea..9feab434b38 100644 --- a/tests/gtests/graph/unit/utils.hpp +++ b/tests/gtests/graph/unit/utils.hpp @@ -1277,8 +1277,10 @@ inline void construct_dnnl_float_JAX_MHA(dnnl::impl::graph::graph_t *agraph, } inline void construct_dnnl_float_JAX_MQA(dnnl::impl::graph::graph_t *agraph, - impl::data_type_t dtype = impl::data_type::f32, int batch_size = 1, - int seq_len = 384, int num_head = 16, int size_per_head = 64) { + impl::data_type_t dtype = impl::data_type::f32, + impl::graph::dim_t batch_size = 1, impl::graph::dim_t seq_len = 384, + impl::graph::dim_t num_head = 16, + impl::graph::dim_t size_per_head = 64) { using namespace dnnl::impl::graph; using namespace dnnl::graph::tests; From b8e440841c15c8030290aab65eb4506d5a01f24e Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Wed, 15 May 2024 15:03:28 -0700 Subject: [PATCH 115/187] x64: post-ops injectors: support different fp8 types in binary post-ops --- src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp | 52 ++++++++------ src/cpu/x64/brgemm/jit_brgemm_kernel.cpp | 63 ++++++++++------- .../x64/injectors/jit_uni_binary_injector.cpp | 68 +++++++++++++------ .../x64/injectors/jit_uni_binary_injector.hpp | 11 ++- src/cpu/x64/jit_brgemm_post_ops.hpp | 64 +++++++++++------ 5 files changed, 171 insertions(+), 87 deletions(-) diff --git a/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp b/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp index 7f4fbdadffb..856401c3013 100644 --- a/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp +++ b/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp @@ -45,16 +45,33 @@ struct jit_brgemm_amx_uker_base_t : public jit_generator { , brg(abrg) , postops_injector_(nullptr) { - if (brg.is_fp8_via_convert() - && one_of(data_type::f8_e5m2, brg.dt_a, brg.dt_b, brg.dt_d)) - f8_e5m2_emulator_ = utils::make_unique(this, - fp8_emu_xmm_1(), fp8_emu_xmm_2(), fp8_emu_xmm_3(), - fp8_tmp_mask, fp8_tmp_reg); - if (brg.is_fp8_via_convert() - && one_of(data_type::f8_e4m3, brg.dt_a, brg.dt_b, brg.dt_d)) - f8_e4m3_emulator_ = utils::make_unique(this, - fp8_emu_xmm_1(), fp8_emu_xmm_2(), fp8_emu_xmm_3(), - fp8_emu_xmm_4(), fp8_emu_xmm_5(), fp8_tmp_reg); + bool has_f8_e5m2_binary_postops = false; + bool has_f8_e4m3_binary_postops = false; + if (brg.with_binary) { + const auto &post_ops = brg.attr()->post_ops_; + for (int i = 0; i < post_ops.len(); i++) { + const auto &entry = post_ops.entry_[i]; + if (!entry.is_binary()) continue; + has_f8_e5m2_binary_postops = entry.binary.src1_desc.data_type + == data_type::f8_e5m2; + has_f8_e4m3_binary_postops = entry.binary.src1_desc.data_type + == data_type::f8_e4m3; + } + } + + if (brg.is_fp8_via_convert() || has_f8_e5m2_binary_postops + || has_f8_e4m3_binary_postops) { + if (one_of(data_type::f8_e5m2, brg.dt_a, brg.dt_b, brg.dt_d) + || has_f8_e5m2_binary_postops) + f8_e5m2_emulator_ = utils::make_unique( + this, fp8_emu_xmm_1(), fp8_emu_xmm_2(), fp8_emu_xmm_3(), + fp8_tmp_mask, fp8_tmp_reg); + if (one_of(data_type::f8_e4m3, brg.dt_a, brg.dt_b, brg.dt_d) + || has_f8_e4m3_binary_postops) + f8_e4m3_emulator_ = utils::make_unique( + this, fp8_emu_xmm_1(), fp8_emu_xmm_2(), fp8_emu_xmm_3(), + fp8_emu_xmm_4(), fp8_emu_xmm_5(), fp8_tmp_reg); + } if (brg.with_eltwise || brg.with_binary || brg.with_sum) { @@ -83,14 +100,9 @@ struct jit_brgemm_amx_uker_base_t : public jit_generator { dst_md_wrapper, static_cast(brg.ldb_tail), ld_tail_mask, use_exact_tail_scalar_bcast}; - fp8_emulation_base_t *f8_emu = nullptr; - if (brg.dt_d == data_type::f8_e5m2) - f8_emu = f8_e5m2_emulator_.get(); - else if (brg.dt_d == data_type::f8_e4m3) - f8_emu = f8_e4m3_emulator_.get(); - - const binary_injector::static_params_t bsp( - this->param1, enabled_bcast_strategy, rhs_sp, f8_emu); + const binary_injector::static_params_t bsp(this->param1, + enabled_bcast_strategy, rhs_sp, f8_e5m2_emulator_.get(), + f8_e4m3_emulator_.get()); eltwise_injector::static_params_t esp; esp.preserve_vmm = preserve_vmm; @@ -136,8 +148,8 @@ struct jit_brgemm_amx_uker_base_t : public jit_generator { using po_injector_t = injector::jit_uni_postops_injector_base_t; std::unique_ptr postops_injector_; - std::unique_ptr f8_e5m2_emulator_; - std::unique_ptr f8_e4m3_emulator_; + std::unique_ptr f8_e5m2_emulator_; + std::unique_ptr f8_e4m3_emulator_; using reg64_t = const Xbyak::Reg64; enum { diff --git a/src/cpu/x64/brgemm/jit_brgemm_kernel.cpp b/src/cpu/x64/brgemm/jit_brgemm_kernel.cpp index 24efbdbd02a..20f250fa06d 100644 --- a/src/cpu/x64/brgemm/jit_brgemm_kernel.cpp +++ b/src/cpu/x64/brgemm/jit_brgemm_kernel.cpp @@ -57,21 +57,39 @@ struct jit_brgemm_kernel_t : public jit_generator { const int is_ldb_tail = brg.ldb_tail ? 1 : 0; is_ldb_loop_ = brg.ldb2 + is_ldb2_tail + is_ldb_tail > 1; - if (brg.is_fp8_via_convert() - && one_of(data_type::f8_e5m2, brg.dt_a, brg.dt_b, brg.dt_c, - brg.dt_d)) - // Note: avoid using 'vmm0' since it is used as - // 'fp8_to_f16_upconvert()' param and would collision with these - // emulation vmms - f8_e5m2_emulator_ = utils::make_unique(this, - xmm_fp8_emu_aux2, xmm_fp8_emu_aux3, xmm_fp8_emu_aux4, - kmask_fp8_aux, reg64_fp8_aux); - if (brg.is_fp8_via_convert() - && one_of(data_type::f8_e4m3, brg.dt_a, brg.dt_b, brg.dt_c, - brg.dt_d)) - f8_e4m3_emulator_ = utils::make_unique(this, - xmm_fp8_emu_aux1, xmm_fp8_emu_aux2, xmm_fp8_emu_aux3, - xmm_fp8_emu_aux4, xmm_fp8_emu_aux5, reg64_fp8_aux); + bool has_f8_e5m2_binary_postops = false; + bool has_f8_e4m3_binary_postops = false; + if (brg.with_binary) { + const auto &post_ops = brg.attr()->post_ops_; + for (int i = 0; i < post_ops.len(); i++) { + const auto &entry = post_ops.entry_[i]; + if (!entry.is_binary()) continue; + has_f8_e5m2_binary_postops = entry.binary.src1_desc.data_type + == data_type::f8_e5m2; + has_f8_e4m3_binary_postops = entry.binary.src1_desc.data_type + == data_type::f8_e4m3; + } + } + + if (brg.is_fp8_via_convert() || has_f8_e5m2_binary_postops + || has_f8_e4m3_binary_postops) { + if (one_of(data_type::f8_e5m2, brg.dt_a, brg.dt_b, brg.dt_c, + brg.dt_d) + || has_f8_e5m2_binary_postops) + // Note: avoid using 'vmm0' since it is used as + // 'fp8_to_f16_upconvert()' param and would collision with these + // emulation vmms + f8_e5m2_emulator_ = utils::make_unique( + this, xmm_fp8_emu_aux2, xmm_fp8_emu_aux3, + xmm_fp8_emu_aux4, kmask_fp8_aux, reg64_fp8_aux); + if (one_of(data_type::f8_e4m3, brg.dt_a, brg.dt_b, brg.dt_c, + brg.dt_d) + || has_f8_e4m3_binary_postops) + f8_e4m3_emulator_ = utils::make_unique( + this, xmm_fp8_emu_aux1, xmm_fp8_emu_aux2, + xmm_fp8_emu_aux3, xmm_fp8_emu_aux4, xmm_fp8_emu_aux5, + reg64_fp8_aux); + } if (brg.with_eltwise || brg.with_binary || brg.with_sum) { @@ -98,14 +116,9 @@ struct jit_brgemm_kernel_t : public jit_generator { dst_md_wrapper, static_cast(brg.ldb_tail), ld_tail_mask, use_exact_tail_scalar_bcast}; - fp8_emulation_base_t *f8_emu = nullptr; - if (brg.dt_d == data_type::f8_e5m2) - f8_emu = f8_e5m2_emulator_.get(); - else if (brg.dt_d == data_type::f8_e4m3) - f8_emu = f8_e4m3_emulator_.get(); - - const binary_injector::static_params_t bsp { - this->param1, enabled_bcast_strategy, rhs_sp, f8_emu}; + const binary_injector::static_params_t bsp {this->param1, + enabled_bcast_strategy, rhs_sp, f8_e5m2_emulator_.get(), + f8_e4m3_emulator_.get()}; auto st = safe_ptr_assign(postops_injector_, po_injector_t::create( @@ -140,8 +153,8 @@ struct jit_brgemm_kernel_t : public jit_generator { using po_injector_t = injector::jit_uni_postops_injector_base_t; std::unique_ptr postops_injector_; std::unique_ptr bf16_emu_; - std::unique_ptr f8_e5m2_emulator_; - std::unique_ptr f8_e4m3_emulator_; + std::unique_ptr f8_e5m2_emulator_; + std::unique_ptr f8_e4m3_emulator_; Xbyak::Label avx_tail_mask_; Xbyak::Label sum_zp_scale_data_; diff --git a/src/cpu/x64/injectors/jit_uni_binary_injector.cpp b/src/cpu/x64/injectors/jit_uni_binary_injector.cpp index ee3ab9a88ab..e105aca31eb 100644 --- a/src/cpu/x64/injectors/jit_uni_binary_injector.cpp +++ b/src/cpu/x64/injectors/jit_uni_binary_injector.cpp @@ -172,17 +172,18 @@ bool all_binary_postop_rhs_per_oc_broadcast(const post_ops_t &post_ops, static_params_t::static_params_t(const Xbyak::Reg64 ¶m1, const bcast_set_t &supported_strategy_set, const rhs_arg_static_params_t &rhs_arg_static_params, - fp8_emulation_base_t *f8_emu) + fp8_emulation_e5m2_t *f8_e5m2_emu, fp8_emulation_e4m3_t *f8_e4m3_emu) : param1(param1) , supported_strategy_set(supported_strategy_set) , rhs_arg_static_params(rhs_arg_static_params) - , f8_emu_(f8_emu) {} + , f8_e5m2_emu_(f8_e5m2_emu) + , f8_e4m3_emu_(f8_e4m3_emu) {} static_params_t::static_params_t(const Xbyak::Reg64 ¶m1, const bcast_set_t &supported_strategy_set, const rhs_arg_static_params_t &rhs_arg_static_params) - : static_params_t( - param1, supported_strategy_set, rhs_arg_static_params, nullptr) {} + : static_params_t(param1, supported_strategy_set, rhs_arg_static_params, + nullptr, nullptr) {} static_params_t::static_params_t(const Xbyak::Reg64 ¶m1, const rhs_arg_static_params_t &rhs_arg_static_params) @@ -259,7 +260,8 @@ template jit_uni_binary_injector_t::jit_uni_binary_injector_t( jit_generator *host, const static_params_t &static_params) : host_(host) - , f8_emu_(static_params.f8_emu_) + , f8_e5m2_emu_(static_params.f8_e5m2_emu_) + , f8_e4m3_emu_(static_params.f8_e4m3_emu_) , rhs_arg_static_params_(static_params.rhs_arg_static_params) , param1_(static_params.param1) , supported_strategy_set_(static_params.supported_strategy_set) {} @@ -2426,10 +2428,16 @@ void jit_uni_binary_injector_t::execute_broadcast_no_tail( assert(!"unsupported ISA for given data type"); break; case data_type::f8_e5m2: + if (is_superset(isa, avx512_core_fp16)) { + assert(f8_e5m2_emu_); + f8_e5m2_emu_->vcvt_f8_to_f32(tmp_vmm, rhs_addr); + } else + assert(!"unsupported ISA for given data type"); + break; case data_type::f8_e4m3: if (is_superset(isa, avx512_core_fp16)) { - assert(f8_emu_); - f8_emu_->vcvt_f8_to_f32(tmp_vmm, rhs_addr); + assert(f8_e4m3_emu_); + f8_e4m3_emu_->vcvt_f8_to_f32(tmp_vmm, rhs_addr); } else assert(!"unsupported ISA for given data type"); break; @@ -2580,10 +2588,17 @@ void jit_uni_binary_injector_t::execute_broadcast_tail_with_opmask( assert(!"unsupported masked tail processing"); break; case data_type::f8_e5m2: + if (is_superset(isa, avx512_core_fp16)) { + assert(f8_e5m2_emu_); + f8_e5m2_emu_->vcvt_f8_to_f32( + tmp_vmm | tail_opmask | host_->T_z, rhs_addr); + } else + assert(!"unsupported ISA for given data type"); + break; case data_type::f8_e4m3: if (is_superset(isa, avx512_core_fp16)) { - assert(f8_emu_); - f8_emu_->vcvt_f8_to_f32( + assert(f8_e4m3_emu_); + f8_e4m3_emu_->vcvt_f8_to_f32( tmp_vmm | tail_opmask | host_->T_z, rhs_addr); } else assert(!"unsupported ISA for given data type"); @@ -2711,7 +2726,8 @@ struct helper_bcast_tail_t { static void execute_broadcast_tail_statically(jit_generator *host, const size_t tail_size, const data_type_t &data_type, const Vmm &tmp_vmm, const Xbyak::Address &rhs_addr, - fp8_emulation_base_t *f8_emu) { + fp8_emulation_e5m2_t *f8_e5m2_emu, + fp8_emulation_e4m3_t *f8_e4m3_emu) { if (utils::one_of(data_type, data_type::bf16, data_type::f16, data_type::f8_e5m2, data_type::f8_e4m3)) { const auto tmp_lower_vmm = @@ -2723,9 +2739,10 @@ struct helper_bcast_tail_t { host->vpslld(tmp_vmm, tmp_vmm, 16); } else if (data_type == data_type::f16) { host->vcvtph2ps(tmp_vmm, tmp_lower_vmm); - } else if (utils::one_of(data_type, data_type::f8_e5m2, - data_type::f8_e4m3)) { - f8_emu->vcvt_f8_to_f32(tmp_vmm, tmp_lower_vmm); + } else if (data_type == data_type::f8_e5m2) { + f8_e5m2_emu->vcvt_f8_to_f32(tmp_vmm, tmp_lower_vmm); + } else if (data_type == data_type::f8_e4m3) { + f8_e4m3_emu->vcvt_f8_to_f32(tmp_vmm, tmp_lower_vmm); } else assert(!"Unsupported data type"); @@ -2751,7 +2768,7 @@ void jit_uni_binary_injector_t::execute_broadcast_tail_statically(host_, tail_size, - data_type, tmp_vmm, rhs_addr, f8_emu_); + data_type, tmp_vmm, rhs_addr, f8_e5m2_emu_, f8_e4m3_emu_); } template <> @@ -2762,7 +2779,7 @@ void jit_uni_binary_injector_t::execute_broadcast_tail_statically(host_, tail_size, - data_type, tmp_vmm, rhs_addr, f8_emu_); + data_type, tmp_vmm, rhs_addr, f8_e5m2_emu_, f8_e4m3_emu_); } template <> @@ -2920,10 +2937,16 @@ void jit_uni_binary_injector_t::load_rhs_no_tail( assert(!"unsupported ISA for given data type"); break; case data_type::f8_e5m2: + if (is_superset(isa, avx512_core_fp16)) { + assert(f8_e5m2_emu_); + f8_e5m2_emu_->vcvt_f8_to_f32(tmp_vmm, rhs_addr); + } else + assert(!"unsupported ISA for given data type"); + break; case data_type::f8_e4m3: if (is_superset(isa, avx512_core_fp16)) { - assert(f8_emu_); - f8_emu_->vcvt_f8_to_f32(tmp_vmm, rhs_addr); + assert(f8_e4m3_emu_); + f8_e4m3_emu_->vcvt_f8_to_f32(tmp_vmm, rhs_addr); } else assert(!"unsupported ISA for given data type"); break; @@ -3002,10 +3025,17 @@ void jit_uni_binary_injector_t::load_rhs_tail_dynamically_with_opmask( assert(!"unsupported masked tail processing"); break; case data_type::f8_e5m2: + if (is_superset(isa, avx512_core_fp16)) { + assert(f8_e5m2_emu_); + f8_e5m2_emu_->vcvt_f8_to_f32( + tmp_vmm | tail_opmask | host_->T_z, rhs_addr); + } else + assert(!"unsupported ISA for given data type"); + break; case data_type::f8_e4m3: if (is_superset(isa, avx512_core_fp16)) { - assert(f8_emu_); - f8_emu_->vcvt_f8_to_f32( + assert(f8_e4m3_emu_); + f8_e4m3_emu_->vcvt_f8_to_f32( tmp_vmm | tail_opmask | host_->T_z, rhs_addr); } else assert(!"unsupported ISA for given data type"); diff --git a/src/cpu/x64/injectors/jit_uni_binary_injector.hpp b/src/cpu/x64/injectors/jit_uni_binary_injector.hpp index da51c2cd474..c13c5e07158 100644 --- a/src/cpu/x64/injectors/jit_uni_binary_injector.hpp +++ b/src/cpu/x64/injectors/jit_uni_binary_injector.hpp @@ -170,7 +170,8 @@ struct static_params_t { static_params_t(const Xbyak::Reg64 ¶m1, const bcast_set_t &supported_strategy_set, const rhs_arg_static_params_t &rhs_arg_static_params, - fp8_emulation_base_t *f8_emu); + fp8_emulation_e5m2_t *f8_e5m2_emu, + fp8_emulation_e4m3_t *f8_e4m3_emu); static_params_t(const Xbyak::Reg64 ¶m1, const bcast_set_t &supported_strategy_set, const rhs_arg_static_params_t &rhs_arg_static_params); @@ -180,7 +181,10 @@ struct static_params_t { Xbyak::Reg64 param1; const bcast_set_t supported_strategy_set; rhs_arg_static_params_t rhs_arg_static_params; - fp8_emulation_base_t *f8_emu_ {nullptr}; + // Both fp8 (e5m2 and e4m3) binary post-ops data types are possible. + // Therefore, we need both fp8 emulators. + fp8_emulation_e5m2_t *f8_e5m2_emu_ {nullptr}; + fp8_emulation_e4m3_t *f8_e4m3_emu_ {nullptr}; }; /* @@ -579,7 +583,8 @@ class jit_uni_binary_injector_t { Xbyak::Opmask get_aux_kmask() const; jit_generator *host_; - fp8_emulation_base_t *f8_emu_ {nullptr}; + fp8_emulation_e5m2_t *f8_e5m2_emu_ {nullptr}; + fp8_emulation_e4m3_t *f8_e4m3_emu_ {nullptr}; const rhs_arg_static_params_t rhs_arg_static_params_; const Xbyak::Reg64 param1_; const bcast_set_t supported_strategy_set_; diff --git a/src/cpu/x64/jit_brgemm_post_ops.hpp b/src/cpu/x64/jit_brgemm_post_ops.hpp index 3e7a8a0e9b5..e741d232519 100644 --- a/src/cpu/x64/jit_brgemm_post_ops.hpp +++ b/src/cpu/x64/jit_brgemm_post_ops.hpp @@ -377,6 +377,38 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { brg.attr()->post_ops_, memory_desc_wrapper(brg.dst_md()))) { + bool has_f8_e5m2_binary_postops = false; + bool has_f8_e4m3_binary_postops = false; + if (brg.with_binary) { + const auto &post_ops = attr.post_ops_; + for (int i = 0; i < post_ops.len(); i++) { + const auto &entry = post_ops.entry_[i]; + if (!entry.is_binary()) continue; + has_f8_e5m2_binary_postops = entry.binary.src1_desc.data_type + == data_type::f8_e5m2; + has_f8_e4m3_binary_postops = entry.binary.src1_desc.data_type + == data_type::f8_e4m3; + } + } + + if (brg.is_bf16_emu) + bf16_emu_ = utils::make_unique(this, emu_reserv_1, + emu_reserv_2, emu_reserv_3, emu_scratch, emu_reserv_4, + emu_reserv_4); + if (brg.is_fp8_via_convert() || has_f8_e5m2_binary_postops + || has_f8_e4m3_binary_postops) { + if (utils::one_of(data_type::f8_e5m2, brg.dt_a, brg.dt_b, brg.dt_d) + || has_f8_e5m2_binary_postops) + f8_e5m2_emulator_ = utils::make_unique( + this, emu_reserv_1, emu_reserv_2, emu_reserv_3, + emu_mask, emu_scratch); + if (utils::one_of(data_type::f8_e4m3, brg.dt_a, brg.dt_b, brg.dt_d) + || has_f8_e4m3_binary_postops) + f8_e4m3_emulator_ = utils::make_unique( + this, emu_reserv_1, emu_reserv_2, emu_reserv_3, + emu_reserv_4, emu_reserv_5, emu_scratch); + } + if (brg.beta != 0) { static constexpr bool preserve_gpr = true; static constexpr bool preserve_vmm = true; @@ -389,7 +421,14 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { memory_desc_wrapper(brg.dst_md()), static_cast(brg.load_dim % brg.ld_block), k_tail_mask, use_exact_tail_scalar_bcast}; - const binary_injector::static_params_t bsp {this->param1, rhs_sp}; + const binary_injector::static_params_t bsp(this->param1, + bcast_set_t {broadcasting_strategy_t::scalar, + broadcasting_strategy_t::per_oc, + broadcasting_strategy_t::per_oc_spatial, + broadcasting_strategy_t::per_mb_w, + broadcasting_strategy_t::per_w, + broadcasting_strategy_t::no_broadcast}, + rhs_sp, f8_e5m2_emulator_.get(), f8_e4m3_emulator_.get()); const bool save_state = jcp.with_eltwise; const auto &reserved_eltwise_gpr = reg_reserved_eltwise; @@ -402,22 +441,6 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { injector::jit_uni_postops_injector_t>( this, attr.post_ops_, bsp, esp); } - if (brg.is_bf16_emu) - bf16_emu_ = utils::make_unique(this, emu_reserv_1, - emu_reserv_2, emu_reserv_3, emu_scratch, emu_reserv_4, - emu_reserv_4); - if (brg.is_fp8_via_convert() - && utils::one_of( - data_type::f8_e5m2, brg.dt_a, brg.dt_b, brg.dt_d)) - f8_e5m2_emulator_ = utils::make_unique(this, - emu_reserv_1, emu_reserv_2, emu_reserv_3, emu_mask, - emu_scratch); - if (brg.is_fp8_via_convert() - && utils::one_of( - data_type::f8_e4m3, brg.dt_a, brg.dt_b, brg.dt_d)) - f8_e4m3_emulator_ = utils::make_unique(this, - emu_reserv_1, emu_reserv_2, emu_reserv_3, emu_reserv_4, - emu_reserv_5, emu_scratch); const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS); // per_oc: conv: 1 << 0, (1 << 1) + (1 << 0) (with groups) @@ -448,14 +471,15 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { data_type_t inp_dt_; data_type_t out_dt_; data_type_t bia_dt_; + // TODO: get rid of this map because it requires updates with every new isa static constexpr cpu_isa_t po_isa_t = utils::map(isa, avx512_core, avx2, avx2, avx2_vnni, avx2, avx2_vnni_2, avx2_vnni_2, avx512_core_fp16, - avx512_core_fp16); + avx512_core_fp16, avx10_1_512_amx_fp16, avx512_core_fp16); std::unique_ptr> postops_injector_; std::unique_ptr bf16_emu_; - std::unique_ptr f8_e5m2_emulator_; - std::unique_ptr f8_e4m3_emulator_; + std::unique_ptr f8_e5m2_emulator_; + std::unique_ptr f8_e4m3_emulator_; const bool with_binary_non_scalar_bcast_; From 21420dd94a122bf10b88d275413caa99f0188611 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Thu, 2 May 2024 15:54:34 -0700 Subject: [PATCH 116/187] common: add arrays conversion functions for fp8 --- src/common/float8.cpp | 57 +++++++++++++++++++++++++++++++++++++ src/common/float8.hpp | 13 +++++++++ src/common/type_helpers.hpp | 36 +++++++++++++++++++++++ 3 files changed, 106 insertions(+) diff --git a/src/common/float8.cpp b/src/common/float8.cpp index 9945817870c..43147cbc4d6 100644 --- a/src/common/float8.cpp +++ b/src/common/float8.cpp @@ -17,6 +17,7 @@ #include #include "common/bit_cast.hpp" +#include "common/dnnl_thread.hpp" #include "common/float16.hpp" #include "common/float8.hpp" #include "common/utils.hpp" @@ -169,5 +170,61 @@ float8_e4m3_t::operator float16_t() const { return utils::bit_cast(u16); } +void cvt_f8_e5m2_to_float(float *out, const float8_e5m2_t *inp, size_t nelems) { + + // TODO: implement and use jit conversion kernel for DNNL_X64 + + PRAGMA_OMP_SIMD() + for (size_t i = 0; i < nelems; ++i) + out[i] = inp[i]; +} + +void cvt_f8_e4m3_to_float(float *out, const float8_e4m3_t *inp, size_t nelems) { + + // TODO: implement and use jit conversion kernel for DNNL_X64 + + PRAGMA_OMP_SIMD() + for (size_t i = 0; i < nelems; ++i) + out[i] = inp[i]; +} + +void cvt_float_to_f8_e5m2(float8_e5m2_t *out, const float *inp, size_t nelems) { + + // TODO: implement and use jit conversion kernel for DNNL_X64 + + PRAGMA_OMP_SIMD() + for (size_t i = 0; i < nelems; ++i) + out[i] = static_cast(inp[i]); +} + +void cvt_float_to_f8_e4m3(float8_e4m3_t *out, const float *inp, size_t nelems) { + + // TODO: implement and use jit conversion kernel for DNNL_X64 + + PRAGMA_OMP_SIMD() + for (size_t i = 0; i < nelems; ++i) + out[i] = static_cast(inp[i]); +} + +void add_floats_and_cvt_to_f8_e5m2(float8_e5m2_t *out, const float *inp0, + const float *inp1, size_t nelems) { + + // TODO: implement and use jit conversion kernel for DNNL_X64 + + PRAGMA_OMP_SIMD() + for (size_t i = 0; i < nelems; ++i) + out[i] = static_cast(inp0[i] + inp1[i]); +} + +void add_floats_and_cvt_to_f8_e4m3(float8_e4m3_t *out, const float *inp0, + const float *inp1, size_t nelems) { + + // TODO: implement and use jit conversion kernel for DNNL_X64 + + PRAGMA_OMP_SIMD() + for (size_t i = 0; i < nelems; ++i) + out[i] = static_cast(inp0[i] + inp1[i]); +} + } // namespace impl } // namespace dnnl diff --git a/src/common/float8.hpp b/src/common/float8.hpp index eb2217887ec..754dcb89520 100644 --- a/src/common/float8.hpp +++ b/src/common/float8.hpp @@ -62,6 +62,19 @@ struct float8_e4m3_t { return *this; } }; + +void cvt_f8_e5m2_to_float(float *out, const float8_e5m2_t *inp, size_t nelems); +void cvt_f8_e4m3_to_float(float *out, const float8_e4m3_t *inp, size_t nelems); +void cvt_float_to_f8_e5m2(float8_e5m2_t *out, const float *inp, size_t nelems); +void cvt_float_to_f8_e4m3(float8_e4m3_t *out, const float *inp, size_t nelems); + +// performs element-by-element sum of inp and add float arrays and stores +// result to f8 out array with down-conversion +void add_floats_and_cvt_to_f8_e5m2(float8_e5m2_t *out, const float *inp0, + const float *inp1, size_t nelems); +void add_floats_and_cvt_to_f8_e4m3(float8_e4m3_t *out, const float *inp0, + const float *inp1, size_t nelems); + static_assert(sizeof(float8_e5m2_t) == 1, "float8_e4m3_t must be 1 byte"); #if DNNL_X64 diff --git a/src/common/type_helpers.hpp b/src/common/type_helpers.hpp index 4779c331890..5473d02c2bd 100644 --- a/src/common/type_helpers.hpp +++ b/src/common/type_helpers.hpp @@ -393,6 +393,30 @@ inline void cvt_to_float( cvt_float16_to_float(out, inp, nelems); } +template <> +inline void cvt_to_float( + float *out, const float8_e5m2_t *inp, size_t nelems) { + cvt_f8_e5m2_to_float(out, inp, nelems); +} + +template <> +inline void cvt_from_float( + float8_e5m2_t *out, const float *inp, size_t nelems) { + cvt_float_to_f8_e5m2(out, inp, nelems); +} + +template <> +inline void cvt_to_float( + float *out, const float8_e4m3_t *inp, size_t nelems) { + cvt_f8_e4m3_to_float(out, inp, nelems); +} + +template <> +inline void cvt_from_float( + float8_e4m3_t *out, const float *inp, size_t nelems) { + cvt_float_to_f8_e4m3(out, inp, nelems); +} + inline void cvt_from_float( data_type_t dt, void *out, const float *inp, size_t nelems) { switch (dt) { @@ -402,6 +426,12 @@ inline void cvt_from_float( case data_type::f16: cvt_from_float((float16_t *)out, inp, nelems); break; + case data_type::f8_e5m2: + cvt_from_float((float8_e5m2_t *)out, inp, nelems); + break; + case data_type::f8_e4m3: + cvt_from_float((float8_e4m3_t *)out, inp, nelems); + break; default: assert(!"unimplemented"); } } @@ -415,6 +445,12 @@ inline void cvt_to_float( case data_type::f16: cvt_to_float(out, (const float16_t *)inp, nelems); break; + case data_type::f8_e5m2: + cvt_to_float(out, (const float8_e5m2_t *)inp, nelems); + break; + case data_type::f8_e4m3: + cvt_to_float(out, (const float8_e4m3_t *)inp, nelems); + break; default: assert(!"unimplemented"); } } From 34de8bfb1514f0239a5c6966d1fda1939019be9a Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Wed, 15 May 2024 13:46:43 -0700 Subject: [PATCH 117/187] x64: jit_diff_wei_trans_to_vnni_t: updates to support fp8 --- .../x64/jit_avx512_core_amx_convolution.cpp | 4 +- src/cpu/x64/jit_brgemm_conv_bwd_w.cpp | 2 +- src/cpu/x64/jit_transpose_utils.cpp | 270 +++++++++++------- src/cpu/x64/jit_transpose_utils.hpp | 9 +- 4 files changed, 180 insertions(+), 105 deletions(-) diff --git a/src/cpu/x64/jit_avx512_core_amx_convolution.cpp b/src/cpu/x64/jit_avx512_core_amx_convolution.cpp index 889eac6cc58..2377ca8bf47 100644 --- a/src/cpu/x64/jit_avx512_core_amx_convolution.cpp +++ b/src/cpu/x64/jit_avx512_core_amx_convolution.cpp @@ -869,8 +869,8 @@ status_t jit_avx512_core_amx_convolution_bwd_weights_t::init(engine_t *engine) { } if (j.transform_to_vnni) { CHECK(safe_ptr_assign(diff_wei_trans_kernel_, - new jit_diff_wei_trans_to_vnni_t( - j.wei_dt, j.kd, j.kh, j.kw, j.ic_block, j.oc_block))); + new jit_diff_wei_trans_to_vnni_t(j.wei_dt, j.kd, j.kh, j.kw, + j.ic_block, j.oc_block, j.nb_ic))); CHECK(diff_wei_trans_kernel_->create_kernel()); } return status::success; diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp index 9b937fde0b2..1fbd599121b 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp @@ -258,7 +258,7 @@ status_t brgemm_convolution_bwd_weights_t::init(engine_t *engine) { if (jcp.transform_to_vnni) { CHECK(safe_ptr_assign(diff_wei_trans_kernel_, new jit_diff_wei_trans_to_vnni_t(jcp.wei_dt, jcp.kd, jcp.kh, - jcp.kw, jcp.ic_block, jcp.oc_block))); + jcp.kw, jcp.ic_block, jcp.oc_block, jcp.nb_ic))); CHECK(diff_wei_trans_kernel_->create_kernel()); } diff --git a/src/cpu/x64/jit_transpose_utils.cpp b/src/cpu/x64/jit_transpose_utils.cpp index 63bcc9cf91e..c21fdfa6bb2 100644 --- a/src/cpu/x64/jit_transpose_utils.cpp +++ b/src/cpu/x64/jit_transpose_utils.cpp @@ -21,6 +21,7 @@ #include "cpu/x64/cpu_barrier.hpp" #include "cpu/x64/jit_generator.hpp" +#include "cpu/x64/jit_avx512_core_fp8cvt.hpp" #include "cpu/x64/jit_transpose_utils.hpp" namespace dnnl { @@ -944,29 +945,45 @@ void jit_transpose4x16_src::generate() { void jit_diff_wei_trans_to_vnni_t::generate() { /* Reorder part of F32 weights tensor - from [2I][kd][kh][kw][16i][16o] to VNNI format [kd][kh][kw][16i][16o][2i] - and downconvert it to Bfloat16. */ - const int typesize_out = 2; - const int typesize_acc = 4; + from [VNNI_GRANULARITY][I][kd][kh][kw][16i][16o] to VNNI format [kd][kh][kw][16i][16o][VNNI_GRANULARITY][i] + and down-convert it to required float. */ + const int ts_out = types::data_type_size(out_dt_); + const int ts_inp = 4; const int simd_w = 16; - using reg64_t = const Xbyak::Reg64; - const reg64_t ®_output = r15; - const reg64_t &org_reg_output = r14; - const reg64_t ®_input = r13; - const reg64_t ®_input_1 = r12; - const reg64_t &org_reg_input_1 = r11; - const reg64_t ®_input_2 = r10; - const reg64_t ®_prm_table = r9; - const reg64_t ®_last_ic_block = rax; - const reg64_t ®_kd = rsi; - const reg64_t ®_kh = abi_not_param1; - const reg64_t ®_tmp = rdx; - - const Xbyak::Zmm &zmm_idx = Xbyak::Zmm(31); - auto get_zmm_src_0 = [&](int ic) { return Xbyak::Zmm(ic); }; - auto get_zmm_src_1 = [&](int ic) { return Xbyak::Zmm(4 + ic); }; - auto get_zmm_bf16 = [&](int ic) { return Xbyak::Zmm(8 + ic); }; + const Reg64 ®_output = r15; + const Reg64 ®_output_kd = r14; + const Reg64 ®_input_kw = r13; + const Reg64 ®_input_kh = r12; + const Reg64 ®_input_kd = r11; + const Reg64 ®_prm_table = r9; + const Reg64 ®_last_ic_block = rax; + const Reg64 ®_kd = rsi; + const Reg64 ®_kh = abi_not_param1; + const Reg64 ®_tmp = rdx; + + Zmm emu_reserv_1 = Zmm(30); + Zmm emu_reserv_2 = Zmm(29); + Zmm emu_reserv_3 = Zmm(28); + Zmm emu_reserv_4 = Zmm(27); + Zmm emu_reserv_5 = Zmm(26); + Reg64 emu_scratch = reg_tmp; + Xbyak::Opmask emu_mask = Xbyak::Opmask(4); + + std::unique_ptr f8_emu; + if (out_dt_ == data_type::f8_e5m2) + f8_emu = utils::make_unique(this, emu_reserv_1, + emu_reserv_2, emu_reserv_3, emu_mask, emu_scratch); + else if (out_dt_ == data_type::f8_e4m3) + f8_emu = utils::make_unique(this, emu_reserv_1, + emu_reserv_2, emu_reserv_3, emu_reserv_4, emu_reserv_5, + emu_scratch); + + const Zmm &zmm_idx = Zmm(31); + auto get_zmm_src = [&](int idx, int ic) { return Zmm(4 * idx + ic); }; + auto get_zmm_bf16 = [&](int ic) { return Zmm(16 + ic); }; + + const int vnni_granularity = data_type_vnni_granularity(out_dt_); Xbyak::Label prm_table, zero_buffer; Xbyak::Label kd_loop_label, kh_loop_label; @@ -974,105 +991,145 @@ void jit_diff_wei_trans_to_vnni_t::generate() { preamble(); mov(reg_last_ic_block, ptr[abi_param1 + GET_OFF(last_ic_block)]); - mov(org_reg_input_1, ptr[abi_param1 + GET_OFF(src)]); - mov(org_reg_output, ptr[abi_param1 + GET_OFF(dst)]); + mov(reg_input_kd, ptr[abi_param1 + GET_OFF(src)]); + mov(reg_output_kd, ptr[abi_param1 + GET_OFF(dst)]); mov(reg_prm_table, prm_table); vmovups(zmm_idx, ptr[reg_prm_table]); + dim_t inp_kw_offset = (dim_t)ts_inp * ic_block_ * oc_block_; + dim_t inp_bc_offset = inp_kw_offset * kd_ * kh_ * kw_; + dim_t out_kw_offset + = (dim_t)ts_out * ic_block_ * oc_block_ * vnni_granularity; + xor_(reg_kd, reg_kd); L(kd_loop_label); { - mov(reg_output, org_reg_output); - mov(reg_input_1, org_reg_input_1); + mov(reg_output, reg_output_kd); + mov(reg_input_kh, reg_input_kd); xor_(reg_kh, reg_kh); L(kh_loop_label); { for (int kw = 0; kw < kw_; kw++) { - Xbyak::Label last_ic_label, done_ic_label; - - dim_t out_offset - = (dim_t)typesize_out * kw * ic_block_ * oc_block_ * 2; - dim_t inp_1_offset - = (dim_t)typesize_acc * kw * ic_block_ * oc_block_; - dim_t inp_2_offset = (dim_t)typesize_acc - * (kd_ * kh_ * kw_ * ic_block_ * oc_block_ - + kw * ic_block_ * oc_block_); - - cmp(reg_last_ic_block, 0); - jne(last_ic_label, T_NEAR); - - mov(reg_input_2, reg_input_1); - safe_add(reg_input_2, inp_2_offset, reg_tmp); - jmp(done_ic_label, T_NEAR); - - L(last_ic_label); - mov(reg_input_2, zero_buffer); - - L(done_ic_label); - - for (int ocb = 0; ocb < oc_block_; ocb += simd_w) { - int ic_count = 0; - for (int bc = 0; bc < 2; bc++) { - if (!bc) { - mov(reg_input, reg_input_1); - safe_add(reg_input, inp_1_offset, reg_tmp); + for (int bc = 0; bc < vnni_granularity; bc++) { + Xbyak::Label last_ic_label, done_ic_label; + + cmp(reg_last_ic_block, 0); + jne(last_ic_label, T_NEAR); + { + mov(reg_input_kw, reg_input_kh); + safe_add(reg_input_kw, + bc * inp_bc_offset + kw * inp_kw_offset, + reg_tmp); + jmp(done_ic_label, T_NEAR); + } + L(last_ic_label); + { + if (bc < (nb_ic_ % vnni_granularity)) { + mov(reg_input_kw, reg_input_kh); + safe_add(reg_input_kw, + bc * inp_bc_offset + kw * inp_kw_offset, + reg_tmp); } else - mov(reg_input, reg_input_2); - - for (int ic = 0; ic < ic_block_ / 2; ic++) { - auto zmm_src_0 = get_zmm_src_0(ic); - auto zmm_src_1 = get_zmm_src_1(ic); - auto zmm_out = get_zmm_bf16(ic); - - vmovups(zmm_src_0, - ptr[reg_input - + typesize_acc - * ((2 * ic + 0) * oc_block_ - + ocb)]); - vmovups(zmm_src_1, - ptr[reg_input - + typesize_acc - * ((2 * ic + 1) * oc_block_ - + ocb)]); + mov(reg_input_kw, zero_buffer); + } + L(done_ic_label); + + for_(int ocb = 0; ocb < oc_block_; ocb += simd_w) + for (int icc = 0; icc < ic_block_ / vnni_granularity; + icc++) { + int ic_count + = bc * (ic_block_ / vnni_granularity) + icc; + + auto zmm_out = get_zmm_bf16(icc); + + for (int idx = 0; idx < vnni_granularity; idx++) { + auto zmm_src = get_zmm_src(idx, icc); + const auto src_offset = ts_inp + * ((vnni_granularity * icc + idx) + * oc_block_ + + ocb); + vmovups(zmm_src, ptr[reg_input_kw + src_offset]); + } + const auto src_offset = ts_inp + * ((vnni_granularity * icc) * oc_block_ + ocb); + + if (one_of(out_dt_, data_type::bf16, data_type::f16)) { + const auto zmm_src_0 = get_zmm_src(0, icc); + const auto zmm_src_1 = get_zmm_src(1, icc); + const auto src_off0 = src_offset; + const auto src_off1 = src_off0 + ts_inp * oc_block_; + vmovups(zmm_src_0, ptr[reg_input_kw + src_off0]); + vmovups(zmm_src_1, ptr[reg_input_kw + src_off1]); if (out_dt_ == data_type::bf16) { vcvtne2ps2bf16(zmm_out, zmm_src_1, zmm_src_0); } else if (out_dt_ == data_type::f16) { - vcvtps2phx(Ymm(zmm_src_0.getIdx()), zmm_src_0); - vcvtps2phx(Ymm(zmm_src_1.getIdx()), zmm_src_1); - vinsertf32x8(zmm_out, zmm_src_0, - Ymm(zmm_src_1.getIdx()), 1); - } else { - assert(!"unsupported data type"); + Ymm ymm_src_0(zmm_src_0.getIdx()); + Ymm ymm_src_1(zmm_src_1.getIdx()); + vcvtps2phx(ymm_src_0, zmm_src_0); + vcvtps2phx(ymm_src_1, zmm_src_1); + vinsertf32x8(zmm_out, zmm_src_0, ymm_src_1, 1); } vpermw(zmm_out, zmm_idx, zmm_out); - - vmovups(ptr[reg_output + out_offset - + typesize_out - * (ic_count * oc_block_ * 2 - + ocb * 2)], - zmm_out); - ic_count++; + } else if (one_of(out_dt_, data_type::f8_e5m2, + data_type::f8_e4m3)) { + const auto zmm_src_0 = get_zmm_src(0, icc); + const auto zmm_src_1 = get_zmm_src(1, icc); + const auto zmm_src_2 = get_zmm_src(2, icc); + const auto zmm_src_3 = get_zmm_src(3, icc); + Xmm xmm_src_0(zmm_src_0.getIdx()); + Xmm xmm_src_1(zmm_src_1.getIdx()); + Xmm xmm_src_2(zmm_src_2.getIdx()); + Xmm xmm_src_3(zmm_src_3.getIdx()); + + const auto src_off0 = src_offset; + const auto src_off1 = src_off0 + ts_inp * oc_block_; + const auto src_off2 = src_off1 + ts_inp * oc_block_; + const auto src_off3 = src_off2 + ts_inp * oc_block_; + + f8_emu->vcvt_f32_to_f8( + xmm_src_0, ptr[reg_input_kw + src_off0]); + f8_emu->vcvt_f32_to_f8( + xmm_src_1, ptr[reg_input_kw + src_off1]); + f8_emu->vcvt_f32_to_f8( + xmm_src_2, ptr[reg_input_kw + src_off2]); + f8_emu->vcvt_f32_to_f8( + xmm_src_3, ptr[reg_input_kw + src_off3]); + vinserti64x2(zmm_out, zmm_out, xmm_src_0, 0); + vinserti64x2(zmm_out, zmm_out, xmm_src_1, 1); + vinserti64x2(zmm_out, zmm_out, xmm_src_2, 2); + vinserti64x2(zmm_out, zmm_out, xmm_src_3, 3); + vpermb(zmm_out, zmm_idx, zmm_out); + } else { + assert(!"unsupported data type"); } + + vmovups(ptr[reg_output + kw * out_kw_offset + + ts_out + * (ic_count * oc_block_ + * vnni_granularity + + ocb * vnni_granularity)], + zmm_out); } } } safe_add(reg_output, - (dim_t)typesize_out * kw_ * 2 * ic_block_ * oc_block_, + (dim_t)ts_out * kw_ * vnni_granularity * ic_block_ + * oc_block_, + reg_tmp); + safe_add(reg_input_kh, (dim_t)ts_inp * kw_ * ic_block_ * oc_block_, reg_tmp); - safe_add(reg_input_1, - (dim_t)typesize_acc * kw_ * ic_block_ * oc_block_, reg_tmp); add(reg_kh, 1); cmp(reg_kh, kh_); jl(kh_loop_label, T_NEAR); } - safe_add(org_reg_output, - (dim_t)typesize_out * kh_ * kw_ * 2 * ic_block_ * oc_block_, - reg_tmp); - safe_add(org_reg_input_1, - (dim_t)typesize_acc * kh_ * kw_ * ic_block_ * oc_block_, + safe_add(reg_output_kd, + (dim_t)ts_out * kh_ * kw_ * vnni_granularity * ic_block_ + * oc_block_, reg_tmp); + safe_add(reg_input_kd, + (dim_t)ts_inp * kh_ * kw_ * ic_block_ * oc_block_, reg_tmp); add(reg_kd, 1); cmp(reg_kd, kd_); @@ -1082,18 +1139,35 @@ void jit_diff_wei_trans_to_vnni_t::generate() { postamble(); align(64); - L(prm_table); - const uint16_t prm_array[32] - = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, - 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}; - for (size_t i = 0; i < 32; ++i) - dw(prm_array[i]); + if (one_of(out_dt_, data_type::f8_e5m2, data_type::f8_e4m3)) { + + L(prm_table); + uint8_t prm_array[64]; + for (size_t i = 0; i < 16; i++) { + prm_array[4 * i] = i; + prm_array[4 * i + 1] = i + 16; + prm_array[4 * i + 2] = i + 32; + prm_array[4 * i + 3] = i + 48; + } + + for (size_t i = 0; i < 64; ++i) + db(prm_array[i]); + } else { + L(prm_table); + const uint16_t prm_array[32] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, + 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, + 14, 30, 15, 31}; + for (size_t i = 0; i < 32; ++i) + dw(prm_array[i]); + } align(64); L(zero_buffer); const uint16_t zero = 0; - for (int i = 0; i < typesize_acc * oc_block_ * ic_block_; ++i) + for (int i = 0; i < ts_inp * oc_block_ * ic_block_; ++i) db(zero); + + if (f8_emu) f8_emu->prepare_table(); } #undef GET_OFF diff --git a/src/cpu/x64/jit_transpose_utils.hpp b/src/cpu/x64/jit_transpose_utils.hpp index 899e4051d99..d3c99a79609 100644 --- a/src/cpu/x64/jit_transpose_utils.hpp +++ b/src/cpu/x64/jit_transpose_utils.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2017-2022 Intel Corporation +* Copyright 2017-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -122,14 +122,15 @@ struct jit_diff_wei_trans_to_vnni_t : public jit_generator { jit_diff_wei_trans_to_vnni_t(const data_type_t dt, const int &kd, const int &kh, const int &kw, const int &ic_block, - const int &oc_block) + const int &oc_block, const int nb_ic) : jit_generator(jit_name()) , out_dt_(dt) , kd_(kd) , kh_(kh) , kw_(kw) , ic_block_(ic_block) - , oc_block_(oc_block) {} + , oc_block_(oc_block) + , nb_ic_(nb_ic) {} ~jit_diff_wei_trans_to_vnni_t() {} @@ -137,7 +138,7 @@ struct jit_diff_wei_trans_to_vnni_t : public jit_generator { const data_type_t out_dt_; const int kd_, kh_, kw_; - const int ic_block_, oc_block_; + const int ic_block_, oc_block_, nb_ic_; private: void generate() override; From a955fd98f3b49ddfe569aadd89e48146f3e20f5a Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Thu, 2 May 2024 10:57:49 -0700 Subject: [PATCH 118/187] x64: jit_trans_iw_ic_t and jit_trans_ow_oc_t: update for fp8 --- src/cpu/x64/jit_brgemm_conv_bwd_w.cpp | 2 + src/cpu/x64/jit_transpose_utils.cpp | 917 +++++++++++++++++--------- 2 files changed, 620 insertions(+), 299 deletions(-) diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp index 1fbd599121b..93b084e76c9 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp @@ -158,6 +158,8 @@ status_t brgemm_convolution_bwd_weights_t::pd_t::init(engine_t *engine) { void brgemm_convolution_bwd_weights_t::pd_t::copy2jit_jcp() { jit_jcp_ = zero(); jit_jcp_.prop_kind = jcp_.prop_kind; + jit_jcp_.src_dt = jcp_.src_dt; + jit_jcp_.dst_dt = jcp_.dst_dt; jit_jcp_.has_vnni = true; // Needed for transpose routines jit_jcp_.harness = jcp_.harness; jit_jcp_.simd_w = jcp_.simd_w; diff --git a/src/cpu/x64/jit_transpose_utils.cpp b/src/cpu/x64/jit_transpose_utils.cpp index c21fdfa6bb2..c4234f02736 100644 --- a/src/cpu/x64/jit_transpose_utils.cpp +++ b/src/cpu/x64/jit_transpose_utils.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2017-2023 Intel Corporation +* Copyright 2017-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,172 +34,206 @@ using namespace Xbyak; #define GET_OFF(x) offsetof(ctx_t, x) -struct jit_trans_iw_ic_int16_t : public jit_trans_src_t, public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_trans_iw_ic_int16_t) - jit_trans_iw_ic_int16_t(const jit_conv_conf_t *conf) - : jit_trans_src_t(conf), jit_generator(jit_name()) {} +struct jit_trans_iw_ic_t : public jit_trans_src_t, public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_trans_iw_ic_t) + jit_trans_iw_ic_t(const jit_conv_conf_t *conf) + : jit_trans_src_t(conf) + , jit_generator(jit_name()) + , typesize(conf->src_dt == data_type::undef + ? 2 + : types::data_type_size(conf->src_dt)) + , is_layout_nxc(utils::one_of(conf_->src_tag, format_tag::ndhwc, + format_tag::nhwc, format_tag::nwc)) {} void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); } status_t create_kernel() override { return jit_generator::create_kernel(); } private: - using reg64_t = const Xbyak::Reg64; - using reg32_t = const Xbyak::Reg32; - using opmask_t = const Xbyak::Opmask; - - enum { - typesize = sizeof(int16_t), - transpose_size = 16, - small_spatial = 14 - }; + int typesize = 0; + bool is_layout_nxc = false; + static constexpr int transpose_size = 16; size_t src_stride = 0, tr_src_stride = 0; - int tail = 0; - bool enable_prefetch = false; - - opmask_t kFFFF = k1; - opmask_t k5555 = k2; - opmask_t kAAAA = k3; - opmask_t kAA = k4; - opmask_t k55 = k5; - opmask_t kCC = k6; - opmask_t k33 = k7; - opmask_t kTail = k1; - - reg64_t reg_src = r8; - reg64_t reg_tr_src = r9; - reg64_t reg_src_prf = r10; - reg64_t reg_tr_src_prf = r11; - reg64_t reg_loop = r12; - reg64_t reg_tr_src_tmp = r13; - reg32_t regw_tmp = r14d; - reg64_t imm_addr64 = rbx; - - Xbyak::Zmm vidx1 = zmm31; - Xbyak::Zmm vidx2 = zmm30; - Xbyak::Zmm vidx3 = zmm29; - Xbyak::Zmm vidx4 = zmm28; - Xbyak::Zmm vidx5 = zmm27; - Xbyak::Zmm zmm_tmp = zmm26; + + Opmask kLoadMask1 = k1; + Opmask kLoadMask2 = k2; + Opmask kPerm1 = k3; + Opmask kPerm2 = k4; + Opmask kTail = k5; + Opmask kLPad = k6; + Opmask kRPad = k7; + + Reg64 reg_src = r8; + Reg64 reg_tr_src = r9; + Reg64 reg_src_prf = r10; + Reg64 reg_tr_src_prf = r11; + Reg64 reg_loop = r12; + Reg64 reg_tr_src_tmp = r13; + Reg32 regw_tmp = r14d; + Reg64 imm_addr64 = rbx; + + Zmm vidx1 = zmm31; + Zmm vidx2 = zmm30; + Zmm vidx3 = zmm29; + Zmm vidx4 = zmm28; + Zmm vidx5 = zmm27; + Zmm zmm_tmp = zmm26; + Zmm zmm_zero = zmm25; + + void kmovw(Opmask k, unsigned w) { + mov(regw_tmp, w); + jit_generator::kmovw(k, regw_tmp); + } + void kmovd(Opmask k, unsigned w) { + mov(regw_tmp, w); + jit_generator::kmovd(k, regw_tmp); + } + Zmm src_zmm(int i) { return Zmm(i); } + Ymm src_ymm(int i) { + assert(i >= 0 && i < 16); + return Ymm(i); + } + Xmm src_xmm(int i) { + assert(i >= 0 && i < 16); + return Xmm(i); + } + void vmovdqa64(Zmm z, const int64_t *addr) { + mov(imm_addr64, reinterpret_cast(addr)); + jit_generator::vmovdqa64(z, ptr[imm_addr64]); + } + + void vmovdqa32(Zmm z, const int32_t *addr) { + mov(imm_addr64, reinterpret_cast(addr)); + jit_generator::vmovdqa32(z, ptr[imm_addr64]); + } void transpose(int nrows, int l_pad, int r_pad, bool nontemporal_stores); + void transpose_2b(int nrows, int l_pad, int r_pad, bool nontemporal_stores); + void transpose_1b(int nrows, int l_pad, int r_pad, bool nontemporal_stores); void generate() override; }; -void jit_trans_iw_ic_int16_t::transpose( +void jit_trans_iw_ic_t::transpose( int nrows, int l_pad, int r_pad, bool nontemporal_stores) { assert(nrows >= 0 && nrows <= transpose_size); static_assert(transpose_size == 16, "Unsupported transpose size"); if (!nrows) return; - auto src_zmm = [](int i) { return Zmm(i); }; - - auto src_ymm = [](int i) { - assert(i >= 0 && i < 16); - return Ymm(i); - }; + if (typesize == 2) + transpose_2b(nrows, l_pad, r_pad, nontemporal_stores); + else if (typesize == 1) + transpose_1b(nrows, l_pad, r_pad, nontemporal_stores); + else + assert(!"unsupported data type"); +} - auto load_ymm = [this, src_ymm](int i) { +void jit_trans_iw_ic_t::transpose_2b( + int nrows, int l_pad, int r_pad, bool nontemporal_stores) { + auto load_ymm = [this](int i) { vmovups(src_ymm(i), EVEX_compress_addr(reg_src, i * src_stride)); }; - auto kmovw = [this](Opmask k, unsigned w) { - mov(regw_tmp, w); - jit_generator::kmovw(k, regw_tmp); - }; - auto kmovd = [this](Opmask k, unsigned w) { mov(regw_tmp, w); jit_generator::kmovd(k, regw_tmp); }; + int l_pad_tail {0}, l_pad_rows {0}; + int r_pad_tail {0}, r_pad_rows {0}; + + if (l_pad > 0) { + int store_pad = 2 * transpose_size; + l_pad_rows = l_pad / store_pad; + l_pad_tail = div_up(l_pad % store_pad, 2); + kmovw(kLPad, (1 << l_pad_tail) - 1); + } + if (r_pad > 0) { + int store_pad = div_up(r_pad, 2); + r_pad_rows = store_pad / transpose_size; + r_pad_tail = store_pad % transpose_size; + kmovw(kRPad, (1 << r_pad_tail) - 1); + } - auto store = [&](Zmm r, int i) { - auto padding - = [this, i, kmovw](Reg64 base, int pad_rows, int pad_tail) { - // note: pad can be bigger than 16 because of dilation - const size_t row_offset = 2 * transpose_size * typesize; - auto zmm_zero = zmm_tmp; - vpxord(zmm_zero, zmm_zero, zmm_zero); - for (int i_row = 0; i_row < pad_rows; i_row++) { - auto addr = EVEX_compress_addr( - base, i * tr_src_stride + i_row * row_offset); - vmovups(addr, zmm_zero); - } - if (pad_tail > 0) { - kmovw(kTail, (1 << pad_tail) - 1); - base.setOpmaskIdx(kTail.getIdx(), true); - auto addr = EVEX_compress_addr(base, - i * tr_src_stride + pad_rows * row_offset); - vmovups(addr, zmm_zero); - } - }; + auto padding = [this](Reg64 base, int addr_shift, int pad_rows, + int pad_tail, const Opmask &mask, int i) { + // note: pad can be bigger than 16 because of dilation + const size_t row_offset = 2 * transpose_size * typesize; + const auto pshift = addr_shift * typesize + i * tr_src_stride; + for (int i_row = 0; i_row < pad_rows; i_row++) { + auto addr = EVEX_compress_addr(base, pshift + i_row * row_offset); + vmovups(addr, zmm_zero); + } + if (pad_tail > 0) { + base.setOpmaskIdx(mask.getIdx(), true); + auto addr + = EVEX_compress_addr(base, pshift + pad_rows * row_offset); + vmovups(addr, zmm_zero); + } + }; + auto store = [&](Zmm r, int i) { mov(reg_tr_src_tmp, reg_tr_src); if (l_pad > 0) { - int store_pad = 2 * transpose_size; - int pad_rows = l_pad / store_pad; - int tail = l_pad % store_pad; - padding(reg_tr_src_tmp, pad_rows, div_up(tail, 2)); - add(reg_tr_src_tmp, (pad_rows * store_pad + tail) * typesize); + padding(reg_tr_src_tmp, 0, l_pad_rows, l_pad_tail, kLPad, i); + add(reg_tr_src_tmp, l_pad * typesize); } if (r_pad > 0) { int addr_shift = nrows - r_pad % 2; - int store_pad = div_up(r_pad, 2); - int pad_rows = store_pad / transpose_size; - add(reg_tr_src_tmp, addr_shift * typesize); - padding(reg_tr_src_tmp, pad_rows, store_pad % transpose_size); - sub(reg_tr_src_tmp, addr_shift * typesize); + padding(reg_tr_src_tmp, addr_shift, r_pad_rows, r_pad_tail, kRPad, + i); } - int store_tail = rnd_up(nrows, 2); - kmovw(kTail, (1 << store_tail / 2) - 1); - auto k = kTail; auto base = reg_tr_src_tmp; - base.setOpmaskIdx(k.getIdx(), true); + base.setOpmaskIdx(kTail.getIdx(), true); auto addr = EVEX_compress_addr(base, i * tr_src_stride); vmovups(addr, r); }; - const bool is_layout_nxc = utils::one_of(conf_->src_tag, format_tag::ndhwc, - format_tag::nhwc, format_tag::nwc); + if (l_pad > 0 || r_pad > 0) vpxord(zmm_zero, zmm_zero, zmm_zero); + int store_tail = rnd_up(nrows, 2); + kmovw(kTail, (1 << store_tail / 2) - 1); + const int ic_block = conf_->ic_block; - const bool is_tail_block = ic_block != 16; + const bool is_short_block = ic_block != 16; const int ic_tail = conf_->ic_tail; // Assertion below as we need vmovdqu16 for ic_tails. // If needed, can be extended by using load_bytes() helper. assert(IMPLICATION(ic_tail, mayiuse(avx512_core))); + // load by two rows of src into each even register and permute it if (mayiuse(avx512_core)) { if (conf_->stride_w > 1 || nrows % 2 || is_layout_nxc) - kmovd(kFFFF, (1 << ic_block) - 1); - if (conf_->stride_w > 1 || is_layout_nxc) kmovd(k33, 0xffff0000); - if (is_layout_nxc && conf_->ic_tail) { + kmovd(kLoadMask1, (1 << ic_block) - 1); + if (conf_->stride_w > 1 || is_layout_nxc) kmovd(kLoadMask2, 0xffff0000); + + if (is_layout_nxc && ic_tail) { Label done; cmp(dword[param1 + GET_OFF(ch_work)], ic_block); je(done, T_NEAR); - kmovd(kFFFF, (1 << conf_->ic_tail) - 1); - kshiftld(k33, kFFFF, 16); + kmovd(kLoadMask1, (1 << ic_tail) - 1); + kshiftld(kLoadMask2, kLoadMask1, 16); L(done); } - for (int i = 0; i < nrows / 2; i++) { - auto zmm_src0 = src_zmm(2 * i); + for (int i = 0; i < rnd_dn(nrows, 2); i += 2) { + auto zmm_src0 = src_zmm(i); if (conf_->stride_w == 1 && !is_layout_nxc) { - vmovdqu16(zmm_src0, - EVEX_compress_addr(reg_src, 2 * i * src_stride)); + // load two rows at a time + vmovdqu16( + zmm_src0, EVEX_compress_addr(reg_src, i * src_stride)); } else { - vmovdqu16(zmm_src0 | kFFFF | T_z, - EVEX_compress_addr(reg_src, 2 * i * src_stride)); - if (is_tail_block || ic_tail) { - auto zmm_tmp = src_zmm(2 * i + 1); - vmovdqu16(zmm_tmp | kFFFF | T_z, - EVEX_compress_addr( - reg_src, (2 * i + 1) * src_stride)); - vinsertf64x4(zmm_src0, zmm_src0, src_ymm(2 * i + 1), 1); + // load even row + vmovdqu16(zmm_src0 | kLoadMask1 | T_z, + EVEX_compress_addr(reg_src, i * src_stride)); + // load odd row to the second half of register + if (is_short_block || ic_tail) { + auto zmm_src_tmp = src_zmm(i + 1); + vmovdqu16(zmm_src_tmp | kLoadMask1 | T_z, + EVEX_compress_addr(reg_src, (i + 1) * src_stride)); + vinsertf64x4(zmm_src0, zmm_src0, src_ymm(i + 1), 1); } else { - vmovdqu16(zmm_src0 | k33, + vmovdqu16(zmm_src0 | kLoadMask2, EVEX_compress_addr( - reg_src, (2 * i + 1) * src_stride - 32)); + reg_src, (i + 1) * src_stride - 32)); } } vpermw(zmm_src0, vidx5, zmm_src0); @@ -207,37 +241,34 @@ void jit_trans_iw_ic_int16_t::transpose( // for odd numbers we need to mix row with zeroes if (nrows % 2) { - int i = nrows / 2; - auto zmm_src0 = src_zmm(2 * i); - vmovdqu16(zmm_src0 | kFFFF | T_z, - EVEX_compress_addr(reg_src, 2 * i * src_stride)); + int i = nrows - 1; + auto zmm_src0 = src_zmm(i); + vmovdqu16(zmm_src0 | kLoadMask1 | T_z, + EVEX_compress_addr(reg_src, i * src_stride)); vpermw(zmm_src0, vidx5, zmm_src0); } - if (conf_->stride_w > 1 || is_layout_nxc) kmovw(k33, 0x33); - for (int i = rnd_up(nrows, 2); i < 16; i += 2) { vpxord(src_zmm(i), src_zmm(i), src_zmm(i)); } } else { - kmovw(kFFFF, 0xffff); // all loads for (int i = 0; i < 16; i++) { vpxord(src_zmm(i), src_zmm(i), src_zmm(i)); } - for (int i = 0; i < nrows / 2; i++) { - auto src0 = src_ymm(2 * i); - auto src1 = src_ymm(2 * i + 1); - auto zmm_src0 = src_zmm(2 * i); - load_ymm(2 * i); + for (int i = 0; i < rnd_dn(nrows, 2); i += 2) { + auto src0 = src_ymm(i); + auto src1 = src_ymm(i + 1); + auto zmm_src0 = src_zmm(i); + load_ymm(i); vpunpcklwd(src1, src0, - EVEX_compress_addr(reg_src, (2 * i + 1) * src_stride)); + EVEX_compress_addr(reg_src, (i + 1) * src_stride)); vpunpckhwd(src0, src0, - EVEX_compress_addr(reg_src, (2 * i + 1) * src_stride)); + EVEX_compress_addr(reg_src, (i + 1) * src_stride)); vinserti64x4(zmm_src0, zmm_src0, src1, 1); - vpermps(zmm_src0 | kFFFF, vidx4, zmm_src0); + vpermps(zmm_src0 | kLoadMask1, vidx4, zmm_src0); } // for odd numbers we need to mix row with zeroes @@ -258,72 +289,81 @@ void jit_trans_iw_ic_int16_t::transpose( vinserti64x4(zmm_tmp, zmm_tmp, src1, 1); vpxord(zmm_src0, zmm_src0, zmm_src0); vmovups(zmm_src0, zmm_tmp); - vpermps(zmm_src0 | kFFFF, vidx4, zmm_src0); + vpermps(zmm_src0 | kLoadMask1, vidx4, zmm_src0); } } + kmovw(kPerm1, 0x5555); + kmovw(kPerm2, 0xaaaa); // swap 1 - for (int i = 0; i < 4; i++) { - auto zmm0 = src_zmm(4 * i); - auto zmm1 = src_zmm(4 * i + 2); - auto tmp0 = src_zmm(4 * i + 1); - auto tmp1 = src_zmm(4 * i + 3); + for (int i = 0; i < 16; i += 4) { + auto zmm0 = src_zmm(i); + auto zmm1 = src_zmm(i + 2); + auto tmp0 = src_zmm(i + 1); + auto tmp1 = src_zmm(i + 3); vmovups(tmp0, zmm0); vmovups(tmp1, zmm1); - vpermps(tmp0 | kAAAA, vidx3, zmm1); - vpermps(tmp1 | k5555, vidx3, zmm0); + vpermps(tmp0 | kPerm2, vidx3, zmm1); + vpermps(tmp1 | kPerm1, vidx3, zmm0); } // swap 2 int base_idx; base_idx = 0; - for (int i = 0; i < 2; i++) { - auto zmm0 = src_zmm(base_idx + 2 * i + 1); - auto zmm1 = src_zmm(base_idx + 2 * i + 5); - auto tmp0 = src_zmm(base_idx + 2 * i); - auto tmp1 = src_zmm(base_idx + 2 * i + 4); + kmovw(kPerm1, 0xaa); + kmovw(kPerm2, 0x55); + + for (int i = 0; i < 4; i += 2) { + auto zmm0 = src_zmm(base_idx + i + 1); + auto zmm1 = src_zmm(base_idx + i + 5); + + auto tmp0 = src_zmm(base_idx + i); + auto tmp1 = src_zmm(base_idx + i + 4); vmovupd(tmp0, zmm0); vmovupd(tmp1, zmm1); - vpermpd(tmp0 | kAA, vidx2, zmm1); - vpermpd(tmp1 | k55, vidx2, zmm0); + vpermpd(tmp0 | kPerm1, vidx2, zmm1); + vpermpd(tmp1 | kPerm2, vidx2, zmm0); } base_idx = 8; - for (int i = 0; i < 2; i++) { - auto zmm0 = src_zmm(base_idx + 2 * i + 1); - auto zmm1 = src_zmm(base_idx + 2 * i + 5); + for (int i = 0; i < 4; i += 2) { + auto zmm0 = src_zmm(base_idx + i + 1); + auto zmm1 = src_zmm(base_idx + i + 5); - auto tmp0 = src_zmm(base_idx + 2 * i); - auto tmp1 = src_zmm(base_idx + 2 * i + 4); + auto tmp0 = src_zmm(base_idx + i); + auto tmp1 = src_zmm(base_idx + i + 4); vmovupd(tmp0, zmm0); vmovupd(tmp1, zmm1); - vpermpd(tmp0 | kAA, vidx2, zmm1); - vpermpd(tmp1 | k55, vidx2, zmm0); + vpermpd(tmp0 | kPerm1, vidx2, zmm1); + vpermpd(tmp1 | kPerm2, vidx2, zmm0); } + kmovw(kPerm1, 0xcc); + kmovw(kPerm2, 0x33); + // swap 3 - for (int i = 0; i < 4; i++) { - auto zmm0 = src_zmm(2 * i); - auto zmm1 = src_zmm(2 * i + 8); + for (int i = 0; i < 8; i += 2) { + auto zmm0 = src_zmm(i); + auto zmm1 = src_zmm(i + 8); - auto tmp0 = src_zmm(2 * i + 1); - auto tmp1 = src_zmm(2 * i + 9); + auto tmp0 = src_zmm(i + 1); + auto tmp1 = src_zmm(i + 9); vmovupd(tmp0, zmm0); vmovupd(tmp1, zmm1); - vpermpd(tmp0 | kCC, vidx1, zmm1); - vpermpd(tmp1 | k33, vidx1, zmm0); + vpermpd(tmp0 | kPerm1, vidx1, zmm1); + vpermpd(tmp1 | kPerm2, vidx1, zmm0); } // all stores - for (int i = 0; i < 8; i++) - vextracti64x4(src_ymm(2 * i), src_zmm(2 * i + 1), 1); + for (int i = 0; i < 16; i += 2) + vextracti64x4(src_ymm(i), src_zmm(i + 1), 1); auto get_vec_idx = [](int ic_idx) { assert(ic_idx < 16 && ic_idx >= 0); @@ -351,24 +391,219 @@ void jit_trans_iw_ic_int16_t::transpose( store(src_zmm(get_vec_idx(ic)), ic); } -void jit_trans_iw_ic_int16_t::generate() { +void jit_trans_iw_ic_t::transpose_1b( + int nrows, int l_pad, int r_pad, bool nontemporal_stores) { + + auto load = [this, nrows](int i) { + auto zmm_src = src_zmm(i); + if (i < nrows) { + auto addr = EVEX_compress_addr(reg_src, i * src_stride); + vmovdqu8(zmm_src | kLoadMask1 | T_z, addr); + } else + vpxord(zmm_src, zmm_src, zmm_src); + }; + + int l_pad_tail {0}, r_pad_tail {0}, l_pad_rows {0}, r_pad_rows {0}; + + if (l_pad > 0) { + l_pad_rows = l_pad / transpose_size; + l_pad_tail = l_pad % transpose_size; + kmovw(kLPad, (1 << l_pad_tail) - 1); + } + if (r_pad > 0) { + r_pad_rows = r_pad / transpose_size; + r_pad_tail = r_pad % transpose_size; + kmovw(kRPad, (1 << r_pad_tail) - 1); + } + + auto padding = [this](Reg64 base, int addr_shift, int pad_rows, + int pad_tail, const Opmask &mask, int i) { + // note: pad can be bigger than 16 because of dilation + const size_t row_off = transpose_size; + auto xmm_zero = Xmm(zmm_zero.getIdx()); + const auto pshift = addr_shift * typesize + i * tr_src_stride; + for (int i_row = 0; i_row < pad_rows; i_row++) { + auto addr = EVEX_compress_addr(base, pshift + i_row * row_off); + vmovups(addr, xmm_zero); + } + if (pad_tail > 0) { + base.setOpmaskIdx(mask.getIdx(), true); + auto addr = EVEX_compress_addr(base, pshift + pad_rows * row_off); + vmovdqu8(addr, xmm_zero); + } + }; + + auto store = [&](Zmm r, int i) { + mov(reg_tr_src_tmp, reg_tr_src); + if (l_pad > 0) { + padding(reg_tr_src_tmp, 0, l_pad_rows, l_pad_tail, kLPad, i); + add(reg_tr_src_tmp, l_pad); + } + if (r_pad > 0) { + padding(reg_tr_src_tmp, nrows, r_pad_rows, r_pad_tail, kRPad, i); + } + + auto base = reg_tr_src_tmp; + base.setOpmaskIdx(kTail.getIdx(), true); + + auto addr = EVEX_compress_addr(base, i * tr_src_stride); + vmovdqu8(addr, r); + }; + + if (l_pad > 0 || r_pad > 0) vpxord(zmm_zero, zmm_zero, zmm_zero); + int store_tail = rnd_up(nrows, 4); + kmovw(kTail, (1 << store_tail) - 1); + + // load rows and swap bytes + for (int i = 0; i < nrows; i += 4) { + load(i); + load(i + 1); + load(i + 2); + load(i + 3); + + // concatenate 4 rows + auto zmm_src0 = src_zmm(i); + auto ymm_src0 = src_ymm(i); + auto ymm_src2 = src_ymm(i + 2); + auto xmm_src1 = src_xmm(i + 1); + auto xmm_src3 = src_xmm(i + 3); + vinserti64x2(ymm_src0, ymm_src0, xmm_src1, 1); + vinserti64x2(ymm_src2, ymm_src2, xmm_src3, 1); + vinserti64x4(zmm_src0, zmm_src0, ymm_src2, 1); + + // swap bytes + vpermb(zmm_src0, vidx1, zmm_src0); + } + // zero rest zmm_src + for (int i = rnd_up(nrows, 4); i < transpose_size; i += 4) { + auto zmm_src0 = src_zmm(i); + vpxord(zmm_src0, zmm_src0, zmm_src0); + } + // At this point every fourth zmm contains four transposed lines from src + + // swap doubles + for (int i = 0; i < 2; i++) { + auto idx0 = 8 * i; + auto idx1 = idx0 + 4; + + auto zmm_src0 = src_zmm(idx0); + auto zmm_src1 = src_zmm(idx1); + + auto zmm_tmp0 = src_zmm(idx0 + 1); + auto zmm_tmp1 = src_zmm(idx1 + 1); + + vmovups(zmm_tmp0, vidx2); + vmovups(zmm_tmp1, vidx3); + + vpermi2d(zmm_tmp0, zmm_src0, zmm_src1); + vpermi2d(zmm_tmp1, zmm_src0, zmm_src1); + } + + // swap quads + for (int i = 0; i < 2; i++) { + auto idx0 = 4 * i; + auto idx1 = idx0 + 8; + + auto zmm_src0 = src_zmm(idx0 + 1); + auto zmm_src1 = src_zmm(idx1 + 1); + + auto zmm_tmp0 = src_zmm(idx0); + auto zmm_tmp1 = src_zmm(idx1); + + vmovups(zmm_tmp0, vidx4); + vmovups(zmm_tmp1, vidx5); + + vpermi2q(zmm_tmp0, zmm_src0, zmm_src1); + vpermi2q(zmm_tmp1, zmm_src0, zmm_src1); + } + + // extract columns + for (int i = 0; i < 16; i += 4) { + vextracti64x4(src_ymm(i + 2) | T_z, src_zmm(i), 1); + vextracti32x4(src_xmm(i + 1) | T_z, src_zmm(i), 1); + vextracti32x4(src_xmm(i + 3) | T_z, src_ymm(i + 2), 1); + } + + auto get_vec_idx = [](int col_idx) { + assert(col_idx < transpose_size && col_idx >= 0); + + const auto div = col_idx / 4; + const auto mod = col_idx % 4; + + return mod * 4 + div; + }; + + const int ic_block = conf_->ic_block; + for (int col_idx = 0; col_idx < ic_block; col_idx++) { + store(src_zmm(get_vec_idx(col_idx)), col_idx); + } +} + +void jit_trans_iw_ic_t::generate() { preamble(); - alignas(64) static constexpr const int64_t idx1[8] - = {2, 3, 0, 1, 6, 7, 4, 5}; - alignas(64) static constexpr const int64_t idx2[8] - = {1, 0, 3, 2, 5, 4, 7, 6}; - alignas(64) static constexpr const int32_t idx3[16] - = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - alignas(64) static constexpr const int32_t idx4[16] - = {8, 10, 12, 14, 0, 2, 4, 6, 9, 11, 13, 15, 1, 3, 5, 7}; - alignas(64) static constexpr const uint16_t idx5[32] - = {0, 16, 2, 18, 8, 24, 10, 26, 4, 20, 6, 22, 12, 28, 14, 30, 1, 17, - 3, 19, 9, 25, 11, 27, 5, 21, 7, 23, 13, 29, 15, 31}; + if (mayiuse(avx512_core)) { + const int ic_block = conf_->ic_block; + const int ic_tail = conf_->ic_tail; + if (conf_->stride_w > 1 || is_layout_nxc) { + kmovd(kLoadMask1, (1 << ic_block) - 1); + kmovd(kLoadMask2, 0xffff0000); + } + + if (is_layout_nxc && ic_tail) { + Label done; + cmp(dword[param1 + GET_OFF(ch_work)], ic_block); + je(done, T_NEAR); + kmovd(kLoadMask1, (1 << ic_tail) - 1); + kshiftld(kLoadMask2, kLoadMask1, 16); + L(done); + } + } else { + kmovw(kLoadMask1, 0xffff); + } + + if (typesize == 2) { + alignas(64) static constexpr const int64_t idx1[8] + = {2, 3, 0, 1, 6, 7, 4, 5}; + alignas(64) static constexpr const int64_t idx2[8] + = {1, 0, 3, 2, 5, 4, 7, 6}; + alignas(64) static constexpr const int32_t idx3[16] + = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + alignas(64) static constexpr const int32_t idx4[16] + = {8, 10, 12, 14, 0, 2, 4, 6, 9, 11, 13, 15, 1, 3, 5, 7}; + alignas(64) static constexpr const uint16_t idx5[32] + = {0, 16, 2, 18, 8, 24, 10, 26, 4, 20, 6, 22, 12, 28, 14, 30, 1, + 17, 3, 19, 9, 25, 11, 27, 5, 21, 7, 23, 13, 29, 15, 31}; + + vmovdqa64(vidx1, idx1); + vmovdqa64(vidx2, idx2); + vmovdqa32(vidx3, idx3); + vmovdqa32(vidx4, idx4); + vmovdqa32(vidx5, (const int32_t *)idx5); + } else if (typesize == 1) { + alignas(64) static constexpr const uint8_t idx1[64] = {0, 16, 32, 48, 1, + 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, 4, 20, 36, 52, 5, 21, + 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, 8, 24, 40, 56, 9, 25, 41, + 57, 10, 26, 42, 58, 11, 27, 43, 59, 12, 28, 44, 60, 13, 29, 45, + 61, 14, 30, 46, 62, 15, 31, 47, 63}; + alignas(64) static constexpr const uint32_t idx2[16] + = {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30}; + alignas(64) static constexpr const uint32_t idx3[16] + = {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31}; + alignas(64) static constexpr const uint64_t idx4[8] + = {0, 8, 2, 10, 4, 12, 6, 14}; + alignas(64) static constexpr const uint64_t idx5[8] + = {1, 9, 3, 11, 5, 13, 7, 15}; + + vmovdqa64(vidx1, (const int64_t *)idx1); + vmovdqa64(vidx2, (const int64_t *)idx2); + vmovdqa64(vidx3, (const int64_t *)idx3); + vmovdqa64(vidx4, (const int64_t *)idx4); + vmovdqa64(vidx5, (const int64_t *)idx5); + } else + assert(!"unsupported data type"); const int ic_block = conf_->ic_block; - const bool is_layout_nxc = utils::one_of(conf_->src_tag, format_tag::ndhwc, - format_tag::nhwc, format_tag::nwc); const size_t src_mult = is_layout_nxc ? conf_->ngroups * conf_->ic : ic_block; const int iw = conf_->iw; @@ -378,35 +613,6 @@ void jit_trans_iw_ic_int16_t::generate() { const int tr_iw_s = tr_iw / str_w; assert(transpose_size >= ic_block); - auto kmovw = [this](Opmask k, unsigned w) { - mov(regw_tmp, w); - jit_generator::kmovw(k, regw_tmp); - }; - - kmovw(kFFFF, 0xffff); - kmovw(k5555, 0x5555); - kmovw(kAAAA, 0xaaaa); - kmovw(kAA, 0xaa); - kmovw(k55, 0x55); - kmovw(kCC, 0xcc); - kmovw(k33, 0x33); - - auto vmovdqa64 = [this](Zmm z, const int64_t *addr) { - mov(imm_addr64, reinterpret_cast(addr)); - jit_generator::vmovdqa64(z, ptr[imm_addr64]); - }; - - auto vmovdqa32 = [this](Zmm z, const int32_t *addr) { - mov(imm_addr64, reinterpret_cast(addr)); - jit_generator::vmovdqa32(z, ptr[imm_addr64]); - }; - - vmovdqa64(vidx1, idx1); - vmovdqa64(vidx2, idx2); - vmovdqa32(vidx3, idx3); - vmovdqa32(vidx4, idx4); - vmovdqa32(vidx5, (const int32_t *)idx5); - // Data for every strided case is placed consecutively // For 1x1 convolutions with strides we transpose only needed elements const auto str_w_end = (conf_->kw == 1) ? 1 : str_w; @@ -419,13 +625,12 @@ void jit_trans_iw_ic_int16_t::generate() { const int transposes = utils::div_up(iw_s, transpose_size); int loop_iters = nstl::max(0, transposes - 1); - tail = iw_s - loop_iters * transpose_size; + int tail = iw_s - loop_iters * transpose_size; src_stride = src_mult * typesize * str_w; tr_src_stride = tr_iw * typesize; bool nontemporal_stores = false; - enable_prefetch = iw > small_spatial ? true : false; const size_t src_step = src_mult * transpose_size * str_w * typesize; const size_t tr_src_step = transpose_size * typesize; @@ -478,64 +683,95 @@ void jit_trans_iw_ic_int16_t::generate() { struct jit_trans_ow_oc_t : public jit_trans_dst_t, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_trans_ow_oc_t) jit_trans_ow_oc_t(const jit_conv_conf_t *conf) - : jit_trans_dst_t(conf), jit_generator(jit_name()) {} + : jit_trans_dst_t(conf) + , jit_generator(jit_name()) + , typesize(conf->dst_dt == data_type::undef + ? 2 + : types::data_type_size(conf->dst_dt)) + , is_layout_nxc(utils::one_of(conf_->dst_tag, format_tag::ndhwc, + format_tag::nhwc, format_tag::nwc)) + , vnni_block(conf->dst_dt == data_type::undef + ? 2 + : data_type_vnni_granularity(conf->dst_dt)) {} void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); } status_t create_kernel() override { return jit_generator::create_kernel(); } private: - using reg64_t = const Xbyak::Reg64; - using reg32_t = const Xbyak::Reg32; - using opmask_t = const Xbyak::Opmask; - using zmm = const Xbyak::Zmm; - - enum { - typesize = sizeof(int16_t), - transpose_size = 16, - small_spatial = 14 - }; + int typesize = 0; + bool is_layout_nxc = false; + int vnni_block = 0; + static constexpr int transpose_size = 16; size_t src_stride = 0, tr_src_stride = 0; int tail = 0; - bool enable_prefetch = false; - - opmask_t kFF = k1; - opmask_t mask_lo = k2; - opmask_t k_oc_tail = k3; - - zmm vidx1 = zmm31; - zmm vidx2 = zmm30; - - reg64_t reg_src = r8; - reg64_t reg_tr_src = r9; - reg64_t reg_src_prf = r10; - reg64_t reg_tr_src_prf = r11; - reg64_t reg_loop = r12; - reg64_t reg_tr_src_tmp = r13; - reg32_t regw_tmp = r14d; - reg64_t imm_addr64 = rbx; - - void transpose(int nrows, int l_pad, int r_pad, bool nontemporal_stores, - bool do_convert = true); + + Opmask kFF = k1; + Opmask mask_lo = k2; + Opmask k_oc_tail = k3; + + Zmm vidx1 = zmm31; + Zmm vidx2 = zmm30; + Zmm vidx3 = zmm29; + Zmm vidx4 = zmm28; + + Reg64 reg_src = r8; + Reg64 reg_tr_src = r9; + Reg64 reg_src_prf = r10; + Reg64 reg_loop = r12; + Reg64 reg_tr_src_tmp = r13; + Reg32 regw_tmp = r14d; + Reg64 imm_addr64 = rbx; + + void vmovdqa64(Zmm z, const int64_t *addr) { + mov(imm_addr64, reinterpret_cast(addr)); + jit_generator::vmovdqa64(z, ptr[imm_addr64]); + } + void kmovw(Opmask k, unsigned w) { + mov(regw_tmp, w); + jit_generator::kmovw(k, regw_tmp); + } + void kmovd(Opmask k, unsigned w) { + mov(regw_tmp, w); + jit_generator::kmovd(k, regw_tmp); + } + Zmm src_zmm(int i) { return Zmm(i); } + Ymm src_ymm(int i) { + assert(i >= 0 && i < 16); + return Ymm(i); + } + Xmm src_xmm(int i) { + assert(i >= 0 && i < 16); + return Xmm(i); + } + + void transpose(int nrows, bool nontemporal_stores, bool do_convert = true); + void transpose_2b( + int nrows, bool nontemporal_stores, bool do_convert = true); + void transpose_1b( + int nrows, bool nontemporal_stores, bool do_convert = true); void generate() override; }; // do_convert (default is 'true') is a flag that determines when to do the // transformation of the input data and when to simply zero out the output data -void jit_trans_ow_oc_t::transpose(int nrows, int l_pad, int r_pad, - bool nontemporal_stores, bool do_convert) { +void jit_trans_ow_oc_t::transpose( + int nrows, bool nontemporal_stores, bool do_convert) { assert(nrows >= 0 && nrows <= transpose_size); static_assert(transpose_size == 16, "Unsupported transpose size"); if (!nrows) return; + if (typesize == 2) + transpose_2b(nrows, nontemporal_stores, do_convert); + else if (typesize == 1) + transpose_1b(nrows, nontemporal_stores, do_convert); + else + assert(!"unsupported data type"); +} - auto src_zmm = [](int i) { return Zmm(i); }; +void jit_trans_ow_oc_t::transpose_2b( + int nrows, bool nontemporal_stores, bool do_convert) { - auto src_ymm = [](int i) { - assert(i >= 0 && i < 16); - return Ymm(i); - }; - - auto load_ymm = [this, src_ymm](int i) { + auto load_ymm = [this](int i) { auto ymm_reg = src_ymm(i); auto addr = EVEX_compress_addr(reg_src, i * src_stride); if (conf_->oc_tail) { @@ -556,23 +792,23 @@ void jit_trans_ow_oc_t::transpose(int nrows, int l_pad, int r_pad, else vmovups(addr, r); }; - const bool is_layout_nxc = utils::one_of(conf_->dst_tag, format_tag::ndhwc, - format_tag::nhwc, format_tag::nwc); + + const auto row_pad = nrows % 2; if (mayiuse(avx512_core) && !is_layout_nxc) { // TODO: adopt for nhwc? - for (int i = 0; i < nrows / 2; i++) { + for (int i = 0; i < rnd_dn(nrows, 2); i += 2) { auto zmm_src0 = src_zmm(i); if (do_convert) { - vmovdqu16(zmm_src0, - EVEX_compress_addr(reg_src, 2 * i * src_stride)); + vmovdqu16( + zmm_src0, EVEX_compress_addr(reg_src, i * src_stride)); vpermw(zmm_src0, vidx2, zmm_src0); } else { vpxord(zmm_src0, zmm_src0, zmm_src0); } - store(zmm_src0, 2 * i); + store(zmm_src0, i); } - if (r_pad > 0) { + if (row_pad > 0) { auto zmm_src0 = src_zmm(29); if (do_convert) { vmovdqu16(zmm_src0 | mask_lo | T_z, @@ -584,34 +820,32 @@ void jit_trans_ow_oc_t::transpose(int nrows, int l_pad, int r_pad, store(zmm_src0, nrows - 1); } } else { - for (int i = 0; i < nrows / 2; i++) { - auto src0 = src_ymm(2 * i); - auto src1 = src_ymm(2 * i + 1); - auto zmm_src0 = src_zmm(2 * i); + for (int i = 0; i < rnd_dn(nrows, 2); i += 2) { + auto src0 = src_ymm(i); + auto src1 = src_ymm(i + 1); + auto zmm_src0 = src_zmm(i); if (do_convert) { - load_ymm(2 * i); + load_ymm(i); if (is_layout_nxc && conf_->oc_tail) { - load_ymm(2 * i + 1); + load_ymm(i + 1); auto ymm_tmp = Ymm(30); vpunpcklwd(ymm_tmp, src0, src1); vpunpckhwd(src0, src0, src1); vinserti64x4(zmm_src0, zmm_src0, ymm_tmp, 1); } else { vpunpcklwd(src1, src0, - EVEX_compress_addr( - reg_src, (2 * i + 1) * src_stride)); + EVEX_compress_addr(reg_src, (i + 1) * src_stride)); vpunpckhwd(src0, src0, - EVEX_compress_addr( - reg_src, (2 * i + 1) * src_stride)); + EVEX_compress_addr(reg_src, (i + 1) * src_stride)); vinserti64x4(zmm_src0, zmm_src0, src1, 1); } vpermpd(zmm_src0 | kFF, vidx1, zmm_src0); } else { vpxord(zmm_src0, zmm_src0, zmm_src0); } - store(zmm_src0, 2 * i); + store(zmm_src0, i); } - if (r_pad > 0) { + if (row_pad > 0) { auto src0 = src_ymm(nrows - 1); auto src1 = src_ymm(nrows); auto zmm_src0 = src_zmm(30); @@ -633,18 +867,127 @@ void jit_trans_ow_oc_t::transpose(int nrows, int l_pad, int r_pad, } } +void jit_trans_ow_oc_t::transpose_1b( + int nrows, bool nontemporal_stores, bool do_convert) { + auto load_xmm = [this](int i) { + auto xmm_reg = src_xmm(i); + auto addr = EVEX_compress_addr(reg_src, i * src_stride); + if (conf_->oc_tail) { + xmm_reg = xmm_reg | k_oc_tail | T_z; + // Assertion below as we need vmovdqu16 for tails. + // If needed, can be removed by using load_bytes() helper. + assert(mayiuse(avx512_core)); + vmovdqu8(xmm_reg, addr); + } else { + vmovups(xmm_reg, addr); + } + }; + + auto store = [this, nontemporal_stores](Zmm r, int i) { + auto addr = EVEX_compress_addr(reg_tr_src, i * tr_src_stride); + if (nontemporal_stores) + vmovntps(addr, r); + else + vmovups(addr, r); + }; + assert(is_layout_nxc); + assert(vnni_block == 4); + + for (int i = 0; i < rnd_up(nrows, vnni_block); i += vnni_block) { + const auto idx0 = i; + const auto idx1 = i + 1; + const auto idx2 = i + 2; + const auto idx3 = i + 3; + auto src0 = src_xmm(idx0); + auto src1 = src_xmm(idx1); + auto src2 = src_xmm(idx2); + auto src3 = src_xmm(idx3); + // two registers from next iteration used as temporal + auto src4 = src_xmm((i + 4) % 16); + auto src5 = src_xmm((i + 5) % 16); + auto zmm_src0 = src_zmm(i); + if (do_convert) { + load_xmm(idx0); + if (idx1 < nrows) + load_xmm(idx1); + else + vpxord(src1, src1, src1); + if (idx2 < nrows) + load_xmm(idx2); + else + vpxord(src2, src2, src2); + if (idx3 < nrows) + load_xmm(idx3); + else + vpxord(src3, src3, src3); + + vpunpcklbw(src4, src0, src1); + vpunpckhbw(src5, src0, src1); + vpunpcklbw(src0, src2, src3); + vpunpckhbw(src1, src2, src3); + + vpunpcklwd(src2, src4, src0); + vpunpckhwd(src3, src4, src0); + vpunpcklwd(src4, src5, src1); + vpunpckhwd(src5, src5, src1); + + vinserti64x2(zmm_src0, zmm_src0, src2, 0); + vinserti64x2(zmm_src0, zmm_src0, src3, 1); + vinserti64x2(zmm_src0, zmm_src0, src4, 2); + vinserti64x2(zmm_src0, zmm_src0, src5, 3); + } else { + vpxord(zmm_src0, zmm_src0, zmm_src0); + } + store(zmm_src0, i); + } +} + void jit_trans_ow_oc_t::generate() { preamble(); - alignas(64) static constexpr const int64_t idx1[8] - = {4, 5, 0, 1, 6, 7, 2, 3}; - alignas(64) static constexpr const int16_t idx2[32] - = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, - 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}; + if (typesize == 2) { + alignas(64) static constexpr const int64_t idx1[8] + = {4, 5, 0, 1, 6, 7, 2, 3}; + alignas(64) static constexpr const int16_t idx2[32] = {0, 16, 1, 17, 2, + 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, + 27, 12, 28, 13, 29, 14, 30, 15, 31}; + vmovdqa64(vidx1, idx1); + vmovdqa64(vidx2, (const int64_t *)idx2); + } else if (typesize == 1) { + + alignas(64) static constexpr const uint8_t idx_lo_16[64] = {0, 1, 64, + 65, 4, 5, 68, 69, 2, 3, 66, 67, 6, 7, 70, 71, 8, 9, 72, 73, 12, + 13, 76, 77, 10, 11, 74, 75, 14, 15, 78, 79, 16, 17, 80, 81, 20, + 21, 84, 85, 18, 19, 82, 83, 22, 23, 86, 87, 24, 25, 88, 89, 28, + 29, 92, 93, 26, 27, 90, 91, 30, 31, 94, 95}; + + alignas(64) static constexpr const uint8_t idx_hi_16[64] = {32, 33, 96, + 97, 36, 37, 100, 101, 34, 35, 98, 99, 38, 39, 102, 103, 40, 41, + 104, 105, 44, 45, 108, 109, 42, 43, 106, 107, 46, 47, 110, 111, + 48, 49, 112, 113, 52, 53, 116, 117, 50, 51, 114, 115, 54, 55, + 118, 119, 56, 57, 120, 121, 60, 61, 124, 125, 58, 59, 122, 123, + 62, 63, 126, 127}; + + alignas(64) static constexpr const uint8_t idx_lo_8[64] = {0, 64, 2, 66, + 1, 65, 3, 67, 8, 72, 10, 74, 9, 73, 11, 75, 4, 68, 6, 70, 5, 69, + 7, 71, 12, 76, 14, 78, 13, 77, 15, 79, 16, 80, 18, 82, 17, 81, + 19, 83, 24, 88, 26, 90, 25, 89, 27, 91, 20, 84, 22, 86, 21, 85, + 23, 87, 28, 92, 30, 94, 29, 93, 31, 95}; + + alignas(64) static constexpr const uint8_t idx_hi_8[64] = {32, 96, 34, + 98, 33, 97, 35, 99, 40, 104, 42, 106, 41, 105, 43, 107, 36, 100, + 38, 102, 37, 101, 39, 103, 44, 108, 46, 110, 45, 109, 47, 111, + 48, 112, 50, 114, 49, 113, 51, 115, 56, 120, 58, 122, 57, 121, + 59, 123, 52, 116, 54, 118, 53, 117, 55, 119, 60, 124, 62, 126, + 61, 125, 63, 127}; + + vmovdqa64(vidx1 /*vreg_idx_lo_256*/, (const int64_t *)idx_lo_16); + vmovdqa64(vidx2 /*vreg_idx_hi_256*/, (const int64_t *)idx_hi_16); + vmovdqa64(vidx3 /*vreg_idx_lo_128*/, (const int64_t *)idx_lo_8); + vmovdqa64(vidx4 /*vreg_idx_hi_128*/, (const int64_t *)idx_hi_8); + } const int oc_block = conf_->oc_block; - const bool is_layout_nxc = utils::one_of(conf_->dst_tag, format_tag::ndhwc, - format_tag::nhwc, format_tag::nwc); const size_t src_mult = is_layout_nxc ? conf_->ngroups * conf_->oc : oc_block; const int ow = conf_->ow; @@ -656,27 +999,15 @@ void jit_trans_ow_oc_t::generate() { tr_src_stride = oc_block * typesize; bool nontemporal_stores = conf_->use_nt_stores_ddst; - enable_prefetch = ow > small_spatial; const size_t src_step = src_mult * transpose_size * typesize; const size_t tr_src_step = (size_t)oc_block * transpose_size * typesize; - const int right_pad = ow % 2; - const auto zero_tr_ow = nstl::max(0, conf_->tr_ow - ow - right_pad); + const auto zero_tr_ow = nstl::max(0, conf_->tr_ow - rnd_up(ow, vnni_block)); mov(reg_src, ptr[param1 + GET_OFF(src)]); mov(reg_tr_src, ptr[param1 + GET_OFF(tr_src)]); mov(reg_src_prf, ptr[param1 + GET_OFF(src_prf)]); - mov(reg_tr_src_prf, ptr[param1 + GET_OFF(tr_src_prf)]); - - auto kmovw = [this](Opmask k, unsigned w) { - mov(regw_tmp, w); - jit_generator::kmovw(k, regw_tmp); - }; - auto kmovd = [this](Opmask k, unsigned w) { - mov(regw_tmp, w); - jit_generator::kmovd(k, regw_tmp); - }; kmovw(kFF, 0xFF); kmovd(mask_lo, 0x0000ffff); @@ -690,39 +1021,27 @@ void jit_trans_ow_oc_t::generate() { L(done); } - auto vmovdqa64 = [this](Zmm z, const int64_t *addr) { - mov(imm_addr64, reinterpret_cast(addr)); - jit_generator::vmovdqa64(z, ptr[imm_addr64]); - }; - - vmovdqa64(vidx1, idx1); - vmovdqa64(vidx2, (const int64_t *)idx2); if (loop_iters) { mov(reg_loop, loop_iters); Label loop; L(loop); { - transpose(transpose_size, 0, 0, nontemporal_stores); + transpose(transpose_size, nontemporal_stores); add(reg_src, src_step); add(reg_tr_src, tr_src_step); add(reg_src_prf, src_step); - add(reg_tr_src_prf, tr_src_step); sub(reg_loop, 1); jnz(loop); } } - transpose(tail, 0, right_pad, nontemporal_stores); + transpose(tail, nontemporal_stores); if (zero_tr_ow) { const auto zero_transposes = utils::div_up(zero_tr_ow, transpose_size); const auto zero_loop_iters = nstl::max(0, zero_transposes - 1); const auto zero_tail = zero_tr_ow - zero_loop_iters * transpose_size; - const auto zero_right_pad = zero_tr_ow % 2; // shift over tail - auto tr_src_tail_step - = (size_t)oc_block * (tail + right_pad) * typesize; - add(reg_tr_src, tr_src_tail_step); - add(reg_tr_src_prf, tr_src_tail_step); + add(reg_tr_src, (size_t)oc_block * rnd_up(tail, vnni_block) * typesize); // zero the tr_ow - ow if (zero_loop_iters) { @@ -730,14 +1049,13 @@ void jit_trans_ow_oc_t::generate() { Label zero_loop; L(zero_loop); { - transpose(transpose_size, 0, 0, nontemporal_stores, false); + transpose(transpose_size, nontemporal_stores, false); add(reg_tr_src, tr_src_step); - add(reg_tr_src_prf, tr_src_step); sub(reg_loop, 1); jnz(zero_loop); } } - transpose(zero_tail, 0, zero_right_pad, nontemporal_stores, false); + transpose(zero_tail, nontemporal_stores, false); } postamble(); @@ -1174,12 +1492,13 @@ void jit_diff_wei_trans_to_vnni_t::generate() { jit_trans_src_t *create_trans_src(const jit_conv_conf_t *conf) { if (conf->has_vnni && IMPLICATION(conf->is_1stconv, conf->transpose_src)) - return new jit_trans_iw_ic_int16_t(conf); + return new jit_trans_iw_ic_t(conf); assert(!"unsupported configuration"); return nullptr; } jit_trans_dst_t *create_trans_dst(const jit_conv_conf_t *conf) { + if (conf->has_vnni) return new jit_trans_ow_oc_t(conf); assert(!"unsupported configuration"); return nullptr; From d9cd4efd3853e806b3600e023dd1d96320166bb3 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Fri, 26 Apr 2024 12:00:40 -0700 Subject: [PATCH 119/187] x64: jit_brgemm_conv_bwd_copy_kernel.cpp: update for fp8 --- .../x64/jit_avx512_core_amx_conv_kernel.cpp | 140 ++++++++++++++++-- .../x64/jit_avx512_core_amx_conv_kernel.hpp | 23 ++- 2 files changed, 144 insertions(+), 19 deletions(-) diff --git a/src/cpu/x64/jit_avx512_core_amx_conv_kernel.cpp b/src/cpu/x64/jit_avx512_core_amx_conv_kernel.cpp index 5cf314ad117..eeaed407f5d 100644 --- a/src/cpu/x64/jit_avx512_core_amx_conv_kernel.cpp +++ b/src/cpu/x64/jit_avx512_core_amx_conv_kernel.cpp @@ -5606,13 +5606,21 @@ void jit_avx512_core_amx_bwd_weights_kernel_t::balance(const jit_conv_conf_t &j, } // start of diff_bias kernel +dim_t jit_avx512_core_amx_bwd_bias_kernel_t::get_ddst_offset( + dim_t w_idx, dim_t hd_idx) const { + int ow_per_oc = data_type_vnni_granularity(jcp.ddst_dt); + dim_t w_off = utils::rnd_dn(w_idx, ow_per_oc) * jcp.oc_block + + w_idx % ow_per_oc; + return jcp.typesize_in * (w_off + jcp.tr_ow * jcp.oc_block * hd_idx); +} + void jit_avx512_core_amx_bwd_bias_kernel_t::compute_diff_bias_row(int ocb) { auto compute_step = [&]() { - if (jcp.ddst_dt == data_type::bf16) { + if (jcp.ddst_dt == bf16) { vmovups(vreg_bias_ddst, ptr[reg_ddst]); vdpbf16ps(vreg_bias_acc, vreg_bias_ddst, vreg_bias_unit); - } else if (jcp.ddst_dt == data_type::f16) { + } else if (jcp.ddst_dt == f16) { // The ddst_dt is in vnni format, (S16c2s) which needs to be // reduced along S dimension. Since, we do not have f16_vnni // instruction, we try to emulate it. @@ -5633,22 +5641,69 @@ void jit_avx512_core_amx_bwd_bias_kernel_t::compute_diff_bias_row(int ocb) { // [p+p, o+o, l+l, k+k, n+n, m+m, j+j, i+i] i.e., [P, O, L, K, N, M, J, I] vhaddps(yreg_bias_ddst0, yreg_bias_ddst0, yreg_bias_ddst1); vaddps(yreg_bias_acc1, yreg_bias_acc1, yreg_bias_ddst0); + } else if (one_of(jcp.ddst_dt, f8_e5m2, f8_e4m3)) { + // The ddst_dt is in vnni format, (S16c4s) which needs to be + // reduced along S dimension. Since, we do not have f8_vnni and f16_vnni + // instruction, we try to emulate f16_vnni. + // A, B, C,.. - corresponds to output channels + // ddst_data: [p,p,p,p, ...b,b,b,b, a,a,a,a] in fp8 + // req_output = [p+p+p+p, ...b+b+b+b, a+a+a+a] in f32 i.e., [H, F, D, B, G, E, C, A] + + const Xbyak::Zmm zmm_load(yreg_bias_ddst0.getIdx()); + + // process A,B,C,D,E,F,G,H channels + // load and process for + f8_emu->vcvt_f8_to_f32(zmm_load, ptr[reg_ddst]); + // copy upper bytes to second ymm + vextractf64x4(yreg_bias_ddst1, zmm_load, 1); + // each yreg_bias_ddst contains 8 float values in vnni layout for and for correspondingly + + // [d2+d3, d0+d1, b2+b3, b0+b1, c2+c3, c0+c1, a2+a3, a0+a1] + vhaddps(yreg_bias_ddst00, yreg_bias_ddst0, yreg_bias_ddst1); + + // load and process for + f8_emu->vcvt_f8_to_f32(zmm_load, ptr[reg_ddst + 16]); + // copy upper bytes to second ymm + vextractf64x4(yreg_bias_ddst1, zmm_load, 1); + // each yreg_bias_ddst contains 8 float values in vnni layout for and for correspondingly + + // [h2+h3, h0+h1, f2+f3, f0+f1, g2+g3, g0+g1, e2+e3, e0+e1] + vhaddps(yreg_bias_ddst01, yreg_bias_ddst0, yreg_bias_ddst1); + + // final summation for a,b,c,d,e,f, g,h + // [h0+h1+h2+h3, f0+f1+f2+f3, d0+d1+d2+d3, b0+b1+b2+b3, g0+g1+g2+g3, e0+e1+e2+e3, c0+c1+c2+c3, a0+a1+a2+a3] in f32 i.e., [H, F, D, B, G, E, C, A] + vhaddps(yreg_bias_ddst00, yreg_bias_ddst00, yreg_bias_ddst01); + vaddps(yreg_bias_acc0, yreg_bias_acc0, yreg_bias_ddst00); + + // process I,J,K,L,M,N,O,P channels in same way as A,B,C,D,E,F,G,H + f8_emu->vcvt_f8_to_f32(zmm_load, ptr[reg_ddst + 32]); + vextractf64x4(yreg_bias_ddst1, zmm_load, 1); + + vhaddps(yreg_bias_ddst00, yreg_bias_ddst0, yreg_bias_ddst1); + + f8_emu->vcvt_f8_to_f32(zmm_load, ptr[reg_ddst + 48]); + vextractf64x4(yreg_bias_ddst1, zmm_load, 1); + vhaddps(yreg_bias_ddst01, yreg_bias_ddst0, yreg_bias_ddst1); + + vhaddps(yreg_bias_ddst00, yreg_bias_ddst00, yreg_bias_ddst01); + vaddps(yreg_bias_acc1, yreg_bias_acc1, yreg_bias_ddst00); } }; Label ow_loop; - int niters = jcp.tr_ow / 2; + const int sp_substep = data_type_vnni_granularity(jcp.ddst_dt); + const int niters = jcp.tr_ow / sp_substep; if (niters > 0) { - mov(reg_tmp, jcp.tr_ow / 2); + mov(reg_tmp, niters); L(ow_loop); compute_step(); - add(reg_ddst, get_ddst_offset(2)); + add(reg_ddst, get_ddst_offset(sp_substep)); sub(reg_tmp, 1); jnz(ow_loop, T_NEAR); } - if (jcp.tr_ow % 2) compute_step(); + if (jcp.tr_ow % jcp.typesize_in) compute_step(); - if (niters > 0) sub(reg_ddst, get_ddst_offset(2 * niters)); + if (niters > 0) sub(reg_ddst, get_ddst_offset(sp_substep * niters)); } void jit_avx512_core_amx_bwd_bias_kernel_t::compute_diff_bias( @@ -5665,7 +5720,7 @@ void jit_avx512_core_amx_bwd_bias_kernel_t::compute_diff_bias( mov(reg_oj, reg_nrows); // accumulator initialization - if (jcp.ddst_dt == data_type::f16) { + if (one_of(jcp.ddst_dt, f16, f8_e5m2, f8_e4m3)) { vpxord(yreg_bias_acc0, yreg_bias_acc0, yreg_bias_acc0); vpxord(yreg_bias_acc1, yreg_bias_acc1, yreg_bias_acc1); } else { @@ -5674,15 +5729,28 @@ void jit_avx512_core_amx_bwd_bias_kernel_t::compute_diff_bias( cmp(reg_initial, 0); jnz(bias_loop, T_NEAR); const size_t offset = sizeof(float) * ocb * jcp.oc_block; - if (jcp.ddst_dt == data_type::f16) { + if (jcp.ddst_dt == bf16) { + vmovups(vreg_bias_acc, ptr[reg_bias + offset]); + } else if (jcp.ddst_dt == f16) { // the data is in plain format, transform while loading. // i.e.,[H, G, F, E, D, C, B, A] -> [H, G, D, C, F, E, B, A] // and [P, O, N, M, L, K, J, I] -> [P, O, L, K, N, M, J, I] vpermq(yreg_bias_acc0, ptr[reg_bias + offset], 0xd8); vpermq(yreg_bias_acc1, ptr[reg_bias + offset + vreg_traits::vlen], 0xd8); + } else if (one_of(jcp.ddst_dt, f8_e5m2, f8_e4m3)) { + // the data is in plain format, transform while loading to + // pseudo-vnni layout to 2 ymm registers conforming to calculations + // by vhaddps. + // i.e.,[H, G, F, E, D, C, B, A] -> [H, F, D, B, G, E, C, A] + // and [P, O, N, M, L, K, J, I] -> [P, N, L, J, O, M, K, I] + + vpermd(yreg_bias_acc0, yreg_permute_to_vnni, + ptr[reg_bias + offset]); + vpermd(yreg_bias_acc1, yreg_permute_to_vnni, + ptr[reg_bias + offset + vreg_traits::vlen]); } else { - vmovups(vreg_bias_acc, ptr[reg_bias + offset]); + assert(!"non-supported type"); } // loop by rows L(bias_loop); @@ -5695,9 +5763,9 @@ void jit_avx512_core_amx_bwd_bias_kernel_t::compute_diff_bias( } // store accumulator - if (jcp.ddst_dt == data_type::bf16) { + if (jcp.ddst_dt == bf16) { vmovups(ptr[reg_bias + offset], vreg_bias_acc); - } else if (jcp.ddst_dt == data_type::f16) { + } else if (jcp.ddst_dt == f16) { // transform to plain before storing. // i.e., [H, G, D, C, F, E, B, A] -> [H, G, F, E, D, C, B, A] // and [P, O, L, K, N, M, J, I] -> [P, O, N, M, L, K, J, I] @@ -5706,11 +5774,24 @@ void jit_avx512_core_amx_bwd_bias_kernel_t::compute_diff_bias( vmovups(ptr[reg_bias + offset], yreg_bias_acc0); vmovups(ptr[reg_bias + offset + vreg_traits::vlen], yreg_bias_acc1); + } else if (one_of(jcp.ddst_dt, f8_e5m2, f8_e4m3)) { + // transform to plain before storing. + // i.e., [H, F, D, B, G, E, C, A] -> [H, G, F, E, D, C, B, A] + // and [P, N, L, J, O, M, K, I] -> [P, O, N, M, L, K, J, I] + vpermd(yreg_bias_acc0, yreg_permute_to_plain, yreg_bias_acc0); + vpermd(yreg_bias_acc1, yreg_permute_to_plain, yreg_bias_acc1); + vmovups(ptr[reg_bias + offset], yreg_bias_acc0); + vmovups(ptr[reg_bias + offset + vreg_traits::vlen], + yreg_bias_acc1); + } else { + assert(!"non-supported type"); } } } void jit_avx512_core_amx_bwd_bias_kernel_t::generate() { + Label f8_permute_to_vnni_table, f8_permute_to_plain_table; + preamble(); Label end_label; @@ -5720,11 +5801,26 @@ void jit_avx512_core_amx_bwd_bias_kernel_t::generate() { cmp(reg_nrows, 0); jle(end_label, T_NEAR); // nothing to do - if (jcp.ddst_dt == data_type::bf16) { + if (jcp.ddst_dt == bf16) { auto reg_unit_val = reg_tmp.cvt16(); mov(reg_unit_val, 0x3f80); // bf16 value of 1. vpbroadcastw(vreg_bias_unit, reg_unit_val); } + if (jcp.ddst_dt == f8_e5m2) + f8_emu = utils::make_unique(this, emu_reserv_1, + emu_reserv_2, emu_reserv_3, emu_mask, emu_scratch); + else if (jcp.ddst_dt == f8_e4m3) + f8_emu = utils::make_unique(this, emu_reserv_1, + emu_reserv_2, emu_reserv_3, emu_reserv_4, emu_reserv_5, + emu_scratch); + + if (one_of(jcp.ddst_dt, f8_e5m2, f8_e4m3)) { + mov(reg_tmp, f8_permute_to_vnni_table); + vmovdqu32(yreg_permute_to_vnni, ptr[reg_tmp]); + mov(reg_tmp, f8_permute_to_plain_table); + vmovdqu32(yreg_permute_to_plain, ptr[reg_tmp]); + } + mov(reg_bias, ptr[param + GET_OFF(bias)]); mov(reg_initial, ptr[param + GET_OFF(channel)]); @@ -5745,6 +5841,24 @@ void jit_avx512_core_amx_bwd_bias_kernel_t::generate() { L(end_label); postamble(); + + if (f8_emu) f8_emu->prepare_table(); + + if (one_of(jcp.ddst_dt, f8_e5m2, f8_e4m3)) { + align(64); + L(f8_permute_to_vnni_table); + { + const uint32_t _idx[] = {0, 2, 4, 6, 1, 3, 5, 7}; + for (size_t i = 0; i < sizeof(_idx) / sizeof(_idx[0]); ++i) + dd(_idx[i]); + } + L(f8_permute_to_plain_table); + { + const uint32_t _idx[] = {0, 4, 1, 5, 2, 6, 3, 7}; + for (size_t i = 0; i < sizeof(_idx) / sizeof(_idx[0]); ++i) + dd(_idx[i]); + } + } } // end of diff_bias kernel diff --git a/src/cpu/x64/jit_avx512_core_amx_conv_kernel.hpp b/src/cpu/x64/jit_avx512_core_amx_conv_kernel.hpp index 5c2e4073c96..37a0ab3a632 100644 --- a/src/cpu/x64/jit_avx512_core_amx_conv_kernel.hpp +++ b/src/cpu/x64/jit_avx512_core_amx_conv_kernel.hpp @@ -24,6 +24,7 @@ #include "cpu/x64/injectors/jit_uni_postops_injector.hpp" #include "cpu/x64/jit_avx512_core_bf16cvt.hpp" +#include "cpu/x64/jit_avx512_core_fp8cvt.hpp" #include "cpu/x64/jit_generator.hpp" #include "cpu/x64/jit_primitive_conf.hpp" @@ -771,18 +772,28 @@ struct jit_avx512_core_amx_bwd_bias_kernel_t : public jit_generator { Xbyak::Ymm yreg_bias_acc1 = Xbyak::Ymm(3); Xbyak::Ymm yreg_bias_ddst0 = Xbyak::Ymm(2); Xbyak::Ymm yreg_bias_ddst1 = Xbyak::Ymm(4); + Xbyak::Ymm yreg_bias_ddst00 = Xbyak::Ymm(5); + Xbyak::Ymm yreg_bias_ddst01 = Xbyak::Ymm(6); + + Xbyak::Ymm yreg_permute_to_vnni = Xbyak::Ymm(14); + Xbyak::Ymm yreg_permute_to_plain = Xbyak::Ymm(15); + + Xbyak::Zmm emu_reserv_1 = Xbyak::Zmm(30); + Xbyak::Zmm emu_reserv_2 = Xbyak::Zmm(29); + Xbyak::Zmm emu_reserv_3 = Xbyak::Zmm(28); + Xbyak::Zmm emu_reserv_4 = Xbyak::Zmm(27); + Xbyak::Zmm emu_reserv_5 = Xbyak::Zmm(26); + Xbyak::Reg64 emu_scratch = r10; + Xbyak::Opmask emu_mask = Xbyak::Opmask(4); + + std::unique_ptr f8_emu; void compute_diff_bias_row(int ocb); void compute_diff_bias(int nb_oc_blocking); void generate() override; - inline dim_t get_ddst_offset(dim_t w_idx, dim_t hd_idx = 0) { - int ow_per_oc = 2; - dim_t w_off = w_idx / ow_per_oc * ow_per_oc * jcp.oc_block - + w_idx % ow_per_oc; - return jcp.typesize_in * (w_off + jcp.tr_ow * jcp.oc_block * hd_idx); - } + dim_t get_ddst_offset(dim_t w_idx, dim_t hd_idx = 0) const; }; } // namespace x64 From ede0bc0bcbb3e8c0f675f815ee2737191427da36 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Wed, 17 Apr 2024 15:25:52 -0700 Subject: [PATCH 120/187] x64: brgemm uker: enable for fp8_via_convert --- src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp b/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp index 856401c3013..f5da5ef627e 100644 --- a/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp +++ b/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp @@ -2572,7 +2572,8 @@ void jit_brgemm_amx_uker_base_t::generate() { // if beta == 1 and C datatype is f32 it is better to perform addition by // reading tiles directly from C instead of by reading/writing by vectors may_load_accumulators_ = one_of(brg.alpha, 0, 1) && brg.beta == 1.f - && brg.dt_c == brg.dt_d && !brg.is_input_convert() + && brg.dt_c == brg.dt_d + && IMPLICATION(brg.is_input_convert(), brg.is_fp8_via_convert()) && IMPLICATION( brg.is_f32 || brg.is_bf16, brg.dt_c == data_type::f32) && IMPLICATION(brg.is_int8, brg.dt_c == data_type::s32) @@ -2589,7 +2590,8 @@ void jit_brgemm_amx_uker_base_t::generate() { assert(IMPLICATION(are_post_ops_applicable_ || need_to_apply_alpha_beta_ || brg.brgattr.bd_mask_level, !brg.is_blocked && !brg.brgattr.var_bs)); - assert(IMPLICATION(brg.brgattr.var_bs, !brg.is_input_convert())); + assert(IMPLICATION(brg.brgattr.var_bs, + IMPLICATION(brg.is_input_convert(), brg.is_fp8_via_convert()))); read_params(); prepare_bd_mask(); From 69d3135e9eb151aab9c423688760ef077d983c51 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Wed, 3 Apr 2024 14:46:59 -0700 Subject: [PATCH 121/187] x64: brgemm bwd_w conv: update init_conf_bwd_w for fp8 support --- src/cpu/x64/jit_brgemm_conv_utils.cpp | 39 ++++++++++++++++++--------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/src/cpu/x64/jit_brgemm_conv_utils.cpp b/src/cpu/x64/jit_brgemm_conv_utils.cpp index 713061b85cb..ab0b30d2f94 100644 --- a/src/cpu/x64/jit_brgemm_conv_utils.cpp +++ b/src/cpu/x64/jit_brgemm_conv_utils.cpp @@ -16,7 +16,6 @@ #include "dnnl_types.h" -#include "common/bfloat16.hpp" #include "common/c_types_map.hpp" #include "common/convolution_pd.hpp" #include "common/dnnl_thread.hpp" @@ -54,6 +53,7 @@ bool allow_perf_heuristics(const jit_brgemm_conv_conf_t &jcp) { // Disable performance heuristics for f16 as there are no other // optimized implementations. if (jcp.wei_dt == f16) return false; + if (one_of(jcp.wei_dt, f8_e5m2, f8_e4m3)) return false; return true; } } // namespace @@ -1682,8 +1682,11 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, if (jcp.wei_plain) CHECK(pick_tags(jcp, src_md, weights_md, dst_md, bias_md)); + const auto vnni_dt = jcp.prop_kind == prop_kind::backward_weights + ? jcp.dst_dt + : jcp.wei_dt; const data_type_t vnni_block_dt = get_mac_emu_data_type( - jcp.wei_dt, isa, isa == avx10_1_512 && !jcp.is_fp8_convert); + vnni_dt, isa, isa == avx10_1_512 && !jcp.is_fp8_convert); jcp.vnni_block = data_type_vnni_granularity(vnni_block_dt); if (one_of(jcp.prop_kind, prop_kind::forward_training, @@ -2996,7 +2999,11 @@ status_t init_conf_bwd_w(jit_brgemm_conv_conf_t &jcp, const bool is_f16 = src_d.data_type() == data_type::f16; - jcp.isa = is_f16 ? avx512_core_amx_fp16 : avx512_core_amx; + const auto is_fp8 = one_of(src_d.data_type(), f8_e5m2, f8_e4m3) + && one_of(diff_weights_d.data_type(), f32, f16, f8_e5m2, f8_e4m3) + && one_of(diff_dst_d.data_type(), f8_e5m2, f8_e4m3); + + jcp.isa = is_f16 || is_fp8 ? avx512_core_amx_fp16 : avx512_core_amx; // disabling verbose dispatch messages for unsupported isa for better readability if (!mayiuse(jcp.isa)) return status::unimplemented; @@ -3031,7 +3038,7 @@ status_t init_conf_bwd_w(jit_brgemm_conv_conf_t &jcp, && everyone_is(0, jcp.f_pad, jcp.back_pad, jcp.t_pad, jcp.b_pad)) jcp.var_bs = false; - jcp.typesize_in = sizeof(bfloat16_t); + jcp.typesize_in = jcp.src_dsz; jcp.typesize_out = sizeof(float); bool ok = true @@ -3080,18 +3087,27 @@ status_t init_conf_bwd_w(jit_brgemm_conv_conf_t &jcp, jcp.dst_tag = diff_dst_d.matches_one_of_tag(jcp.src_tag); VDISPATCH_CONV_IC(jcp.dst_tag == jcp.src_tag, VERBOSE_UNSUPPORTED_TAG); + jcp.wei_dt = diff_weights_d.data_type(); + const int wei_format_tag = 2 * ndims - 6 + with_groups; format_tag_t wei_tag; - if (jcp.transform_to_vnni) - wei_tag = pick(wei_format_tag, format_tag::OIw16i16o2i, - format_tag::gOIw16i16o2i, format_tag::OIhw16i16o2i, - format_tag::gOIhw16i16o2i, format_tag::OIdhw16i16o2i, - format_tag::gOIdhw16i16o2i); - else + if (jcp.transform_to_vnni) { + if (one_of(jcp.wei_dt, f8_e5m2, f8_e4m3)) + wei_tag = pick(wei_format_tag, format_tag::OIw16i16o4i, + format_tag::gOIw16i16o4i, format_tag::OIhw16i16o4i, + format_tag::gOIhw16i16o4i, format_tag::OIdhw16i16o4i, + format_tag::gOIdhw16i16o4i); + else + wei_tag = pick(wei_format_tag, format_tag::OIw16i16o2i, + format_tag::gOIw16i16o2i, format_tag::OIhw16i16o2i, + format_tag::gOIhw16i16o2i, format_tag::OIdhw16i16o2i, + format_tag::gOIdhw16i16o2i); + } else { wei_tag = pick(wei_format_tag, format_tag::OIw16i16o, format_tag::gOIw16i16o, format_tag::OIhw16i16o, format_tag::gOIhw16i16o, format_tag::OIdhw16i16o, format_tag::gOIdhw16i16o); + } if (diff_weights_md.format_kind == format_kind::any) { CHECK(memory_desc_init_by_tag(diff_weights_md, wei_tag)); jcp.wei_tag = wei_tag; @@ -3100,7 +3116,6 @@ status_t init_conf_bwd_w(jit_brgemm_conv_conf_t &jcp, VDISPATCH_CONV_IC( jcp.wei_tag == wei_tag, VERBOSE_UNSUPPORTED_TAG_S, "weights"); } - jcp.wei_dt = diff_weights_d.data_type(); /* kernel applicability check wrt boundaries * the conditions are quite general across the kernels we have, @@ -3142,7 +3157,7 @@ status_t init_conf_bwd_w(jit_brgemm_conv_conf_t &jcp, tr_round) * jcp.stride_w; - // TODO: xf16 training is supported only + // TODO: xf16 or fp8 training is supported only const auto rnd_val = jcp.vnni_block; jcp.tr_src_num_guard_elems = tr_pad; // upper bound jcp.tr_ow = rnd_up(jcp.ow, rnd_val); From 49125a0b14dd4acc8af96f7c805f2e5aa2f84ce8 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Thu, 2 May 2024 15:56:33 -0700 Subject: [PATCH 122/187] x64: brgemm bwd_w conv: update harness for fp8 --- src/cpu/x64/jit_brgemm_conv_bwd_w.cpp | 253 +++++++++++++++----------- src/cpu/x64/jit_brgemm_conv_bwd_w.hpp | 46 ++--- 2 files changed, 171 insertions(+), 128 deletions(-) diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp index 93b084e76c9..5eec6cb651c 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp @@ -45,12 +45,14 @@ status_t brgemm_convolution_bwd_weights_t::pd_t::init(engine_t *engine) { const auto diff_bia_type = diff_weights_md(1)->data_type; const auto diff_dst_type = diff_dst_md(0)->data_type; VDISPATCH_CONV(is_bwd_w(), VERBOSE_BAD_PROPKIND); - VDISPATCH_CONV(utils::one_of(src_type, bf16, f16), VERBOSE_UNSUPPORTED_DT); - VDISPATCH_CONV(diff_dst_type == src_type, VERBOSE_UNSUPPORTED_DT); - VDISPATCH_CONV(utils::one_of(diff_wei_type, f32, src_type), + VDISPATCH_CONV(utils::one_of(src_type, bf16, f16, f8_e5m2, f8_e4m3), VERBOSE_UNSUPPORTED_DT); + VDISPATCH_CONV(diff_dst_type == src_type, VERBOSE_UNSUPPORTED_DT); VDISPATCH_CONV( - utils::one_of(diff_bia_type, data_type::undef, f32, src_type), + utils::one_of(diff_wei_type, f32, f8_e5m2, f8_e4m3, src_type), + VERBOSE_UNSUPPORTED_DT); + VDISPATCH_CONV(utils::one_of(diff_bia_type, data_type::undef, f32, f8_e5m2, + f8_e4m3, src_type), VERBOSE_UNSUPPORTED_BIAS_CFG); VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct), VERBOSE_BAD_ALGORITHM); @@ -294,16 +296,16 @@ status_t brgemm_convolution_bwd_weights_t::init(engine_t *engine) { } struct brgemm_convolution_bwd_weights_t::thread_info_t { - const src_data_t *src = nullptr; - const diff_dst_data_t *diff_dst = nullptr; - const void *diff_weights = nullptr; - const void *diff_bias = nullptr; + const char *const __restrict src = nullptr; + const char *const __restrict diff_dst = nullptr; + const char *diff_weights = nullptr; + const char *diff_bias = nullptr; const brgemm_convolution_bwd_weights_t *self; const memory_tracking::grantor_t scratchpad; - src_data_t *tr_src = nullptr; - diff_dst_data_t *tr_diff_dst = nullptr; + char *tr_src = nullptr; + char *tr_diff_dst = nullptr; simple_barrier::ctx_t *tr_src_bctx = nullptr; simple_barrier::ctx_t *tr_diff_dst_bctx = nullptr; @@ -336,9 +338,9 @@ struct brgemm_convolution_bwd_weights_t::thread_info_t { thread_info_t(const brgemm_convolution_bwd_weights_t *pcnv, const exec_ctx_t &ctx, int ithr) - : src(CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC)) - , diff_dst(CTX_IN_MEM(const diff_dst_data_t *, DNNL_ARG_DIFF_DST)) - , diff_weights(CTX_OUT_MEM(void *, DNNL_ARG_DIFF_WEIGHTS)) + : src(CTX_IN_MEM(const char *, DNNL_ARG_SRC)) + , diff_dst(CTX_IN_MEM(const char *, DNNL_ARG_DIFF_DST)) + , diff_weights(CTX_OUT_MEM(char *, DNNL_ARG_DIFF_WEIGHTS)) , self(pcnv) , scratchpad(ctx.get_scratchpad_grantor()) , ithr(ithr) @@ -350,16 +352,15 @@ struct brgemm_convolution_bwd_weights_t::thread_info_t { diff_bias = self->pd()->with_bias() && (jcp.oc % jcp.oc_block != 0) && self->pd()->jcp_.bia_dt == data_type::f32 - ? (void *)scratchpad.template get(key_conv_padded_bias) - : CTX_OUT_MEM(void *, DNNL_ARG_DIFF_BIAS); + ? (char *)scratchpad.template get(key_conv_padded_bias) + : CTX_OUT_MEM(char *, DNNL_ARG_DIFF_BIAS); - tr_src = scratchpad.template get(key_conv_tr_src); + tr_src = scratchpad.template get(key_conv_tr_src); if (jcp.global_transpose) tr_src_bctx = scratchpad.template get( key_conv_tr_src_bctx); - tr_diff_dst = scratchpad.template get( - key_conv_tr_diff_dst); + tr_diff_dst = scratchpad.template get(key_conv_tr_diff_dst); if (jcp.global_transpose) tr_diff_dst_bctx = scratchpad.template get( key_conv_tr_diff_dst_bctx); @@ -401,11 +402,18 @@ struct brgemm_convolution_bwd_weights_t::thread_info_t { balance211(jcp.nb_oc, jcp.nthr_oc_b, ithr_oc_b, oc_b_start, oc_b_end); oc_b_work = oc_b_end - oc_b_start; - balance211(jcp.nb_ic, jcp.nthr_ic_b, ithr_ic_b, ic_b_start, ic_b_end); if (jcp.transform_to_vnni) { - if (ic_b_start % 2 != 0) ic_b_start++; - if (ic_b_end != jcp.nb_ic && ic_b_end % 2 != 0) ic_b_end++; + const int vnni_granularity = data_type_vnni_granularity(jcp.wei_dt); + const auto icb_work = div_up(jcp.nb_ic, vnni_granularity); + balance211( + icb_work, jcp.nthr_ic_b, ithr_ic_b, ic_b_start, ic_b_end); + ic_b_start = nstl::min(jcp.nb_ic, ic_b_start * vnni_granularity); + ic_b_end = nstl::min(jcp.nb_ic, ic_b_end * vnni_granularity); + } else { + balance211( + jcp.nb_ic, jcp.nthr_ic_b, ithr_ic_b, ic_b_start, ic_b_end); } + ic_b_work = ic_b_end - ic_b_start; brgemm_batch_element_t *const __restrict brg_batch_global @@ -470,30 +478,33 @@ struct brgemm_convolution_bwd_weights_t::thread_info_t { const size_t tr_row_size = jcp.tr_iw * jcp.ic_block; const size_t tr_3d_size = tr_row_size * jcp.ih_block; // Aligned to buffer end to use guard elements - return tr_src_buf_number(g, icb) * jcp.tr_src_buf_size + id * tr_3d_size - + ih * tr_row_size; + return (tr_src_buf_number(g, icb) * jcp.tr_src_buf_size + + id * tr_3d_size + ih * tr_row_size) + * jcp.src_dsz; } inline size_t tr_ic_block_src_off(int g, int tr_icb, int id, int ih) const { const int nb_tr_icb = jcp.ic_block / jcp.tr_ic_block; return tr_src_off(g, tr_icb / nb_tr_icb, id, ih) - + (tr_icb % nb_tr_icb) * jcp.tr_ic_block * jcp.tr_iw; + + (tr_icb % nb_tr_icb) * jcp.tr_ic_block * jcp.tr_iw + * jcp.src_dsz; } inline size_t tr_diff_dst_off(int g, int ocb, int od, int oh) const { const size_t tr_row_size = jcp.tr_ow * jcp.oc_block; const size_t tr_3d_size = tr_row_size * jcp.oh_block; - return tr_diff_dst_buf_number(g, ocb) * jcp.tr_diff_dst_buf_size - + od * tr_3d_size + oh * tr_row_size; + return (tr_diff_dst_buf_number(g, ocb) * jcp.tr_diff_dst_buf_size + + od * tr_3d_size + oh * tr_row_size) + * jcp.dst_dsz; } - void trans_src_nxc(src_data_t *tr_src, const src_data_t *src_base, - int tr_icb, int row_count, int ih_s) const { - const int src_stride = jcp.iw * jcp.ngroups * jcp.ic; - const int tr_src_stride = jcp.tr_iw * jcp.ic_block; + void trans_src_nxc(char *tr_src, const char *src_base, int tr_icb, + int row_count, int ih_s) const { + const int src_stride = jcp.iw * jcp.ngroups * jcp.ic * jcp.src_dsz; + const int tr_src_stride = jcp.tr_iw * jcp.ic_block * jcp.src_dsz; int sp_work = row_count; - const src_data_t *src = src_base; + const char *src = src_base; const int tr_ic_tail_work = jcp.tr_ic_tail ? jcp.tr_ic_tail : jcp.tr_ic_block; for (int iwork = 0; iwork < sp_work; iwork++) { @@ -514,16 +525,16 @@ struct brgemm_convolution_bwd_weights_t::thread_info_t { } } - void trans_dst_nxc(diff_dst_data_t *tr_diff_dst, - const diff_dst_data_t *diff_dst_base, int spatial_start, - dim_t spatial_start_offset, int ocb_start, dim_t chb_stride, - int row_count) const { - const int diff_dst_stride = jcp.ow * jcp.ngroups * jcp.oc; - const int tr_diff_dst_stride = jcp.tr_ow * jcp.oc_block; + void trans_dst_nxc(char *tr_diff_dst, const char *diff_dst_base, + int spatial_start, dim_t spatial_start_offset, int ocb_start, + dim_t chb_stride, int row_count) const { + const int diff_dst_stride = jcp.ow * jcp.ngroups * jcp.oc * jcp.dst_dsz; + const int tr_diff_dst_stride = jcp.tr_ow * jcp.oc_block * jcp.dst_dsz; int work_rest = row_count; int max_spatial_work = jcp.od * jcp.oh; int sp_work = nstl::min(work_rest, max_spatial_work - spatial_start); - const src_data_t *diff_dst = diff_dst_base + spatial_start_offset; + const char *diff_dst + = diff_dst_base + spatial_start_offset * jcp.dst_dsz; int ocb = 0; const int oc_tail_work = jcp.oc_tail ? jcp.oc_tail : jcp.oc_block; while (work_rest > 0) { @@ -596,15 +607,17 @@ struct brgemm_convolution_bwd_weights_t::thread_info_t { const int ic_off_idx = g_ * jcp.ic + tr_ic_b_ * jcp.tr_ic_block; - const src_data_t *p_src {nullptr}; + const char *p_src {nullptr}; if (jcp.harness == harness_2d_reduction) { - p_src = &src[src_d.blk_off(img, ic_off_idx, jh_s)]; + p_src = &src[src_d.blk_off(img, ic_off_idx, jh_s) + * jcp.src_dsz]; } else if (jcp.harness == harness_3d_reduction) { - p_src = &src[src_d.blk_off(img, ic_off_idx, jd_s, jh_s)]; + p_src = &src[src_d.blk_off(img, ic_off_idx, jd_s, jh_s) + * jcp.src_dsz]; } else assert(!"Invalid harness type"); - src_data_t *p_tr_src = &tr_src[tr_ic_block_src_off( + char *p_tr_src = &tr_src[tr_ic_block_src_off( g_, tr_ic_b_, jd_s - id_s, jh_s - ih_s)]; trans_src_nxc(p_tr_src, p_src, tr_ic_b_, jh_e - jh_s, jh_s); @@ -642,17 +655,19 @@ struct brgemm_convolution_bwd_weights_t::thread_info_t { int jh_e = jh_s + nstl::min(tr_end - tr_start, ohb_e - jh_s); const int oc_off_idx = g_ * jcp.oc + oc_b_ * jcp.oc_block; - const diff_dst_data_t *p_diff_dst {nullptr}; + const char *p_diff_dst {nullptr}; if (jcp.harness == harness_2d_reduction) { p_diff_dst = &diff_dst[diff_dst_d.blk_off( - img, oc_off_idx, jh_s)]; + img, oc_off_idx, jh_s) + * jcp.dst_dsz]; } else if (jcp.harness == harness_3d_reduction) { p_diff_dst = &diff_dst[diff_dst_d.blk_off( - img, oc_off_idx, jd_s, jh_s)]; + img, oc_off_idx, jd_s, jh_s) + * jcp.dst_dsz]; } else assert(!"Invalid harness type"); - diff_dst_data_t *p_tr_diff_dst = &tr_diff_dst[tr_diff_dst_off( + char *p_tr_diff_dst = &tr_diff_dst[tr_diff_dst_off( g_, oc_b_, jd_s - od_s, jh_s - oh_s)]; trans_dst_nxc( p_tr_diff_dst, p_diff_dst, 0, 0, oc_b_, 0, jh_e - jh_s); @@ -665,7 +680,7 @@ struct brgemm_convolution_bwd_weights_t::thread_info_t { barrier(&tr_diff_dst_bctx[ithr_but_ic], jcp.nthr_ic_b); } - void maybe_local_traspose(void *&p_src, void *&p_dst, int img, int g, + void maybe_local_traspose(char *&p_src, char *&p_dst, int img, int g, int ic_b, int oc_b, int od_s, int odb_s, int odb_e, int oh_s, int ohb_s, int ohb_e) const { @@ -695,16 +710,18 @@ struct brgemm_convolution_bwd_weights_t::thread_info_t { for_(int idb = idb_s; idb < idb_e; idb++) for (int icb = 0; icb < nb_ic_blocks; icb++) { const int ic_off_idx = g * jcp.ic + (ic_b + icb) * jcp.ic_block; - src_data_t *p_tr_src + char *p_tr_src = &tr_src[tr_src_off(0, 0, idb - id_s, ihb_s - ih_s)]; - src_data_t *tr_src_local = p_tr_src + icb * jcp.tr_src_block_size; - const src_data_t *p_raw_src {nullptr}; + char *tr_src_local + = p_tr_src + icb * jcp.tr_src_block_size * jcp.src_dsz; + const char *p_raw_src {nullptr}; if (jcp.harness == harness_2d_reduction) { - p_raw_src = (src_data_t - *)&src[src_d.blk_off(img, ic_off_idx, ihb_s)]; + p_raw_src = (char *)&src[src_d.blk_off(img, ic_off_idx, ihb_s) + * jcp.src_dsz]; } else if (jcp.harness == harness_3d_reduction) { - p_raw_src = (src_data_t *)&src[src_d.blk_off( - img, ic_off_idx, idb, ihb_s)]; + p_raw_src = (char *)&src[src_d.blk_off( + img, ic_off_idx, idb, ihb_s) + * jcp.src_dsz]; } else assert(!"Invalid harness type"); trans_src_nxc(tr_src_local, p_raw_src, @@ -717,19 +734,21 @@ struct brgemm_convolution_bwd_weights_t::thread_info_t { for_(int odb = odb_s; odb < odb_e; odb++) for (int ocb = 0; ocb < nb_oc_blocks; ocb++) { const int oc_off_idx = g * jcp.oc + (oc_b + ocb) * jcp.oc_block; - const diff_dst_data_t *p_raw_diff_dst {nullptr}; + const char *p_raw_diff_dst {nullptr}; if (jcp.harness == harness_2d_reduction) { p_raw_diff_dst - = &diff_dst[diff_dst_d.blk_off(img, oc_off_idx, ohb_s)]; + = &diff_dst[diff_dst_d.blk_off(img, oc_off_idx, ohb_s) + * jcp.dst_dsz]; } else if (jcp.harness == harness_3d_reduction) { p_raw_diff_dst = &diff_dst[diff_dst_d.blk_off( - img, oc_off_idx, odb, ohb_s)]; + img, oc_off_idx, odb, ohb_s) + * jcp.dst_dsz]; } else assert(!"Invalid harness type"); - diff_dst_data_t *p_tr_diff_dst = &tr_diff_dst[tr_diff_dst_off( + char *p_tr_diff_dst = &tr_diff_dst[tr_diff_dst_off( 0, 0, odb - od_s, ohb_s - oh_s)]; - diff_dst_data_t *tr_diff_dst_local - = p_tr_diff_dst + ocb * jcp.tr_diff_dst_block_size; + char *tr_diff_dst_local = p_tr_diff_dst + + ocb * jcp.tr_diff_dst_block_size * jcp.dst_dsz; trans_dst_nxc(tr_diff_dst_local, p_raw_diff_dst, 0, 0, (oc_b + ocb), 0, (ohb_e - ohb_s)); } @@ -855,8 +874,8 @@ void brgemm_convolution_bwd_weights_t::compute_diff_weights_2d( img = img_s; auto do_brgemm_call = [&](int g, int bs, int ic_b, int oc_b, int ohb_s, - int bs_ih_s, const void *p_src, - const void *p_dst, int kh, int kw, + int bs_ih_s, const char *p_src, + const char *p_dst, int kh, int kw, bool do_init) { const int ihb_s = ti->get_ih_start(ohb_s); @@ -866,12 +885,12 @@ void brgemm_convolution_bwd_weights_t::compute_diff_weights_2d( auto ocb_end = get_end(oc_b, jcp.nb_oc_blocking, ti->oc_b_end); auto icb_end = get_end(ic_b, jcp.nb_ic_blocking, ti->ic_b_end); const int src_stride_w_shift = jcp.tr_iw / jcp.stride_w; - const void *ptr_A = ((src_data_t *)p_src) - + _pd->filter_w_to_src(kw) / jcp.stride_w + const auto a_off = _pd->filter_w_to_src(kw) / jcp.stride_w + (kw % jcp.stride_w) * src_stride_w_shift + (bs_ih_s - ihb_s) * jcp.tr_iw * jcp.ic_block; - const void *ptr_B = ((diff_dst_data_t *)p_dst) - + (bs_oh_s - ohb_s) * jcp.tr_ow * jcp.oc_block; + const char *ptr_A = p_src + a_off * jcp.src_dsz; + const auto b_off = (bs_oh_s - ohb_s) * jcp.tr_ow * jcp.oc_block; + const char *ptr_B = p_dst + b_off * jcp.dst_dsz; void *ptr_C = (jcp.transform_to_vnni) ? diff_wei + wei_offset_int(g, oc_b, ic_b, 0, kh, kw) @@ -888,10 +907,10 @@ void brgemm_convolution_bwd_weights_t::compute_diff_weights_2d( for (int ohb = 0; ohb < bs; ohb++) { ti->brg_batch[ohb].ptr.A = (char *)ptr_A - + ohb * jcp.typesize_in * jcp.tr_iw * jcp.ic_block - * jcp.stride_h; + + ohb * jcp.tr_iw * jcp.ic_block * jcp.stride_h + * jcp.src_dsz; ti->brg_batch[ohb].ptr.B = (char *)ptr_B - + ohb * jcp.typesize_in * jcp.tr_ow * jcp.oc_block; + + ohb * jcp.tr_ow * jcp.oc_block * jcp.dst_dsz; } call_brgemm_kernel(*ti, brg_idx, bs, ptr_C); @@ -937,8 +956,8 @@ void brgemm_convolution_bwd_weights_t::compute_diff_weights_2d( ti->maybe_global_transpose(img, 0, 0, ic_b, ic_b_e, 0, 0, 1, oh_s, ohb_s, ohb_e); - void *p_src {nullptr}; - void *p_dst {nullptr}; + char *p_src {nullptr}; + char *p_dst {nullptr}; ti->maybe_local_traspose(p_src, p_dst, img, g, ic_b, oc_b, 0, 0, 1, oh_s, ohb_s, ohb_e); @@ -1029,7 +1048,7 @@ void brgemm_convolution_bwd_weights_t::compute_diff_weights_3d( auto do_brgemm_call = [&](int g, int bs_d, int bs_h, int ic_b, int oc_b, int od_s, int oh_s, int bs_id_s, int bs_ih_s, - const void *p_src, const void *p_dst, int kd, + const char *p_src, const char *p_dst, int kd, int kh, int kw, bool do_init) { const int id_s = ti->get_id_start(od_s); const int ih_s = ti->get_ih_start(oh_s); @@ -1043,14 +1062,14 @@ void brgemm_convolution_bwd_weights_t::compute_diff_weights_3d( auto ocb_end = get_end(oc_b, jcp.nb_oc_blocking, ti->oc_b_end); auto icb_end = get_end(ic_b, jcp.nb_ic_blocking, ti->ic_b_end); const int src_stride_w_shift = jcp.tr_iw / jcp.stride_w; - const void *ptr_A = ((src_data_t *)p_src) - + _pd->filter_w_to_src(kw) / jcp.stride_w + const auto a_off = _pd->filter_w_to_src(kw) / jcp.stride_w + (kw % jcp.stride_w) * src_stride_w_shift + (bs_ih_s - ih_s) * jcp.tr_iw * jcp.ic_block + (bs_id_s - id_s) * jcp.ih_block * jcp.tr_iw * jcp.ic_block; - const void *ptr_B = ((diff_dst_data_t *)p_dst) - + (bs_oh_s - oh_s) * jcp.tr_ow * jcp.oc_block + const char *ptr_A = p_src + a_off * jcp.src_dsz; + const auto b_off = (bs_oh_s - oh_s) * jcp.tr_ow * jcp.oc_block + (bs_od_s - od_s) * jcp.oh_block * jcp.tr_ow * jcp.oc_block; + const char *ptr_B = p_dst + b_off * jcp.dst_dsz; void *ptr_C = (jcp.transform_to_vnni) ? diff_wei + wei_offset_int(g, oc_b, ic_b, kd, kh, kw) : diff_wei @@ -1065,15 +1084,15 @@ void brgemm_convolution_bwd_weights_t::compute_diff_weights_3d( for (int odb = 0; odb < bs_d; odb++) { for (int ohb = 0; ohb < bs_h; ohb++) { - ti->brg_batch[odb * bs_h + ohb].ptr.A = (char *)ptr_A - + ohb * jcp.typesize_in * jcp.tr_iw * jcp.ic_block - * jcp.stride_h - + odb * jcp.typesize_in * jcp.ih_block * jcp.tr_iw - * jcp.ic_block * jcp.stride_d; - ti->brg_batch[odb * bs_h + ohb].ptr.B = (char *)ptr_B - + ohb * jcp.typesize_in * jcp.tr_ow * jcp.oc_block - + odb * jcp.typesize_in * jcp.oh_block * jcp.tr_ow - * jcp.oc_block; + const auto a_off_batch = (odb * jcp.ih_block * jcp.stride_d + + ohb * jcp.stride_h) + * jcp.tr_iw * jcp.ic_block * jcp.src_dsz; + ti->brg_batch[odb * bs_h + ohb].ptr.A + = (char *)ptr_A + a_off_batch; + const auto b_off_batch = (odb * jcp.oh_block + ohb) * jcp.tr_ow + * jcp.oc_block * jcp.dst_dsz; + ti->brg_batch[odb * bs_h + ohb].ptr.B + = (char *)ptr_B + b_off_batch; } } @@ -1125,8 +1144,8 @@ void brgemm_convolution_bwd_weights_t::compute_diff_weights_3d( ti->maybe_global_transpose(img, 0, 0, ic_b, ic_b_e, od_s, odb_s, odb_e, oh_s, ohb_s, ohb_e); - void *p_src {nullptr}; - void *p_dst {nullptr}; + char *p_src {nullptr}; + char *p_dst {nullptr}; ti->maybe_local_traspose(p_src, p_dst, img, g, ic_b, oc_b, od_s, odb_s, odb_e, oh_s, ohb_s, ohb_e); @@ -1149,11 +1168,11 @@ void brgemm_convolution_bwd_weights_t::compute_diff_weights_3d( bp.channel = (start == ti->img_start) && (odb_s == od_s) && (iodb == odb_s) && (ohb_s == oh_s); - bp.dst = ((diff_dst_data_t *)p_dst) - + (iodb - od_s) * jcp.oh_block - * jcp.tr_ow * jcp.oc_block - + (ohb_s - oh_s) * jcp.tr_ow - * jcp.oc_block; + const auto dst_idx + = ((iodb - od_s) * jcp.oh_block + + (ohb_s - oh_s)) + * jcp.tr_ow * jcp.oc_block; + bp.dst = p_dst + dst_idx * jcp.dst_dsz; (*diff_bias_kernel_)(&bp); } } @@ -1205,8 +1224,11 @@ void brgemm_convolution_bwd_weights_t::compute_diff_weights_3d( void brgemm_convolution_bwd_weights_t::store_in_vnni_format( thread_info_t *ti) const { const auto &jcp = pd()->jcp_; + if (one_of(0, ti->g_work, ti->oc_b_work, ti->ic_b_work)) return; + + const int vnni_granularity = data_type_vnni_granularity(jcp.wei_dt); - const auto icb2_work = div_up(ti->ic_b_work, 2); + const auto icb2_work = div_up(ti->ic_b_work, vnni_granularity); const auto work = ti->g_work * ti->oc_b_work * icb2_work; int start {0}, end {0}; @@ -1217,16 +1239,16 @@ void brgemm_convolution_bwd_weights_t::store_in_vnni_format( for (int w = start; w < end; w++) { const int g = ti->g_start + sub_g_start; const int oc_b = ti->oc_b_start + sub_oc_b_start; - const int ic_b = ti->ic_b_start + 2 * sub_icb2_start; + const int ic_b = ti->ic_b_start + vnni_granularity * sub_icb2_start; jit_conv_call_s p = jit_conv_call_s(); - bfloat16_t *output = (bfloat16_t *)ti->diff_weights - + wei_offset_ext(g, oc_b, (ic_b / 2), 0); float *input = ti->wei_bia_reduction + wei_offset_int(g, oc_b, ic_b, 0); + char *output = (char *)ti->diff_weights + + wei_offset_ext(g, oc_b, ic_b) * jcp.wei_dsz; p.src = (void *)input; p.dst = (void *)output; - p.last_ic_block = ((ic_b + 1) >= jcp.nb_ic) ? 1 : 0; + p.last_ic_block = ((ic_b + vnni_granularity) > jcp.nb_ic) ? 1 : 0; (*diff_wei_trans_kernel_)(&p); nd_iterator_step(sub_g_start, ti->g_work, sub_oc_b_start, ti->oc_b_work, sub_icb2_start, icb2_work); @@ -1261,8 +1283,7 @@ void brgemm_convolution_bwd_weights_t::reduce_and_convert_diff_weights_and_bias( const size_t off = wht_blk_off( diff_weights_d, g, oc_b, ti->ic_b_start); types::cvt_from_float(wei_dt, - (void *)((char *)ti->diff_weights - + off * types::data_type_size(wei_dt)), + (void *)(ti->diff_weights + off * jcp.wei_dsz), (ti->wei_bia_reduction + off), acc_size); } } @@ -1278,7 +1299,7 @@ void brgemm_convolution_bwd_weights_t::reduce_and_convert_diff_weights_and_bias( = nstl::min(jcp.oc, ti->oc_b_end * jcp.oc_block) - ti->oc_b_start * jcp.oc_block; void *diff_bias = (char *)ti->diff_bias - + result_start_idx * types::data_type_size(bia_dt); + + result_start_idx * jcp.bia_dsz; float *buffer = ti->bia_reduction + buffer_start_idx; types::cvt_from_float( bia_dt, diff_bias, (const float *)buffer, acc_size); @@ -1351,6 +1372,14 @@ void brgemm_convolution_bwd_weights_t::reduce_and_convert_diff_weights_and_bias( add_floats_and_cvt_to_float16( (float16_t *)(ti->diff_weights) + off_ext, wei_reduced, wei_to_reduce, acc_size); + else if (wei_dt == f8_e5m2) + add_floats_and_cvt_to_f8_e5m2( + (float8_e5m2_t *)(ti->diff_weights) + off_ext, + wei_reduced, wei_to_reduce, acc_size); + else if (wei_dt == f8_e4m3) + add_floats_and_cvt_to_f8_e4m3( + (float8_e4m3_t *)(ti->diff_weights) + off_ext, + wei_reduced, wei_to_reduce, acc_size); } else acc_ker_->accumulate(wei_reduced, wei_to_reduce, acc_size); @@ -1386,6 +1415,18 @@ void brgemm_convolution_bwd_weights_t::reduce_and_convert_diff_weights_and_bias( (float16_t *)(ti->diff_bias) + diff_bias_idx, &bias_reduced[idx], &bias_to_reduce[idx], acc_size); + else if (bia_dt == f8_e5m2) + add_floats_and_cvt_to_f8_e5m2( + (float8_e5m2_t *)(ti->diff_bias) + + diff_bias_idx, + &bias_reduced[idx], &bias_to_reduce[idx], + acc_size); + else if (bia_dt == f8_e4m3) + add_floats_and_cvt_to_f8_e4m3( + (float8_e4m3_t *)(ti->diff_bias) + + diff_bias_idx, + &bias_reduced[idx], &bias_to_reduce[idx], + acc_size); } else { acc_ker_->accumulate( &bias_reduced[idx], &bias_to_reduce[idx], acc_size); @@ -1406,7 +1447,7 @@ void brgemm_convolution_bwd_weights_t::prepare_scratchpad_data( const auto &jcp = pd()->jcp_; - auto tr_src = scratchpad.template get(key_conv_tr_src); + auto tr_src = scratchpad.template get(key_conv_tr_src); const auto bytes_to_zero = jcp.src_dsz * jcp.tr_src_num_guard_elems; if (jcp.oh_block < jcp.oh || jcp.id > 1) { // if (oh_block < oh) or (id > 1) then we zero end of each row because @@ -1414,19 +1455,21 @@ void brgemm_convolution_bwd_weights_t::prepare_scratchpad_data( // oh_block, padding and kh parallel_nd(jcp.tr_src_buf_count, jcp.ih_block * jcp.id, [&](size_t isb, size_t is) { - src_data_t *ts = &tr_src[isb * jcp.tr_src_buf_size - + (is + 1) * jcp.tr_iw * jcp.ic_block]; + const auto tr_src_idx = isb * jcp.tr_src_buf_size + + (is + 1) * jcp.tr_iw * jcp.ic_block; + char *ts = &tr_src[tr_src_idx * jcp.src_dsz]; std::memset(ts, 0, bytes_to_zero); }); // Zero out last guard elements - src_data_t *ts = &tr_src[jcp.tr_src_buf_count * jcp.tr_src_buf_size]; + char *ts = &tr_src[jcp.tr_src_buf_count * jcp.tr_src_buf_size + * jcp.src_dsz]; std::memset(ts, 0, bytes_to_zero); } else { // Zero out guard elements that cross a buffer boundary to prevent a // race condition due to buffer overflows from memory optimization where // buffers sharing padding parallel_nd(jcp.tr_src_buf_count, [&](size_t isb) { - src_data_t *ts = &tr_src[(isb + 1) * jcp.tr_src_buf_size]; + char *ts = &tr_src[(isb + 1) * jcp.tr_src_buf_size * jcp.src_dsz]; std::memset(ts, 0, bytes_to_zero); }); } diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_w.hpp b/src/cpu/x64/jit_brgemm_conv_bwd_w.hpp index 2c96c77d0e9..74234712512 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_w.hpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_w.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -128,9 +128,6 @@ struct brgemm_convolution_bwd_weights_t : public primitive_t { brgemm_convolution_bwd_weights_t(const pd_t *apd) : primitive_t(apd) {} - typedef typename prec_traits::type src_data_t; - typedef typename prec_traits::type diff_dst_data_t; - status_t init(engine_t *engine) override; status_t execute(const exec_ctx_t &ctx) const override { @@ -163,37 +160,40 @@ struct brgemm_convolution_bwd_weights_t : public primitive_t { status_t add_brg_kernel(int bs, int M, int i_N, int i_K, int i_init); void call_brgemm_kernel( thread_info_t &btc, int brg_idx, int batch_size, void *ptr_C) const; + inline dim_t wei_offset_int( int g, int oc_b, int ic_b, int kd, int kh, int kw) const { const auto &jcp = pd()->jcp_; - const dim_t const_extra_offset = jcp.ic_block * jcp.oc_block; - dim_t extra_offset - = ((kd * jcp.kh + kh) * jcp.kw + kw) * const_extra_offset; + const dim_t kw_offset = jcp.ic_block * jcp.oc_block; + dim_t extra_offset = ((kd * jcp.kh + kh) * jcp.kw + kw) * kw_offset; return (dim_t)((g * jcp.nb_oc + oc_b) * jcp.nb_ic + ic_b) * jcp.kd - * jcp.kh * jcp.kw * jcp.ic_block * jcp.oc_block + * jcp.kh * jcp.kw * kw_offset + extra_offset; } inline dim_t wei_offset_int(int g, int oc_b, int ic_b, int kX) const { const auto &jcp = pd()->jcp_; - const dim_t const_extra_offset = jcp.kw * jcp.ic_block * jcp.oc_block; - dim_t extra_offset = (jcp.ndims == 5) ? kX * jcp.kh * const_extra_offset - : kX * const_extra_offset; - return (dim_t)((g * jcp.nb_oc + oc_b) * jcp.nb_ic + ic_b) * jcp.kd - * jcp.kh * jcp.kw * jcp.ic_block * jcp.oc_block - + extra_offset; + const dim_t kh_offset = jcp.kw * jcp.ic_block * jcp.oc_block; + dim_t extra_offset = (jcp.ndims == 5) ? kX * jcp.kh : kX; + const auto res = ((dim_t)((g * jcp.nb_oc + oc_b) * jcp.nb_ic + ic_b) + * jcp.kd * jcp.kh + + extra_offset) + * kh_offset; + return res; } - inline dim_t wei_offset_ext(int g, int oc_b, int ic_b, int kX) const { + inline dim_t wei_offset_ext(int g, int oc_b, int ic_b) const { const auto &jcp = pd()->jcp_; - const int nb_ic = utils::div_up(jcp.ic, 2 * jcp.ic_block); - const dim_t const_extra_offset - = jcp.kw * jcp.ic_block * jcp.oc_block * 2; - dim_t extra_offset = (jcp.ndims == 5) ? kX * jcp.kh * const_extra_offset - : kX * const_extra_offset; - return (dim_t)((g * jcp.nb_oc + oc_b) * nb_ic + ic_b) * jcp.kd * jcp.kh - * jcp.kw * jcp.ic_block * jcp.oc_block * 2 - + extra_offset; + const int vnni_granularity = data_type_vnni_granularity(jcp.wei_dt); + + const int vnni_ic_b = ic_b / vnni_granularity; + const int vnni_ic_block = vnni_granularity * jcp.ic_block; + const int vnni_nb_ic = utils::div_up(jcp.ic, vnni_ic_block); + const dim_t kh_offset = jcp.kw * jcp.oc_block * vnni_ic_block; + const auto res + = (dim_t)((g * jcp.nb_oc + oc_b) * vnni_nb_ic + vnni_ic_b) + * jcp.kd * jcp.kh * kh_offset; + return res; } inline int get_end(int start, int step, int limit) const { From 49effc7609365726aac277adef4e9d009141c9f5 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Wed, 15 May 2024 13:41:55 -0700 Subject: [PATCH 123/187] cpu: conv list: updates for fp8 brgemm implementations --- src/cpu/cpu_convolution_list.cpp | 34 ++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp index 5f44cbe2d6e..dcdddedfac8 100644 --- a/src/cpu/cpu_convolution_list.cpp +++ b/src/cpu/cpu_convolution_list.cpp @@ -101,6 +101,16 @@ using namespace dnnl::impl::prop_kind; }) \ } +#define BRGEMM_FP8_BWD_W_CONVS(dtsrc, dtwei, dtdst) \ + { \ + {backward_weights, dtsrc, dtwei, dtdst}, \ + REG_BWD_PK({ \ + CPU_INSTANCE_AMX(brgemm_convolution_bwd_weights_t) \ + CPU_INSTANCE( \ + ref_convolution_bwd_weights_t) nullptr, \ + }) \ + } + // clang-format off const std::map> &impl_list_map() { static const std::map> the_map = REG_CONV_P({ @@ -299,10 +309,18 @@ const std::map> &impl_list_map() BRGEMM_FP8_BWD_D_CONVS(f8_e5m2, f8_e5m2, f8_e4m3), BRGEMM_FP8_BWD_D_CONVS(f8_e5m2, f8_e4m3, f8_e5m2), BRGEMM_FP8_BWD_D_CONVS(f8_e5m2, f8_e4m3, f8_e4m3), + BRGEMM_FP8_BWD_D_CONVS(f32, f8_e5m2, f8_e5m2), + BRGEMM_FP8_BWD_D_CONVS(f32, f8_e5m2, f8_e4m3), + BRGEMM_FP8_BWD_D_CONVS(f16, f8_e4m3, f8_e5m2), + BRGEMM_FP8_BWD_D_CONVS(f16, f8_e4m3, f8_e4m3), BRGEMM_FP8_BWD_D_CONVS(f8_e4m3, f8_e5m2, f8_e5m2), BRGEMM_FP8_BWD_D_CONVS(f8_e4m3, f8_e5m2, f8_e4m3), BRGEMM_FP8_BWD_D_CONVS(f8_e4m3, f8_e4m3, f8_e5m2), BRGEMM_FP8_BWD_D_CONVS(f8_e4m3, f8_e4m3, f8_e4m3), + BRGEMM_FP8_BWD_D_CONVS(f32, f8_e5m2, f8_e5m2), + BRGEMM_FP8_BWD_D_CONVS(f32, f8_e5m2, f8_e4m3), + BRGEMM_FP8_BWD_D_CONVS(f16, f8_e4m3, f8_e5m2), + BRGEMM_FP8_BWD_D_CONVS(f16, f8_e4m3, f8_e4m3), // BWD_W fp {{backward_weights, f32, f32, f32}, REG_BWD_PK({ CPU_INSTANCE_X64(ip_convolution_bwd_weights_t) @@ -354,6 +372,22 @@ const std::map> &impl_list_map() CPU_INSTANCE(ref_convolution_bwd_weights_t) nullptr, })}, + BRGEMM_FP8_BWD_W_CONVS(f8_e5m2, f8_e5m2, f8_e5m2), + BRGEMM_FP8_BWD_W_CONVS(f8_e5m2, f8_e5m2, f8_e4m3), + BRGEMM_FP8_BWD_W_CONVS(f8_e5m2, f8_e4m3, f8_e5m2), + BRGEMM_FP8_BWD_W_CONVS(f8_e5m2, f8_e4m3, f8_e4m3), + BRGEMM_FP8_BWD_W_CONVS(f8_e5m2, f32, f8_e5m2), + BRGEMM_FP8_BWD_W_CONVS(f8_e5m2, f32, f8_e4m3), + BRGEMM_FP8_BWD_W_CONVS(f8_e5m2, f16, f8_e5m2), + BRGEMM_FP8_BWD_W_CONVS(f8_e5m2, f16, f8_e4m3), + BRGEMM_FP8_BWD_W_CONVS(f8_e4m3, f8_e5m2, f8_e5m2), + BRGEMM_FP8_BWD_W_CONVS(f8_e4m3, f8_e5m2, f8_e4m3), + BRGEMM_FP8_BWD_W_CONVS(f8_e4m3, f8_e4m3, f8_e5m2), + BRGEMM_FP8_BWD_W_CONVS(f8_e4m3, f8_e4m3, f8_e4m3), + BRGEMM_FP8_BWD_W_CONVS(f8_e4m3, f32, f8_e5m2), + BRGEMM_FP8_BWD_W_CONVS(f8_e4m3, f32, f8_e4m3), + BRGEMM_FP8_BWD_W_CONVS(f8_e4m3, f16, f8_e5m2), + BRGEMM_FP8_BWD_W_CONVS(f8_e4m3, f16, f8_e4m3), // FWD int8 (src:s8) {{forward, s8, s8, f32}, { CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t) From aa81371478fd2a65abbbfc1e78db8db7a501b423 Mon Sep 17 00:00:00 2001 From: Andrey Kalinin Date: Wed, 15 May 2024 15:05:04 -0700 Subject: [PATCH 124/187] benchdnn: fp8 testing for brgemm, conv, deconv --- tests/benchdnn/deconv/cfg.cpp | 10 +- tests/benchdnn/dnnl_common.cpp | 2 +- tests/benchdnn/inputs/brgemm/test_brgemm_all | 2 - tests/benchdnn/inputs/brgemm/test_brgemm_ci | 4 + tests/benchdnn/inputs/brgemm/test_brgemm_fp8 | 19 ++++ .../inputs/conv/harness_conv_dw_fp8_nxc | 53 +++++++++ tests/benchdnn/inputs/conv/test_conv_dt_nxc | 3 + tests/benchdnn/inputs/conv/test_conv_fp8_nxc | 105 ++++++++++++++++++ .../inputs/deconv/test_deconv_fp8_nxc | 29 +++++ 9 files changed, 223 insertions(+), 4 deletions(-) create mode 100644 tests/benchdnn/inputs/brgemm/test_brgemm_fp8 create mode 100644 tests/benchdnn/inputs/conv/harness_conv_dw_fp8_nxc create mode 100644 tests/benchdnn/inputs/conv/test_conv_fp8_nxc create mode 100644 tests/benchdnn/inputs/deconv/test_deconv_fp8_nxc diff --git a/tests/benchdnn/deconv/cfg.cpp b/tests/benchdnn/deconv/cfg.cpp index 118c0ec5e26..919a5081c9d 100644 --- a/tests/benchdnn/deconv/cfg.cpp +++ b/tests/benchdnn/deconv/cfg.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2017-2023 Intel Corporation +* Copyright 2017-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -89,6 +89,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const { {{dnnl_f32}, {-32, 32}}, {{dnnl_bf16}, {-4, 4}}, {{dnnl_f16}, {-4, 4}}, + {{dnnl_f8_e5m2}, {-4, 4}}, + {{dnnl_f8_e4m3}, {-4, 4}}, {{dnnl_s8}, {-4, 4}}, {{dnnl_u8}, {0, 8}}, }; @@ -98,6 +100,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const { {{dnnl_f32}, {-32, 32}}, {{dnnl_bf16}, {-8, 8}}, {{dnnl_f16}, {-2, 2}}, + {{dnnl_f8_e5m2}, {-2, 2}}, + {{dnnl_f8_e4m3}, {-2, 2}}, {{dnnl_s8}, {-4, 4}}, }; @@ -106,6 +110,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const { {{dnnl_f32}, {-8, 8}}, {{dnnl_bf16}, {-8, 8}}, {{dnnl_f16}, {-8, 8}}, + {{dnnl_f8_e5m2}, {-8, 8}}, + {{dnnl_f8_e4m3}, {-8, 8}}, {{dnnl_s8}, {-8, 8}}, {{dnnl_u8}, {0, 8}}, {{dnnl_s32}, {-8, 8}}, @@ -116,6 +122,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const { {{dnnl_f32}, {-8, 8}}, {{dnnl_bf16}, {-4, 4}}, {{dnnl_f16}, {-4, 4}}, + {{dnnl_f8_e5m2}, {-4, 4}}, + {{dnnl_f8_e4m3}, {-4, 4}}, {{dnnl_s8}, {-4, 4}}, {{dnnl_u8}, {0, 160}}, {{dnnl_s32}, {-128, 128}}, diff --git a/tests/benchdnn/dnnl_common.cpp b/tests/benchdnn/dnnl_common.cpp index d8fba8af444..2ab0cafcd33 100644 --- a/tests/benchdnn/dnnl_common.cpp +++ b/tests/benchdnn/dnnl_common.cpp @@ -640,7 +640,7 @@ void skip_unimplemented_data_type( && (dir & FLAG_INF)); const bool has_f8_e4m3_support = is_gpu() || (is_cpu() && has_data_type_support(dnnl_f8_e4m3) - && (dir & FLAG_FWD)); + && (dir & FLAG_INF)); #else const bool has_bf16_support = is_gpu(); // f16 is supported on GPU for inference only. diff --git a/tests/benchdnn/inputs/brgemm/test_brgemm_all b/tests/benchdnn/inputs/brgemm/test_brgemm_all index 19b01b4074a..ce4f7782d4f 100644 --- a/tests/benchdnn/inputs/brgemm/test_brgemm_all +++ b/tests/benchdnn/inputs/brgemm/test_brgemm_all @@ -11,5 +11,3 @@ --batch=harness_brgemm_skip_acc --batch=harness_brgemm_fpmath - ---batch=harness_brgemm_fp8 diff --git a/tests/benchdnn/inputs/brgemm/test_brgemm_ci b/tests/benchdnn/inputs/brgemm/test_brgemm_ci index feeda04e8b0..3b74e652136 100644 --- a/tests/benchdnn/inputs/brgemm/test_brgemm_ci +++ b/tests/benchdnn/inputs/brgemm/test_brgemm_ci @@ -27,6 +27,10 @@ --batch=shapes_2d_no_tail_bf16 --beta=0,1 +--dt=f8_e5m2,f8_e5m2:f8_e5m2:f32 +--bia_dt=undef,f32,f8_e5m2 +--batch=shapes_2d_no_tail_int8 + --attr-post-ops= --dt=u8:s8:f32,s8:s8:bf16,u8:s8:u8,s8:s8:s8 --brgemm-attr=,use_uker:1 diff --git a/tests/benchdnn/inputs/brgemm/test_brgemm_fp8 b/tests/benchdnn/inputs/brgemm/test_brgemm_fp8 new file mode 100644 index 00000000000..44f46d2606d --- /dev/null +++ b/tests/benchdnn/inputs/brgemm/test_brgemm_fp8 @@ -0,0 +1,19 @@ +--reset + +--dt=f8_e5m2,f8_e5m2:f8_e5m2:f32 +--bia_dt=undef,f32,f8_e5m2 +--alpha=1,2 +--beta=0 +--attr-post-ops=,sum:2,relu +--brgemm-attr=,use_uker:1 +--batch=option_set_int8 + +--dt=f8_e4m3,f8_e4m3:f8_e4m3:f32 +--bia_dt=undef,f32,f8_e4m3 +--alpha=1,2 +--beta=0 +--attr-post-ops=,sum:2,relu +--brgemm-attr=,use_uker:1 +--batch=option_set_int8 + +--batch=harness_brgemm_fp8 diff --git a/tests/benchdnn/inputs/conv/harness_conv_dw_fp8_nxc b/tests/benchdnn/inputs/conv/harness_conv_dw_fp8_nxc new file mode 100644 index 00000000000..9f61867b596 --- /dev/null +++ b/tests/benchdnn/inputs/conv/harness_conv_dw_fp8_nxc @@ -0,0 +1,53 @@ +#f8_e5m2 +--reset +--skip-impl=ref +--mb=2 +--stag=axb --dtag=axb +--dir=FWD_B +--dt=f8_e5m2:f8_e5m2:f32,f8_e5m2 +--batch=shapes_mobilenet_dw --batch=shapes_ssd_mobilenet --batch=shapes_regression_dw + +--dir=BWD_D +--dt=f32:f8_e5m2:f8_e5m2,f8_e5m2 +--batch=shapes_mobilenet_dw --batch=shapes_ssd_mobilenet --batch=shapes_regression_dw + +--dir=BWD_WB +--dt=f8_e5m2:f32:f8_e5m2,f8_e5m2 +--batch=shapes_mobilenet_dw --batch=shapes_ssd_mobilenet --batch=shapes_regression_dw + +--reset +--mb=2 +--skip-impl=ref +--stag=axb --dtag=axb +--dir=FWD_D +--dt=f8_e5m2:f8_e5m2:f32,f8_e5m2 +--attr-post-ops=relu,sum,sum+relu,prelu,add:f8_e5m2:per_oc +--batch=shapes_mobilenet_dw +--batch=shapes_regression_dw + +#f8_e4m3 +--reset +--skip-impl=ref +--mb=2 +--stag=axb --dtag=axb +--dir=FWD_B +--dt=f8_e4m3:f8_e4m3:f32,f8_e4m3 +--batch=shapes_mobilenet_dw --batch=shapes_ssd_mobilenet --batch=shapes_regression_dw + +--dir=BWD_D +--dt=f32:f8_e4m3:f8_e4m3,f8_e4m3 +--batch=shapes_mobilenet_dw --batch=shapes_ssd_mobilenet --batch=shapes_regression_dw + +--dir=BWD_WB +--dt=f8_e4m3:f32:f8_e4m3,f8_e4m3 +--batch=shapes_mobilenet_dw --batch=shapes_ssd_mobilenet --batch=shapes_regression_dw + +--reset +--mb=2 +--skip-impl=ref +--stag=axb --dtag=axb +--dir=FWD_D +--dt=f8_e4m3:f8_e4m3:f32,f8_e4m3 +--attr-post-ops=relu,sum,sum+relu,prelu,add:f8_e4m3:per_oc +--batch=shapes_mobilenet_dw +--batch=shapes_regression_dw diff --git a/tests/benchdnn/inputs/conv/test_conv_dt_nxc b/tests/benchdnn/inputs/conv/test_conv_dt_nxc index 9794f7cb7ee..bd746632ccc 100644 --- a/tests/benchdnn/inputs/conv/test_conv_dt_nxc +++ b/tests/benchdnn/inputs/conv/test_conv_dt_nxc @@ -14,3 +14,6 @@ # f16 --batch=test_conv_float16_nxc + +# fp8 +--batch=test_conv_fp8_nxc diff --git a/tests/benchdnn/inputs/conv/test_conv_fp8_nxc b/tests/benchdnn/inputs/conv/test_conv_fp8_nxc new file mode 100644 index 00000000000..e68f38dc172 --- /dev/null +++ b/tests/benchdnn/inputs/conv/test_conv_fp8_nxc @@ -0,0 +1,105 @@ +# f8_e5m2 +--reset +--mb=2 +--stag=axb --dtag=axb +--skip-impl=ref +--dir=FWD_B +--dt=f8_e5m2:f8_e5m2:f32 --batch=shapes_resnet_50 +--dt=f8_e5m2:f8_e5m2:f8_e5m2 --batch=set_conv_all + +--dir=FWD_D +--dt=f8_e5m2:f8_e5m2:f8_e5m2 --batch=shapes_resnet_50 + +--dir=BWD_D +--dt=f32:f8_e5m2:f8_e5m2 --batch=shapes_resnet_50 +--dt=f8_e5m2:f8_e5m2:f8_e5m2 --batch=set_conv_all + +--dir=BWD_WB +--dt=f8_e5m2:f32:f8_e5m2 --batch=set_conv_all --batch=set_dilated-conv +--dt=f8_e5m2:f8_e5m2:f8_e5m2 --batch=shapes_resnet_50 --batch=set_dilated-conv + +--reset +--mb=2 +--stag=axb --dtag=axb +--skip-impl=ref,x64:gemm + +--dir=FWD_D +--dt=f8_e5m2:f8_e5m2:f8_e5m2 +--batch=shapes_3d_2d_strided_padding --batch=shapes_dilated_3d_strided_padding + +--dir=BWD_D +--dt=f32:f8_e5m2:f8_e5m2 +--batch=shapes_3d_2d_strided_padding + +--dir=BWD_WB +--dt=f8_e5m2:f32:f8_e5m2 +--batch=set_conv_3d --batch=shapes_dilated_3d_unit-stride_no-padding + +# Attributes +--reset +--mb=2 +--stag=axb --dtag=axb +--skip-impl=ref,x64:gemm +--dir=FWD_B +--dt=f8_e5m2:f8_e5m2:f8_e5m2 +## PostOps +--attr-post-ops=sum --batch=shapes_resnet_50 +--attr-post-ops=relu --batch=shapes_googlenet_v3 +--attr-post-ops=add:f8_e5m2:per_oc --batch=shapes_tails +--attr-post-ops=add:f8_e5m2:per_tensor --batch=shapes_tails +--attr-post-ops=mul:f32+sum+tanh:1:1:2.5 --batch=shapes_tails +--attr-post-ops=mul:f32+sum+tanh:1:1:2.5+prelu --batch=shapes_tails + +# f8_e4m3 +--reset +--mb=2 +--stag=axb --dtag=axb +--skip-impl=ref +--dir=FWD_B +--dt=f8_e4m3:f8_e4m3:f32 --batch=shapes_resnet_50 +--dt=f8_e4m3:f8_e4m3:f8_e4m3 --batch=set_conv_all + +--dir=FWD_D +--dt=f8_e4m3:f8_e4m3:f8_e4m3 --batch=shapes_resnet_50 + +--dir=BWD_D +--dt=f32:f8_e4m3:f8_e4m3 --batch=shapes_resnet_50 +--dt=f8_e4m3:f8_e4m3:f8_e4m3 --batch=set_conv_all + +--dir=BWD_WB +--dt=f8_e4m3:f32:f8_e4m3 --batch=set_conv_all --batch=set_dilated-conv +--dt=f8_e4m3:f8_e4m3:f8_e4m3 --batch=shapes_resnet_50 --batch=set_dilated-conv + +--reset +--mb=2 +--stag=axb --dtag=axb +--skip-impl=ref,x64:gemm + +--dir=FWD_D +--dt=f8_e4m3:f8_e4m3:f8_e4m3 +--batch=shapes_3d_2d_strided_padding --batch=shapes_dilated_3d_strided_padding + +--dir=BWD_D +--dt=f32:f8_e4m3:f8_e4m3 +--batch=shapes_3d_2d_strided_padding + +--dir=BWD_WB +--dt=f8_e4m3:f32:f8_e4m3 +--batch=set_conv_3d --batch=shapes_dilated_3d_unit-stride_no-padding + +# Attributes +--reset +--mb=2 +--stag=axb --dtag=axb +--skip-impl=ref,x64:gemm +--dir=FWD_B +--dt=f8_e4m3:f8_e4m3:f8_e4m3 +## PostOps +--attr-post-ops=sum --batch=shapes_resnet_50 +--attr-post-ops=relu --batch=shapes_googlenet_v3 +--attr-post-ops=add:f8_e4m3:per_oc --batch=shapes_tails +--attr-post-ops=add:f8_e4m3:per_tensor --batch=shapes_tails +--attr-post-ops=mul:f32+sum+tanh:1:1:2.5 --batch=shapes_tails +--attr-post-ops=mul:f32+sum+tanh:1:1:2.5+prelu --batch=shapes_tails + +--batch=harness_conv_dw_fp8_nxc diff --git a/tests/benchdnn/inputs/deconv/test_deconv_fp8_nxc b/tests/benchdnn/inputs/deconv/test_deconv_fp8_nxc new file mode 100644 index 00000000000..a278cc81047 --- /dev/null +++ b/tests/benchdnn/inputs/deconv/test_deconv_fp8_nxc @@ -0,0 +1,29 @@ +# fp8 nxc +--reset +--mb=2 +--stag=axb --dtag=axb + +--dt=f8_e5m2,f8_e4m3 +--dir=FWD_B +--attr-post-ops=,sum+prelu:per_oc,linear:2:1,sum:1.5+add:f32:per_oc+relu +--batch=set_all + +--dir=BWD_D,BWD_W,BWD_WB +--attr-post-ops= +--batch=set_all + +--dt=f8_e5m2:f8_e5m2:f32 --dir=FWD_B --batch=set_all +--dt=f32:f8_e5m2:f8_e5m2 --dir=BWD_D --batch=set_all +--dt=f8_e5m2:f32:f8_e5m2 --dir=BWD_WB --batch=set_all + +--dt=f8_e4m3:f8_e4m3:f32 --dir=FWD_B --batch=set_all +--dt=f32:f8_e4m3:f8_e4m3 --dir=BWD_D --batch=set_all +--dt=f8_e4m3:f32:f8_e4m3 --dir=BWD_WB --batch=set_all + +# Test Deconv w/bias through GeMM +--reset +--mb=2 +--stag=axb --dtag=axb + +--dt=f8_e5m2,f8_e4m3 +--dir=FWD_B,BWD_WB g16_ic32ih4iw8_oc64oh3ow8_kh3kw3sh1sw1ph0pw0n"gemm_shape" From 4586a952cfd19cca90bf83773c2e3b1a74524d74 Mon Sep 17 00:00:00 2001 From: Alexey Makarevich Date: Fri, 10 May 2024 06:01:11 -0700 Subject: [PATCH 125/187] cpu: x64: jit_generator: replace implicit capture this with explicit --- src/cpu/x64/jit_generator.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cpu/x64/jit_generator.cpp b/src/cpu/x64/jit_generator.cpp index 81b2bccc8db..17f88c68389 100644 --- a/src/cpu/x64/jit_generator.cpp +++ b/src/cpu/x64/jit_generator.cpp @@ -44,7 +44,7 @@ void jit_generator::transpose(const Xbyak::Reg64 ®_src, if (transpose_size > nrows) uni_vxorps(ymm_tmp, ymm_tmp, ymm_tmp); - auto load_src = [=](Xbyak::Xmm vmm, int r, int c) { + auto load_src = [= WA_THIS_COPY_CAPTURE](Xbyak::Xmm vmm, int r, int c) { const int simd_w = vmm.getBit() / (types::data_type_size(dt) * 8); const auto addr = ptr[reg_src + r * src_stride + c * types::data_type_size(dt)]; @@ -61,7 +61,7 @@ void jit_generator::transpose(const Xbyak::Reg64 ®_src, } }; - auto vinsert = [=](Xbyak::Ymm ymm, int r, int c) { + auto vinsert = [= WA_THIS_COPY_CAPTURE](Xbyak::Ymm ymm, int r, int c) { const int xmm_simd_w = 4; const auto addr = ptr[reg_src + r * src_stride + c * sizeof(float)]; if (r >= nrows) { @@ -78,7 +78,7 @@ void jit_generator::transpose(const Xbyak::Reg64 ®_src, // Intel(R) Software Optimization manual // Example 15-20. 8x8 Matrix Transpose Using VINSERTPS - auto transpose_8x4 = [=](int col) { + auto transpose_8x4 = [= WA_THIS_COPY_CAPTURE](int col) { load_src(xmm0, 0, col); vinsert(ymm0, 4, col); load_src(xmm1, 1, col); From 0a7fcdf7981a458a8e9d63f9232d4d1cd9094950 Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Mon, 20 May 2024 10:24:11 -0700 Subject: [PATCH 126/187] gpu: fixup devmode typo --- src/gpu/gpu_impl_list.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpu/gpu_impl_list.hpp b/src/gpu/gpu_impl_list.hpp index 7e137a4fbb8..066647b83f2 100644 --- a/src/gpu/gpu_impl_list.hpp +++ b/src/gpu/gpu_impl_list.hpp @@ -138,12 +138,12 @@ namespace gpu { DNNL_GPU_AMD_ONLY(GPU_REORDER_INSTANCE(__VA_ARGS__)) #define GPU_REORDER_INSTANCE_GENERIC_SYCL(...) \ DNNL_GPU_GENERIC_SYCL_ONLY(GPU_REORDER_INSTANCE(__VA_ARGS__)) -#define GPU_REORDER_INSTANCE_GENERIC(...) GPU_REORDER_INSTANCE(__VA_ARGS_) +#define GPU_REORDER_INSTANCE_GENERIC(...) GPU_REORDER_INSTANCE(__VA_ARGS__) // Instance macros that are enabled only in the DEV_MODE. #ifdef DNNL_DEV_MODE #define GPU_INSTANCE_INTEL_DEVMODE(...) \ - DNNL_GPU_INTEL_ONLY(GPU_INSTANCE(__VA_ARGS_)) + DNNL_GPU_INTEL_ONLY(GPU_INSTANCE(__VA_ARGS__)) #else #define GPU_INSTANCE_INTEL_DEVMODE(...) #endif From 40ae09468323797a2c805a3dadb51486e3b0e54e Mon Sep 17 00:00:00 2001 From: Tomasz Czeszun Date: Tue, 7 May 2024 13:40:13 -0700 Subject: [PATCH 127/187] cpu: ref_eltwise: enable fp8 --- src/cpu/cpu_eltwise_list.cpp | 4 ++++ src/cpu/ref_eltwise.cpp | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/src/cpu/cpu_eltwise_list.cpp b/src/cpu/cpu_eltwise_list.cpp index 2885db7fa1f..084bfdc34c8 100644 --- a/src/cpu/cpu_eltwise_list.cpp +++ b/src/cpu/cpu_eltwise_list.cpp @@ -75,6 +75,8 @@ const std::map> &impl_list_map() { CPU_INSTANCE(ref_eltwise_fwd_t) CPU_INSTANCE(ref_eltwise_fwd_t) CPU_INSTANCE(ref_eltwise_fwd_t) + CPU_INSTANCE(ref_eltwise_fwd_t) + CPU_INSTANCE(ref_eltwise_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ @@ -90,6 +92,8 @@ const std::map> &impl_list_map() { CPU_INSTANCE(ref_eltwise_bwd_t) CPU_INSTANCE(ref_eltwise_bwd_t) CPU_INSTANCE(ref_eltwise_bwd_t) + CPU_INSTANCE(ref_eltwise_bwd_t) + CPU_INSTANCE(ref_eltwise_bwd_t) nullptr, })}, }); diff --git a/src/cpu/ref_eltwise.cpp b/src/cpu/ref_eltwise.cpp index cb1054b933e..ad64b88068e 100644 --- a/src/cpu/ref_eltwise.cpp +++ b/src/cpu/ref_eltwise.cpp @@ -272,10 +272,14 @@ template struct ref_eltwise_fwd_t; template struct ref_eltwise_fwd_t; template struct ref_eltwise_fwd_t; template struct ref_eltwise_fwd_t; +template struct ref_eltwise_fwd_t; +template struct ref_eltwise_fwd_t; template struct ref_eltwise_bwd_t; template struct ref_eltwise_bwd_t; template struct ref_eltwise_bwd_t; +template struct ref_eltwise_bwd_t; +template struct ref_eltwise_bwd_t; } // namespace cpu } // namespace impl From 85bfafd753f79ab86d7c59fc89bcf3315e0c76bb Mon Sep 17 00:00:00 2001 From: Tomasz Czeszun Date: Mon, 6 May 2024 13:22:42 -0700 Subject: [PATCH 128/187] x64: eltwise: enable fp8 --- src/cpu/cpu_eltwise_list.cpp | 6 +++- src/cpu/x64/cpu_isa_traits.hpp | 1 + src/cpu/x64/jit_uni_eltwise.cpp | 57 ++++++++++++++++++++++++--------- 3 files changed, 48 insertions(+), 16 deletions(-) diff --git a/src/cpu/cpu_eltwise_list.cpp b/src/cpu/cpu_eltwise_list.cpp index 084bfdc34c8..03d4f107449 100644 --- a/src/cpu/cpu_eltwise_list.cpp +++ b/src/cpu/cpu_eltwise_list.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2022 Intel Corporation +* Copyright 2019-2024 Intel Corporation * Copyright 2021 FUJITSU LIMITED * Copyright 2021-2022 Arm Ltd. and affiliates * @@ -45,6 +45,8 @@ using namespace dnnl::impl::prop_kind; const std::map> &impl_list_map() { static const std::map> the_map = REG_ELTWISE_P({ {{forward}, { + CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t) + CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t) CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t) CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t) CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t) @@ -80,6 +82,8 @@ const std::map> &impl_list_map() { nullptr, }}, {{backward}, REG_BWD_PK({ + CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t) + CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t) CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t) CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t) CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t) diff --git a/src/cpu/x64/cpu_isa_traits.hpp b/src/cpu/x64/cpu_isa_traits.hpp index 984edd095e5..49144d183e8 100644 --- a/src/cpu/x64/cpu_isa_traits.hpp +++ b/src/cpu/x64/cpu_isa_traits.hpp @@ -312,6 +312,7 @@ struct cpu_isa_traits : public cpu_isa_traits { template <> struct cpu_isa_traits { typedef Xbyak::Zmm Vmm; + static constexpr int vlen = vreg_traits::vlen; static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx10_1_512_amx; static constexpr const char *user_option_env = "avx10_1_512_amx"; diff --git a/src/cpu/x64/jit_uni_eltwise.cpp b/src/cpu/x64/jit_uni_eltwise.cpp index e7cbd35b59d..9871f16eb5e 100644 --- a/src/cpu/x64/jit_uni_eltwise.cpp +++ b/src/cpu/x64/jit_uni_eltwise.cpp @@ -58,6 +58,10 @@ struct jit_uni_eltwise_kernel : public jit_generator { } bool is_bf16() const { return data_type() == data_type::bf16; } bool is_f16() const { return data_type() == data_type::f16; } + bool is_f8() const { + return utils::one_of( + data_type(), data_type::f8_e5m2, data_type::f8_e4m3); + } int dtype_size() const { return types::data_type_size(data_type()); } cpu_isa_t get_io_isa(cpu_isa_t isa) const { // reusing avx512_core instantiation for bf16 @@ -77,6 +81,7 @@ struct jit_uni_kernel_t : public jit_uni_eltwise_kernel { jit_uni_kernel_t(const eltwise_pd_t *pd) : jit_uni_eltwise_kernel(pd, jit_name()) , vlen_(is_bf16() || is_f16() ? cpu_isa_traits::vlen / 2 + : is_f8() ? cpu_isa_traits::vlen / 4 : cpu_isa_traits::vlen) , simd_w_(vlen_ / dtype_size()) , is_fwd_(pd_->is_fwd()) { @@ -86,17 +91,20 @@ struct jit_uni_kernel_t : public jit_uni_eltwise_kernel { // using the first 7 vregs can be considered volatile during the call // to eltwise injector const bool save_state = is_fwd_ ? false : true; - eltwise_injector_.reset(new jit_uni_eltwise_injector_f32(this, - desc.alg_kind, desc.alpha, desc.beta, 1.f, save_state, + eltwise_injector_.reset(new jit_uni_eltwise_injector_f32( + this, desc.alg_kind, desc.alpha, desc.beta, 1.f, save_state, reg_injector_table, injector_mask, is_fwd_, pd_->use_dst())); io::io_conf_t io_conf; io::io_tail_conf_t io_tail_conf(simd_w_, tail_size_, tail_opmask_idx_, vmm_tail_mask.getIdx(), reg_tmp); - io::io_emu_bf16_conf_t io_bf16_conf(bf16_emu_zmm_1_idx_, - bf16_emu_zmm_2_idx_, bf16_emu_zmm_3_idx_, reg_tmp, - bf16_emu_zmm_4_idx_); + io::io_emu_bf16_conf_t io_bf16_conf(emu_zmm_1_idx_, emu_zmm_2_idx_, + emu_zmm_3_idx_, reg_tmp, emu_zmm_4_idx_); + io::io_emu_fp8_conf_t io_fp8_conf(emu_zmm_1_idx_, emu_zmm_2_idx_, + emu_zmm_3_idx_, emu_zmm_4_idx_, emu_zmm_5_idx_, + emu_kmask_aux_idx_, reg_tmp); io_ = io::jit_io_multi_dt_helper_t(this, get_io_isa(isa), - {data_type()}, io_conf, io_tail_conf, io_bf16_conf); + {data_type()}, io_conf, io_tail_conf, io_bf16_conf, {}, + utils::nullopt, io_fp8_conf); } void compute_dst(const bool tail) { @@ -217,10 +225,15 @@ struct jit_uni_kernel_t : public jit_uni_eltwise_kernel { postamble(); eltwise_injector_->prepare_table(); + if (is_superset(isa, avx512_core_amx) && is_f8() + && io_.at(data_type()) != nullptr) + io_[data_type()]->prepare_table_fp8(); } private: using Vmm = typename cpu_isa_traits::Vmm; + static constexpr cpu_isa_t injector_isa + = isa == avx512_core_amx ? avx512_core : isa; const int vlen_; const int simd_w_; @@ -248,15 +261,18 @@ struct jit_uni_kernel_t : public jit_uni_eltwise_kernel { Vmm vmm_src_odd = Vmm(8); Vmm vmm_diff_dst_even = vmm_diff_dst; Vmm vmm_diff_dst_odd = Vmm(9); - std::unique_ptr> eltwise_injector_; + std::unique_ptr> + eltwise_injector_; io::jit_io_multi_dt_helper_t io_; - /* bf16 support */ - const int bf16_emu_zmm_1_idx_ = 26; - const int bf16_emu_zmm_2_idx_ = 27; - const int bf16_emu_zmm_3_idx_ = 28; - const int bf16_emu_zmm_4_idx_ = 29; + /* bf16 and fp8 support */ + const int emu_zmm_1_idx_ = 25; + const int emu_zmm_2_idx_ = 26; + const int emu_zmm_3_idx_ = 27; + const int emu_zmm_4_idx_ = 28; + const int emu_zmm_5_idx_ = 29; const int tail_opmask_idx_ = 6; + const int emu_kmask_aux_idx_ = 2; }; } // namespace @@ -270,6 +286,9 @@ status_t jit_uni_eltwise_fwd_t::pd_t::init(engine_t *engine) { // disabling verbose dispatch messages for unsupported isa for better readability if (!mayiuse(isa)) return status::unimplemented; + static constexpr cpu_isa_t injector_isa + = isa == avx512_core_amx ? avx512_core : isa; + VDISPATCH_ELTWISE(is_fwd(), VERBOSE_BAD_PROPKIND); VDISPATCH_ELTWISE(utils::everyone_is( d_type, src_md()->data_type, dst_md()->data_type), @@ -283,7 +302,8 @@ status_t jit_uni_eltwise_fwd_t::pd_t::init(engine_t *engine) { VERBOSE_ISA_DT_MISMATCH); VDISPATCH_ELTWISE(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, ""); VDISPATCH_ELTWISE(src_d.is_dense(true), VERBOSE_UNSUPPORTED_SPARSE_CFG); - VDISPATCH_ELTWISE(eltwise_injector::is_supported(isa, desc_.alg_kind), + VDISPATCH_ELTWISE( + eltwise_injector::is_supported(injector_isa, desc_.alg_kind), VERBOSE_BAD_ALGORITHM); // refer to a comment in jit_uni_kernel why this is needed VDISPATCH_ELTWISE(IMPLICATION(!src_d.is_dense(), is_zero_preserved()), @@ -350,6 +370,9 @@ status_t jit_uni_eltwise_bwd_t::pd_t::init(engine_t *engine) { // disabling verbose dispatch messages for unsupported isa for better readability if (!mayiuse(isa)) return status::unimplemented; + static constexpr cpu_isa_t injector_isa + = isa == avx512_core_amx ? avx512_core : isa; + VDISPATCH_ELTWISE(!is_fwd(), VERBOSE_BAD_PROPKIND); VDISPATCH_ELTWISE( utils::everyone_is(d_type, data_md()->data_type, @@ -364,8 +387,8 @@ status_t jit_uni_eltwise_bwd_t::pd_t::init(engine_t *engine) { VDISPATCH_ELTWISE(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, ""); VDISPATCH_ELTWISE(set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG); VDISPATCH_ELTWISE(data_d.is_dense(true), VERBOSE_UNSUPPORTED_SPARSE_CFG); - VDISPATCH_ELTWISE( - eltwise_injector::is_isa_supported(isa), VERBOSE_UNSUPPORTED_ISA); + VDISPATCH_ELTWISE(eltwise_injector::is_isa_supported(injector_isa), + VERBOSE_UNSUPPORTED_ISA); VDISPATCH_ELTWISE(eltwise_injector::is_alg_supported(desc_.alg_kind), VERBOSE_BAD_ALGORITHM); // refer to a comment in jit_uni_kernel why this is needed @@ -438,6 +461,8 @@ template struct jit_uni_eltwise_fwd_t; template struct jit_uni_eltwise_fwd_t; template struct jit_uni_eltwise_fwd_t; template struct jit_uni_eltwise_fwd_t; +template struct jit_uni_eltwise_fwd_t; +template struct jit_uni_eltwise_fwd_t; template struct jit_uni_eltwise_bwd_t; template struct jit_uni_eltwise_bwd_t; @@ -445,6 +470,8 @@ template struct jit_uni_eltwise_bwd_t; template struct jit_uni_eltwise_bwd_t; template struct jit_uni_eltwise_bwd_t; template struct jit_uni_eltwise_bwd_t; +template struct jit_uni_eltwise_bwd_t; +template struct jit_uni_eltwise_bwd_t; } // namespace x64 } // namespace cpu From 976f45162bf8b1110174305ad26bf7c478eb451f Mon Sep 17 00:00:00 2001 From: Tomasz Czeszun Date: Mon, 6 May 2024 13:47:35 -0700 Subject: [PATCH 129/187] tests: benchdnn: inputs: eltwise: add fp8 coverage --- tests/benchdnn/inputs/eltwise/test_eltwise_all | 3 +++ tests/benchdnn/inputs/eltwise/test_eltwise_ci | 2 +- tests/benchdnn/inputs/eltwise/test_eltwise_float8 | 12 ++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 tests/benchdnn/inputs/eltwise/test_eltwise_float8 diff --git a/tests/benchdnn/inputs/eltwise/test_eltwise_all b/tests/benchdnn/inputs/eltwise/test_eltwise_all index 99f0276bb4b..a36895387e2 100644 --- a/tests/benchdnn/inputs/eltwise/test_eltwise_all +++ b/tests/benchdnn/inputs/eltwise/test_eltwise_all @@ -31,5 +31,8 @@ # f16 --batch=test_eltwise_float16 +# f8 +--batch=test_eltwise_float8 + # regression check --batch=harness_eltwise_regression diff --git a/tests/benchdnn/inputs/eltwise/test_eltwise_ci b/tests/benchdnn/inputs/eltwise/test_eltwise_ci index eb9537ef7a8..d9fbb6da81f 100644 --- a/tests/benchdnn/inputs/eltwise/test_eltwise_ci +++ b/tests/benchdnn/inputs/eltwise/test_eltwise_ci @@ -2,7 +2,7 @@ --inplace=true,false ---dt=f32,bf16,f16 +--dt=f32,bf16,f16,f8_e5m2,f8_e4m3 --tag=abx,axb --dir=FWD_D --attr-post-ops=,mul:s8:per_oc diff --git a/tests/benchdnn/inputs/eltwise/test_eltwise_float8 b/tests/benchdnn/inputs/eltwise/test_eltwise_float8 new file mode 100644 index 00000000000..206a9e18424 --- /dev/null +++ b/tests/benchdnn/inputs/eltwise/test_eltwise_float8 @@ -0,0 +1,12 @@ +--reset + +--inplace=true,false +--dt=f8_e5m2,f8_e4m3 +--tag=abx,axb + +--dir=FWD_D,BWD_D +--batch=option_set_all_algs + +--dir=FWD_D +--attr-post-ops=add:f32+mul:f32:per_oc +--batch=option_set_all_algs From 6f806c481007690debf0bae3e844f70b87e4448a Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Fri, 17 May 2024 12:35:48 -0700 Subject: [PATCH 130/187] examples: ukernels: brgemm: allow unimplemeneted and skip gpu engine --- examples/CMakeLists.txt | 2 +- .../ukernels/{brgemm.cpp => cpu_brgemm.cpp} | 40 ++++++++++++++----- 2 files changed, 30 insertions(+), 12 deletions(-) rename examples/ukernels/{brgemm.cpp => cpu_brgemm.cpp} (91%) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 3e5ebfc4fd8..3bc12cf073b 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -50,7 +50,7 @@ if(NOT DNNL_EXPERIMENTAL_SPARSE) endif() if(NOT DNNL_EXPERIMENTAL_UKERNEL) - list(REMOVE_ITEM sources ${CMAKE_CURRENT_SOURCE_DIR}/ukernels/brgemm.cpp) + list(REMOVE_ITEM sources ${CMAKE_CURRENT_SOURCE_DIR}/ukernels/cpu_brgemm.cpp) endif() # Remove tests for CUDA which use unimplemented primitives diff --git a/examples/ukernels/brgemm.cpp b/examples/ukernels/cpu_brgemm.cpp similarity index 91% rename from examples/ukernels/brgemm.cpp rename to examples/ukernels/cpu_brgemm.cpp index d0486e63f4f..9ae799b530d 100644 --- a/examples/ukernels/brgemm.cpp +++ b/examples/ukernels/cpu_brgemm.cpp @@ -43,11 +43,11 @@ using namespace dnnl::ukernel; using tag = memory::format_tag; using dt = memory::data_type; -void brgemm_example(dnnl::engine::kind engine_kind) { +void brgemm_example() { // Create execution dnnl::engine. Needed for reorders to operate over input // data. - dnnl::engine engine(engine_kind, 0); + dnnl::engine engine(engine::kind::cpu, 0); // Create dnnl::stream. Needed for reorders for the same reason. dnnl::stream engine_stream(engine); @@ -177,16 +177,34 @@ void brgemm_example(dnnl::engine::kind engine_kind) { // zeroing the correspondent piece of accumulation buffer. brgemm brg, brg_po; if (batch_size > 0) { - brg = brgemm(M, N, K_k, batch_size, lda, ldb, ldc, a_dt, b_dt, c_dt, - /* alpha = */ 1.f, /* beta = */ 1.f); - // Generate the executable JIT code for the objects. - brg.generate(); + try { + brg = brgemm(M, N, K_k, batch_size, lda, ldb, ldc, a_dt, b_dt, c_dt, + /* alpha = */ 1.f, /* beta = */ 1.f); + // Generate the executable JIT code for the objects. + brg.generate(); + } catch (error &e) { + if (e.status == dnnl_unimplemented) + throw example_allows_unimplemented { + "Kernel is not supported on this platform.\n"}; + + // on any other error just re-throw + throw; + } } - brg_po = brgemm(M, N, K_k, 1, lda, ldb, ldc, ldd, a_dt, b_dt, c_dt, d_dt, - 1.f, 1.f, brgemm_attr); - // Generate the executable JIT code for the objects. - brg_po.generate(); + try { + brg_po = brgemm(M, N, K_k, 1, lda, ldb, ldc, ldd, a_dt, b_dt, c_dt, + d_dt, 1.f, 1.f, brgemm_attr); + // Generate the executable JIT code for the objects. + brg_po.generate(); + } catch (error &e) { + if (e.status == dnnl_unimplemented) + throw example_allows_unimplemented { + "Kernel is not supported on this platform.\n"}; + + // on any other error just re-throw + throw; + } // Query a scratchpad size and initialize a scratchpad buffer if the ukernel // is expecting it. This is a service space needed, has nothing in common @@ -310,5 +328,5 @@ void brgemm_example(dnnl::engine::kind engine_kind) { } int main(int argc, char **argv) { - return handle_example_errors(brgemm_example, dnnl::engine::kind::cpu); + return handle_example_errors({dnnl::engine::kind::cpu}, brgemm_example); } From 02404d6fc699ec30d8e778517f424d429c445dea Mon Sep 17 00:00:00 2001 From: "Lv, Tao A" Date: Wed, 15 May 2024 16:33:34 +0800 Subject: [PATCH 131/187] doc: graph: ops: clarify the optional zps for (de)quantize --- doc/graph/operations/Dequantize.md | 2 +- doc/graph/operations/DynamicDequantize.md | 3 +-- doc/graph/operations/DynamicQuantize.md | 9 ++++----- doc/graph/operations/Quantize.md | 2 +- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/doc/graph/operations/Dequantize.md b/doc/graph/operations/Dequantize.md index e12048e6815..8c38495fcbc 100644 --- a/doc/graph/operations/Dequantize.md +++ b/doc/graph/operations/Dequantize.md @@ -26,7 +26,7 @@ where \f$ic\f$ is the number of channels. | [qtype](@ref dnnl::graph::op::attr::qtype) | Specifies which de-quantization type is used. |string | `per_tensor` (default), `per_channel` | Optional | | [axis](@ref dnnl::graph::op::attr::axis) | Specifies dimension on which per-channel de-quantization is applied. |s64 | A s64 value in the range of [-r, r-1] where r = rank(src), `1` by default | Optional | | [scales](@ref dnnl::graph::op::attr::scales) | Scalings applied on the src data. |f32 | A f32 list (only contain one element if qtype is `per_tensor`) | Required | -| [zps](@ref dnnl::graph::op::attr::zps) | Offset values that maps to float zero. |s64 | A s64 list (only contain one element if qtype is `per_tensor`) | Optional | +| [zps](@ref dnnl::graph::op::attr::zps) | Offset values that maps to float zero. |s64 | A s64 list (only contain one element if qtype is `per_tensor`). If omitted, zps values are assumed to be zero. | Optional | ## Execution arguments diff --git a/doc/graph/operations/DynamicDequantize.md b/doc/graph/operations/DynamicDequantize.md index d551c8700e0..9e730f1fb54 100644 --- a/doc/graph/operations/DynamicDequantize.md +++ b/doc/graph/operations/DynamicDequantize.md @@ -44,8 +44,7 @@ element number of src tensor along the dimension axis. @note `zps` is a 1D tensor with offset values that map to zero. For `qtype` = `per-tensor`, there should be only one element in the zps tensor. For `qtype` = `per-channel`, the element number should be equal to the element number of input -tensor along the dimension axis. If not specified, the library can assume the -operator is symmetric de-quantization and perform kernel optimization accordingly. +tensor along the dimension axis. If omitted, zps values are assumed to be zero. ### Outputs diff --git a/doc/graph/operations/DynamicQuantize.md b/doc/graph/operations/DynamicQuantize.md index 56a66a71c9e..9a40a33c230 100644 --- a/doc/graph/operations/DynamicQuantize.md +++ b/doc/graph/operations/DynamicQuantize.md @@ -41,11 +41,10 @@ constructing an operation. For `qtype` = `per-channel`, the element number should be equal to the element number of src tensor along the dimension axis. -@note `zps` is a 1D tensor with offset values that map to zero. For `qtype` = `per-tensor`, there should be only one -element in the zps tensor. For `qtype` = `per-channel`, the element number should be -equal to the element number of input tensor along the dimension axis. If not -specified, the library can assume the operator is symmetric quantization and -perform kernel optimization accordingly. +@note `zps` is a 1D tensor with offset values that map to zero. For `qtype` = +`per-tensor`, there should be only one element in the zps tensor. For `qtype` = +`per-channel`, the element number should be equal to the element number of input +tensor along the dimension axis. If omitted, zps values are assumed to be zero. ### Outputs diff --git a/doc/graph/operations/Quantize.md b/doc/graph/operations/Quantize.md index fbc42a587f8..7c1165e8e3d 100644 --- a/doc/graph/operations/Quantize.md +++ b/doc/graph/operations/Quantize.md @@ -26,7 +26,7 @@ where \f$ic\f$ is the number of channels. | [qtype](@ref dnnl::graph::op::attr::qtype) | Specifies which quantization type is used. | string | `per_tensor` (default), `per_channel` | Optional | | [axis](@ref dnnl::graph::op::attr::axis) | Specifies dimension on which per-channel quantization is applied. | s64 | A s64 value in the range of [-r, r-1] where r = rank(src), `1` by default | Optional | | [scales](@ref dnnl::graph::op::attr::scales) | Scalings applied on the src data. | f32 | A f32 list (only contain one element if qtype is `per_tensor`) | Required | -| [zps](@ref dnnl::graph::op::attr::zps) | Offset values that maps to float zero. | s64 | A s64 list (only contain one element if qtype is `per_tensor`) | Optional | +| [zps](@ref dnnl::graph::op::attr::zps) | Offset values that maps to float zero. | s64 | A s64 list (only contain one element if qtype is `per_tensor`). If omitted, zps values are assumed to be zero. | Optional | ## Execution arguments From 0c1f185ec2448b62158ab77586f4c23d54db3161 Mon Sep 17 00:00:00 2001 From: xiang1guo Date: Thu, 9 May 2024 03:58:20 +0000 Subject: [PATCH 132/187] graph: backend: dnnl: extend sdpa to support optional trailing transpose/reshape --- src/graph/backend/dnnl/kernels/sdp.hpp | 9 ++++++--- src/graph/backend/dnnl/patterns/sdp.cpp | 9 ++------- src/graph/backend/dnnl/patterns/utils.hpp | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/src/graph/backend/dnnl/kernels/sdp.hpp b/src/graph/backend/dnnl/kernels/sdp.hpp index 1e7ceadfbe5..2499254f022 100644 --- a/src/graph/backend/dnnl/kernels/sdp.hpp +++ b/src/graph/backend/dnnl/kernels/sdp.hpp @@ -555,8 +555,10 @@ struct sdp_decomp_config_t { if (mm1 != nullptr && mm2 != nullptr) break; if (cur_op->get_kind() != graph::op_kind::MatMul) continue; auto post_op = get_post_op(cur_op); - if (post_op->get_kind() == graph::op_kind::Divide - || post_op->get_kind() == graph::op_kind::Multiply) { + if (post_op + && (post_op->get_kind() == graph::op_kind::Divide + || post_op->get_kind() + == graph::op_kind::Multiply)) { mm1 = cur_op; scale = post_op; const auto pop = get_post_op(post_op); @@ -570,7 +572,8 @@ struct sdp_decomp_config_t { add = nullptr; select = nullptr; } - } else if (post_op->get_kind() == graph::op_kind::Select) { + } else if (post_op + && (post_op->get_kind() == graph::op_kind::Select)) { return status::unimplemented; } else mm2 = cur_op; diff --git a/src/graph/backend/dnnl/patterns/sdp.cpp b/src/graph/backend/dnnl/patterns/sdp.cpp index 37a0f1a90a8..25ec122b17b 100644 --- a/src/graph/backend/dnnl/patterns/sdp.cpp +++ b/src/graph/backend/dnnl/patterns/sdp.cpp @@ -126,13 +126,8 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, float_sdp_fusion) {in_edge(0, p_select2, 0)}); auto matmul_v = pgraph->append_op( graph::op_kind::MatMul, {in_edge(0, softmax, 0)}); - auto transpose_output - = pgraph->append_op(graph::op_kind::StaticTranspose, - {in_edge(0, matmul_v, 0)}); - pgraph->append_alternation( - {graph::op_kind::Reorder, - graph::op_kind::StaticReshape}, - {in_edge(0, transpose_output, 0)}); + // Optional transpose + reshape/reorder + optional_transpose_reshape(pgraph, matmul_v, 0); }) .set_attr("FCreateKernel", []() -> kernel_ptr { return std::make_shared>(); diff --git a/src/graph/backend/dnnl/patterns/utils.hpp b/src/graph/backend/dnnl/patterns/utils.hpp index 6ba9845c9f5..2c04c401fca 100644 --- a/src/graph/backend/dnnl/patterns/utils.hpp +++ b/src/graph/backend/dnnl/patterns/utils.hpp @@ -349,6 +349,24 @@ inline graph::utils::pm::repetition_t *optional_select( return pselect; } +// Optional (transpose + reorder/staticReshape) +inline graph::utils::pm::repetition_t *optional_transpose_reshape( + const std::shared_ptr &pgraph, + graph::utils::pm::pb_node_t *input, int input_index) { + auto popt_graph = std::make_shared(); + + graph::utils::pm::pb_op_t *transpose + = popt_graph->append_op(graph::op_kind::StaticTranspose); + graph::utils::pm::pb_op_t *reshape_out = popt_graph->append_alternation( + {graph::op_kind::Reorder, graph::op_kind::StaticReshape}, + {in_edge(0, transpose, 0)}); + popt_graph->create_input_port(0, transpose, 0); + popt_graph->create_output_port(0, reshape_out, 0); + auto popt_transpose_reshape = pgraph->append_optional(popt_graph, + graph::utils::pm::in_edges_t {in_edge(input_index, input, 0)}); + return popt_transpose_reshape; +} + inline graph::utils::pm::pb_node_t *create_dequant_matmul( const std::shared_ptr &pgraph, graph::utils::pm::pb_node_t *input, bool is_bf16 = false, From bfee86e3e6b62e89f743cc4436ab10114984d317 Mon Sep 17 00:00:00 2001 From: xiang1guo Date: Tue, 14 May 2024 11:09:31 +0000 Subject: [PATCH 133/187] benchdnn: inputs: graph: add sdpa case w/o trailing transpose/reshape --- .../graph/complex_fusion/harness_mha_all | 4 + .../graph/complex_fusion/harness_mha_ci | 2 + .../MHA-stable_diffusion-inf-bf16-bs1.json | 276 ++++++++++++++++++ .../MHA-stable_diffusion-inf-fp32-bs1.json | 276 ++++++++++++++++++ 4 files changed, 558 insertions(+) create mode 100644 tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-stable_diffusion-inf-bf16-bs1.json create mode 100644 tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json diff --git a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all index 4e5bab34696..3317af140a0 100644 --- a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all +++ b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all @@ -15,6 +15,8 @@ --reset --case=complex_fusion/mha/MHA-distill_bert-inf-bf16-bs1.json --reset --case=complex_fusion/mha/MHA-distill_bert-inf-fp32-bs1.json --reset --case=complex_fusion/mha/MHA-distill_bert-inf-int8-bs1.json +--reset --case=complex_fusion/mha/MHA-stable_diffusion-inf-bf16-bs1.json +--reset --case=complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json --reset --case=complex_fusion/mha/MHA-starcoder-inf-bf16-bs1.json --reset --case=complex_fusion/mha/MHA-starcoder-inf-fp32-bs1.json --reset --case=complex_fusion/mha/MHA-starcoder-inf-int8-bs1.json @@ -35,6 +37,8 @@ --reset --in-shapes=4:56x12x128x64+5:56x12x64x128+0:56x12x128x64+1:56x1x1x128 --case=complex_fusion/mha/MHA-distill_bert-inf-fp32-bs1.json --reset --in-shapes=4:56x12x128x64+5:56x12x64x128+0:56x12x128x64+1:56x1x1x128 --case=complex_fusion/mha/MHA-distill_bert-inf-bf16-bs1.json --reset --in-shapes=5:56x12x128x64+4:56x12x64x128+0:56x12x128x64+1:56x1x1x128 --case=complex_fusion/mha/MHA-distill_bert-inf-int8-bs1.json +--reset --in-shapes=0:56x8x1024x80+1:56x8x77x80+2:56x8x77x80 --case=complex_fusion/mha/MHA-stable_diffusion-inf-bf16-bs1.json +--reset --in-shapes=0:56x8x1024x80+1:56x8x77x80+2:56x8x77x80 --case=complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json --reset --in-shapes=5:20x117x48x128+6:20x1x128x117+19:20x1x117x128 --case=complex_fusion/mha/MHA-starcoder-inf-fp32-bs1.json --reset --in-shapes=5:20x117x48x128+6:20x1x128x117+19:20x1x117x128 --case=complex_fusion/mha/MHA-starcoder-inf-bf16-bs1.json --reset --in-shapes=4:20x117x48x128+3:20x1x128x117+0:20x1x117x128 --case=complex_fusion/mha/MHA-starcoder-inf-int8-bs1.json diff --git a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_ci b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_ci index 42d0d529b17..add28c3b2f5 100644 --- a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_ci +++ b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_ci @@ -7,6 +7,8 @@ --reset --case=complex_fusion/mha/MHA-bert_large-inf-bf16-bs1.json --reset --case=complex_fusion/mha/MHA-bert_large-inf-fp32-bs1.json --reset --case=complex_fusion/mha/MHA-bert_large-inf-int8-bs1.json +--reset --case=complex_fusion/mha/MHA-stable_diffusion-inf-bf16-bs1.json +--reset --case=complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json --reset --case=complex_fusion/mha/MHA-distill_bert-inf-bf16-bs1.json --reset --case=complex_fusion/mha/MHA-distill_bert-inf-fp32-bs1.json --reset --case=complex_fusion/mha/MHA-distill_bert-inf-int8-bs1.json diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-stable_diffusion-inf-bf16-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-stable_diffusion-inf-bf16-bs1.json new file mode 100644 index 00000000000..afc8b32a0b6 --- /dev/null +++ b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-stable_diffusion-inf-bf16-bs1.json @@ -0,0 +1,276 @@ +{ + "version": "3.6.0", + "engine_kind": "gpu", + "fpmath_mode": "strict", + "input_ports": [ + 0, + 1, + 3, + 2 + ], + "output_ports": [ + 8 + ], + "graph": [ + { + "id": 0, + "name": "matmul_qk", + "kind": "MatMul", + "attrs": { + "transpose_a": { + "type": "bool", + "value": 0 + }, + "transpose_b": { + "type": "bool", + "value": 1 + } + }, + "inputs": [ + { + "id": 0, + "dtype": "bf16", + "shape": [ + 2, + 8, + 1024, + 80 + ], + "stride": [ + 655360, + 81920, + 80, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 1, + "dtype": "bf16", + "shape": [ + 2, + 8, + 77, + 80 + ], + "stride": [ + 49280, + 6160, + 80, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 5, + "dtype": "bf16", + "shape": [ + 2, + 8, + 1024, + 77 + ], + "stride": [ + 630784, + 78848, + 77, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 1, + "name": "scale_div", + "kind": "Multiply", + "attrs": { + "auto_broadcast": { + "type": "string", + "value": "numpy" + } + }, + "inputs": [ + { + "id": 5, + "dtype": "bf16", + "shape": [ + 2, + 8, + 1024, + 77 + ], + "stride": [ + 630784, + 78848, + 77, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 3, + "dtype": "bf16", + "shape": [ + 1 + ], + "stride": [ + 1 + ], + "layout_type": "strided", + "property_type": "constant" + } + ], + "outputs": [ + { + "id": 6, + "dtype": "bf16", + "shape": [ + 2, + 8, + 1024, + 77 + ], + "stride": [ + 630784, + 78848, + 77, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 3, + "name": "softmax", + "kind": "SoftMax", + "attrs": { + "axis": { + "type": "s64", + "value": -1 + } + }, + "inputs": [ + { + "id": 6, + "dtype": "bf16", + "shape": [ + 2, + 8, + 1024, + 77 + ], + "stride": [ + 630784, + 78848, + 77, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 7, + "dtype": "bf16", + "shape": [ + 2, + 8, + 1024, + 77 + ], + "stride": [ + 630784, + 78848, + 77, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 4, + "name": "matmul_v", + "kind": "MatMul", + "attrs": { + "transpose_a": { + "type": "bool", + "value": 0 + }, + "transpose_b": { + "type": "bool", + "value": 0 + } + }, + "inputs": [ + { + "id": 7, + "dtype": "bf16", + "shape": [ + 2, + 8, + 1024, + 77 + ], + "stride": [ + 630784, + 78848, + 77, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 2, + "dtype": "bf16", + "shape": [ + 2, + 8, + 77, + 80 + ], + "stride": [ + 49280, + 6160, + 80, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 8, + "dtype": "bf16", + "shape": [ + 2, + 8, + 1024, + 80 + ], + "stride": [ + 655360, + 81920, + 80, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + } + ] +} diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json new file mode 100644 index 00000000000..07c216aa7e2 --- /dev/null +++ b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json @@ -0,0 +1,276 @@ +{ + "version": "3.6.0", + "engine_kind": "gpu", + "fpmath_mode": "strict", + "input_ports": [ + 0, + 1, + 3, + 2 + ], + "output_ports": [ + 8 + ], + "graph": [ + { + "id": 0, + "name": "matmul_qk", + "kind": "MatMul", + "attrs": { + "transpose_a": { + "type": "bool", + "value": 0 + }, + "transpose_b": { + "type": "bool", + "value": 1 + } + }, + "inputs": [ + { + "id": 0, + "dtype": "f32", + "shape": [ + 2, + 8, + 1024, + 80 + ], + "stride": [ + 655360, + 81920, + 80, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 1, + "dtype": "f32", + "shape": [ + 2, + 8, + 77, + 80 + ], + "stride": [ + 49280, + 6160, + 80, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 5, + "dtype": "f32", + "shape": [ + 2, + 8, + 1024, + 77 + ], + "stride": [ + 630784, + 78848, + 77, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 1, + "name": "scale_div", + "kind": "Multiply", + "attrs": { + "auto_broadcast": { + "type": "string", + "value": "numpy" + } + }, + "inputs": [ + { + "id": 5, + "dtype": "f32", + "shape": [ + 2, + 8, + 1024, + 77 + ], + "stride": [ + 630784, + 78848, + 77, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 3, + "dtype": "f32", + "shape": [ + 1 + ], + "stride": [ + 1 + ], + "layout_type": "strided", + "property_type": "constant" + } + ], + "outputs": [ + { + "id": 6, + "dtype": "f32", + "shape": [ + 2, + 8, + 1024, + 77 + ], + "stride": [ + 630784, + 78848, + 77, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 3, + "name": "softmax", + "kind": "SoftMax", + "attrs": { + "axis": { + "type": "s64", + "value": -1 + } + }, + "inputs": [ + { + "id": 6, + "dtype": "f32", + "shape": [ + 2, + 8, + 1024, + 77 + ], + "stride": [ + 630784, + 78848, + 77, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 7, + "dtype": "f32", + "shape": [ + 2, + 8, + 1024, + 77 + ], + "stride": [ + 630784, + 78848, + 77, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 4, + "name": "matmul_v", + "kind": "MatMul", + "attrs": { + "transpose_a": { + "type": "bool", + "value": 0 + }, + "transpose_b": { + "type": "bool", + "value": 0 + } + }, + "inputs": [ + { + "id": 7, + "dtype": "f32", + "shape": [ + 2, + 8, + 1024, + 77 + ], + "stride": [ + 630784, + 78848, + 77, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 2, + "dtype": "f32", + "shape": [ + 2, + 8, + 77, + 80 + ], + "stride": [ + 49280, + 6160, + 80, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 8, + "dtype": "f32", + "shape": [ + 2, + 8, + 1024, + 80 + ], + "stride": [ + 655360, + 81920, + 80, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + } + ] +} From 331e4a050bf544797e4e9d3622746690cad15555 Mon Sep 17 00:00:00 2001 From: xiang1guo Date: Thu, 16 May 2024 02:20:20 +0000 Subject: [PATCH 134/187] graph: backend: dnnl: fix sdpa decomp kernel to handle unequal q/k/v seq len and fix last reorder's memory address if in-place --- src/graph/backend/dnnl/kernels/sdp.hpp | 45 +++++++++++++++++++------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/src/graph/backend/dnnl/kernels/sdp.hpp b/src/graph/backend/dnnl/kernels/sdp.hpp index 2499254f022..38f0dc7cef0 100644 --- a/src/graph/backend/dnnl/kernels/sdp.hpp +++ b/src/graph/backend/dnnl/kernels/sdp.hpp @@ -68,6 +68,7 @@ class sdp_reorder { reorder_prim_ = reorder(pd); return status::success; } + bool get_inplace() { return is_inplace_; } status_t execute(const stream &astream, const std::unordered_map &args) const { if (is_inplace_) @@ -87,7 +88,7 @@ struct sdp_decomp_config_t { sdp_decomp_config_t() = default; // SDP input dimension - memory::dim batch_size, num_head, seq_len, size_per_head; + memory::dim batch_size, num_head, seq_len_q, size_per_head; // SDP input and output strides memory::dims src1_strides, wei1_strides, wei2_strides, dst_strides, @@ -166,7 +167,7 @@ struct sdp_decomp_config_t { // Initialize SDP input dimension according to the src of mm1 batch_size = src1_user_dims[0]; num_head = src1_user_dims[1]; - seq_len = src1_user_dims[2]; + seq_len_q = src1_user_dims[2]; size_per_head = src1_user_dims[3]; #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_OMP @@ -199,6 +200,14 @@ struct sdp_decomp_config_t { // Record the ops inside of SDP pattern for later usage record_sdp_ops(sg, quantized); + // Update SDPA input params. Sequence length for query and key/value are + // NOT always same. + memory::dim seq_len_kv; + const auto <_wei + = sdp_op[1]->get_input_value(1)->get_logical_tensor(); + const ltw ltw_wei(lt_wei); + seq_len_kv = ltw_wei.vdims()[3]; + // Acquire the data type from input param for later primitive creation. // The src and wei dt of both quantized sdp and float sdp are the same. memory::data_type dt_src_user = static_cast( @@ -229,7 +238,7 @@ struct sdp_decomp_config_t { sub_reorder0_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); // per-head: reorder src1 to dense, for first matmul - memory::dims sub_src1_dims = {1, 1, seq_len, size_per_head}; + memory::dims sub_src1_dims = {1, 1, seq_len_q, size_per_head}; src1_strides = ltw(inputs[graph_inport[0]]).vstrides(); sub_src1_md = memory::desc(sub_src1_dims, dt_src_user, {1, 1, src1_strides[2], src1_strides[3]}); @@ -245,7 +254,7 @@ struct sdp_decomp_config_t { // create reorder1 primitive attr dnnl::primitive_attr sub_reorder1_attr = make_primitive_attr(sdp_op[0], mgr); - memory::dims sub_wei1_dims = {1, 1, size_per_head, seq_len}; + memory::dims sub_wei1_dims = {1, 1, size_per_head, seq_len_kv}; auto wei_md = make_dnnl_memory_desc( sdp_op[1]->get_input_value(1)->get_logical_tensor()); wei1_strides = wei_md.get_strides(); @@ -261,9 +270,9 @@ struct sdp_decomp_config_t { // create first matmul primitive attr dnnl::primitive_attr sub_matmul1_attr = make_primitive_attr(sdp_op[1], mgr); - memory::dims sub_mm1_src_dims = {1, 1, seq_len, size_per_head}; - memory::dims sub_mm1_wei_dims = {1, 1, size_per_head, seq_len}; - memory::dims sub_mm1_dst_dims = {1, 1, seq_len, seq_len}; + memory::dims sub_mm1_src_dims = {1, 1, seq_len_q, size_per_head}; + memory::dims sub_mm1_wei_dims = {1, 1, size_per_head, seq_len_kv}; + memory::dims sub_mm1_dst_dims = {1, 1, seq_len_q, seq_len_kv}; sub_mm1_src_md = memory::desc(sub_mm1_src_dims, dt_src_user, tag::abcd); sub_mm1_wei_md = memory::desc(sub_mm1_wei_dims, dt_wei, tag::abdc); @@ -306,7 +315,7 @@ struct sdp_decomp_config_t { // create reorder2 primitive attr dnnl::primitive_attr sub_reorder2_attr = make_primitive_attr(sdp_op[3], mgr); - memory::dims sub_wei2_dims = {1, 1, seq_len, size_per_head}; + memory::dims sub_wei2_dims = {1, 1, seq_len_kv, size_per_head}; wei2_strides = ltw(inputs[graph_inport[4]]).vstrides(); sub_wei2_user_md = memory::desc(sub_wei2_dims, dt_wei_user, {1, 1, wei2_strides[2], wei2_strides[3]}); @@ -320,9 +329,9 @@ struct sdp_decomp_config_t { // create second matmul primitive attr dnnl::primitive_attr sub_matmul2_attr = make_primitive_attr(sdp_op[4], mgr); - memory::dims sub_mm2_src_dims = {1, 1, seq_len, seq_len}; - memory::dims sub_mm2_wei_dims = {1, 1, seq_len, size_per_head}; - memory::dims sub_mm2_dst_dims = {1, 1, seq_len, size_per_head}; + memory::dims sub_mm2_src_dims = {1, 1, seq_len_q, seq_len_kv}; + memory::dims sub_mm2_wei_dims = {1, 1, seq_len_kv, size_per_head}; + memory::dims sub_mm2_dst_dims = {1, 1, seq_len_q, size_per_head}; auto sub_mm2_src_md = memory::desc(sub_mm2_src_dims, dt_src_user, tag::abcd); sub_mm2_wei_md = memory::desc(sub_mm2_wei_dims, dt_wei, tag::abcd); @@ -334,7 +343,7 @@ struct sdp_decomp_config_t { // per-head: reorder dst2 from dense to strided primitive_attr sub_reorder3_attr; sub_reorder3_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); - memory::dims sub_dst_dims = {1, 1, seq_len, size_per_head}; + memory::dims sub_dst_dims = {1, 1, seq_len_q, size_per_head}; auto out_lt = sdp_op[4]->get_output_value(0)->get_logical_tensor(); dst_strides = ltw(out_lt).vstrides(); sub_dst_md = memory::desc(sub_dst_dims, dt_src_user, tag::abcd); @@ -1144,6 +1153,10 @@ class sdp_decomp_kernel_t : public kernel_base_t { auto &sub_dst_user_tid = res->mem_map[sdp_cfg_.sub_dst_user.get()][tid]; + // matmul2 + auto &sub_mm2_dst_tid + = res->mem_map[sdp_cfg_.sub_mm2_dst.get()][tid]; + const size_t sub_src1_offset = (bo * sdp_cfg_.src1_strides[0] + bi * sdp_cfg_.src1_strides[1]) @@ -1169,6 +1182,14 @@ class sdp_decomp_kernel_t : public kernel_base_t { sub_dst_user_tid.set_data_handle( dst2_user_pointer + sub_dst_user_offset); + // If the last reorder is inplace, it means we don't have to do + // extra reorder, thus we should set matmul's output to the user's + // output directly. + if (sdp_cfg_.sub_reorder3.get_inplace()) { + sub_mm2_dst_tid.set_data_handle( + dst2_user_pointer + sub_dst_user_offset); + } + // in parallel region - these primitives should use single thread. sdp_cfg_.sub_reorder0.execute(strm, res->sub_reorder0_args[tid]); sdp_cfg_.sub_reorder1.execute(strm, res->sub_reorder1_args[tid]); From b4a5351129f9efd2073f31d5e6f93ffcd84802fc Mon Sep 17 00:00:00 2001 From: "Lv, Tao A" Date: Fri, 26 Apr 2024 00:11:06 +0800 Subject: [PATCH 135/187] benchdnn: graph: inputs: add a cases for lnorm + tc + multiply + quantize --- .../inputs/graph/pattern/harness_int8_all | 1 + .../int8/int8_lnorm_tc_multiply_quantize.json | 261 ++++++++++++++++++ 2 files changed, 262 insertions(+) create mode 100644 tests/benchdnn/inputs/graph/pattern/int8/int8_lnorm_tc_multiply_quantize.json diff --git a/tests/benchdnn/inputs/graph/pattern/harness_int8_all b/tests/benchdnn/inputs/graph/pattern/harness_int8_all index 44687a3c79b..e51df7d3bde 100644 --- a/tests/benchdnn/inputs/graph/pattern/harness_int8_all +++ b/tests/benchdnn/inputs/graph/pattern/harness_int8_all @@ -115,5 +115,6 @@ #layernorm --reset --case=pattern/int8/int8_lnorm_gelu_quantize.json --reset --case=pattern/int8/int8_lnorm_multiply_quantize.json +--reset --case=pattern/int8/int8_lnorm_tc_multiply_quantize.json #softmax --reset --case=pattern/int8/int8_softmax_add.json diff --git a/tests/benchdnn/inputs/graph/pattern/int8/int8_lnorm_tc_multiply_quantize.json b/tests/benchdnn/inputs/graph/pattern/int8/int8_lnorm_tc_multiply_quantize.json new file mode 100644 index 00000000000..50cfa7b7284 --- /dev/null +++ b/tests/benchdnn/inputs/graph/pattern/int8/int8_lnorm_tc_multiply_quantize.json @@ -0,0 +1,261 @@ +{ + "version": "3.5.0", + "engine_kind": "cpu", + "fpmath_mode": "strict", + "input_ports": [ + 0, + 1, + 2, + 5 + ], + "output_ports": [ + 7 + ], + "graph": [ + { + "id": 0, + "name": "layernorm", + "kind": "LayerNorm", + "attrs": { + "begin_norm_axis": { + "type": "s64", + "value": -1 + }, + "use_affine": { + "type": "bool", + "value": 1 + }, + "keep_stats": { + "type": "bool", + "value": 0 + }, + "epsilon": { + "type": "f32", + "value": 0.0625 + } + }, + "inputs": [ + { + "id": 0, + "dtype": "bf16", + "shape": [ + 1, + 128, + 512 + ], + "stride": [ + 65536, + 512, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 1, + "dtype": "f32", + "shape": [ + 512 + ], + "stride": [ + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 2, + "dtype": "f32", + "shape": [ + 512 + ], + "stride": [ + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 3, + "dtype": "bf16", + "shape": [ + 1, + 128, + 512 + ], + "stride": [ + 65536, + 512, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 1, + "name": "typecast", + "kind": "TypeCast", + "attrs": {}, + "inputs": [ + { + "id": 3, + "dtype": "bf16", + "shape": [ + 1, + 128, + 512 + ], + "stride": [ + 65536, + 512, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 4, + "dtype": "f32", + "shape": [ + 1, + 128, + 512 + ], + "stride": [ + 65536, + 512, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 2, + "name": "multiply", + "kind": "Multiply", + "attrs": { + "auto_broadcast": { + "type": "string", + "value": "numpy" + } + }, + "inputs": [ + { + "id": 4, + "dtype": "f32", + "shape": [ + 1, + 128, + 512 + ], + "stride": [ + 65536, + 512, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 5, + "dtype": "f32", + "shape": [ + 1, + 128, + 512 + ], + "stride": [ + 65536, + 512, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 6, + "dtype": "f32", + "shape": [ + 1, + 128, + 512 + ], + "stride": [ + 65536, + 512, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 3, + "name": "quantize", + "kind": "Quantize", + "attrs": { + "axis": { + "type": "s64", + "value": 0 + }, + "qtype": { + "type": "string", + "value": "per_tensor" + }, + "scales": { + "type": "f32[]", + "value": [ + 0.5 + ] + } + }, + "inputs": [ + { + "id": 6, + "dtype": "f32", + "shape": [ + 1, + 128, + 512 + ], + "stride": [ + 65536, + 512, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 7, + "dtype": "s8", + "shape": [ + 1, + 128, + 512 + ], + "stride": [ + 65536, + 512, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + } + ] +} From 0cfa33b3e395b134f883bb9a89c6d41d9f12d372 Mon Sep 17 00:00:00 2001 From: "Lv, Tao A" Date: Tue, 7 May 2024 17:31:44 +0800 Subject: [PATCH 136/187] benchdnn: graph: inputs: correct op kind in quantize.json --- tests/benchdnn/inputs/graph/op/f32/quantize.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/benchdnn/inputs/graph/op/f32/quantize.json b/tests/benchdnn/inputs/graph/op/f32/quantize.json index f5eeaba3a81..f5649c690d3 100644 --- a/tests/benchdnn/inputs/graph/op/f32/quantize.json +++ b/tests/benchdnn/inputs/graph/op/f32/quantize.json @@ -5,8 +5,8 @@ "graph": [ { "id": 0, - "name": "DEQUANTIZE_0", - "kind": "Dequantize", + "name": "QUANTIZE_0", + "kind": "Quantize", "attrs": { "zps": { "type": "s64[]", @@ -32,7 +32,7 @@ "inputs": [ { "id": 0, - "dtype": "u8", + "dtype": "f32", "shape": [ 1, 32, @@ -52,7 +52,7 @@ "outputs": [ { "id": 1, - "dtype": "f32", + "dtype": "u8", "shape": [ 1, 32, From 9eb693f7776a4e7ca7e37b3fe67a6b1de214b65a Mon Sep 17 00:00:00 2001 From: "Wang, Zhitao" Date: Tue, 7 May 2024 01:35:23 +0000 Subject: [PATCH 137/187] tests: benchdnn: graph: add p_nums knob for checking parition nums --- tests/benchdnn/graph/bench_graph.cpp | 9 ++++++--- tests/benchdnn/graph/graph.cpp | 21 +++++++++++++++++++- tests/benchdnn/graph/graph.hpp | 10 ++++++++-- tests/benchdnn/graph/parser.cpp | 29 ++++++++++++++++++++++++++++ tests/benchdnn/graph/parser.hpp | 3 +++ 5 files changed, 66 insertions(+), 6 deletions(-) diff --git a/tests/benchdnn/graph/bench_graph.cpp b/tests/benchdnn/graph/bench_graph.cpp index 21144a38533..eda0d67c401 100644 --- a/tests/benchdnn/graph/bench_graph.cpp +++ b/tests/benchdnn/graph/bench_graph.cpp @@ -27,6 +27,7 @@ namespace graph { void check_correctness(const settings_t &s) { for_(const auto &i_in_shapes : s.in_shapes_vec) for_(const auto &i_op_attrs : s.op_attrs_vec) + for_(const auto &i_expected_n_partition : s.expected_n_partition_vec) for_(const auto &i_fpmath_mode : s.fpmath_mode_vec) for (const auto &i_mb : s.mb) { deserialized_graph dg; @@ -35,9 +36,9 @@ void check_correctness(const settings_t &s) { fw.rewrite(dg); BENCHDNN_PRINT(7, "[INFO] Graph dump:\n%s\n", dg.get_string().c_str()); - const prb_t prb(dg); - const auto &cpp_pstr = case_to_str( - s.json_file, i_in_shapes, i_op_attrs, i_fpmath_mode, i_mb); + const prb_t prb(dg, i_expected_n_partition); + const auto &cpp_pstr = case_to_str(s.json_file, i_in_shapes, i_op_attrs, + i_fpmath_mode, i_expected_n_partition, i_mb); const char *pstr = cpp_pstr.c_str(); BENCHDNN_PRINT(1, "run: %s\n", pstr); res_t res {}; @@ -61,6 +62,8 @@ int bench(int argc, char **argv) { || parse_batch(bench, argv[0]) || parse_input_shapes(s.in_shapes_vec, argv[0]) || parse_op_attrs(s.op_attrs_vec, argv[0]) + || parse_graph_expected_n_partitions( + s.expected_n_partition_vec, argv[0]) || parse_graph_fpmath_mode(s.fpmath_mode_vec, argv[0]) || parse_mb(s.mb, def.mb, argv[0]) || parse_reset(s, argv[0]); if (!parsed_options) { diff --git a/tests/benchdnn/graph/graph.cpp b/tests/benchdnn/graph/graph.cpp index f71adc169a2..cae97e53ce6 100644 --- a/tests/benchdnn/graph/graph.cpp +++ b/tests/benchdnn/graph/graph.cpp @@ -341,7 +341,8 @@ using namespace dnnl::graph; std::string case_to_str(const std::string &json_file, const std::map &in_shapes, const std::map &op_attrs, - const std::string &fpmath_mode, const int64_t mb) { + const std::string &fpmath_mode, const size_t expected_n_partitions, + const int64_t mb) { std::stringstream s; dump_global_params(s); @@ -374,6 +375,11 @@ std::string case_to_str(const std::string &json_file, s << "--attr-fpmath=" << fpmath_mode << " "; } + if (expected_n_partitions != 0) { + s << "--expected-n-partitions=" << std::to_string(expected_n_partitions) + << " "; + } + s << "--case=" << json_file; return s.str(); } @@ -463,6 +469,19 @@ int doit(const prb_t *prb, res_t *res) { if (aop.kind_ == "End") { end_opid_v.emplace_back(aop.id_); } } + if (prb->expected_n_partition != 0) { + // If the expected partition num is specified by user with command line + // knob + if (partitions.size() != prb->expected_n_partition) { + BENCHDNN_PRINT(0, + "Error: the expected number of partitions (%zu) doesn't " + "coincide with the actual number of partitions returned " + "(%zu).\n ", + prb->expected_n_partition, partitions.size()); + return res->state = FAILED, FAIL; + } + } + if (partitions.empty()) { BENCHDNN_PRINT(0, "%s\n", "Error: partitions are empty"); return res->state = FAILED, FAIL; diff --git a/tests/benchdnn/graph/graph.hpp b/tests/benchdnn/graph/graph.hpp index 3d0ef6be739..a2d45aafccf 100644 --- a/tests/benchdnn/graph/graph.hpp +++ b/tests/benchdnn/graph/graph.hpp @@ -47,6 +47,9 @@ struct settings_t : public base_settings_t { std::string json_file; std::vector> in_shapes_vec {{{0, "default"}}}; std::vector> op_attrs_vec {{{0, "default"}}}; + // `0` means not specified by user with command line knob, will skip + // the partition num check. + std::vector expected_n_partition_vec {0}; // `default` means not specified by user with command line knob. std::vector fpmath_mode_vec {"default"}; @@ -61,7 +64,8 @@ struct settings_t : public base_settings_t { // TODO evaluate prb_t struct struct prb_t { - prb_t(const deserialized_graph &dg) : dg(dg) { + prb_t(const deserialized_graph &dg, const size_t &expected_n_partition) + : dg(dg), expected_n_partition(expected_n_partition) { const std::string &fpmath_mode = dg.get_fpmath_mode(); this->fpmath_mode = static_cast( @@ -69,13 +73,15 @@ struct prb_t { } deserialized_graph dg; + size_t expected_n_partition; dnnl::fpmath_mode fpmath_mode; }; std::string case_to_str(const std::string &json_file, const std::map &in_shapes, const std::map &op_attrs, - const std::string &fpmath_mode, const int64_t mb); + const std::string &fpmath_mode, const size_t expected_n_partitions, + const int64_t mb); struct perf_report_t : public base_perf_report_t { perf_report_t(const std::string case_str, const char *perf_template) diff --git a/tests/benchdnn/graph/parser.cpp b/tests/benchdnn/graph/parser.cpp index 2533b04c976..af042ac60e7 100644 --- a/tests/benchdnn/graph/parser.cpp +++ b/tests/benchdnn/graph/parser.cpp @@ -85,6 +85,35 @@ bool parse_op_attrs(std::vector> &op_attrs_vec, return parse_key_value(op_attrs_vec, op_attrs_str), true; } +bool parse_graph_expected_n_partitions( + std::vector &expected_n_partition_vec, const char *str) { + std::string expected_n_partitions_str; + if (!parse_string(expected_n_partitions_str, str, "expected-n-partitions")) + return false; + + std::stringstream ss(expected_n_partitions_str); + std::string expected_n_partitions; + while (std::getline(ss, expected_n_partitions, ',')) { + if (!expected_n_partitions.empty()) { + expected_n_partition_vec.clear(); + + const auto int_expected_n_partitions + = std::stoi(expected_n_partitions); + if (int_expected_n_partitions >= 0) { + expected_n_partition_vec.emplace_back( + int_expected_n_partitions); + } else { + BENCHDNN_PRINT(0, + "Error: expected-n-partitions option supports only" + "non-negative numbers, but `%d` was specified.\n", + int_expected_n_partitions); + SAFE_V(FAIL); + } + } + } + return true; +} + bool parse_graph_fpmath_mode( std::vector &fpmath_mode_vec, const char *str) { std::string graph_attrs_str; diff --git a/tests/benchdnn/graph/parser.hpp b/tests/benchdnn/graph/parser.hpp index 1d34f9b3e74..1e8150b3bfd 100644 --- a/tests/benchdnn/graph/parser.hpp +++ b/tests/benchdnn/graph/parser.hpp @@ -29,6 +29,9 @@ bool parse_input_shapes( bool parse_op_attrs(std::vector> &op_attrs_vec, const char *str); +bool parse_graph_expected_n_partitions( + std::vector &expected_n_partition_vec, const char *str); + bool parse_graph_fpmath_mode( std::vector &fpmath_mode_vec, const char *str); From 0ad65f929111622ca1ee583673ab661bab3474fa Mon Sep 17 00:00:00 2001 From: "Wang, Zhitao" Date: Tue, 7 May 2024 01:35:52 +0000 Subject: [PATCH 138/187] tests: benchdnn: doc: update option description for expeceted partition nums --- tests/benchdnn/doc/driver_graph.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/benchdnn/doc/driver_graph.md b/tests/benchdnn/doc/driver_graph.md index c85072d9240..3738bbef326 100644 --- a/tests/benchdnn/doc/driver_graph.md +++ b/tests/benchdnn/doc/driver_graph.md @@ -50,6 +50,9 @@ where *graph-knobs* are: Multiple attributes value changes may be specified using the `*` delimeter. Multiple ops modification may be specified using the `+` delimeter. By default, the option value is empty, meaning values are taken from original graph. + - `--expected-n-partitions=INT` -- Specify the number of expected partitions + returned from the graph. `INT` is a non-negative integer value. When `INT` + value is `0` (the default), the check is skipped. and *graph-case* is a JSON file which is dumped by a library or created from scratch. It must be passed to the graph driver as `--case=JSON_FILE`. Refer to the JSON From 211b4ba11f291595d3ab10b1c049aba92b65cfbf Mon Sep 17 00:00:00 2001 From: Ankit Manerikar Date: Mon, 20 May 2024 10:50:20 -0700 Subject: [PATCH 139/187] doc: verbose: add message catalogue for verbose diagnostics --- doc/performance_considerations/verbose.md | 3 +- .../verbose_table.md | 98 +++++++++++++++++++ .../performance_profiling_and_inspection.rst | 1 + src/common/verbose_msg.hpp | 2 +- 4 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 doc/performance_considerations/verbose_table.md diff --git a/doc/performance_considerations/verbose.md b/doc/performance_considerations/verbose.md index 4bda2cd93c0..62ed1ca894e 100644 --- a/doc/performance_considerations/verbose.md +++ b/doc/performance_considerations/verbose.md @@ -149,7 +149,8 @@ onednn_verbose,primitive,create:dispatch,matmul,cpu,matmul,brg:avx512_core_bf16, Above, we can see that the highest performance implementations were not dispatched either because they required a higher ISA, or because they did not support that datatype configuration. - +A complete list of verbose messages encountered in the dispatch mode +can be found [here](https://oneapi-src.github.io/oneDNN/dev_guide_verbose_table.html) along with their explanation. ### Enable ONEDNN_VERBOSE with timestamps diff --git a/doc/performance_considerations/verbose_table.md b/doc/performance_considerations/verbose_table.md new file mode 100644 index 00000000000..594dea606cd --- /dev/null +++ b/doc/performance_considerations/verbose_table.md @@ -0,0 +1,98 @@ + +Verbose Message Catalogue {#dev_guide_verbose_table} +======================================================== + +The following catalogue lists verbose messages, explanations, and additional information for: primitive creation and dispatch checks for primitive implementations; and implementation failures that occur during engine/memory object creation. + +## Primitive Creation/Dispatching + +| VERBOSE MESSAGE | SUBSTRING | PRIMITIVE | EXPLANATION | +|:----------------------------------------------------------------------|:----------|:------------|:--------------| +|**Bad/Invalid Arguments** | | | | | +|`bad algorithm` | | all | Bad or invalid algorithm [`dnnl::algorithm`](https://oneapi-src.github.io/oneDNN/enum_dnnl_algorithm.html) selected for the current primitive implementation. The choice and availability of the algorithm depends on the specific implementation selected for the primitive. For example, oneDNN supports Winograd convolution only on GPU and AArch64 CPU systems. | +|`bad propagation kind` | | all | Incorrect propagation kind [`dnnl::prop_kind`](https://oneapi-src.github.io/oneDNN/enum_dnnl_prop_kind.html) (`forward_training`,`backward_weights`, `backward_data`, etc.) selected for the current primitive implementation. | +|`bad param

` |`p` - initialization parameter for [`dnnl::primitive_desc`](https://oneapi-src.github.io/oneDNN/struct_dnnl_primitive_desc-2.html) | all | Invalid parameter passed for the initialization of the primitive descriptor.
**Example**: For the [`group_normalization`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_group_normalization.html) primitive, this message is displayed when the value passed for the `groups` parameter does not evenly divide the number of channels for the source tensor. | +|`one of the mandatory arguments is nullptr` | | all | A NULL pointer argument exception for the primitive methods. | +|`bad flags` | | all | Bad or unsupported flags specified for the primitive attributes [`dnnl::primitive_attr`](https://oneapi-src.github.io/oneDNN/struct_dnnl_primitive_attr-2.html) during initialization of the primitive descriptor [`dnnl::primitive_desc`](https://oneapi-src.github.io/oneDNN/struct_dnnl_primitive_desc-2.html). | +|**Unsupported Arguments/Parameters**|||| +|`unsupported isa` | | all | Primitive implementation does not support the current ISA. This typically results in the dispatching of the next best supporting implementation for the ISA.| +|`unsupported datatype` | | all | Tensor datatype is not supported for the current implementation of the primitive. Depending on the primitive, this may correspond to the source, weight, bias or destination tensors.| +|`unsupported datatype combination` | | all | Datatype combination for source, weight, bias or destination tensors is not supported for the current primitive implementation. | +|`unsupported attr` | | all | Bad or unsupported attributes [`dnnl::primitive_attr`](https://oneapi-src.github.io/oneDNN/struct_dnnl_primitive_attr-2.html) passed to the primitive descriptor for the current implementation. Since attributes are separately created from the corresponding primitive descriptor, the selected primitive implementation may not support the attribute configuration. | +|`unsupported post-ops` | | all | Unsupported post-ops configuration [`dnnl::post_ops`](https://oneapi-src.github.io/oneDNN/struct_dnnl_post_ops-2.html) passed on to the primitive descriptor. Similar to `dnnl::primitive_attr`, the selected implementation may not support the postop configuration during primitve creation.| +|`unsupported scales configuration` | | all | Unsupported scales configuration specified for the primitive attributes [`dnnl::primitive_attr`](https://oneapi-src.github.io/oneDNN/struct_dnnl_primitive_attr-2.html). | +|`unsupported zero-point configuration` | | all | Unsupported zero-point configuration specified for the primitive attributes [`dnnl::primitive_attr`](https://oneapi-src.github.io/oneDNN/struct_dnnl_primitive_attr-2.html).| +|`unsupported bias configuration` | | all | Unsupported bias data configuration specified for the descriptors of compute-type primitives.| +|`unsupported sparse md configuration` | | all | Current primitive implementation does not support sparse data operations. | +|`unsupported format tag` | | all | Unsupported format tag [`dnnl::memory::format_tag`](https://oneapi-src.github.io/oneDNN/enum_dnnl_memory_format_tag.html) encountered during primitive operation.| +|`unsupported format tag for ` |`t` - tensor | all | Unsupported format tag [`dnnl::memory::format_tag`](https://oneapi-src.github.io/oneDNN/enum_dnnl_memory_format_tag.html) for specified tensor during primitive operation.| +|`unsupported format kind` | | all | Unsupported format kind [`dnnl::memory::format_kind`](https://oneapi-src.github.io/oneDNN/enum_dnnl_memory_format_kind.html?highlight=format_kind) encountered during primitive operation.| +|`runtime dimension is not supported` | | all | Current implementation does not support processing runtime-specified shapes and strides using `DNNL_RUNTIME_DIM_VAL`. | +|**Tensor Operations**|||| +|`tensor has no elements` |`t` - tensor | all | Empty tensor passed as data to the primitive. Depending on the primitive, this may correspond to the source, weights or destination tensors. | +|` has a bad number of dimensions ` |`t`- tensor
`ndims`- number of tensor dimensions | all | Tensor data has bad or invalid number of dimensions for the current primitive operation.
**Example**: The [`convolution`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_convolution.html) primitive expects only 1D, 2D or 3D tensors for operations and prints this message for any other data with higher dimensions. | +|`bad dimensions :` |`t`- tensor
`axis`- axis | all | Tensor `` has an invalid dimension along the specified axis.
**Example**: The [`concat`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_concat.html) primitive prints this message when the destination tensor dimension along the concatenated axis does not match the sum of the dimensions of the concatenated tensors. | +|`dimension : is inconsistent with :` |`t0, t1` - tensors,
`a0, a1` - tensor axes | all | Tensors `t0, t1` have inconsistent dimensions along axes `a0` and `a1` respectively.
**Example**: This is encountered for the [`matmul`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_matmul.html) primitive when the input matrices have mismatching dimensions. | +|`tensors and have inconsistent number of dimensions` |`t0, t1` - tensors | all |Tensors `t0, t1` have inconsistent dimensions for primitive operation.| +|`tensors and have inconsistent datatypes` |`t0, t1` - tensors | all | Tensors `t0, t1` have inconsistent datatypes for primitive operation.| +|**Unsupported Combinations**|||| +|`sparse encoding is not supported on this isa` | | all | *(self-explanatory)* | +|`datatype configuration not supported on this isa` | | all | *(self-explanatory)*| +|`datatype and propagation kind mismatch` | | all | *(self-explanatory)*| +|`inconsistent and mds` |`t0, t1` - tensors | all | Tensors `t0, t1` have inconsistent memory descriptors [`dnnl::memory::desc`](https://oneapi-src.github.io/oneDNN/struct_dnnl_memory_desc-2.html) for the primitive operation. | +|**Implementation Heuristics/Features**|||| +|`unsupported feature for implementation: ` |`msg` - feature description | all | Current implementation is skipped because it does not support the specified feature for primtive operation. | +|` feature unavailable for device` |`msg` - feature description | all | Current implementation is skipped because the selected device does not support the specified feature for primitive operation.| +|`unsupported feature for padding: ` |`msg` - feature description | all | Current implementation is skipped because of a padding inconsistency or unsupported feature related to padding.| +|`heuristic fail: ` |`h` - implementation heuristic | all | Implementation skipped due to specified heuristic.| +|`blocking heuristic fail: ` |`h` - implementation heuristic | all | Current implementation is skipped because of the specified inconsistency or implementation heuristic related to blocking. | +|**Primitive-Specific Messages**|||| +|`heuristic fail for 1x1 convolution: ` |`h` - implementation heuristic | [`convolution`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_convolution.html) | Implementation skipped due to specified heuristic related to 1x1 convolution.| +|` offsets do not fit into

datatype` |`o` - {`input`, `output`},
`dt` - datatype | [`convolution`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_convolution.html) | I/O dimension offsets do not fit into the specified datatype range for the kernel implementation. | +|`failed shape restrictions` | | [`convolution`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_convolution.html), [`gnorm`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_group_normalization.html) |Implementation skipped because the current data layout/shapes exceeds the range supported by the current implementation.| +|`alpha and beta parameters are not properly set` | | [`eltwise`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_eltwise.html) | Alpha and beta parameters are not properly set for the elementwise algorithm. | +|`large shapes fall back` | | [`gemm`](https://oneapi-src.github.io/oneDNN/dev_guide_convolution.html?highlight=gemm#algorithms) | Heuristic to skip current implementation for large tensor shapes for better performance.| +|`only trivial strides are supported` | | [`gemm`](https://oneapi-src.github.io/oneDNN/dev_guide_convolution.html?highlight=gemm#algorithms), [`rnn`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_rnn.html) | Current implementation for the primitive does not process non-trivial stride values.| +|`unsupported fpmath mode` | | [`matmul`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_matmul.html) | [Floating-point math mode](https://oneapi-src.github.io/oneDNN/group_dnnl_api_fpmath_mode.html?highlight=math%20mode) is not supported by the current primitive implementation.| +|`small shapes fall back` | | [`matmul`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_matmul.html) | Heuristic to skip current implementation for small tensor shapes for better performance. | +|`incompatible gemm format` | | [`matmul`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_matmul.html), [`ip`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_inner_product.html) | Specified GeMM format is incompatible with the current primitive implementation.| +|`unsupported tensor layout` | | [`reorder`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_reorder.html) | The data layout for the source/destination tensor is not supported by the current implementation.| +|`bad axis` | | [`softmax`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_softmax.html), [`shuffle`](https://oneapi-src.github.io/oneDNN/group_dnnl_api_shuffle.html) | Bad or invalid axis specified for softmax/shuffle operation. | +|`unsupported architecture` | `d` - [`dnnl::engine::kind`](https://oneapi-src.github.io/oneDNN/enum_dnnl_engine_kind.html) | [`gemm`](https://oneapi-src.github.io/oneDNN/dev_guide_convolution.html?highlight=gemm#algorithms) | Unsupported architecture for specified device-type. Typically encountered when current GPU device does not support the primitive.| +|**Miscellaneous**|||| +|`failed to create nested primitive ` |`pm` - [`dnnl::primitive`](https://oneapi-src.github.io/oneDNN/struct_dnnl_primitive-2.html) | all | Descriptor initialization for the nested primitive implementation was unsuccessful. | +|`failed to create descriptor` |`pm` -[`dnnl::primitive`](https://oneapi-src.github.io/oneDNN/struct_dnnl_primitive-2.html), [`dnnl::memory`](https://oneapi-src.github.io/oneDNN/struct_dnnl_memory-2.html) | all | Descriptor initialization for the primitive or memory object was unsuccessful.| +|`bad accumulation mode` | | all | Bad or invalid [accumulation mode](https://oneapi-src.github.io/oneDNN/enum_dnnl_accumulation_mode.html) specified for primitive attribute [`dnnl::primitive_attr`](https://oneapi-src.github.io/oneDNN/struct_dnnl_primitive_attr-2.html). | +|`unsupported md flag` |`t` - tensor | all | Bad or unsupported flags specified for the memory descriptor [`dnnl::memory::desc`](https://oneapi-src.github.io/oneDNN/struct_dnnl_memory_desc-2.html). | +|`problem is not mathematically consistent` | | all | *(self-explanatory)* | +|`workspace mismatch between forward and backward primitive descriptors`| | all | *(self-explanatory)*| +|`workspace initialization failed` | | all | [Workspace](https://oneapi-src.github.io/oneDNN/dev_guide_inference_and_training_aspects.html?highlight=workspace#workspace) descriptor initialization was unsuccessful during primitive creation.| +|`invalid datatype for ` |`t` - tensor | all | The datatype for the tensor/data processed by the primitive is invalid.
**Example**: This is encountered when an undefined datatype `data_type::undef` is specified for the accumulator.| +|`failed to run kernel deterministically` | | all | failed to run application in the [deterministic mode](https://oneapi-src.github.io/oneDNN/dev_guide_attributes_deterministic.html?highlight=deterministic). | +||||| + +## Engine Creation + +| VERBOSE MESSAGE | SUBSTRING | ENGINE | EXPLANATION | +|:-----------------------------------------------------|:----------|:------------|:--------------| +|`bad engine kind` | | all | Invalid value for [`dnnl::engine::kind`](https://oneapi-src.github.io/oneDNN/enum_dnnl_engine_kind.html) encountered during engine creation. | +|`invalid device in environment: index ` |`d` - [`dnnl::engine::kind`](https://oneapi-src.github.io/oneDNN/enum_dnnl_engine_kind.html),
`i` - device index | all | Device of type `dnnl::engine::kind` and index `i` is invalid for the current environment. | +|`no device is available` |`d` - [`dnnl::engine::kind`](https://oneapi-src.github.io/oneDNN/enum_dnnl_engine_kind.html) | all | No device of type `dnnl::engine::kind` was found during engine creation. | +|` devices are available but was queried` |`d` - [`dnnl::engine::kind`](https://oneapi-src.github.io/oneDNN/enum_dnnl_engine_kind.html),
`n` - number of `d` devices,
`i` - queried device index | all | Queried index is out-of-range for device of type `dnnl::engine::kind`. | +|`device not found in the given context` | | all | *(self-explanatory)* | +|`unsupported platform (expected got )` |`d` - [`dnnl::engine::kind`](https://oneapi-src.github.io/oneDNN/enum_dnnl_engine_kind.html),
`d0` - queried platform,
`d1` - available platform | `sycl`, `opencl` | Unsupported device platform encountered during engine creation. | +|`failed to create engine with index ` |`d` - [`dnnl::engine::kind`](https://oneapi-src.github.io/oneDNN/enum_dnnl_engine_kind.html),
`i` - device index |all | Engine creation was unsuccessful for specified device index and kind. | +|`unsupported backend` |`d` - [`dnnl::engine::kind`](https://oneapi-src.github.io/oneDNN/enum_dnnl_engine_kind.html) | `sycl` | *(self-explanatory)* | +|`profiling capabilities are not supported` | | all | Experimental profiling ([ONEDNN_EXPERIMENTAL_PROFILING](https://oneapi-src.github.io/oneDNN/dev_guide_experimental.html?highlight=profiling#onednn-experimental-profiling)) is not enabled for the application.| +||||| + +## Memory Creation and Related Operations + +| VERBOSE MESSAGE | EXPLANATION | +|:------------------------------------------|:--------------| +|`bad arguments for memory descriptor` | Bad or unsupported values passed to the memory descriptor [`dnnl::memory::desc`](https://oneapi-src.github.io/oneDNN/struct_dnnl_memory_desc-2.html) during memory object creation. | +|`invalid memory index` | An out-of-range value encountered for memory handle during data mapping. | +|`unsupported memory stride` | Memory descriptor initialization failed due to unsupported value for memory strides. | +|`scratchpad memory limit exceeded` | [Scratchpad](https://oneapi-src.github.io/oneDNN/dev_guide_attributes_scratchpad.html?highlight=scratchpad) space is exhausted during GEMM kernel initialization. | +|`scratchpad initialization unsuccessful` | *(self-explanatory)* | +||||| \ No newline at end of file diff --git a/doc/rst/performance_profiling_and_inspection.rst b/doc/rst/performance_profiling_and_inspection.rst index f354eab1e0b..4a942169f07 100644 --- a/doc/rst/performance_profiling_and_inspection.rst +++ b/doc/rst/performance_profiling_and_inspection.rst @@ -5,6 +5,7 @@ Performance Profiling and Inspection :maxdepth: 1 dev_guide_verbose + dev_guide_verbose_table dev_guide_performance_settings dev_guide_benchdnn dev_guide_profilers diff --git a/src/common/verbose_msg.hpp b/src/common/verbose_msg.hpp index aac3bec955d..59983f59e4b 100644 --- a/src/common/verbose_msg.hpp +++ b/src/common/verbose_msg.hpp @@ -98,7 +98,7 @@ #define VERBOSE_ISA_DT_MISMATCH \ "datatype configuration not supported on this isa" #define VERBOSE_OFFSET_DT_MISMATCH "%s offsets do not fit into %s datatype" -#define VERBOSE_PROPKIND_DT_MISMATCH "data type and propagation kind mismatch" +#define VERBOSE_PROPKIND_DT_MISMATCH "datatype and propagation kind mismatch" #define VERBOSE_WS_MISMATCH \ "workspace mismatch between forward and backward primitive " \ "descriptors" From 0ab6977c1b1a87fb75dae05809d4e6d3ce508427 Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Fri, 17 May 2024 16:33:29 -0700 Subject: [PATCH 140/187] gpu: intel: jit: ir: add missing const --- src/gpu/intel/jit/utils/trace.cpp | 2 +- src/gpu/intel/jit/utils/trace.hpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/gpu/intel/jit/utils/trace.cpp b/src/gpu/intel/jit/utils/trace.cpp index 93b4a6afc84..49eee86802a 100644 --- a/src/gpu/intel/jit/utils/trace.cpp +++ b/src/gpu/intel/jit/utils/trace.cpp @@ -34,7 +34,7 @@ ir_utils::debug_profiler_t &get_trace_profiler() { #if defined(DNNL_DEV_MODE) void trace_pass( - const char *pass_name, const stmt_t &stmt, ir_context_t &ir_ctx) { + const char *pass_name, const stmt_t &stmt, const ir_context_t &ir_ctx) { trace_stop(pass_name); ir_trace() << "=== After " << pass_name << std::endl; ir_trace() << stmt << std::endl; diff --git a/src/gpu/intel/jit/utils/trace.hpp b/src/gpu/intel/jit/utils/trace.hpp index 5aceefabf2c..2cb4fccd3c8 100644 --- a/src/gpu/intel/jit/utils/trace.hpp +++ b/src/gpu/intel/jit/utils/trace.hpp @@ -60,10 +60,10 @@ inline void trace_perf() {}; #if defined(DNNL_DEV_MODE) void trace_pass( - const char *pass_name, const stmt_t &stmt, ir_context_t &ir_ctx); + const char *pass_name, const stmt_t &stmt, const ir_context_t &ir_ctx); #else -inline void trace_pass( - const char *pass_name, const stmt_t &stmt, ir_context_t &ir_ctx) {}; +inline void trace_pass(const char *pass_name, const stmt_t &stmt, + const ir_context_t &ir_ctx) {}; #endif } // namespace jit From 22e01d0c888c4d2424601e9fe1fbb3fdf4f2aa1d Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Fri, 17 May 2024 16:33:59 -0700 Subject: [PATCH 141/187] gpu: intel: jit: trace constraints --- src/gpu/intel/jit/ir/ir.hpp | 27 +++++++++++++++++++++++++++ src/gpu/intel/jit/utils/trace.cpp | 1 + 2 files changed, 28 insertions(+) diff --git a/src/gpu/intel/jit/ir/ir.hpp b/src/gpu/intel/jit/ir/ir.hpp index e9ae2f5ad80..ce4073e5190 100644 --- a/src/gpu/intel/jit/ir/ir.hpp +++ b/src/gpu/intel/jit/ir/ir.hpp @@ -841,6 +841,33 @@ class constraint_set_t { int max_proven_gcd(const expr_t &var) const; + std::string str() const { + std::ostringstream oss; + oss << "relations:" << (relations_.empty() ? " (empty)\n" : "\n"); + for (auto &r : relations_) { + oss << "\t" << r.first << ":"; + bool first = true; + for (auto &s : r.second) { + oss << (first ? " " : ", ") << s; + first = false; + } + oss << "\n"; + } + + oss << "modulus_info:" + << (modulus_infos_.empty() ? " (empty)\n" : "\n"); + for (auto &m : modulus_infos_) { + oss << "\t" << m.first << ":"; + bool first = true; + for (auto &s : m.second) { + oss << (first ? " " : ", ") << s; + first = false; + } + oss << "\n"; + } + return oss.str(); + } + private: bool can_prove_modulus(const expr_t &e) const { modulus_info_t unknown(e); diff --git a/src/gpu/intel/jit/utils/trace.cpp b/src/gpu/intel/jit/utils/trace.cpp index 49eee86802a..74a74c1f172 100644 --- a/src/gpu/intel/jit/utils/trace.cpp +++ b/src/gpu/intel/jit/utils/trace.cpp @@ -40,6 +40,7 @@ void trace_pass( ir_trace() << stmt << std::endl; auto grf_usage = get_grf_usage(stmt, ir_ctx.hw().grf_size()); if (!grf_usage.is_empty()) ir_trace() << grf_usage << std::endl; + ir_trace() << ir_ctx.cset() << std::endl; } #endif From 86041ec86a74f10f77578b7fae6abbab66c695c5 Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Fri, 17 May 2024 16:34:42 -0700 Subject: [PATCH 142/187] gpu: intel: jit: pass: sort by cost per byte in cse --- src/gpu/intel/jit/pass/cse.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gpu/intel/jit/pass/cse.cpp b/src/gpu/intel/jit/pass/cse.cpp index e7d0b3d0170..aa6e781269a 100644 --- a/src/gpu/intel/jit/pass/cse.cpp +++ b/src/gpu/intel/jit/pass/cse.cpp @@ -424,7 +424,8 @@ class cse_context_t { it != sorted_var_entries.end();) { std::sort(it, sorted_var_entries.end(), [&](const cse_var_entry_t *a, const cse_var_entry_t *b) { - return a->cost() > b->cost(); + // Sort by cost per byte + return a->cost() * b->size() > b->cost() * a->size(); }); while (it != sorted_var_entries.end()) { auto &e = **it; From 337cb2d3816cab0cfacc957fc76124cc3c7ae03d Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Fri, 17 May 2024 17:19:04 -0700 Subject: [PATCH 143/187] gpu: intel: jit: fix cse algorithm The current CSE algorithm has a few issues. First, the initial cost computation is not done in topological order, so whether a variable is allocated or not during this computation is semi-random. Because of this, the implementation will remove sub-optimal expression from allocation. Second, greedy addition overprioritizes leaf computations which share a common dependency. By switching to greedy removal, shared computations will be more appropriately prioritized. This patch fixes both these issues while switching to a greedy removal algorithm. --- src/gpu/intel/jit/pass/cse.cpp | 197 ++++++++++++++++++++------------- 1 file changed, 120 insertions(+), 77 deletions(-) diff --git a/src/gpu/intel/jit/pass/cse.cpp b/src/gpu/intel/jit/pass/cse.cpp index aa6e781269a..cfb70bb6867 100644 --- a/src/gpu/intel/jit/pass/cse.cpp +++ b/src/gpu/intel/jit/pass/cse.cpp @@ -77,7 +77,12 @@ class cse_var_entry_t { const cse_expr_t *cse_expr() const { return cse_expr_; } - bool allocated() const { return allocated_; } + bool unallocated() const { return allocated_ == allocated_t::no; } + bool allocated() const { return allocated_ == allocated_t::yes; } + + void set_unallocated() { allocated_ = allocated_t::no; } + void set_allocated() { allocated_ = allocated_t::yes; } + void mark() { allocated_ = allocated_t::mark; } int size() const { return utils::rnd_up( @@ -91,33 +96,16 @@ class cse_var_entry_t { var2entry_ = &var2entry; } - void add_back_ref(cse_var_entry_t *e) { - ir_assert(e != this); - back_refs_.insert(e); - } - - void mark_as_allocated() { - allocated_ = true; - update_back_ref_cost(); - } - void recompute_cost() { cost_ = expr_cost(cse_expr_->expr) * cse_expr_->refs; } private: - void update_back_ref_cost() { - for (auto *e : back_refs_) { - e->recompute_cost(); - e->update_back_ref_cost(); - } - } - int expr_cost(const expr_t &e) { if (is_var(e)) { auto it = var2entry_->find(e); if (it == var2entry_->end()) return 0; - if (it->second->allocated_) return 0; + if (it->second->allocated()) return 0; // If variable is not allocated, its value // has to be recomputed every time. return it->second->cost(); @@ -138,9 +126,15 @@ class cse_var_entry_t { const cse_expr_t *cse_expr_ = nullptr; int cost_ = 0; - bool allocated_ = false; - std::unordered_set back_refs_; + enum class allocated_t { + no, + yes, + mark // Used for topological sorting algorithm + }; + + allocated_t allocated_ = allocated_t::no; + const object_map_t *var2entry_ = nullptr; }; @@ -374,72 +368,121 @@ class cse_context_t { return true; } - void set_skip_exprs(const stmt_t &root, int limit, int grf_size) { - // Initialize variable-entry for each potential CSE variable. - std::vector var_entries; - for (auto &kv : cse_exprs_) { - auto &cse_expr = kv.second; - if (cse_expr.cse_var.is_empty()) continue; - var_entries.emplace_back(&cse_expr); - } - // Create mapping from CSE var to entry. - object_map_t var2entry; - for (auto &e : var_entries) { - var2entry.emplace(e.cse_expr()->cse_var, &e); - e.set_var2entry(var2entry); - } - // Initialize back references. - for (auto &e : var_entries) { - auto vars = find_objects(e.cse_expr()->expr); - for (auto &v : vars) { - auto it = var2entry.find(v); - if (it == var2entry.end()) continue; - it->second->add_back_ref(&e); + void set_skip_exprs( + const stmt_t &root, int usage, int limit, int grf_size) { + struct var_entries_t { + var_entries_t( + const object_eq_map_t &cse_exprs) { + for (auto &kv : cse_exprs) { + auto &cse_expr = kv.second; + if (cse_expr.cse_var.is_empty()) continue; + entries_.emplace_back(&cse_expr); + } + + for (auto &e : entries_) { + var2entry_.emplace(e.cse_expr()->cse_var, &e); + e.set_var2entry(var2entry_); + } + + topological_sort(); + + for (auto &e : entries_) { + gpu_assert(e.allocated()) + << "unallocated: " << e.cse_expr()->cse_var << " = " + << e.cse_expr()->expr << "\n"; + } } - var2entry.emplace(e.cse_expr()->cse_var, &e); - } - // Initialize cost. - for (auto &e : var_entries) { - e.recompute_cost(); - } - // Initialize statement-entry for each potential statement of CSE - // variable attachement. - std::unordered_map - stmt_entries; - cse_memory_usage_visitor_t mem_usage_visitor( - stmt_entries, cse_exprs_, grf_size); - mem_usage_visitor.visit(root); - for (auto &kv : stmt_entries) - kv.second.propagate_usage_up(); - - // Greedily find the variable with the highest current complexity that - // won't exceed the usage limit, mark it as allocated and recompute - // complexity for other dependent vars. Stop once there are no - // such variables. + + std::vector::iterator begin() { + return entries_.begin(); + }; + std::vector::iterator end() { + return entries_.end(); + }; + + private: + // Depth first search visitor for topological_sort() + void visit(cse_var_entry_t &e, + std::vector::reverse_iterator &head) { + if (e.allocated()) return; + gpu_assert(e.unallocated()) + << "Cyclic expression dependency detected"; + e.mark(); + + for (auto &dep : find_objects(e.cse_expr()->expr)) { + auto it = var2entry_.find(dep); + if (it != var2entry_.end()) visit(*(it->second), head); + } + e.set_allocated(); + + *head++ = &e; + } + + // Topological sort `entries_` and mark all expressions as allocated. + // Topological sort is required for correct iteration order when + // updating node costs. Uses a depth first search based algorithm. + void topological_sort() { + std::vector e_sorted( + entries_.size(), nullptr); + auto head = e_sorted.rbegin(); + for (auto it = entries_.begin(); it != entries_.end(); it++) { + if (it->unallocated()) { visit(*it, head); } + } + + std::vector entries; + entries.reserve(entries_.size()); + for (auto e_ptr : e_sorted) { + entries.emplace_back(*e_ptr); + } + entries_ = std::move(entries); + for (auto &e : entries_) { + var2entry_[e.cse_expr()->cse_var] = &e; + } + } + + std::vector entries_; + object_map_t var2entry_; + }; + + var_entries_t var_entries(cse_exprs_); + + // Greedily remove the least beneficial variable until memory usage + // limit is met. std::vector sorted_var_entries; - for (auto &e : var_entries) + for (auto &e : var_entries) { sorted_var_entries.push_back(&e); + } - for (auto it = sorted_var_entries.begin(); - it != sorted_var_entries.end();) { + int overflow_size = usage - limit; + auto it = sorted_var_entries.begin(); + while (overflow_size > 0 && it != sorted_var_entries.end()) { + // Update costs. + for (auto &e : var_entries) { + e.recompute_cost(); + } std::sort(it, sorted_var_entries.end(), [&](const cse_var_entry_t *a, const cse_var_entry_t *b) { // Sort by cost per byte - return a->cost() * b->size() > b->cost() * a->size(); + return a->cost() * b->size() < b->cost() * a->size(); }); - while (it != sorted_var_entries.end()) { - auto &e = **it; - auto &stmt_entry = stmt_entries.at(e.cse_expr()->path.back()); - if (stmt_entry.try_allocate(e.size(), limit)) { - e.mark_as_allocated(); - ++it; - break; - } - ++it; - } + auto &e = **it; + + ir_trace() << "cse_pass: unmarking " << e.cse_expr()->expr + << " with cost " << e.cost() << ", size " << e.size() + << ", and cost per byte " << (double)e.cost() / e.size() + << "\n"; + + e.set_unallocated(); + overflow_size -= e.size(); + ++it; } // Skip not allocated variables. + + // TODO: Rather than rerun CSE, just delete `let_t` and substitute + // variables with their value. This needs to be performed in the reverse + // order on `var_entries` to ensure no substitutions are missed in + // computation chains. for (auto &e : var_entries) { if (e.allocated()) continue; skip_exprs_.insert(e.cse_expr()->orig_expr); @@ -772,7 +815,7 @@ stmt_t eliminate_common_subexprs_impl(const stmt_t &_stmt, cse_context_t &ctx, ir_trace() << "CSE exceeded GRF usage limit. Usage: " << memory_usage << ", limit: " << memory_usage_limit << ". Retry CSE and skip some expressions..." << std::endl; - ctx.set_skip_exprs(_stmt, memory_usage_limit, grf_size); + ctx.set_skip_exprs(_stmt, memory_usage, memory_usage_limit, grf_size); ctx.reset_cse_exprs(); return stmt_t(); } From 7dc0868aeebdbb0da1f471176d917a4de55f9bb9 Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Fri, 3 May 2024 13:30:32 -0700 Subject: [PATCH 144/187] gpu: intel: jit: gemm: add larger thin m OHS kernels Add more kernels targeting 8 < m <=32. --- src/gpu/intel/jit/gemm/kernel.db | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/gpu/intel/jit/gemm/kernel.db b/src/gpu/intel/jit/gemm/kernel.db index a6c26bce413..f40a645ddb2 100644 --- a/src/gpu/intel/jit/gemm/kernel.db +++ b/src/gpu/intel/jit/gemm/kernel.db @@ -15,8 +15,8 @@ *******************************************************************************/ /*@kcatalog@*/ -kcatalog::FlatCatalog<1023> _CATALOG_ -{1, 8379, 1023, { +kcatalog::FlatCatalog<1024> _CATALOG_ +{1, 8380, 1024, { {{'9', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2x2 as8x2 ab l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'9', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4x2 ab l4 acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'9', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 as16 ab l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {256}}}, @@ -1039,5 +1039,6 @@ kcatalog::FlatCatalog<1023> _CATALOG_ {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16+S1,16@32 aB32x2 aB wg 1x4x8 kr cb4 ks32 af vav di hi pt bk0 sm sn grf256 sys dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 261, 65536, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.0672e+06, 762832, 500.274, 77177.4, 0, 0, 1.36188, 1.19635, 3.53273, 8.13743, 0.0615908, 0.0615908, 0, 0.79968, 1.28461, 0.929214, 4.37091e-12}}}, {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16+S32@48 aB32/16x2 aB wg 8x4 cb4x2 ks32 af vav di hi pt bk0 sm sn grf256 sys dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {964729, 752109, 0, 0, 0, 0, 1.46062, 1.46554, 1.01724, 2.17327, 0.0239322, 0.0239322, 0, 1, 1.25665, 0.945813, 3.00962e-12}}}, {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16+S64@64 aB16x2 aB wg 4x8 cb4 ks64 af vav di hi pt bk0 sm sn grf256 sys dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.00341e+06, 675337, 0, 0, 0, 0, 2.06693, 1.45219, 1.47193, 3.06315, 0.0374548, 0.0374548, 0, 1, 1.33339, 0.377183, 1.26633e-11}}}, -{{'G', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {16, 16, 1}, "IAB"}, "at32x2+m64@16 am64x2 aB wg 4x1x4 kr af vav ar sb64 bm0 bk0 sys np", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {4, 1, 4}, 1, (WGType) 1, 261, 0, 2048, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}} +{{'G', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {16, 16, 1}, "IAB"}, "at32x2+m64@16 am64x2 aB wg 4x1x4 ikr af vav ar sb64 bm0 bk0 sys nmk np", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {4, 1, 4}, 1, (WGType) 1, 261, 0, 2048, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}}, +{{'G', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, 9, -1}, {-1, 64, -1}, {16, 16, 1}, "IAB"}, "at32x2+m64@16 am64x2 aB wg 4x1x4 ikr af vav ar sb64 bm0 bk0 sys nmk np", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {4, 1, 4}, 1, (WGType) 1, 261, 0, 2048, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}}, }} From 7c4ece7ea892ddd67f916c13e330f0283b1a9304 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Wed, 15 May 2024 13:55:00 -0700 Subject: [PATCH 145/187] cpu: x64: jit_brgemm_post_ops: cleanup * Remove jcp in a favor of brg; * Renamed internal members to have underscore; * Removed unnecessary members; * Added TODO design comments for a future reference; --- src/cpu/x64/jit_brgemm_conv.cpp | 22 +- src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp | 9 +- src/cpu/x64/jit_brgemm_post_ops.hpp | 245 ++++++++++---------- 3 files changed, 154 insertions(+), 122 deletions(-) diff --git a/src/cpu/x64/jit_brgemm_conv.cpp b/src/cpu/x64/jit_brgemm_conv.cpp index 0e2aa49ddac..71a0c4ce590 100644 --- a/src/cpu/x64/jit_brgemm_conv.cpp +++ b/src/cpu/x64/jit_brgemm_conv.cpp @@ -758,10 +758,13 @@ status_t brgemm_convolution_fwd_t::add_po_kernel( bcfg->LDD = (is_init && jcp.use_buffer) ? jcp.LDC : jcp.LDD; bcfg->dt_c = (!is_init && jcp.use_buffer) ? jcp.acc_dt : jcp.dst_dt; // inp bcfg->dt_d = (is_init && jcp.use_buffer) ? jcp.acc_dt : jcp.dst_dt; // out + bcfg->typesize_C = types::data_type_size(bcfg->dt_c); + bcfg->typesize_D = types::data_type_size(bcfg->dt_d); bcfg->alpha = !is_init && IMPLICATION(jcp.with_sum, jcp.use_buffer); bcfg->beta = is_init ? 0 : 1; + // See the comment in `add_po_kernels` why `*_pd->attr()` is needed so far. CHECK(safe_ptr_assign(kernels_po_[ker_idx], - new jit_brgemm_kernel_post_ops(jcp, *bcfg, *_pd->attr()))); + new jit_brgemm_kernel_post_ops(*bcfg, *_pd->attr()))); kernels_po_[ker_idx]->create_kernel(); return status::success; } @@ -781,6 +784,18 @@ void brgemm_convolution_fwd_t::add_po_kernels( if (init_bcast_dim > 0) { if (brgs[brg_idx]) { + // Note: The particular line below means a copy of brgemm_desc + // object. The copy here is due to: + // * PD creation time passed, original objects can't be modified. + // * PO kernel requires (for some reason) custom values for certain + // members in brgemm descriptor. + // When the copy is performed, it erases underlying memory for + // attributes and dst_md, which means they can't be used in any + // further call due to the temporary object on stack (after copy) + // will be destroyed and the address of, e.g. the address of the sum + // scale (used in the post-ops kernel), will be invalidated. + // This copy puts restrictions on what objects can be used in + // sub-calls and a developer should be careful about that. auto init_cfg = *(brgs[brg_idx]); auto ker_init_idx = get_ker_po_idx(init_bcast_dim - 1, false, i_N); if (init_cfg.load_dim > 0 && kernels_po_[ker_init_idx] == nullptr) { @@ -1625,8 +1640,9 @@ void brgemm_convolution_fwd_t::perform_outwork( auto call_outwork_ker = [&](bool is_postwork, bool has_postcomp, int ow_pw_s, int ow_pw_l) { auto ker_po_idx = get_ker_po_idx(ow_pw_l - 1, is_postwork, is_oc_tail); - const auto outwork_ker = kernels_po_[ker_po_idx].get(); - assert(outwork_ker != nullptr && ow_pw_l == outwork_ker->brg.bcast_dim); + const auto &outwork_ker = kernels_po_[ker_po_idx].get(); + assert(outwork_ker != nullptr + && ow_pw_l == outwork_ker->get_bcast_dim()); if (is_postwork) { p.apply_comp = has_postcomp; p.a_zp_compensation = has_postcomp && jcp.src_zero_point diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp index a4cb990712e..bb79bd02f17 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp @@ -294,15 +294,18 @@ status_t brgemm_convolution_bwd_strided_t::add_po_kernel( bcfg->LDD = (is_init && jcp.use_buffer) ? jcp.LDC : jcp.LDD; bcfg->dt_c = (!is_init && jcp.use_buffer) ? jcp.acc_dt : jcp.dst_dt; bcfg->dt_d = (is_init && jcp.use_buffer) ? jcp.acc_dt : jcp.dst_dt; + bcfg->typesize_C = types::data_type_size(bcfg->dt_c); + bcfg->typesize_D = types::data_type_size(bcfg->dt_d); bcfg->alpha = (!is_init && IMPLICATION(jcp.with_sum, jcp.use_buffer)) ? 1 : 0; bcfg->beta = is_init ? 0 : 1; CHECK(safe_ptr_assign(kernels_po_[ker_idx], - new jit_brgemm_kernel_post_ops(jcp, *bcfg, *_pd->attr()))); + new jit_brgemm_kernel_post_ops(*bcfg, *_pd->attr()))); kernels_po_[ker_idx]->create_kernel(); return status::success; } +// TODO: consolidate with jit_brgemm_conv.cpp version. template void brgemm_convolution_bwd_strided_t::add_po_kernels( int i_N, int init_bcast_dim, int po_bcast_dim) { @@ -340,6 +343,7 @@ void brgemm_convolution_bwd_strided_t::add_po_kernels( } } } + template int brgemm_convolution_bwd_strided_t::get_comp_ker_idx(const int kd_b, const int kd_e, const int kh_b, const int kh_e, const int kw_b, @@ -982,7 +986,8 @@ void brgemm_convolution_bwd_strided_t::perform_outwork(char *dst_base, auto ker_po_idx = get_ker_po_idx(iw_pw_l - 1, is_postwork, is_ic_tail); const auto outwork_ker = kernels_po_[ker_po_idx].get(); const auto comp_iw_s = get_comp_iw(iw_pw_s); - assert(outwork_ker != nullptr && iw_pw_l == outwork_ker->brg.bcast_dim); + assert(outwork_ker != nullptr + && iw_pw_l == outwork_ker->get_bcast_dim()); if (is_postwork) { p.apply_comp = has_postcomp; p.a_zp_compensation = has_postcomp && jcp.src_zero_point diff --git a/src/cpu/x64/jit_brgemm_post_ops.hpp b/src/cpu/x64/jit_brgemm_post_ops.hpp index e741d232519..4ee2ae6df28 100644 --- a/src/cpu/x64/jit_brgemm_post_ops.hpp +++ b/src/cpu/x64/jit_brgemm_post_ops.hpp @@ -364,23 +364,25 @@ struct brgemm_kernel_post_ops_t { template struct jit_brgemm_kernel_post_ops : public jit_generator { - jit_brgemm_kernel_post_ops(const jit_brgemm_conv_conf_t &ajcp, + // TODO: the proper design should replace `brgemm_desc_t` argument and + // introduce a dedicated struct with members properly initialized. This will + // let avoiding a `brgemm_desc_t` object copy which is unsafe due to `attr` + // member. + jit_brgemm_kernel_post_ops( const brgemm_desc_t &abrg, const primitive_attr_t &aattr) : jit_generator(jit_name(), abrg.isa_impl) - , brg(abrg) - , jcp(ajcp) - , attr(aattr) - , postops_injector_(nullptr) - , with_binary_non_scalar_bcast_(brg.with_binary + , brg_(abrg) + , attr_(aattr) + , with_binary_non_scalar_bcast_(brg_.with_binary && binary_injector:: any_binary_postop_rhs_non_scalar_broadcast( - brg.attr()->post_ops_, - memory_desc_wrapper(brg.dst_md()))) { + attr_.post_ops_, + memory_desc_wrapper(brg_.dst_md()))) { bool has_f8_e5m2_binary_postops = false; bool has_f8_e4m3_binary_postops = false; - if (brg.with_binary) { - const auto &post_ops = attr.post_ops_; + if (brg_.with_binary) { + const auto &post_ops = attr_.post_ops_; for (int i = 0; i < post_ops.len(); i++) { const auto &entry = post_ops.entry_[i]; if (!entry.is_binary()) continue; @@ -391,25 +393,27 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { } } - if (brg.is_bf16_emu) + if (brg_.is_bf16_emu) bf16_emu_ = utils::make_unique(this, emu_reserv_1, emu_reserv_2, emu_reserv_3, emu_scratch, emu_reserv_4, emu_reserv_4); - if (brg.is_fp8_via_convert() || has_f8_e5m2_binary_postops + if (brg_.is_fp8_via_convert() || has_f8_e5m2_binary_postops || has_f8_e4m3_binary_postops) { - if (utils::one_of(data_type::f8_e5m2, brg.dt_a, brg.dt_b, brg.dt_d) + if (utils::one_of( + data_type::f8_e5m2, brg_.dt_a, brg_.dt_b, brg_.dt_d) || has_f8_e5m2_binary_postops) f8_e5m2_emulator_ = utils::make_unique( this, emu_reserv_1, emu_reserv_2, emu_reserv_3, emu_mask, emu_scratch); - if (utils::one_of(data_type::f8_e4m3, brg.dt_a, brg.dt_b, brg.dt_d) + if (utils::one_of( + data_type::f8_e4m3, brg_.dt_a, brg_.dt_b, brg_.dt_d) || has_f8_e4m3_binary_postops) f8_e4m3_emulator_ = utils::make_unique( this, emu_reserv_1, emu_reserv_2, emu_reserv_3, emu_reserv_4, emu_reserv_5, emu_scratch); } - if (brg.beta != 0) { + if (brg_.beta != 0) { static constexpr bool preserve_gpr = true; static constexpr bool preserve_vmm = true; static constexpr bool use_exact_tail_scalar_bcast = false; @@ -418,8 +422,8 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { static_cast(vmm_tmp(4).getIdx()), this->r14, this->r15, this->r13, preserve_gpr, preserve_vmm, GET_OFF(ptr_binary_post_ops_rhs), GET_OFF(dst_orig), - memory_desc_wrapper(brg.dst_md()), - static_cast(brg.load_dim % brg.ld_block), + memory_desc_wrapper(brg_.dst_md()), + static_cast(brg_.load_dim % brg_.ld_block), k_tail_mask, use_exact_tail_scalar_bcast}; const binary_injector::static_params_t bsp(this->param1, bcast_set_t {broadcasting_strategy_t::scalar, @@ -430,7 +434,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { broadcasting_strategy_t::no_broadcast}, rhs_sp, f8_e5m2_emulator_.get(), f8_e4m3_emulator_.get()); - const bool save_state = jcp.with_eltwise; + const bool save_state = brg_.with_eltwise; const auto &reserved_eltwise_gpr = reg_reserved_eltwise; const auto reserved_eltwise_maskr = Xbyak::Opmask(1); @@ -439,34 +443,39 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { postops_injector_ = utils::make_unique< injector::jit_uni_postops_injector_t>( - this, attr.post_ops_, bsp, esp); + this, attr_.post_ops_, bsp, esp); } - const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS); + const auto &wei_scales = attr_.scales_.get(DNNL_ARG_WEIGHTS); // per_oc: conv: 1 << 0, (1 << 1) + (1 << 0) (with groups) // per_oc: ip: 1 << 0 is_oc_scale_ = utils::one_of(wei_scales.mask_, 1 << 0, (1 << 1) + (1 << 0)); - LDD_ = brg.LDD; - inp_dt_ = brg.dt_c; - out_dt_ = brg.dt_d; - bia_dt_ = jcp.bia_dt; - inp_typesize_ = types::data_type_size(inp_dt_); - out_typesize_ = types::data_type_size(out_dt_); - bia_typesize_ = (jcp.with_bias) ? types::data_type_size(bia_dt_) : 0; + inp_dt_ = brg_.dt_c; + out_dt_ = brg_.dt_d; + bia_dt_ = brg_.dt_bias; + + inp_typesize_ = brg_.typesize_C; + out_typesize_ = brg_.typesize_D; + bia_typesize_ = brg_.typesize_bias; } ~jit_brgemm_kernel_post_ops() = default; DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_kernel_post_ops) - brgemm_desc_t brg; - jit_brgemm_conv_conf_t jcp; - const primitive_attr_t &attr; + // Used for assertion on implementation side in debug mode. + int get_bcast_dim() const { return brg_.bcast_dim; } private: - int LDD_; + // This can't be a reference, otherwise, `get_bcast_dim()` would return + // rubbish due to brgemm_desc argument is a copy on stack (see comment + // above). + // This means a copy at construction time. + // This class is ridiculously broken. + brgemm_desc_t brg_; + const primitive_attr_t &attr_; data_type_t inp_dt_; data_type_t out_dt_; @@ -554,28 +563,28 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { Vmm vmm_tmp(int i) const { return Vmm(max_vregs_ - 1 - i); } int zp_c_values_offset(int n, bool is_tail = false) const noexcept { - if (brg.zp_type_c == brgemm_broadcast_t::per_n) { - return (is_tail) ? sizeof(int32_t) * brg.ldb_tail - : sizeof(int32_t) * n * brg.ld_block; + if (brg_.zp_type_c == brgemm_broadcast_t::per_n) { + return (is_tail) ? sizeof(int32_t) * brg_.ldb_tail + : sizeof(int32_t) * n * brg_.ld_block; } return 0; } int zp_comp_a_vpad_offset( int n, int m, bool is_tail = false) const noexcept { - return (is_tail) ? sizeof(int32_t) * (brg.ldb_tail + m * brg.LDB) - : sizeof(int32_t) * (n * brg.ld_block + m * brg.LDB); + return (is_tail) ? sizeof(int32_t) * (brg_.ldb_tail + m * brg_.LDB) + : sizeof(int32_t) * (n * brg_.ld_block + m * brg_.LDB); } int mb_zp_comp_a_offset(int m_block) const noexcept { - return sizeof(int32_t) * m_block * brg.LDB; + return sizeof(int32_t) * m_block * brg_.LDB; } int compensation_vpad_offset( int n, int m, bool is_tail = false) const noexcept { - return (is_tail) ? sizeof(int32_t) * (brg.ldb_tail + m * brg.LDB) - : sizeof(int32_t) * (n * brg.ld_block + m * brg.LDB); + return (is_tail) ? sizeof(int32_t) * (brg_.ldb_tail + m * brg_.LDB) + : sizeof(int32_t) * (n * brg_.ld_block + m * brg_.LDB); } int mb_compensation_offset(int m_block) const noexcept { - return sizeof(int32_t) * m_block * brg.LDB; + return sizeof(int32_t) * m_block * brg_.LDB; } template @@ -609,13 +618,13 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { break; case data_type::f16: vcvtph2ps(vmm, op); break; case data_type::f8_e5m2: - if (brg.is_fp8_via_convert()) + if (brg_.is_fp8_via_convert()) f8_e5m2_emulator_->vcvt_f8_to_f32(vmm, op); else assert(!"Not supported yet"); break; case data_type::f8_e4m3: - if (brg.is_fp8_via_convert()) + if (brg_.is_fp8_via_convert()) f8_e4m3_emulator_->vcvt_f8_to_f32(vmm, op); else assert(!"Not supported yet"); @@ -632,7 +641,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { Vmm vector(int m, int n, int n_block) { return Vmm(m * n_block + n); }; void inject_attr_postops(int m_block, int n_block, int tail = 0) { - const auto &p = attr.post_ops_; + const auto &p = attr_.post_ops_; const int sum_idx = p.find(primitive_kind::sum); const auto k_mask = tail == 0 ? k_full_mask : k_tail_mask; const auto sum_dt = p.get_sum_dt(out_dt_); @@ -657,7 +666,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { for (int n = 0; n < n_block; n++) { const auto vmm = vector(m, n, n_block); const auto addr = ptr[aux_reg_out - + out_typesize_ * (m * LDD_ + n * brg.ld_block)]; + + out_typesize_ * (m * brg_.LDD + n * brg_.ld_block)]; const auto vmm_prev_dst = vmm_tmp(0); cvt2ps(sum_dt, vmm_prev_dst, addr, tail, false, k_mask); @@ -678,7 +687,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { } }; - if (jcp.with_sum) { + if (brg_.with_sum) { postops_injector_->set_lambda_injector( primitive_kind::sum, sum_injector); } @@ -690,7 +699,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { for (int n = 0; n < n_block; n++) { const auto vmm_idx = vector(m, n, n_block).getIdx(); const size_t aux_output_offset - = out_typesize_ * (m * LDD_ + n * brg.ld_block); + = out_typesize_ * (m * brg_.LDD + n * brg_.ld_block); rhs_arg_params.vmm_idx_to_out_reg.emplace(vmm_idx, aux_reg_out); rhs_arg_params.vmm_idx_to_out_elem_off_val.emplace( @@ -705,7 +714,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { void apply_comp(int m_block, int n_block, int tail = 0) { auto k_mask = (tail == 0) ? k_full_mask : k_tail_mask; const bool has_tail = tail > 0; - if (brg.zp_type_a != brgemm_broadcast_t::none) { + if (brg_.zp_type_a != brgemm_broadcast_t::none) { auto vmm_zp_a_val = vmm_tmp(1); mov(reg_zp_a_val, ptr[rsp + reg_zp_a_val_offs_]); uni_vpbroadcastd(vmm_zp_a_val, reg_zp_a_val.cvt32()); @@ -715,7 +724,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { for_(int m = 0; m < m_block; m++) for (int n = 0; n < n_block; n++) { const size_t zp_comp_offset - = sizeof(int32_t) * (n * brg.ld_block + m * brg.LDB); + = sizeof(int32_t) * (n * brg_.ld_block + m * brg_.LDB); auto zp_comp_a_addr = is_superset(isa, avx512_core) ? EVEX_compress_addr(aux_reg_zp_a_comp, zp_comp_offset) : ptr[aux_reg_zp_a_comp + zp_comp_offset]; @@ -734,13 +743,13 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { } } - if (brg.req_s8s8_compensation) { + if (brg_.req_s8s8_compensation) { mov(aux_reg_s8s8_comp, ptr[rsp + aux_reg_s8s8_comp_offs_]); const auto vmm_comp = vmm_tmp(0); for_(int m = 0; m < m_block; m++) for (int n = 0; n < n_block; n++) { const size_t s8s8_comp_offset - = sizeof(int32_t) * (n * brg.ld_block + m * brg.LDB); + = sizeof(int32_t) * (n * brg_.ld_block + m * brg_.LDB); auto comp_addr = is_superset(isa, avx512_core) ? EVEX_compress_addr( @@ -775,23 +784,23 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { void apply_post_ops(int m_block, int n_block, int tail = 0) { const auto vector = [=](int m, int n) { return Vmm(m * n_block + n); }; auto k_mask = (tail == 0) ? k_full_mask : k_tail_mask; - const auto req_comp = brg.is_int8 && brg.beta != 0 - && (brg.req_s8s8_compensation - || brg.zp_type_a != brgemm_broadcast_t::none); + const auto req_comp = brg_.is_int8 && brg_.beta != 0 + && (brg_.req_s8s8_compensation + || brg_.zp_type_a != brgemm_broadcast_t::none); - // brg.alpha == 0 means initialize registers, 1 means read from input - // brg.beta == 0 means skip postwork, 1 means do postwork + // brg_.alpha == 0 means initialize registers, 1 means read from input + // brg_.beta == 0 means skip postwork, 1 means do postwork // req_comp == true -> convert accumulated values to f32 after applying // compensation to avoid the loss of accuracy when converting s32 to f32 for_(int m = 0; m < m_block; m++) for (int n = 0; n < n_block; n++) { - if (brg.alpha == 0) { + if (brg_.alpha == 0) { // have to init vmm each time because vectors may have been // changed in the previous iterations uni_vpxor(vector(m, n), vector(m, n), vector(m, n)); } else { auto inp_addr = ptr[aux_reg_in - + inp_typesize_ * (m * brg.LDC + n * brg.ld_block)]; + + inp_typesize_ * (m * brg_.LDC + n * brg_.ld_block)]; cvt2ps(inp_dt_, vector(m, n), inp_addr, tail, false, k_mask, req_comp); } @@ -799,11 +808,11 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { if (req_comp) maybe_apply_comp(m_block, n_block, tail); - if (brg.beta != 0) { + if (brg_.beta != 0) { for_(int m = 0; m < m_block; m++) for (int n = 0; n < n_block; n++) { const auto addr = ptr[aux_reg_scales - + is_oc_scale_ * sizeof(float) * (n * brg.ld_block)]; + + is_oc_scale_ * sizeof(float) * (n * brg_.ld_block)]; auto vmm = vector(m, n); if (IMPLICATION(tail > 0, isa_has_masks(isa))) { vmm = maybe_mask(vector(m, n), tail > 0, false, k_mask); @@ -816,11 +825,11 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { } } - if (brg.beta != 0 && jcp.with_bias) { + if (brg_.beta != 0 && brg_.with_bias) { for (int n = 0; n < n_block; n++) { auto vmm_bias = vmm_tmp(0); auto bias_addr = ptr[aux_reg_bias - + bia_typesize_ * (n * brg.ld_block)]; + + bia_typesize_ * (n * brg_.ld_block)]; cvt2ps(bia_dt_, vmm_bias, bias_addr, tail, false, k_mask); for (int m = 0; m < m_block; m++) { vaddps(vector(m, n), vmm_bias); @@ -830,7 +839,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { if (postops_injector_) inject_attr_postops(m_block, n_block, tail); - if (brg.beta != 0 && brg.with_dst_scales) { + if (brg_.beta != 0 && brg_.with_dst_scales) { mov(aux_reg_dst_scales, ptr[rsp + reg_dst_scales_offs_]); const auto addr = ptr[aux_reg_dst_scales]; auto vmm_scales = vmm_tmp(0); @@ -848,10 +857,10 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { } } - if (brg.beta != 0 && brg.zp_type_c != brgemm_broadcast_t::none) { + if (brg_.beta != 0 && brg_.zp_type_c != brgemm_broadcast_t::none) { mov(aux_reg_zp_c_values, ptr[rsp + aux_reg_zp_c_values_offs_]); auto vmm_zp_c = vmm_tmp(0); - if (brg.zp_type_c == brgemm_broadcast_t::per_tensor) { + if (brg_.zp_type_c == brgemm_broadcast_t::per_tensor) { if (is_superset(isa, avx512_core)) vcvtdq2ps(vmm_zp_c, EVEX_compress_addr(aux_reg_zp_c_values, 0, true)); @@ -861,7 +870,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { } } for (int n = 0; n < n_block; n++) { - if (brg.zp_type_c == brgemm_broadcast_t::per_n) { + if (brg_.zp_type_c == brgemm_broadcast_t::per_n) { int zp_c_off = zp_c_values_offset(n); auto zp_c_addr = is_superset(isa, avx512_core) ? EVEX_compress_addr(aux_reg_zp_c_values, zp_c_off) @@ -886,7 +895,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { data_type::f32, out_dt_); } - if (brg.is_bf16_emu) bf16_emu_->init_vcvtneps2bf16(); + if (brg_.is_bf16_emu) bf16_emu_->init_vcvtneps2bf16(); for_(int m = 0; m < m_block; m++) for (int n = 0; n < n_block; n++) { @@ -894,7 +903,8 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { // of `n`, implying n_block must be equal to `1`. assert(IMPLICATION(tail > 0, n_block == 1)); auto vmm = vector(m, n); - const size_t offset = out_typesize_ * (m * LDD_ + n * brg.ld_block); + const size_t offset + = out_typesize_ * (m * brg_.LDD + n * brg_.ld_block); const auto addr = ptr[aux_reg_out + offset]; if (dt_requires_saturation) { @@ -913,7 +923,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { case data_type::f32: case data_type::s32: uni_vmovups(addr, vmm_masked); break; case data_type::bf16: - if (brg.is_bf16_emu) { + if (brg_.is_bf16_emu) { bf16_emu_->vcvtneps2bf16(vmm_low, vmm); vmovdqu16(addr, vmm_low_masked); } else { @@ -926,14 +936,14 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { vmovdqu16(addr, vmm_low_masked); break; case data_type::f8_e5m2: - if (brg.is_fp8_via_convert()) { + if (brg_.is_fp8_via_convert()) { f8_e5m2_emulator_->vcvt_f32_to_f8(vmm_low2, vmm); vmovdqu8(addr, vmm_low2_masked); } else assert(!"Not supported yet"); break; case data_type::f8_e4m3: - if (brg.is_fp8_via_convert()) { + if (brg_.is_fp8_via_convert()) { f8_e4m3_emulator_->vcvt_f32_to_f8(vmm_low2, vmm); vmovdqu8(addr, vmm_low2_masked); } else @@ -953,18 +963,18 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { void loop_by_N(int m_block, int nb2, int nb2_tail, int nb_tail) { - if (brg.alpha) { mov(aux_reg_in, reg_in); } - if (brg.beta != 0) { - if (jcp.with_bias) mov(aux_reg_bias, reg_bias); - if (brg.zp_type_c != brgemm_broadcast_t::none) { + if (brg_.alpha) { mov(aux_reg_in, reg_in); } + if (brg_.beta != 0) { + if (brg_.with_bias) mov(aux_reg_bias, reg_bias); + if (brg_.zp_type_c != brgemm_broadcast_t::none) { mov(aux_reg_zp_c_values, ptr[rsp + reg_zp_c_values_offs_]); mov(ptr[rsp + aux_reg_zp_c_values_offs_], aux_reg_zp_c_values); } - if (brg.zp_type_a != brgemm_broadcast_t::none) { + if (brg_.zp_type_a != brgemm_broadcast_t::none) { mov(aux_reg_zp_a_comp, ptr[rsp + reg_zp_a_comp_offs_]); mov(ptr[rsp + aux_reg_zp_a_comp_offs_], aux_reg_zp_a_comp); } - if (brg.req_s8s8_compensation) { + if (brg_.req_s8s8_compensation) { mov(aux_reg_s8s8_comp, ptr[rsp + reg_s8s8_comp_offs_]); mov(ptr[rsp + aux_reg_s8s8_comp_offs_], aux_reg_s8s8_comp); } @@ -975,28 +985,28 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { for (int n_loop_ = 0; n_loop_ < nb2; n_loop_++) { apply_post_ops(m_block, n_block2_); - const auto oc_l_offset = n_block2_ * brg.ld_block; + const auto oc_l_offset = n_block2_ * brg_.ld_block; add(aux_reg_out, out_typesize_ * oc_l_offset); - if (brg.alpha != 0) { + if (brg_.alpha != 0) { add(aux_reg_in, inp_typesize_ * oc_l_offset); } - if (brg.beta != 0) { - if (jcp.with_bias) + if (brg_.beta != 0) { + if (brg_.with_bias) add(aux_reg_bias, bia_typesize_ * oc_l_offset); - if (brg.zp_type_c != brgemm_broadcast_t::none) { + if (brg_.zp_type_c != brgemm_broadcast_t::none) { mov(aux_reg_zp_c_values, ptr[rsp + aux_reg_zp_c_values_offs_]); add(aux_reg_zp_c_values, zp_c_values_offset(n_block2_)); mov(ptr[rsp + aux_reg_zp_c_values_offs_], aux_reg_zp_c_values); } - if (brg.zp_type_a != brgemm_broadcast_t::none) { + if (brg_.zp_type_a != brgemm_broadcast_t::none) { mov(aux_reg_zp_a_comp, ptr[rsp + aux_reg_zp_a_comp_offs_]); add(aux_reg_zp_a_comp, sizeof(int32_t) * oc_l_offset); mov(ptr[rsp + aux_reg_zp_a_comp_offs_], aux_reg_zp_a_comp); } - if (brg.req_s8s8_compensation) { + if (brg_.req_s8s8_compensation) { mov(aux_reg_s8s8_comp, ptr[rsp + aux_reg_s8s8_comp_offs_]); add(aux_reg_s8s8_comp, sizeof(int32_t) * oc_l_offset); mov(ptr[rsp + aux_reg_s8s8_comp_offs_], aux_reg_s8s8_comp); @@ -1007,28 +1017,28 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { } if (nb2_tail > 0) { apply_post_ops(m_block, nb2_tail); - const auto oc_l_offset = nb2_tail * brg.ld_block; + const auto oc_l_offset = nb2_tail * brg_.ld_block; add(aux_reg_out, out_typesize_ * oc_l_offset); - if (brg.alpha != 0) { + if (brg_.alpha != 0) { add(aux_reg_in, inp_typesize_ * oc_l_offset); } - if (brg.beta != 0) { - if (jcp.with_bias) + if (brg_.beta != 0) { + if (brg_.with_bias) add(aux_reg_bias, bia_typesize_ * oc_l_offset); - if (brg.zp_type_c != brgemm_broadcast_t::none) { + if (brg_.zp_type_c != brgemm_broadcast_t::none) { mov(aux_reg_zp_c_values, ptr[rsp + aux_reg_zp_c_values_offs_]); add(aux_reg_zp_c_values, zp_c_values_offset(nb2_tail)); mov(ptr[rsp + aux_reg_zp_c_values_offs_], aux_reg_zp_c_values); } - if (brg.zp_type_a != brgemm_broadcast_t::none) { + if (brg_.zp_type_a != brgemm_broadcast_t::none) { mov(aux_reg_zp_a_comp, ptr[rsp + aux_reg_zp_a_comp_offs_]); add(aux_reg_zp_a_comp, sizeof(int32_t) * oc_l_offset); mov(ptr[rsp + aux_reg_zp_a_comp_offs_], aux_reg_zp_a_comp); } - if (brg.req_s8s8_compensation) { + if (brg_.req_s8s8_compensation) { mov(aux_reg_s8s8_comp, ptr[rsp + aux_reg_s8s8_comp_offs_]); add(aux_reg_s8s8_comp, sizeof(int32_t) * oc_l_offset); mov(ptr[rsp + aux_reg_s8s8_comp_offs_], aux_reg_s8s8_comp); @@ -1040,22 +1050,23 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { if (nb_tail > 0) { apply_post_ops(m_block, 1, nb_tail); - if (brg.alpha != 0) { add(aux_reg_in, inp_typesize_ * (nb_tail)); } - if (brg.beta != 0) { - if (jcp.with_bias) add(aux_reg_bias, bia_typesize_ * (nb_tail)); - if (brg.zp_type_c != brgemm_broadcast_t::none) { + if (brg_.alpha != 0) { add(aux_reg_in, inp_typesize_ * (nb_tail)); } + if (brg_.beta != 0) { + if (brg_.with_bias) + add(aux_reg_bias, bia_typesize_ * (nb_tail)); + if (brg_.zp_type_c != brgemm_broadcast_t::none) { mov(aux_reg_zp_c_values, ptr[rsp + aux_reg_zp_c_values_offs_]); add(aux_reg_zp_c_values, zp_c_values_offset(1, nb_tail)); mov(ptr[rsp + aux_reg_zp_c_values_offs_], aux_reg_zp_c_values); } - if (brg.zp_type_a != brgemm_broadcast_t::none) { + if (brg_.zp_type_a != brgemm_broadcast_t::none) { mov(aux_reg_zp_a_comp, ptr[rsp + aux_reg_zp_a_comp_offs_]); add(aux_reg_zp_a_comp, sizeof(int32_t) * nb_tail); mov(ptr[rsp + aux_reg_zp_a_comp_offs_], aux_reg_zp_a_comp); } - if (brg.req_s8s8_compensation) { + if (brg_.req_s8s8_compensation) { mov(aux_reg_s8s8_comp, ptr[rsp + aux_reg_s8s8_comp_offs_]); add(aux_reg_s8s8_comp, sizeof(int32_t) * nb_tail); mov(ptr[rsp + aux_reg_s8s8_comp_offs_], aux_reg_s8s8_comp); @@ -1071,22 +1082,22 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { sub(rsp, stack_space_needed_); - int nb = brg.load_dim / brg.ld_block; - int nb_tail = brg.load_dim % brg.ld_block; + int nb = brg_.load_dim / brg_.ld_block; + int nb_tail = brg_.load_dim % brg_.ld_block; int nb2 = nb / n_block2_; int nb2_tail = nb % n_block2_; int n_block = (nb2 == 0) ? nstl::max(1, nb2_tail) : n_block2_; - int m_max_regs = (brg.is_bf16_emu + int m_max_regs = (brg_.is_bf16_emu ? 24 - : (brg.is_fp8_via_convert() ? 23 : max_vregs_ - 4)); + : (brg_.is_fp8_via_convert() ? 23 : max_vregs_ - 4)); m_max_regs /= n_block; - int m_block = nstl::min(brg.bcast_dim, m_max_regs); + int m_block = nstl::min(brg_.bcast_dim, m_max_regs); - int mb = brg.bcast_dim / m_block; - int mb_tail = brg.bcast_dim % m_block; + int mb = brg_.bcast_dim / m_block; + int mb_tail = brg_.bcast_dim % m_block; if (isa_has_masks(isa)) { const auto full_mask = size_t {0xffffffffffffffff}; @@ -1099,29 +1110,29 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { kmovq(k_tail_mask, reg_mask); } - if (brg.alpha != 0) { mov(reg_in, ptr[param1 + GET_OFF(ptr_in)]); } - if (brg.beta != 0) { + if (brg_.alpha != 0) { mov(reg_in, ptr[param1 + GET_OFF(ptr_in)]); } + if (brg_.beta != 0) { mov(reg_scales, ptr[param1 + GET_OFF(ptr_scales)]); mov(reg_apply_comp, ptr[param1 + GET_OFF(apply_comp)]); mov(ptr[rsp + reg_apply_comp_offs_], reg_apply_comp); - if (jcp.with_bias) mov(reg_bias, ptr[param1 + GET_OFF(ptr_bias)]); - if (brg.zp_type_c != brgemm_broadcast_t::none) { + if (brg_.with_bias) mov(reg_bias, ptr[param1 + GET_OFF(ptr_bias)]); + if (brg_.zp_type_c != brgemm_broadcast_t::none) { mov(reg_zp_c_values, ptr[param1 + GET_OFF(c_zp_values)]); mov(ptr[rsp + reg_zp_c_values_offs_], reg_zp_c_values); } - if (brg.zp_type_a != brgemm_broadcast_t::none) { + if (brg_.zp_type_a != brgemm_broadcast_t::none) { mov(reg_zp_a_comp, ptr[param1 + GET_OFF(a_zp_compensation)]); mov(ptr[rsp + reg_zp_a_comp_offs_], reg_zp_a_comp); mov(reg_zp_a_val, ptr[param1 + GET_OFF(a_comp_val)]); mov(ptr[rsp + reg_zp_a_val_offs_], reg_zp_a_val); } - if (brg.req_s8s8_compensation) { + if (brg_.req_s8s8_compensation) { mov(reg_s8s8_comp, ptr[param1 + GET_OFF(s8s8_compensation)]); mov(ptr[rsp + reg_s8s8_comp_offs_], reg_s8s8_comp); } - if (brg.with_dst_scales) { + if (brg_.with_dst_scales) { mov(reg_dst_scales, ptr[param1 + GET_OFF(ptr_dst_scales)]); mov(ptr[rsp + reg_dst_scales_offs_], reg_dst_scales); } @@ -1131,21 +1142,21 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { for (int mb_ = 0; mb_ < mb; mb_++) { loop_by_N(m_block, nb2, nb2_tail, nb_tail); - if (brg.alpha != 0) - add(reg_in, inp_typesize_ * (m_block * brg.LDC)); - if (brg.beta != 0) { - if (brg.zp_type_a != brgemm_broadcast_t::none) { + if (brg_.alpha != 0) + add(reg_in, inp_typesize_ * (m_block * brg_.LDC)); + if (brg_.beta != 0) { + if (brg_.zp_type_a != brgemm_broadcast_t::none) { mov(reg_zp_a_comp, ptr[rsp + reg_zp_a_comp_offs_]); add(reg_zp_a_comp, mb_zp_comp_a_offset(m_block)); mov(ptr[rsp + reg_zp_a_comp_offs_], reg_zp_a_comp); } - if (brg.req_s8s8_compensation) { + if (brg_.req_s8s8_compensation) { mov(reg_s8s8_comp, ptr[rsp + reg_s8s8_comp_offs_]); add(reg_s8s8_comp, mb_compensation_offset(m_block)); mov(ptr[rsp + reg_s8s8_comp_offs_], reg_s8s8_comp); } } - add(reg_out, out_typesize_ * (m_block * LDD_)); + add(reg_out, out_typesize_ * (m_block * brg_.LDD)); } if (mb_tail > 0) loop_by_N(mb_tail, nb2, nb2_tail, nb_tail); @@ -1155,7 +1166,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { if (postops_injector_) postops_injector_->prepare_table(/* generate = */ true); - if (brg.is_fp8_via_convert()) { + if (brg_.is_fp8_via_convert()) { if (f8_e5m2_emulator_) f8_e5m2_emulator_->prepare_table(); if (f8_e4m3_emulator_) f8_e4m3_emulator_->prepare_table(); } From daac78c3657e390c2360fb105d2612ce000c8f56 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Thu, 16 May 2024 11:00:10 -0700 Subject: [PATCH 146/187] cpu: x64: jit_brgemm_post_ops: renaming --- src/cpu/x64/jit_brgemm_conv.cpp | 4 ++-- src/cpu/x64/jit_brgemm_conv.hpp | 2 +- src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp | 4 ++-- src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp | 2 +- src/cpu/x64/jit_brgemm_post_ops.hpp | 12 ++++++------ 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/cpu/x64/jit_brgemm_conv.cpp b/src/cpu/x64/jit_brgemm_conv.cpp index 71a0c4ce590..0f7ebcbcc03 100644 --- a/src/cpu/x64/jit_brgemm_conv.cpp +++ b/src/cpu/x64/jit_brgemm_conv.cpp @@ -764,7 +764,7 @@ status_t brgemm_convolution_fwd_t::add_po_kernel( bcfg->beta = is_init ? 0 : 1; // See the comment in `add_po_kernels` why `*_pd->attr()` is needed so far. CHECK(safe_ptr_assign(kernels_po_[ker_idx], - new jit_brgemm_kernel_post_ops(*bcfg, *_pd->attr()))); + new jit_brgemm_kernel_post_ops_t(*bcfg, *_pd->attr()))); kernels_po_[ker_idx]->create_kernel(); return status::success; } @@ -1625,7 +1625,7 @@ void brgemm_convolution_fwd_t::perform_outwork( const auto ow_f = (kdh_l <= 0) ? ow : ker_ow_f; assert(ow <= ow_s && ow_s <= ow_f && ow_f <= ow + M); - brgemm_kernel_post_ops_t p; + brgemm_kernel_post_ops_args_t p; if (do_postwork) { p.ptr_bias = (void *)(bias_w); p.ptr_scales = (void *)(&btc.oscales[jcp.is_oc_scale * g_oc]); diff --git a/src/cpu/x64/jit_brgemm_conv.hpp b/src/cpu/x64/jit_brgemm_conv.hpp index 2db203e2216..40791792939 100644 --- a/src/cpu/x64/jit_brgemm_conv.hpp +++ b/src/cpu/x64/jit_brgemm_conv.hpp @@ -247,7 +247,7 @@ struct brgemm_convolution_fwd_t : public primitive_t { brgemm_containers::brgemm_kernel_container_t brgemm_kernels_; brgemm_containers::brgemm_palette_container_t brgemm_palettes_; - std::vector>> kernels_po_; + std::vector>> kernels_po_; std::unique_ptr copy_to_pbuffer_; diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp index bb79bd02f17..37f3995920a 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp @@ -300,7 +300,7 @@ status_t brgemm_convolution_bwd_strided_t::add_po_kernel( = (!is_init && IMPLICATION(jcp.with_sum, jcp.use_buffer)) ? 1 : 0; bcfg->beta = is_init ? 0 : 1; CHECK(safe_ptr_assign(kernels_po_[ker_idx], - new jit_brgemm_kernel_post_ops(*bcfg, *_pd->attr()))); + new jit_brgemm_kernel_post_ops_t(*bcfg, *_pd->attr()))); kernels_po_[ker_idx]->create_kernel(); return status::success; } @@ -970,7 +970,7 @@ void brgemm_convolution_bwd_strided_t::perform_outwork(char *dst_base, auto iw_f = (kdh_l <= 0) ? iw : ker_iw_f; assert(iw <= iw_s && iw_s <= iw_f && iw_f <= iw + M); - brgemm_kernel_post_ops_t p; + brgemm_kernel_post_ops_args_t p; if (do_postwork) { p.ptr_bias = (void *)(bias_w); p.ptr_scales = (void *)(&oscales[jcp.is_ic_scale * g_ic]); diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp b/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp index 63cf0aac7b5..a8587ea01d4 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp @@ -206,7 +206,7 @@ struct brgemm_convolution_bwd_strided_t : public primitive_t { brgemm_containers::brgemm_kernel_container_t brg_kernels_; brgemm_containers::brgemm_palette_container_t brgemm_palettes_; - std::vector>> kernels_po_; + std::vector>> kernels_po_; using Vmm = typename cpu_isa_traits::Vmm; diff --git a/src/cpu/x64/jit_brgemm_post_ops.hpp b/src/cpu/x64/jit_brgemm_post_ops.hpp index 4ee2ae6df28..22e1f770dbf 100644 --- a/src/cpu/x64/jit_brgemm_post_ops.hpp +++ b/src/cpu/x64/jit_brgemm_post_ops.hpp @@ -344,9 +344,9 @@ struct jit_brgemm_kernel_diff_bias_t : public jit_generator { #undef GET_OFF -#define GET_OFF(field) offsetof(brgemm_kernel_post_ops_t, field) +#define GET_OFF(field) offsetof(brgemm_kernel_post_ops_args_t, field) -struct brgemm_kernel_post_ops_t { +struct brgemm_kernel_post_ops_args_t { void *ptr_in; void *ptr_out; void *ptr_bias; @@ -362,13 +362,13 @@ struct brgemm_kernel_post_ops_t { }; template -struct jit_brgemm_kernel_post_ops : public jit_generator { +struct jit_brgemm_kernel_post_ops_t : public jit_generator { // TODO: the proper design should replace `brgemm_desc_t` argument and // introduce a dedicated struct with members properly initialized. This will // let avoiding a `brgemm_desc_t` object copy which is unsafe due to `attr` // member. - jit_brgemm_kernel_post_ops( + jit_brgemm_kernel_post_ops_t( const brgemm_desc_t &abrg, const primitive_attr_t &aattr) : jit_generator(jit_name(), abrg.isa_impl) , brg_(abrg) @@ -461,9 +461,9 @@ struct jit_brgemm_kernel_post_ops : public jit_generator { bia_typesize_ = brg_.typesize_bias; } - ~jit_brgemm_kernel_post_ops() = default; + ~jit_brgemm_kernel_post_ops_t() = default; - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_kernel_post_ops) + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_kernel_post_ops_t) // Used for assertion on implementation side in debug mode. int get_bcast_dim() const { return brg_.bcast_dim; } From 96dd41996f41e9c1a653c1e4ebc7db66200fa3cd Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Thu, 16 May 2024 13:28:50 -0700 Subject: [PATCH 147/187] cpu: x64: jit_brgemm_post_ops: introduce a template-free class version --- src/cpu/x64/jit_brgemm_conv.cpp | 5 +- src/cpu/x64/jit_brgemm_conv.hpp | 2 +- src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp | 5 +- src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp | 2 +- src/cpu/x64/jit_brgemm_post_ops.cpp | 37 ++++++++ src/cpu/x64/jit_brgemm_post_ops.hpp | 94 +++++++++++++-------- 6 files changed, 106 insertions(+), 39 deletions(-) create mode 100644 src/cpu/x64/jit_brgemm_post_ops.cpp diff --git a/src/cpu/x64/jit_brgemm_conv.cpp b/src/cpu/x64/jit_brgemm_conv.cpp index 0f7ebcbcc03..c4d8b356237 100644 --- a/src/cpu/x64/jit_brgemm_conv.cpp +++ b/src/cpu/x64/jit_brgemm_conv.cpp @@ -764,8 +764,9 @@ status_t brgemm_convolution_fwd_t::add_po_kernel( bcfg->beta = is_init ? 0 : 1; // See the comment in `add_po_kernels` why `*_pd->attr()` is needed so far. CHECK(safe_ptr_assign(kernels_po_[ker_idx], - new jit_brgemm_kernel_post_ops_t(*bcfg, *_pd->attr()))); - kernels_po_[ker_idx]->create_kernel(); + jit_brgemm_kernel_post_ops_base_t::create( + isa, *bcfg, *_pd->attr()))); + kernels_po_[ker_idx]->generate_kernel(); return status::success; } diff --git a/src/cpu/x64/jit_brgemm_conv.hpp b/src/cpu/x64/jit_brgemm_conv.hpp index 40791792939..2b84bc7a576 100644 --- a/src/cpu/x64/jit_brgemm_conv.hpp +++ b/src/cpu/x64/jit_brgemm_conv.hpp @@ -247,7 +247,7 @@ struct brgemm_convolution_fwd_t : public primitive_t { brgemm_containers::brgemm_kernel_container_t brgemm_kernels_; brgemm_containers::brgemm_palette_container_t brgemm_palettes_; - std::vector>> kernels_po_; + std::vector> kernels_po_; std::unique_ptr copy_to_pbuffer_; diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp index 37f3995920a..0cadd8b60f7 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp @@ -300,8 +300,9 @@ status_t brgemm_convolution_bwd_strided_t::add_po_kernel( = (!is_init && IMPLICATION(jcp.with_sum, jcp.use_buffer)) ? 1 : 0; bcfg->beta = is_init ? 0 : 1; CHECK(safe_ptr_assign(kernels_po_[ker_idx], - new jit_brgemm_kernel_post_ops_t(*bcfg, *_pd->attr()))); - kernels_po_[ker_idx]->create_kernel(); + jit_brgemm_kernel_post_ops_base_t::create( + isa, *bcfg, *_pd->attr()))); + kernels_po_[ker_idx]->generate_kernel(); return status::success; } diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp b/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp index a8587ea01d4..494070bb6ee 100644 --- a/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp +++ b/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp @@ -206,7 +206,7 @@ struct brgemm_convolution_bwd_strided_t : public primitive_t { brgemm_containers::brgemm_kernel_container_t brg_kernels_; brgemm_containers::brgemm_palette_container_t brgemm_palettes_; - std::vector>> kernels_po_; + std::vector> kernels_po_; using Vmm = typename cpu_isa_traits::Vmm; diff --git a/src/cpu/x64/jit_brgemm_post_ops.cpp b/src/cpu/x64/jit_brgemm_post_ops.cpp new file mode 100644 index 00000000000..12e88db16d2 --- /dev/null +++ b/src/cpu/x64/jit_brgemm_post_ops.cpp @@ -0,0 +1,37 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "cpu/x64/jit_brgemm_post_ops.hpp" + +namespace dnnl { +namespace impl { +namespace cpu { +namespace x64 { + +jit_brgemm_kernel_post_ops_base_t *jit_brgemm_kernel_post_ops_base_t::create( + cpu_isa_t isa, const brgemm_desc_t &abrg, + const primitive_attr_t &aattr) { + if (utils::one_of(isa, avx2, avx2_vnni, avx2_vnni_2)) { + return new jit_brgemm_kernel_post_ops_t(abrg, aattr); + } else { + return new jit_brgemm_kernel_post_ops_t(abrg, aattr); + } +} + +} // namespace x64 +} // namespace cpu +} // namespace impl +} // namespace dnnl diff --git a/src/cpu/x64/jit_brgemm_post_ops.hpp b/src/cpu/x64/jit_brgemm_post_ops.hpp index 22e1f770dbf..b23a34b8924 100644 --- a/src/cpu/x64/jit_brgemm_post_ops.hpp +++ b/src/cpu/x64/jit_brgemm_post_ops.hpp @@ -361,8 +361,29 @@ struct brgemm_kernel_post_ops_args_t { void *ptr_dst_scales; }; -template -struct jit_brgemm_kernel_post_ops_t : public jit_generator { +// This is a shim user interface that allows to create a template-free object +// of post-ops class. +struct jit_brgemm_kernel_post_ops_base_t { + // `isa` argument specifies the `Vmm` type the kernel to be generated for. + // Rest arguments are propagated as is to the underlying class. + static jit_brgemm_kernel_post_ops_base_t *create(cpu_isa_t isa, + const brgemm_desc_t &abrg, const primitive_attr_t &aattr); + + virtual ~jit_brgemm_kernel_post_ops_base_t() = default; + + virtual status_t generate_kernel() = 0; + + virtual void operator()(brgemm_kernel_post_ops_args_t *args) const = 0; + + virtual int get_bcast_dim() const = 0; +}; + +// An implementation class for post-ops based on `Vmm` template argument. +// `Vmm` is propagated further to uni_postops injector class. +// Shouldn't be called directly on implementation side. +template +struct jit_brgemm_kernel_post_ops_t : public jit_brgemm_kernel_post_ops_base_t, + public jit_generator { // TODO: the proper design should replace `brgemm_desc_t` argument and // introduce a dedicated struct with members properly initialized. This will @@ -373,6 +394,7 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { : jit_generator(jit_name(), abrg.isa_impl) , brg_(abrg) , attr_(aattr) + , max_vregs_(isa_num_vregs(brg_.isa_impl)) , with_binary_non_scalar_bcast_(brg_.with_binary && binary_injector:: any_binary_postop_rhs_non_scalar_broadcast( @@ -441,9 +463,12 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { const eltwise_injector::static_params_t esp { save_state, reserved_eltwise_gpr, reserved_eltwise_maskr}; - postops_injector_ = utils::make_unique< - injector::jit_uni_postops_injector_t>( - this, attr_.post_ops_, bsp, esp); + auto st = safe_ptr_assign(postops_injector_, + po_injector_t::create( + this, brg_.isa_impl, attr_.post_ops_, bsp, esp)); + if (st != status::success) { + assert(!"postops_injector creation failed"); + } } const auto &wei_scales = attr_.scales_.get(DNNL_ARG_WEIGHTS); @@ -461,12 +486,21 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { bia_typesize_ = brg_.typesize_bias; } + // These two methods are required for a base class to work since it's not + // derived from the jit_generator. + status_t generate_kernel() override { + return jit_generator::create_kernel(); + } + void operator()(brgemm_kernel_post_ops_args_t *args) const override { + return jit_generator::operator()(args); + } + ~jit_brgemm_kernel_post_ops_t() = default; DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_kernel_post_ops_t) // Used for assertion on implementation side in debug mode. - int get_bcast_dim() const { return brg_.bcast_dim; } + int get_bcast_dim() const override { return brg_.bcast_dim; } private: // This can't be a reference, otherwise, `get_bcast_dim()` would return @@ -480,16 +514,16 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { data_type_t inp_dt_; data_type_t out_dt_; data_type_t bia_dt_; - // TODO: get rid of this map because it requires updates with every new isa - static constexpr cpu_isa_t po_isa_t = utils::map(isa, avx512_core, avx2, - avx2, avx2_vnni, avx2, avx2_vnni_2, avx2_vnni_2, avx512_core_fp16, - avx512_core_fp16, avx10_1_512_amx_fp16, avx512_core_fp16); - std::unique_ptr> - postops_injector_; + + using Vmm_lower_t = typename vreg_traits::Vmm_lower_t; + using Vmm_lower2_t = typename vreg_traits::Vmm_lower_t; + using po_injector_t = injector::jit_uni_postops_injector_base_t; + std::unique_ptr postops_injector_; std::unique_ptr bf16_emu_; std::unique_ptr f8_e5m2_emulator_; std::unique_ptr f8_e4m3_emulator_; + int max_vregs_; const bool with_binary_non_scalar_bcast_; int inp_typesize_; @@ -497,14 +531,8 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { int bia_typesize_; int is_oc_scale_; - constexpr static int max_vregs_ = cpu_isa_traits::n_vregs; using reg64_t = const Xbyak::Reg64; - using Vmm = typename utils::conditional::type; - using Vmm_lower_t = typename vreg_traits::Vmm_lower_t; - using Vmm_lower2_t = typename vreg_traits::Vmm_lower_t; // Register decomposition const reg64_t reg_reserved_eltwise = rax; @@ -590,7 +618,7 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { template const T maybe_mask(const T vmm_in, bool mask_flag, bool store, Xbyak::Opmask ktail_mask) { - assert(IMPLICATION(mask_flag, isa_has_masks(isa))); + assert(IMPLICATION(mask_flag, isa_has_masks(brg_.isa_impl))); return mask_flag ? (store ? vmm_in | ktail_mask : vmm_in | ktail_mask | T_z) : vmm_in; @@ -605,7 +633,7 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { // no tail and full vmm must be processed. && tail_size > 0; - if (IMPLICATION(is_tail, isa_has_masks(isa))) { + if (IMPLICATION(is_tail, isa_has_masks(brg_.isa_impl))) { const Vmm vmm = maybe_mask(vmm_in, is_tail, store, ktail_mask); switch (type_in) { case data_type::f32: @@ -654,7 +682,7 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { auto vmm_sum_zp = vmm_tmp(1); if (*p_sum_zp != 0) { mov(reg_ptr_sum_zp, (size_t)p_sum_zp); - if (is_superset(isa, avx512_core)) { + if (is_superset(brg_.isa_impl, avx512_core)) { vcvtdq2ps(vmm_sum_zp, ptr_b[reg_ptr_sum_zp]); } else { vpbroadcastd(vmm_sum_zp, ptr[reg_ptr_sum_zp]); @@ -675,7 +703,7 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { if (*p_sum_scale == 1.f) uni_vaddps(vmm, vmm, vmm_prev_dst); else { - if (is_superset(isa, avx512_core)) { + if (is_superset(brg_.isa_impl, avx512_core)) { vfmadd231ps( vmm, vmm_prev_dst, ptr_b[reg_ptr_sum_scale]); } else { @@ -725,10 +753,10 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { for (int n = 0; n < n_block; n++) { const size_t zp_comp_offset = sizeof(int32_t) * (n * brg_.ld_block + m * brg_.LDB); - auto zp_comp_a_addr = is_superset(isa, avx512_core) + auto zp_comp_a_addr = is_superset(brg_.isa_impl, avx512_core) ? EVEX_compress_addr(aux_reg_zp_a_comp, zp_comp_offset) : ptr[aux_reg_zp_a_comp + zp_comp_offset]; - if (IMPLICATION(has_tail, isa_has_masks(isa))) { + if (IMPLICATION(has_tail, isa_has_masks(brg_.isa_impl))) { auto vmm_zp_comp_a_masked = maybe_mask( vmm_zp_comp_a, has_tail, false, k_mask); vmovups(vmm_zp_comp_a_masked, zp_comp_a_addr); @@ -751,11 +779,11 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { const size_t s8s8_comp_offset = sizeof(int32_t) * (n * brg_.ld_block + m * brg_.LDB); - auto comp_addr = is_superset(isa, avx512_core) + auto comp_addr = is_superset(brg_.isa_impl, avx512_core) ? EVEX_compress_addr( aux_reg_s8s8_comp, s8s8_comp_offset) : ptr[aux_reg_s8s8_comp + s8s8_comp_offset]; - if (IMPLICATION(tail > 0, isa_has_masks(isa))) { + if (IMPLICATION(tail > 0, isa_has_masks(brg_.isa_impl))) { auto vmm_comp_masked = maybe_mask(vmm_comp, tail > 0, false, k_mask); vmovups(vmm_comp_masked, comp_addr); @@ -814,7 +842,7 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { const auto addr = ptr[aux_reg_scales + is_oc_scale_ * sizeof(float) * (n * brg_.ld_block)]; auto vmm = vector(m, n); - if (IMPLICATION(tail > 0, isa_has_masks(isa))) { + if (IMPLICATION(tail > 0, isa_has_masks(brg_.isa_impl))) { vmm = maybe_mask(vector(m, n), tail > 0, false, k_mask); vmulps(vmm, vmm, addr); } else { @@ -843,12 +871,12 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { mov(aux_reg_dst_scales, ptr[rsp + reg_dst_scales_offs_]); const auto addr = ptr[aux_reg_dst_scales]; auto vmm_scales = vmm_tmp(0); - if (!isa_has_masks(isa)) vmovups(vmm_scales, addr); + if (!isa_has_masks(brg_.isa_impl)) vmovups(vmm_scales, addr); for_(int m = 0; m < m_block; m++) for (int n = 0; n < n_block; n++) { auto vmm = vector(m, n); - if (isa_has_masks(isa)) { + if (isa_has_masks(brg_.isa_impl)) { vmm = maybe_mask(vector(m, n), tail > 0, false, k_mask); vmulps(vmm, vmm, addr); } else { @@ -861,7 +889,7 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { mov(aux_reg_zp_c_values, ptr[rsp + aux_reg_zp_c_values_offs_]); auto vmm_zp_c = vmm_tmp(0); if (brg_.zp_type_c == brgemm_broadcast_t::per_tensor) { - if (is_superset(isa, avx512_core)) + if (is_superset(brg_.isa_impl, avx512_core)) vcvtdq2ps(vmm_zp_c, EVEX_compress_addr(aux_reg_zp_c_values, 0, true)); else { @@ -872,7 +900,7 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { for (int n = 0; n < n_block; n++) { if (brg_.zp_type_c == brgemm_broadcast_t::per_n) { int zp_c_off = zp_c_values_offset(n); - auto zp_c_addr = is_superset(isa, avx512_core) + auto zp_c_addr = is_superset(brg_.isa_impl, avx512_core) ? EVEX_compress_addr(aux_reg_zp_c_values, zp_c_off) : ptr[aux_reg_zp_c_values + zp_c_off]; cvt2ps(data_type::s32, vmm_zp_c, zp_c_addr, tail, false, @@ -911,7 +939,7 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { saturate_cvt_f32(vmm, vmm_lbound, vmm_ubound, out_dt_); } - if (is_superset(isa, avx512_core)) { + if (is_superset(brg_.isa_impl, avx512_core)) { auto vmm_masked = maybe_mask(vmm, tail > 0, true, k_mask); Vmm_lower_t vmm_low = Vmm_lower_t(vmm.getIdx()); Vmm_lower2_t vmm_low2 = Vmm_lower2_t(vmm_low.getIdx()); @@ -1099,7 +1127,7 @@ struct jit_brgemm_kernel_post_ops_t : public jit_generator { int mb = brg_.bcast_dim / m_block; int mb_tail = brg_.bcast_dim % m_block; - if (isa_has_masks(isa)) { + if (isa_has_masks(brg_.isa_impl)) { const auto full_mask = size_t {0xffffffffffffffff}; const auto tail_mask = size_t((1 << nb_tail) - 1); From fe84eaf396050a03a5b75482255110c069eec38e Mon Sep 17 00:00:00 2001 From: "Pirogov, Vadim" Date: Tue, 21 May 2024 16:00:38 -0700 Subject: [PATCH 148/187] fixup: gpu: jit: gemm: microkernel provider --- src/gpu/intel/jit/gemm/ukernel_lmr.db | 21 +++++++++++---------- src/gpu/intel/jit/gemm/ukernel_mlr.db | 21 +++++++++++---------- src/gpu/intel/jit/gemm/ukernel_mmr.db | 21 +++++++++++---------- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/src/gpu/intel/jit/gemm/ukernel_lmr.db b/src/gpu/intel/jit/gemm/ukernel_lmr.db index 03bafb96895..0a392fe5944 100644 --- a/src/gpu/intel/jit/gemm/ukernel_lmr.db +++ b/src/gpu/intel/jit/gemm/ukernel_lmr.db @@ -1,16 +1,17 @@ /******************************************************************************* -* INTEL CONFIDENTIAL -* Copyright 2024 Intel Corporation. +* Copyright 2024 Intel Corporation * -* This software and the related documents are Intel copyrighted materials, and -* your use of them is governed by the express license under which they were -* provided to you (License). Unless the License provides otherwise, you may not -* use, modify, copy, publish, distribute, disclose or transmit this software or -* the related documents without Intel's prior written permission. +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at * -* This software and the related documents are provided as is, with no express -* or implied warranties, other than those that are expressly stated in the -* License. +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. *******************************************************************************/ /*@kcatalog@*/ diff --git a/src/gpu/intel/jit/gemm/ukernel_mlr.db b/src/gpu/intel/jit/gemm/ukernel_mlr.db index bc6ecf4c9e8..8f82bfac275 100644 --- a/src/gpu/intel/jit/gemm/ukernel_mlr.db +++ b/src/gpu/intel/jit/gemm/ukernel_mlr.db @@ -1,16 +1,17 @@ /******************************************************************************* -* INTEL CONFIDENTIAL -* Copyright 2024 Intel Corporation. +* Copyright 2024 Intel Corporation * -* This software and the related documents are Intel copyrighted materials, and -* your use of them is governed by the express license under which they were -* provided to you (License). Unless the License provides otherwise, you may not -* use, modify, copy, publish, distribute, disclose or transmit this software or -* the related documents without Intel's prior written permission. +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at * -* This software and the related documents are provided as is, with no express -* or implied warranties, other than those that are expressly stated in the -* License. +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. *******************************************************************************/ /*@kcatalog@*/ diff --git a/src/gpu/intel/jit/gemm/ukernel_mmr.db b/src/gpu/intel/jit/gemm/ukernel_mmr.db index 461b5c3a0fd..a673e416e73 100644 --- a/src/gpu/intel/jit/gemm/ukernel_mmr.db +++ b/src/gpu/intel/jit/gemm/ukernel_mmr.db @@ -1,16 +1,17 @@ /******************************************************************************* -* INTEL CONFIDENTIAL -* Copyright 2022-2024 Intel Corporation. +* Copyright 2022-2024 Intel Corporation * -* This software and the related documents are Intel copyrighted materials, and -* your use of them is governed by the express license under which they were -* provided to you (License). Unless the License provides otherwise, you may not -* use, modify, copy, publish, distribute, disclose or transmit this software or -* the related documents without Intel's prior written permission. +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at * -* This software and the related documents are provided as is, with no express -* or implied warranties, other than those that are expressly stated in the -* License. +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. *******************************************************************************/ /*@kcatalog@*/ From 34b0c1e648d2ff7b9685f6af8deb972dfb2b95e6 Mon Sep 17 00:00:00 2001 From: Daniel Youssif Date: Fri, 17 May 2024 10:42:42 -0700 Subject: [PATCH 149/187] Revert "gpu: jit: conv: update lut for yolo_v4 int8 mb16 and resent_v1.5 int8 mb8" This reverts commit 3c9d69ce26ba3d13bc4940710062b1e23a2b41b1. --- src/gpu/intel/jit/conv/lookup_table_data.cpp | 1621 ++++++++---------- 1 file changed, 752 insertions(+), 869 deletions(-) diff --git a/src/gpu/intel/jit/conv/lookup_table_data.cpp b/src/gpu/intel/jit/conv/lookup_table_data.cpp index ed33e8dd726..33de49c1a0d 100644 --- a/src/gpu/intel/jit/conv/lookup_table_data.cpp +++ b/src/gpu/intel/jit/conv/lookup_table_data.cpp @@ -26,877 +26,760 @@ namespace jit { // clang-format off const std::vector &get_conv_lookup_table_data() { static std::vector data = { - 0x00000000000001d6, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x0018000000100101, 0x6869343663690000, 0x6f3436636f343031, 0x7033686b34303168, - 0x0003000000083168, 0x0000000202020000, 0x0808000000030707, 0x0000000200000003, 0x0d0d000000020a0a, 0x0000000300000008, 0x0909000000200202, 0x00200a0a00000010, - 0x0004ffffffff0000, 0x0001000000030000, 0x0004000000040000, 0x0101000000010000, 0x0000001d00000010, 0x3177693832316369, 0x3436636f36313830, 0x6b3631383031776f, - 0x0000083077703177, 0x0402020000000100, 0x0a00000001000000, 0x000003000000040a, 0x0900000020020200, 0x100a0a0000001009, 0x04ffffffff000000, 0x0100000003000000, - 0x0400000004000000, 0x0100000001000000, 0x00001b0000001001, 0x6869363532636900, 0x6f323135636f3632, 0x687333686b333168, 0x0000000831687032, 0x0008020200000003, - 0x0000000307070000, 0x0002000000030808, 0x000000080a0a0000, 0x0003000000080d0d, 0x0000002002020000, 0x0a0a000000100909, 0xffffffff00000040, 0x0000000300000004, - 0x0000000400000001, 0x0000000100000004, 0x001c000000100101, 0x6932313563690000, 0x323031636f363268, 0x33686b3331686f34, 0x0008316870326873, 0x0202000000030000, - 0x0003070700000010, 0x0000000308080000, 0x00080a0a00000002, 0x000000020d0d0000, 0x0020020200000003, 0x0000001009090000, 0xffff000000200a0a, 0x000300000004ffff, - 0x0004000000010000, 0x0001000000040000, 0x0000001001010000, 0x353263690000001c, 0x6f34303732776936, 0x3732776f35353263, 0x30777031776b3430, 0x0000000100000008, - 0x0002000000080202, 0x000000080a0a0000, 0x0003000000020d0d, 0x0000002002020000, 0x0a0a000000100909, 0xffffffff00000020, 0x0000000300000004, 0x0000000400000001, - 0x0000000100000004, 0x0019000000080001, 0x6936353263690000, 0x323031636f343168, 0x31686b3431686f34, 0x0100000008306870, 0x0000080202000000, 0x040a0a0000000100, - 0x0200000003000000, 0x400a0a0000002002, 0x0000100d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000040000000400, 0x1001010000000100, 0x6900000017000000, - 0x6f36313468693363, 0x363134686f323363, 0x000831687033686b, 0x0707000000010000, 0x0000000100000003, 0x0004000000040d0d, 0x0000000402020000, 0x0909000000080808, - 0x00200a0a00000010, 0x0004ffffffff0000, 0x0001000000030000, 0x0004000000040000, 0x0101000000010000, 0x0000001a00000010, 0x3677693635326369, 0x6f363532636f3637, - 0x7031776b36373677, 0x0001000000083077, 0x0000000802020000, 0x00080a0a00000002, 0x000000040d0d0000, 0x0020020200000003, 0x0000001009090000, 0xffff000000200a0a, - 0x000300000004ffff, 0x0004000000010000, 0x0001000000040000, 0x0000001001010000, 0x323163690000001c, 0x6f34303732776938, 0x3732776f38323163, 0x30777031776b3430, - 0x0000000100000008, 0x0002000000040202, 0x000000040a0a0000, 0x0003000000040d0d, 0x0000002002020000, 0x0a0a000000100909, 0xffffffff00000020, 0x0000000300000004, - 0x0000000400000001, 0x0000000100000004, 0x001c000000100101, 0x6936353263690000, 0x31636f3430373277, 0x34303732776f3832, 0x000830777031776b, 0x0202000000010000, - 0x0000000200000008, 0x0d0d000000040a0a, 0x0000000300000008, 0x0909000000200202, 0x00200a0a00000010, 0x0004ffffffff0000, 0x0001000000030000, 0x0004000000040000, - 0x0001000000010000, 0x0000001a00000008, 0x3777693832316369, 0x6f323135636f3438, 0x7031776b34383777, 0x0001000000083077, 0x0000000402020000, 0x00040a0a00000001, - 0x0202000000030000, 0x00200a0a00000020, 0x000000100d0d0000, 0x00000004ffffffff, 0x0000000100000003, 0x0000000400000004, 0x0010010100000001, 0x63690000001c0000, - 0x3632333477693436, 0x34776f3436636f34, 0x7031776b34363233, 0x0001000000083077, 0x0000000202020000, 0x00020d0d00000001, 0x0202000000030000, 0x0010090900000020, - 0x000000200a0a0000, 0x00000004ffffffff, 0x0000000100000003, 0x0000000400000004, 0x0008000100000001, 0x63690000001b0000, 0x3639317769363532, 0x776f34323031636f, - 0x777031776b363931, 0x0000010000000830, 0x0200000008020200, 0x0000080a0a000000, 0x03000000080d0d00, 0x0000200202000000, 0x0d000000200a0a00, 0xffffff0000001c0d, - 0x00000300000004ff, 0x0000040000000100, 0x0000010000000400, 0x1800000008000100, 0x3832316369000000, 0x3231636f38326869, 0x33686b3832686f38, 0x0300000008316870, - 0x0000040202000000, 0x0800000003070700, 0x0000020000000308, 0x0d000000040a0a00, 0x000003000000020d, 0x0a00000020020200, 0x100d0d000000200a, 0x04ffffffff000000, - 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, 0x00001c0000001001, 0x6869383231636900, 0x363532636f343031, 0x7333686b3235686f, 0x0000083168703268, - 0x0202020000000300, 0x0000030707000000, 0x0100000003080800, 0x0000020202000000, 0x2002020000000300, 0x0000100909000000, 0x00000000200a0a00, 0x0300000004000000, - 0x0400000001000000, 0x0100000004000000, 0x0000100101000000, 0x3163690000001b00, 0x636f323568693832, 0x6b3632686f363532, 0x3168703268733368, 0x0000000300000008, - 0x0707000000040202, 0x0003080800000003, 0x0a0a000000020000, 0x00040d0d00000008, 0x0202000000030000, 0x0010090900000020, 0x000000200a0a0000, 0x00000004ffffffff, - 0x0000000100000003, 0x0000000400000004, 0x0010010100000001, 0x6369000000180000, 0x6f38303268693233, 0x383032686f343663, 0x000831687033686b, 0x0707000000020000, - 0x0003080800000003, 0x0a0a000000010000, 0x0000000300000002, 0x0909000000200202, 0x00200a0a00000010, 0x0004ffffffff0000, 0x0001000000030000, 0x0004000000040000, - 0x0101000000010000, 0x0000001e00000010, 0x3177693832316369, 0x3231636f36313830, 0x3631383031776f38, 0x000830777031776b, 0x0202000000010000, 0x0000000100000004, - 0x0003000000040a0a, 0x0000002002020000, 0x0a0a000000100909, 0xffffffff00000020, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x001c000000100101, - 0x7769343663690000, 0x33636f3436323334, 0x3436323334776f32, 0x000830777031776b, 0x0202000000010000, 0x0000000100000002, 0x0003000000020a0a, 0x0000002002020000, - 0x0a0a000000100909, 0xffffffff00000010, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x001a000000100101, 0x6932313563690000, 0x3532636f36373677, - 0x776b363736776f35, 0x0000000830777031, 0x0010020200000001, 0x0a0a000000020000, 0x00040d0d00000004, 0x0202000000030000, 0x0010090900000020, 0x000000200a0a0000, - 0x00000004ffffffff, 0x0000000100000003, 0x0000000400000004, 0x0008000100000001, 0x63690000001a0000, 0x3438377769323135, 0x37776f383231636f, 0x30777031776b3438, - 0x0000000100000008, 0x0002000000100202, 0x000000080a0a0000, 0x0003000000040d0d, 0x0000002002020000, 0x0d0d000000100a0a, 0xffffffff0000001c, 0x0000000300000004, - 0x0000000400000001, 0x0000000100000004, 0x001c000000080001, 0x6932313563690000, 0x323031636f383268, 0x31686b3431686f34, 0x0008306870326873, 0x0202000000010000, - 0x0000000100000010, 0x0003000000040a0a, 0x0000002002020000, 0x0d0d000000400a0a, 0xffffffff00000010, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, - 0x001b000000100101, 0x6936353263690000, 0x323135636f323568, 0x7333686b3632686f, 0x0000083168703268, 0x0402020000000300, 0x0000030707000000, 0x0100000003080800, - 0x0000020202000000, 0x2002020000000300, 0x0000100909000000, 0x00000000200a0a00, 0x0300000004000000, 0x0400000001000000, 0x0100000004000000, 0x0000100101000000, - 0x3363690000001b00, 0x636f363134686932, 0x6b383032686f3436, 0x3168703268733368, 0x0000000200000008, 0x0808000000030707, 0x0000000100000003, 0x0003000000020a0a, - 0x0000002002020000, 0x0a0a000000100909, 0xffffffff00000020, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x001c000000080001, 0x3432303163690000, - 0x3032636f34316869, 0x31686b37686f3834, 0x0008306870326873, 0x0202000000010000, 0x0000000100000020, 0x0003000000020a0a, 0x0000002002020000, 0x0d0d000000400a0a, - 0xffffffff00000008, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x001a000000080001, 0x6932313563690000, 0x323135636f343168, 0x687333686b37686f, - 0x0000000831687032, 0x0004020200000003, 0x0000000307070000, 0x0001000000030808, 0x0000000402020000, 0x0020020200000003, 0x000000400a0a0000, 0x0000000000070d0d, - 0x0003000000040000, 0x0004000000010000, 0x0001000000040000, 0x0000001001010000, 0x3135636900000018, 0x35636f3331686932, 0x686b3331686f3231, 0x0000000831687033, - 0x0008020200000003, 0x0000000307070000, 0x0001000000030808, 0x0000000202020000, 0x0020020200000003, 0x0000001009090000, 0x0000000000200a0a, 0x0003000000040000, - 0x0004000000010000, 0x0001000000040000, 0x0000000800010000, 0x343663690000001a, 0x636f363331337769, 0x36333133776f3436, 0x000830777031776b, 0x0202000000010000, - 0x0000000200000002, 0x0d0d000000040a0a, 0x0000000300000004, 0x0a0a000000200202, 0x00100d0d00000010, 0x0004ffffffff0000, 0x0001000000030000, 0x0004000000040000, - 0x0001000000010000, 0x0000001b00000008, 0x3568693635326369, 0x686f323135636f36, 0x32687331686b3832, 0x0100000008306870, 0x0000080202000000, 0x040a0a0000000100, - 0x0200000003000000, 0x200a0a0000002002, 0x00001c0d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000040000000400, 0x1001010000000100, 0x6900000018000000, - 0x3235686938323163, 0x35686f383231636f, 0x0831687033686b32, 0x0200000003000000, 0x0307070000000202, 0x0000030808000000, 0x0202020000000100, 0x0200000003000000, - 0x1009090000002002, 0x0000200a0a000000, 0x0000040000000000, 0x0000010000000300, 0x0000040000000400, 0x1001010000000100, 0x6900000018000000, 0x3632686936353263, - 0x32686f323135636f, 0x0831687033686b36, 0x0200000003000000, 0x0307070000000402, 0x0000030808000000, 0x0202020000000100, 0x0200000003000000, 0x1009090000002002, - 0x0000200a0a000000, 0x0000040000000000, 0x0000010000000300, 0x0000040000000400, 0x1001010000000100, 0x6900000018000000, 0x3632686936353263, 0x32686f363532636f, - 0x0831687033686b36, 0x0200000003000000, 0x0307070000000402, 0x0000030808000000, 0x0202020000000100, 0x0200000003000000, 0x1009090000002002, 0x0000200a0a000000, - 0x0000040000000000, 0x0000010000000300, 0x0000040000000400, 0x1001010000000100, 0x690000001b000000, 0x3177693432303163, 0x6f323135636f3936, 0x7031776b39363177, - 0x0001000000083077, 0x0000002002020000, 0x00080a0a00000002, 0x000000080d0d0000, 0x0020020200000003, 0x0000001009090000, 0xffff000000100a0a, 0x000300000004ffff, - 0x0004000000010000, 0x0001000000040000, 0x0000000800010000, 0x3436636900000016, 0x3436636f36356869, 0x7033686b3635686f, 0x0003000000083168, 0x0000000202020000, - 0x0808000000030707, 0x0000000100000003, 0x0003000000020a0a, 0x0000002002020000, 0x0d0d000000200a0a, 0xffffffff0000001c, 0x0000000300000005, 0x0000000700000001, - 0x0000000100000007, 0x0018000000010001, 0x6936393863690000, 0x383231636f343677, 0x7031776b3436776f, 0x0001000000103077, 0x0000000702020000, 0x0008020200000001, - 0x0202000000030000, 0x00100a0a00000010, 0x000000080d0d0000, 0x0000000500000000, 0x0000000300000003, 0x0000000100000007, 0x0018000100000007, 0x63690000001a0000, - 0x3934776938343032, 0x776f38343032636f, 0x30777031776b3934, 0x0000000100000010, 0x0002000000310d0d, 0x0000000402020000, 0x0003000000040a0a, 0x0000002002020000, - 0x0a0a000000100909, 0x0000000300000040, 0x0000000300000004, 0x0000000700000003, 0x0000000700000001, 0x001a000000010001, 0x3268693363690000, 0x6f383231636f3432, - 0x687334686b363568, 0x0000000830687034, 0x00080c0c00000001, 0x0a0a000000010000, 0x0000000400000002, 0x0808000000040202, 0x00080a0a00000004, 0x000000100d0d0000, - 0x0000000400000000, 0x0000000100000003, 0x0000000400000004, 0x0080010100000001, 0x63690000001b0000, 0x3633313377693436, 0x33776f363532636f, 0x777031776b363331, - 0x0000010000000830, 0x0100000002020200, 0x0000080d0d000000, 0x2002020000000300, 0x0000200909000000, 0x02000000200a0a00, 0x0300000004000000, 0x0700000001000000, - 0x0100000007000000, 0x0000020001000000, 0x3363690000001c00, 0x3430333277693032, 0x32776f303436636f, 0x777031776b343033, 0x0000010000000830, 0x0200000014020200, - 0x0000040a0a000000, 0x03000000080d0d00, 0x0000100202000000, 0x0d000000200a0a00, 0xffffff000000180d, 0x00000300000004ff, 0x0000070000000100, 0x0000010000000700, - 0x2c00000020010100, 0x6869316369000000, 0x3630303177693038, 0x38686f31636f3035, 0x3536303031776f30, 0x3332776b31686b30, 0x0831317770306870, 0x0800000001000000, - 0x0000010000000308, 0x04000000080d0d00, 0x0000020202000000, 0x0900000008080800, 0x080a0a0000002009, 0x0400000001000000, 0x0100000003000000, 0x0400000004000000, - 0x0100000001000000, 0x00002c0000002001, 0x3038686931636900, 0x3035363030317769, 0x6f3038686f31636f, 0x6b30353630303177, 0x68703332776b3168, 0x0000083131777030, - 0x0308080000000100, 0x0d00000001000000, 0x000004000000080d, 0x0800000004020200, 0x2009090000000808, 0x0000080a0a000000, 0x0000040000000100, 0x0000010000000300, - 0x0000070000000700, 0x0200010000000100, 0x690000001a000000, 0x3268693038323163, 0x6f30383231636f34, 0x687033686b343268, 0x0000030000000831, 0x0700000050020200, - 0x0308080000000307, 0x0a00000001000000, 0x000003000000040a, 0x0a00000010020200, 0x180d0d000000200a, 0x04ffffffff000000, 0x0100000003000000, 0x0700000007000000, - 0x0100000001000000, 0x00001c0000000200, 0x7769303639636900, 0x3233636f36313239, 0x6b36313239776f30, 0x0000083077703177, 0x3c02020000000100, 0x0a00000002000000, - 0x080d0d000000040a, 0x0200000003000000, 0x200a0a0000001002, 0x0000200d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000040000000400, 0x0800010000000100, - 0x690000001b000000, 0x3177693432303163, 0x6f363532636f3639, 0x7031776b36393177, 0x0001000000083077, 0x0000002002020000, 0x00080a0a00000002, 0x000000040d0d0000, - 0x0020020200000003, 0x000000100a0a0000, 0xffff000000100d0d, 0x000300000004ffff, 0x0004000000010000, 0x0001000000040000, 0x0000008001010000, 0x323163690000001b, - 0x31636f3635686938, 0x686b3832686f3832, 0x0831687032687333, 0x0200000003000000, 0x0307070000000402, 0x0000030808000000, 0x040a0a0000000200, 0x0000080d0d000000, - 0x2002020000000300, 0x0000200909000000, 0x03000000200a0a00, 0x0100000004000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3163690000002c00, - 0x3031776930386869, 0x6f31636f30353630, 0x303031776f303868, 0x776b31686b303536, 0x3177703068703332, 0x0000010000000831, 0x0100000003080800, 0x0000080d0d000000, - 0x0808080000000300, 0x0000080a0a000000, 0x01000000100d0d00, 0x0100000004000000, 0x0400000001000000, 0x0100000004000000, 0x0000010001000000, 0x3163690000002c00, - 0x3031776930386869, 0x6f31636f30353630, 0x303031776f303868, 0x776b31686b303536, 0x3177703068703332, 0x0000000000000831, 0x020d0d0000000100, 0x0800000003000000, - 0x080a0a0000001708, 0x00000f0d0d000000, 0x0000040000000000, 0x0000010000000300, 0x0000040000000400, 0x8001010000000100, 0x6900000016000000, 0x6f36356869343663, - 0x6b3635686f343663, 0x0000083168703368, 0x0202020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000020a0a000000, 0x03000000080d0d00, 0x0000200202000000, - 0x0a00000020090900, 0x000003000000200a, 0x0000030000000400, 0x0000040000000100, 0x0000010000000400, 0x1b00000080010100, 0x3230316369000000, 0x636f363931776934, - 0x363931776f363532, 0x000830777031776b, 0x0202000000010000, 0x0000000200000020, 0x0d0d000000080a0a, 0x0000000300000004, 0x0909000000200202, 0x00200a0a00000020, - 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001500000001, 0x3568693635326369, 0x6b35686f3231636f, 0x0000103168703368, - 0x0202020000000300, 0x0000030707000000, 0x0100000003080800, 0x0000080202000000, 0x1002020000000300, 0x0000100a0a000000, 0x00000000080d0d00, 0x0100000005000000, - 0x0800000003000000, 0x0800000001000000, 0x0000200001000000, 0x3863690000001400, 0x6f38636f35326469, 0x647035646b333264, 0x0000010000001031, 0x00000000170d0d00, - 0x0200000003000000, 0x1009090000000802, 0x0000100a0a000000, 0x0000040000000000, 0x0000010000000300, 0x0000070000000700, 0x0200010000000100, 0x690000001d000000, - 0x3277693032393163, 0x303436636f343033, 0x776b34303332776f, 0x0000000830777031, 0x0078020200000001, 0x0a0a000000020000, 0x00080d0d00000004, 0x0202000000030000, - 0x00200a0a00000010, 0x000000180d0d0000, 0x00000004ffffffff, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x6369000000180000, 0x6f36366869303233, - 0x3436686f30323363, 0x000830687033686b, 0x0202000000030000, 0x0003070700000014, 0x0000000308080000, 0x00080a0a00000002, 0x000000020d0d0000, 0x0010020200000003, - 0x000000100a0a0000, 0x0000000000200d0d, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000200010000, 0x3233636900000018, 0x36636f3834686930, - 0x686b3834686f3034, 0x0000000831687033, 0x0014020200000003, 0x0000000307070000, 0x0001000000030808, 0x000000040d0d0000, 0x0010020200000004, 0x0000000209090000, - 0x0d0d000000200a0a, 0x000000000000000c, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x001c000000010001, 0x6930363963690000, 0x33636f3639303477, - 0x36393034776f3032, 0x001030777031776b, 0x0202000000010000, 0x000000020000001e, 0x0d0d000000040a0a, 0x0000000300000004, 0x0a0a000000200202, 0x00200d0d00000020, - 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001c00000002, 0x3477693036396369, 0x303233636f363930, 0x776b36393034776f, - 0x0000001030777031, 0x001e020200000001, 0x0a0a000000020000, 0x00080d0d00000002, 0x0202000000030000, 0x00400a0a00000020, 0x000000200d0d0000, 0x0000000500000003, - 0x0000000300000003, 0x000000010000000a, 0x004001010000000a, 0x63690000001b0000, 0x3235327769383231, 0x32776f363532636f, 0x77703631776b3733, 0x0000020000001030, - 0x0d00000002090900, 0x0000020000004f0d, 0x0a00000004020200, 0x000003000000020a, 0x0900000020020200, 0x400a0a0000000809, 0x0400000002000000, 0x0100000003000000, - 0x0700000007000000, 0x0100000001000000, 0x0000180000000200, 0x6869303436636900, 0x6f303436636f3834, 0x687033686b383468, 0x0000030000000831, 0x0700000028020200, - 0x0308080000000307, 0x0d00000001000000, 0x000004000000040d, 0x0900000010020200, 0x200a0a0000000209, 0x00000c0d0d000000, 0x0000050000000000, 0x0000010000000100, - 0x0000080000000800, 0x0000010000000100, 0x6900000013000008, 0x636f303168693863, 0x7033686b38686f38, 0x0001000000103068, 0x0000000307070000, 0x0000000400000000, - 0x0808000000080202, 0x00100a0a00000003, 0x000000080d0d0000, 0x0000000500000000, 0x0000000100000001, 0x0000000b0000000b, 0x0800000100000001, 0x6369000000130000, - 0x38636f3031686938, 0x687033686b38686f, 0x0000010000000830, 0x0000000003070700, 0x0200000004000000, 0x0308080000000802, 0x0000080a0a000000, 0x00000000080d0d00, - 0x0300000005000000, 0x0400000001000000, 0x0100000004000000, 0x0000010001000000, 0x3163690000001500, 0x33636f3768693832, 0x7033686b37686f32, 0x0002000000103168, - 0x0000000307070000, 0x0001000000030808, 0x0000000402020000, 0x0020020200000003, 0x000000100a0a0000, 0x0000000000080d0d, 0x0003000000050000, 0x0007000000010000, - 0x0001000000070000, 0x0000000100010000, 0x3231636900000015, 0x3233636f37686938, 0x687033686b37686f, 0x0000020000001031, 0x0800000003070700, 0x0000010000000308, - 0x0300000008020200, 0x0000100202000000, 0x0d000000100a0a00, 0x000000000000080d, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1800000002000100, - 0x3032336369000000, 0x3233636f36396869, 0x33686b3639686f30, 0x0300000008316870, 0x0000140202000000, 0x0800000003070700, 0x0000010000000308, 0x04000000020d0d00, - 0x0000100202000000, 0x0a00000002090900, 0x100d0d000000200a, 0x0400000000000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000180000000200, - 0x6869303639636900, 0x6f303436636f3834, 0x687033686b383468, 0x0000030000000831, 0x070000003c020200, 0x0308080000000307, 0x0d00000001000000, 0x000004000000040d, - 0x0900000010020200, 0x200a0a0000000209, 0x00000c0d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000040000000400, 0x0800010000000100, 0x6900000019000000, - 0x3934776932313563, 0x776f38343032636f, 0x30777031776b3934, 0x0000000100000008, 0x0002000000100202, 0x000000080a0a0000, 0x0003000000020d0d, 0x0000002002020000, - 0x0d0d000000100a0a, 0xffffffff00000020, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x0016000000010001, 0x6869363163690000, 0x686f3631636f3631, - 0x30687033686b3431, 0x0000000200000008, 0x0808000000030707, 0x0000000100000002, 0x0004000000020d0d, 0x0000001002020000, 0x0a0a000000020808, 0x00080d0d00000008, - 0x0004000000030000, 0x0001000000030000, 0x0004000000040000, 0x0001000000010000, 0x0000001700000001, 0x3332686932336369, 0x3232686f33636f32, 0x0830687039686b34, - 0x0700000002000000, 0x0908080000000907, 0x0d00000001000000, 0x000003000000020d, 0x0a00000020020200, 0x1c0d0d000000080a, 0x0400000001000000, 0x0100000003000000, - 0x0400000004000000, 0x0100000001000000, 0x0000170000002000, 0x3268693233636900, 0x32686f33636f3233, 0x30687039686b3432, 0x0000000200000008, 0x0808000000090707, - 0x0000000100000009, 0x0003000000040d0d, 0x0000002002020000, 0x0a0a000000200909, 0x0000000100000008, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, - 0x001b000000100101, 0x3432303163690000, 0x32636f3936317769, 0x6b393631776f3535, 0x0000083077703177, 0x2002020000000100, 0x0a00000002000000, 0x040d0d000000040a, - 0x0200000003000000, 0x1009090000002002, 0x0000200a0a000000, 0x000004ffffffff00, 0x0000030000000100, 0x0000010000000800, 0x0000010000000800, 0x6900000013000008, - 0x636f303168693263, 0x7033686b38686f38, 0x0001000000083068, 0x000000080c0c0000, 0x0000000300000000, 0x0909000000080202, 0x00080a0a00000010, 0x0004000000000000, - 0x0001000000010000, 0x0007000000070000, 0x0001000000010000, 0x0000001c00000080, 0x6931636934343167, 0x6f31636f31353168, 0x687333686b353768, 0x0000000830687032, - 0x0003070700000001, 0x0003000000000000, 0x0000004001010000, 0x0909000000030808, 0x0000000000000008, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, - 0x001b000000020001, 0x6930323363690000, 0x303233636f363968, 0x7333686b3834686f, 0x0000083168703268, 0x1402020000000300, 0x0000030707000000, 0x0100000003080800, - 0x0000040d0d000000, 0x1002020000000400, 0x0000020909000000, 0x0d000000200a0a00, 0x0000000000000c0d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, - 0x1900000001000100, 0x3034366369000000, 0x3231636f36316869, 0x686b3631686f3038, 0x0000001031687033, 0x0005020200000003, 0x0000000307070000, 0x0001000000030808, - 0x0000000402020000, 0x0020020200000003, 0x000000200a0a0000, 0x0000000000100d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000200010000, - 0x3436636900000019, 0x31636f3631686930, 0x6b3631686f303832, 0x0000103168703368, 0x1402020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000020a0a000000, - 0x03000000020d0d00, 0x0000200202000000, 0x0d000000100a0a00, 0x000003000000080d, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1600000002000100, - 0x6869346369000000, 0x6f303233636f3639, 0x687033686b363968, 0x0000010000000831, 0x0100000003070700, 0x0000020d0d000000, 0x0402020000000500, 0x0000040808000000, - 0x0a00000002090900, 0x100d0d000000200a, 0x04ffffffff000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000180000000200, 0x6869303639636900, - 0x6f303233636f3639, 0x687033686b363968, 0x0000030000000831, 0x070000003c020200, 0x0308080000000307, 0x0a00000002000000, 0x040d0d000000040a, 0x0200000003000000, - 0x200a0a0000001002, 0x0000180d0d000000, 0x000005ffffffff00, 0x0000030000000300, 0x0000010000000700, 0x1000010000000700, 0x6900000019000000, 0x3368693432303163, - 0x686f343233636f38, 0x31687033686b3833, 0x0000000200000010, 0x0d0d000000020c0c, 0x0000000100000026, 0x0003000000040a0a, 0x0000002002020000, 0x0a0a000000100909, - 0x0000000300000020, 0x0000000300000005, 0x0000000700000003, 0x0000000700000001, 0x001c000000400001, 0x7769343663690000, 0x3231636f32323031, 0x6b37303031776f38, - 0x0010307770363177, 0x0909000000020000, 0x00070d0d00000004, 0x0202000000020000, 0x00020a0a00000002, 0x0202000000030000, 0x00400a0a00000020, 0x000000100d0d0000, - 0x00000004ffffffff, 0x0000000100000003, 0x0000000400000004, 0x0001000100000001, 0x6369000000180000, 0x636f383231686933, 0x33686b3436686f38, 0x0008306870326873, - 0x0707000000010000, 0x0000000100000003, 0x0004000000020d0d, 0x0000000402020000, 0x0a0a000000080808, 0x00080d0d00000008, 0x0005000000030000, 0x0001000000030000, - 0x0007000000070000, 0x0001000000010000, 0x0000001b00000001, 0x3368693034366369, 0x686f303436636f32, 0x32687333686b3631, 0x0300000010316870, 0x0000140202000000, - 0x0800000003070700, 0x0000020000000308, 0x0d000000080a0a00, 0x000003000000020d, 0x0a00000020020200, 0x080d0d000000100a, 0x0500000003000000, 0x0100000003000000, - 0x0700000007000000, 0x0100000001000000, 0x00001b0000000200, 0x6869303436636900, 0x6f303436636f3233, 0x687333686b363168, 0x0000001031687032, 0x0014020200000003, - 0x0000000307070000, 0x0001000000030808, 0x000000020d0d0000, 0x0020020200000004, 0x0000000209090000, 0x0d0d000000400a0a, 0x0000000300000008, 0x0000000300000004, - 0x0000000700000001, 0x0000000100000007, 0x001c000000010001, 0x6930363963690000, 0x33636f3639303477, 0x36393034776f3032, 0x000830777031776b, 0x0202000000010000, - 0x000000020000003c, 0x0d0d000000040a0a, 0x0000000300000004, 0x0a0a000000100202, 0x00200d0d00000020, 0x0004ffffffff0000, 0x0001000000030000, 0x0004000000040000, - 0x0101000000010000, 0x0000001a00000080, 0x3432326869336369, 0x3131686f3436636f, 0x7032687337686b32, 0x0001000000083368, 0x0000000707070000, 0x00020a0a00000002, - 0x000000080d0d0000, 0x0004020200000004, 0x0000000808080000, 0x0a0a000000200909, 0x0000000300000020, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, - 0x0015000000010001, 0x3368693363690000, 0x33686f3031636f32, 0x0830687033686b30, 0x0700000002000000, 0x0308080000000307, 0x0d00000001000000, 0x000003000000040d, - 0x0a00000020020200, 0x080d0d000000080a, 0x0400000003000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, 0x00001b0000000800, 0x6869383231636900, - 0x6f383231636f3635, 0x687333686b383268, 0x0000000831687032, 0x0004020200000003, 0x0000000307070000, 0x0002000000030808, 0x000000040a0a0000, 0x0003000000020d0d, - 0x0000002002020000, 0x0d0d000000200a0a, 0xffffffff00000010, 0x0000000300000005, 0x0000000100000002, 0x0000000700000007, 0x0018000000100001, 0x3432303163690000, - 0x3631636f38336869, 0x7033686b3833686f, 0x0002000000103168, 0x0000000307070000, 0x0002000000030808, 0x0000000202020000, 0x0003000000020505, 0x0000004002020000, - 0x0a0a000000100909, 0x0000000300000010, 0x0000000100000005, 0x0000000800000003, 0x0000000800000001, 0x0030000000200001, 0x3264693663690000, 0x3277693832686936, - 0x3632646f36636f37, 0x3632776f3632686f, 0x776b33686b33646b, 0x7030687031647038, 0x0002000000103377, 0x000000020c0c0000, 0x00010000001a0d0d, 0x0000000202020000, - 0x0008080800000003, 0x0000001009090000, 0x0000000000100a0a, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x343663690000001b, - 0x636f363532776930, 0x3532776f30383231, 0x0830777031776b36, 0x0200000001000000, 0x0000020000002802, 0x0d000000080a0a00, 0x000003000000040d, 0x0a00000010020200, - 0x200d0d000000100a, 0x05ffffffff000000, 0x0300000003000000, 0x0100000007000000, 0x0100000007000000, 0x00001f0000001000, 0x7769383231636900, 0x31636f3030303531, - 0x303531776f343230, 0x30777031776b3030, 0x0000000100000010, 0x0002000000fa0d0d, 0x0000000402020000, 0x0003000000080a0a, 0x0000002002020000, 0x0a0a000000100909, - 0x0000000300000020, 0x0000000300000004, 0x0000000100000002, 0x0000000700000007, 0x002f000000010001, 0x3168693463690000, 0x3239317769303830, 0x33686f3633636f30, - 0x6b303436776f3036, 0x33687333776b3368, 0x7770306870337773, 0x0000030000000830, 0x0800000003070700, 0x030a0a0000000308, 0x0500000001000000, 0x0000030000000205, - 0x0500000008020200, 0x100a0a0000001e05, 0x05ffffffff000000, 0x0100000001000000, 0x0800000008000000, 0x0100000001000000, 0x00001e0000006400, 0x3932776931636900, - 0x6f3031636f323335, 0x31776b3530393577, 0x1030777035777330, 0x0000000000000000, 0x0800000003000000, 0x0a09090000000a08, 0x0000100a0a000000, 0x0000040000000000, - 0x0000030000000100, 0x0000010000000800, 0x0100010000000800, 0x690000001a000000, 0x6f34323268693363, 0x3635686f38323163, 0x687034687334686b, 0x0000010000001030, - 0x000000001c0c0c00, 0x0200000003000000, 0x200a0a0000000302, 0x0000080d0d000000, 0x0000040000000000, 0x0000010000000100, 0x0000040000000400, 0x0100010000000100, - 0x6700000017000000, 0x3268693263693233, 0x3832686f32636f38, 0x000831687033686b, 0x0707000000010000, 0x0000000100000003, 0x0004000000020d0d, 0x0000000202020000, - 0x0a0a000000030808, 0x000e0d0d00000008, 0x0004000000000000, 0x0001000000010000, 0x0007000000070000, 0x0001000000010000, 0x0000001c00000080, 0x6931636934343167, - 0x6f31636f31393168, 0x687333686b353968, 0x0000000830687032, 0x0003070700000001, 0x0003000000000000, 0x0000004001010000, 0x0909000000030808, 0x0000000000000008, - 0x0000000300000005, 0x0000000700000003, 0x0000000700000001, 0x0018000000200001, 0x6932313563690000, 0x363834636f393168, 0x7033686b3931686f, 0x0001000000103168, - 0x000000130d0d0000, 0x0000000300000000, 0x0909000000200202, 0x00400a0a00000010, 0x0005000000030000, 0x0002000000030000, 0x0007000000010000, 0x0001000000070000, - 0x0000001c00000010, 0x3577693635326369, 0x383231636f353236, 0x776b35323635776f, 0x0000001030777031, 0x00040a0a00000001, 0x0202000000010000, 0x0000000300000004, - 0x0909000000400202, 0x00200a0a00000010, 0x0005000000030000, 0x0001000000030000, 0x0004000000040000, 0x0001000000010000, 0x0000001b00000080, 0x3133776934366369, - 0x6f363532636f3633, 0x31776b3633313377, 0x0000000010307770, 0x0a00000001000000, 0x000003000000040a, 0x0a00000040020200, 0x200d0d000000400a, 0x0500000003000000, - 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001a0000000100, 0x6930383231636900, 0x383231636f343677, 0x31776b3436776f30, 0x0100000010307770, - 0x0000280202000000, 0x020a0a0000000200, 0x0000020d0d000000, 0x2002020000000300, 0x0000100a0a000000, 0x03000000100d0d00, 0x0300000005000000, 0x0700000001000000, - 0x0100000007000000, 0x0000020001000000, 0x3163690000001a00, 0x6f34367769303832, 0x36776f3038323163, 0x1030777031776b34, 0x0200000001000000, 0x0000020000002802, - 0x0d000000020a0a00, 0x000003000000080d, 0x0a00000020020200, 0x080d0d000000200a, 0x0400000003000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, - 0x00001a0000008001, 0x6869323135636900, 0x6f323135636f3431, 0x32687333686b3768, 0x0300000008316870, 0x0000100202000000, 0x0800000003070700, 0x0000020000000308, - 0x0a00000004090900, 0x000003000000080a, 0x0900000020020200, 0x200a0a0000002009, 0x0500000003000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, - 0x0000170000002001, 0x3134686933636900, 0x34686f3631636f36, 0x31687033686b3631, 0x0000000100000010, 0x0000000000030707, 0x0202000000040000, 0x0008080800000004, - 0x0000002009090000, 0x0003000000100a0a, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000002001010000, 0x6933636900000017, 0x3631636f36313468, - 0x33686b363134686f, 0x0100000010316870, 0x0000030707000000, 0x080d0d0000000100, 0x0200000004000000, 0x0408080000000402, 0x0000200909000000, 0x03000000100a0a00, - 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3163690000001900, 0x6f38346869303239, 0x3834686f30343663, 0x000831687033686b, - 0x0202000000030000, 0x0003070700000078, 0x0000000308080000, 0x00040d0d00000001, 0x0202000000040000, 0x0002090900000010, 0x000000200a0a0000, 0xffff0000000c0d0d, - 0x000100000004ffff, 0x0004000000010000, 0x0001000000040000, 0x0000002001010000, 0x343232670000001c, 0x3331337769316369, 0x3133776f31636f36, 0x30777031776b3633, - 0x0000000000000008, 0x0000000200000000, 0x0909000000200101, 0x0000000000000010, 0x0000000100000005, 0x0000000400000001, 0x0000000100000004, 0x0030000000010001, - 0x6369383832670000, 0x7769383231686931, 0x686f31636f363532, 0x363532776f383231, 0x686435776b35686b, 0x7038687033776433, 0x0001000000103877, 0x0000000507070000, - 0x0000000300000000, 0x0808000000100101, 0x00100d0d00000005, 0x0005000000000000, 0x0001000000030000, 0x0004000000040000, 0x0001000000010000, 0x0000001800000080, - 0x3268693832316369, 0x686f383231636f38, 0x31687033686b3832, 0x0000000300000010, 0x0707000000020202, 0x0003080800000003, 0x0a0a000000020000, 0x00040d0d00000002, - 0x0202000000030000, 0x0020090900000040, 0x000000400a0a0000, 0x0000000400000003, 0x0000000100000003, 0x0000000400000004, 0x0080010100000001, 0x6369000000180000, - 0x6f38326869383231, 0x3832686f38323163, 0x000831687033686b, 0x0202000000030000, 0x0003070700000004, 0x0000000308080000, 0x00040a0a00000002, 0x000000040d0d0000, - 0x0020020200000003, 0x0000002009090000, 0x0003000000200a0a, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000800010000, 0x693363690000001c, - 0x3637636f34323268, 0x31686b3431686f38, 0x3068703631687336, 0x0000000200000010, 0x0808000000100707, 0x0000000100000004, 0x0004000000080a0a, 0x0000000402020000, - 0x0a0a000000040808, 0x000e0d0d00000040, 0x0004000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000001, 0x3368693032336369, - 0x686f303436636f32, 0x31687033686b3233, 0x0000000300000008, 0x0707000000140202, 0x0003080800000003, 0x0a0a000000010000, 0x0000000300000002, 0x0a0a000000100202, - 0x00100d0d00000020, 0x0004ffffffff0000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001c00000002, 0x3977693034366369, 0x303233636f363132, - 0x776b36313239776f, 0x0000000830777031, 0x0028020200000001, 0x0a0a000000020000, 0x00080d0d00000002, 0x0202000000030000, 0x00200a0a00000010, 0x000000200d0d0000, - 0x00000004ffffffff, 0x0000000100000003, 0x0000000400000004, 0x0080010100000001, 0x63690000001c0000, 0x3331337769363532, 0x776f383231636f36, 0x7031776b36333133, - 0x0001000000083077, 0x0000000802020000, 0x00080a0a00000002, 0x000000040d0d0000, 0x0020020200000003, 0x0000001009090000, 0x0003000000100a0a, 0x0003000000050000, - 0x0007000000030000, 0x0007000000010000, 0x0000001000010000, 0x3135636900000018, 0x34636f3031686932, 0x686b3031686f3638, 0x0000001031687033, 0x000a0c0c00000002, - 0x0000000a0d0d0000, 0x0000000300000000, 0x0909000000200202, 0x00400a0a00000010, 0x0004000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, - 0x0000001800000005, 0x3168693231356369, 0x686f323135636f34, 0x31687033686b3431, 0x0000000300000008, 0x0707000000040202, 0x0003080800000003, 0x0202000000010000, - 0x0000000300000004, 0x0a0a000000200202, 0x000e0d0d00000010, 0x0005000000000000, 0x0001000000030000, 0x0004000000040000, 0x0001000000010000, 0x0000001700000001, - 0x3168693832316369, 0x31686f3233636f34, 0x1031687033686b34, 0x0700000002000000, 0x0308080000000307, 0x0200000001000000, 0x0000030000000402, 0x0a00000020020200, - 0x080d0d000000100a, 0x0500000000000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, 0x0000170000000100, 0x6869383231636900, 0x686f3436636f3233, - 0x31687033686b3233, 0x0000000200000010, 0x0808000000030707, 0x0000000100000003, 0x0003000000040202, 0x0000002002020000, 0x0d0d000000100a0a, 0x0000000000000008, - 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x001a000000020001, 0x3032393163690000, 0x3231636f34326869, 0x686b3432686f3038, 0x0000000831687033, - 0x0078020200000003, 0x0000000307070000, 0x0001000000030808, 0x000000040a0a0000, 0x0010020200000003, 0x000000200a0a0000, 0xffff000000180d0d, 0x000300000004ffff, - 0x0004000000010000, 0x0001000000040000, 0x0000002000010000, 0x6932336700000017, 0x636f383268693863, 0x33686b3832686f38, 0x0100000008316870, 0x0000030707000000, - 0x080d0d0000000100, 0x0200000004000000, 0x0408080000000802, 0x0000200909000000, 0x02000000080a0a00, 0x0300000004000000, 0x0400000001000000, 0x0100000004000000, - 0x0000100101000000, 0x3263690000001b00, 0x3936317769383430, 0x31776f323135636f, 0x30777031776b3936, 0x0000000100000008, 0x0002000000400202, 0x000000080a0a0000, - 0x0003000000080d0d, 0x0000002002020000, 0x0a0a000000100909, 0xffffffff00000040, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x0019000000010001, - 0x6930343663690000, 0x383231636f363168, 0x33686b3631686f30, 0x0300000008316870, 0x0000140202000000, 0x0800000003070700, 0x0000010000000308, 0x0300000002020200, - 0x0000100202000000, 0x0d000000200a0a00, 0x000000000000100d, 0x0000030000000500, 0x0000040000000100, 0x0000010000000400, 0x2a00000020010100, 0x6869346369000000, - 0x3931776930383031, 0x31686f31636f3032, 0x323931776f303830, 0x7033776b33686b30, 0x0000103177703168, 0x0307070000000100, 0x0400000000000000, 0x0000040202000000, - 0x0900000008080800, 0x100a0a0000002009, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00002a0000002001, 0x3031686934636900, - 0x3032393177693038, 0x383031686f31636f, 0x6b30323931776f30, 0x31687033776b3368, 0x0100000010317770, 0x0000030707000000, 0x080d0d0000000100, 0x0200000004000000, - 0x0408080000000402, 0x0000200909000000, 0x03000000100a0a00, 0x0300000004000000, 0x0400000001000000, 0x0100000004000000, 0x0000100101000000, 0x3563690000001900, - 0x636f333168693231, 0x3331686f34323031, 0x000831687033686b, 0x0202000000030000, 0x0003070700000008, 0x0000000308080000, 0x0002020200000001, 0x0202000000030000, - 0x0010090900000020, 0x000000200a0a0000, 0x0000000400000000, 0x0000000100000003, 0x0000000700000007, 0x0020010100000001, 0x6369000000190000, 0x6f33316869323135, - 0x31686f3432303163, 0x0831687033686b33, 0x0200000003000000, 0x0307070000002002, 0x0000030808000000, 0x080a0a0000000100, 0x0200000003000000, 0x2009090000001002, - 0x0000200a0a000000, 0x0000040000000200, 0x0000010000000300, 0x0000040000000400, 0x2001010000000100, 0x6900000019000000, 0x3331686932313563, 0x686f34323031636f, - 0x31687033686b3331, 0x0000000300000008, 0x0707000000100202, 0x0003080800000003, 0x0a0a000000020000, 0x00020d0d00000008, 0x0202000000030000, 0x0020090900000020, - 0x000000100a0a0000, 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x63690000001a0000, 0x3233686930383231, 0x686f30383231636f, - 0x31687033686b3233, 0x0000000300000010, 0x0707000000050202, 0x0003080800000003, 0x0202000000010000, 0x0000000300000008, 0x0a0a000000200202, 0x00200d0d00000020, - 0x0005000000000000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001a00000002, 0x6869303832316369, 0x30383231636f3233, 0x7033686b3233686f, - 0x0003000000103168, 0x0000002802020000, 0x0808000000030707, 0x0000000100000003, 0x0004000000040d0d, 0x0000002002020000, 0x0a0a000000020909, 0x00080d0d00000020, - 0x0004000000030000, 0x0002000000030000, 0x0007000000010000, 0x0001000000070000, 0x0000001f00000010, 0x7769383830316369, 0x35636f3030303531, 0x30303531776f3231, - 0x0830777031776b30, 0x0a00000001000000, 0x000002000000100a, 0x0500000004020200, 0x0000030000000805, 0x0900000020020200, 0x200a0a0000001009, 0x0400000003000000, - 0x0100000001000000, 0x0700000007000000, 0x0100000001000000, 0x00001b0000000100, 0x3163693231356700, 0x6f31636f32337769, 0x776433776b323377, 0x0000000832777031, - 0x0000000100000000, 0x0003000000040d0d, 0x0000000801010000, 0x0d0d000000030808, 0xffffffff00000008, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, - 0x001b000000080001, 0x6936353263690000, 0x363532636f383268, 0x7333686b3431686f, 0x0000083168703268, 0x0402020000000300, 0x0000030707000000, 0x0100000003080800, - 0x0000020202000000, 0x2002020000000300, 0x0000200a0a000000, 0x000000000e0d0d00, 0x0300000004000000, 0x0400000001000000, 0x0100000004000000, 0x0000010001000000, - 0x3163690000001800, 0x636f323377693832, 0x6b3233776f363532, 0x0000083077703177, 0x0202020000000100, 0x0a00000002000000, 0x020d0d000000080a, 0x0200000003000000, - 0x080a0a0000004002, 0x0000080d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x6900000018000000, 0x3233776938323163, - 0x33776f363532636f, 0x0830777031776b32, 0x0200000001000000, 0x0000020000000402, 0x0d000000040a0a00, 0x000003000000040d, 0x0a00000020020200, 0x080d0d000000080a, - 0x05ffffffff000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000320000001001, 0x3164693233636900, 0x3377693333686937, 0x31646f3233636f33, - 0x33776f3333686f37, 0x6b33686b33646b33, 0x3168703164703377, 0x0400000010317770, 0x0000020202000000, 0x0700000003060600, 0x0308080000000307, 0x0d00000001000000, - 0x000003000000020d, 0x0900000010020200, 0x200a0a0000001009, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000180000000100, - 0x6869303436636900, 0x6f303436636f3233, 0x687033686b323368, 0x0000030000001031, 0x0700000014020200, 0x0308080000000307, 0x0a00000001000000, 0x000003000000080a, - 0x0a00000020020200, 0x200d0d000000100a, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000180000000200, 0x6869303436636900, - 0x6f303436636f3233, 0x687033686b323368, 0x0000030000001031, 0x0700000014020200, 0x0308080000000307, 0x0a00000001000000, 0x000003000000080a, 0x0a00000020020200, - 0x200d0d000000200a, 0x0400000003000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, 0x00001d0000001001, 0x7769383231636900, 0x36636f3436323334, - 0x3436323334776f34, 0x000830777031776b, 0x0202000000010000, 0x0000000200000004, 0x0d0d000000020a0a, 0x0000000300000002, 0x0909000000200202, 0x00200a0a00000010, - 0x0004ffffffff0000, 0x0001000000030000, 0x0004000000040000, 0x0001000000010000, 0x0000001800000008, 0x3168693635326369, 0x686f363532636f34, 0x31687033686b3431, - 0x0000000300000008, 0x0707000000040202, 0x0003080800000003, 0x0202000000010000, 0x0000000300000002, 0x0a0a000000200202, 0x000e0d0d00000020, 0x0004000000000000, - 0x0001000000030000, 0x0004000000040000, 0x0001000000010000, 0x0000001800000001, 0x3168693635326369, 0x686f363532636f34, 0x31687033686b3431, 0x0000000200000008, - 0x0808000000030707, 0x0000000100000003, 0x0003000000080202, 0x0000002002020000, 0x0d0d000000200a0a, 0x0000000000000008, 0x0000000300000005, 0x0000000400000001, - 0x0000000100000004, 0x0017000000200001, 0x3863693233670000, 0x6f38636f38326869, 0x687033686b383268, 0x0000020000001031, 0x0800000003070700, 0x0000010000000308, - 0x03000000040d0d00, 0x0000200202000000, 0x0a00000020090900, 0x000003000000100a, 0x0000010000000500, 0x0000010000000200, 0x0000080000000800, 0x1300000800000100, - 0x6869386369000000, 0x38686f38636f3031, 0x001030687033686b, 0x0707000000020000, 0x0003080800000003, 0x0505000000010000, 0x0000000300000002, 0x0909000000100202, - 0x00080a0a00000010, 0x0005000000000000, 0x0002000000010000, 0x000b000000010000, 0x00010000000b0000, 0x0000001300000800, 0x6f30316869386369, 0x33686b38686f3863, - 0x0200000008306870, 0x0000030707000000, 0x0100000003080800, 0x0000020505000000, 0x0802020000000300, 0x0000100909000000, 0x00000000080a0a00, 0x0300000004000000, - 0x0400000001000000, 0x0100000004000000, 0x0000010001000000, 0x3163690000003100, 0x3030316869343230, 0x35636f3137317769, 0x6f303031686f3231, 0x6b33686b31373177, - 0x3177643168643377, 0x0008327770326870, 0x0202000000030000, 0x0003070700000020, 0x0000000308080000, 0x00080a0a00000001, 0x0202000000030000, 0x00400a0a00000020, - 0x000000100d0d0000, 0x0000000500000003, 0x0000000200000003, 0x0000000700000001, 0x0010000100000007, 0x6369000000180000, 0x6f39316869323135, 0x3931686f36383463, - 0x001031687033686b, 0x0707000000030000, 0x0003080800000003, 0x000000100a0a0000, 0x0002020200000002, 0x0000000805050000, 0x0040020200000003, 0x0000001009090000, - 0x0003000000200a0a, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x3436636900000038, 0x3268693432326469, 0x6f30363177693432, - 0x343232646f323363, 0x31776f343232686f, 0x33686b33646b3036, 0x687031647033776b, 0x0000001031777031, 0x0004020200000004, 0x0000000306060000, 0x0808000000030707, - 0x0000000000000003, 0x0010020200000003, 0x000000200a0a0000, 0x0003000000200d0d, 0x0003000000050000, 0x0004000000010000, 0x0001000000040000, 0x0000000100010000, - 0x3436636900000038, 0x3268693432326469, 0x6f30363177693432, 0x343232646f323363, 0x31776f343232686f, 0x33686b33646b3036, 0x687031647033776b, 0x0000001031777031, - 0x0003060600000003, 0x0000000307070000, 0x0001000000030808, 0x0000000202020000, 0x0020020200000003, 0x000000200a0a0000, 0x0000000000200d0d, 0x0003000000050000, - 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x353263690000001a, 0x636f323135686936, 0x323135686f363532, 0x001031687033686b, 0x0202000000030000, - 0x0003070700000008, 0x0000000308080000, 0x00040a0a00000002, 0x000000040d0d0000, 0x0020020200000003, 0x000000400a0a0000, 0x0003000000200d0d, 0x0001000000050000, - 0x0008000000030000, 0x0008000000010000, 0x0000002000010000, 0x6938636900000014, 0x646f38636f353264, 0x32647035646b3532, 0x0000000100000010, 0x0000000000190d0d, - 0x0202000000030000, 0x0010090900000008, 0x000000100a0a0000, 0x0000000400000000, 0x0000000100000003, 0x0000000400000004, 0x0080010100000001, 0x6369000000190000, - 0x3934776938343032, 0x34776f323135636f, 0x0830777031776b39, 0x0200000001000000, 0x0000020000004002, 0x0d000000040a0a00, 0x000003000000040d, 0x0900000020020200, - 0x200a0a0000002009, 0x0400000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000180000000100, 0x6869303639636900, 0x6f303436636f3233, - 0x687033686b323368, 0x0000030000000831, 0x070000003c020200, 0x0308080000000307, 0x0a00000001000000, 0x000003000000080a, 0x0a00000010020200, 0x100d0d000000200a, - 0x05ffffffff000000, 0x0300000003000000, 0x0100000007000000, 0x0100000007000000, 0x0000160000001000, 0x6869363532636900, 0x686f363834636f35, 0x1031687033686b35, - 0x0c00000002000000, 0x050d0d000000050c, 0x0a00000001000000, 0x000003000000040a, 0x0900000020020200, 0x200a0a0000001009, 0x0400000003000000, 0x0100000003000000, - 0x0700000007000000, 0x0100000001000000, 0x00001a0000000100, 0x6930363532636900, 0x383231636f343677, 0x31776b3436776f30, 0x0100000008307770, 0x0000140202000000, - 0x0802020000000200, 0x0000020d0d000000, 0x1002020000000300, 0x0000100a0a000000, 0x00000000200d0d00, 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, - 0x0000200101000000, 0x3163690000001900, 0x6f30316869303832, 0x3031686f36343563, 0x000831687033686b, 0x0202000000030000, 0x0003070700000050, 0x0000000308080000, - 0x00080a0a00000002, 0x000000020d0d0000, 0x0010020200000003, 0x0000002009090000, 0x0003000000100a0a, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, - 0x0000000100010000, 0x343663690000001c, 0x6f34323031776930, 0x3031776f30343663, 0x30777031776b3432, 0x0000000100000010, 0x0002000000140202, 0x000000040a0a0000, - 0x0003000000080d0d, 0x0000002002020000, 0x0d0d000000400a0a, 0x0000000300000008, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x001c000000020001, - 0x6930343663690000, 0x36636f3432303177, 0x34323031776f3034, 0x001030777031776b, 0x0202000000010000, 0x0000000200000014, 0x0d0d000000040a0a, 0x0000000300000008, - 0x0a0a000000200202, 0x00100d0d00000040, 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001b00000010, 0x3768693832316369, - 0x686f383231636f35, 0x32687333686b3833, 0x0300000010316870, 0x0000040202000000, 0x0800000003070700, 0x0000020000000308, 0x0d000000020a0a00, 0x000003000000040d, - 0x0900000020020200, 0x400a0a0000001009, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000190000001000, 0x6934323031636900, - 0x343233636f383368, 0x7033686b3833686f, 0x0003000000103168, 0x0000002002020000, 0x0808000000030707, 0x0000000200000003, 0x0d0d000000020a0a, 0x0000000300000008, - 0x0909000000200202, 0x00400a0a00000010, 0x0005000000030000, 0x0001000000030000, 0x0004000000040000, 0x0101000000010000, 0x0000001600000020, 0x3268693434316369, - 0x32686f343431636f, 0x001031687033686b, 0x0202000000030000, 0x0003070700000005, 0x0000000308080000, 0x0002090900000002, 0x000000020a0a0000, 0x0020020200000003, - 0x0000001009090000, 0x0003000000100a0a, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x3135636900000016, 0x36636f3031686932, - 0x7033686b3031686f, 0x0003000000083168, 0x0000000402020000, 0x0808000000030707, 0x0000000100000003, 0x0003000000080202, 0x0000001002020000, 0x0d0d000000080a0a, - 0x0000000000000008, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x001b000000010001, 0x6930343663690000, 0x3231636f36353277, 0x6b363532776f3038, - 0x0000103077703177, 0x1402020000000100, 0x0a00000002000000, 0x080d0d000000020a, 0x0200000003000000, 0x200a0a0000002002, 0x0000100d0d000000, 0x0000050000000300, - 0x0000010000000300, 0x0000070000000700, 0x0200010000000100, 0x690000001b000000, 0x3532776930343663, 0x6f30383231636f36, 0x7031776b36353277, 0x0001000000103077, - 0x0000001402020000, 0x00080a0a00000002, 0x000000040d0d0000, 0x0020020200000003, 0x000000200a0a0000, 0x0003000000100d0d, 0x0003000000050000, 0x0001000000020000, - 0x0007000000070000, 0x0000001000010000, 0x3532636900000018, 0x32636f3833686936, 0x686b3833686f3635, 0x0000001031687033, 0x0003070700000003, 0x0000000308080000, - 0x0002000000080a0a, 0x0000000202020000, 0x0003000000080505, 0x0000004002020000, 0x0a0a000000100909, 0x0000000300000020, 0x0000000300000005, 0x0000000700000001, - 0x0000000100000007, 0x001b000000100001, 0x6936353263690000, 0x323135636f353768, 0x7331686b3833686f, 0x0000103068703268, 0x0802020000000100, 0x0a00000002000000, - 0x020d0d000000040a, 0x0200000003000000, 0x1009090000002002, 0x0000400a0a000000, 0x0000050000000300, 0x0000030000000300, 0x0000010000000700, 0x1000010000000700, - 0x6900000018000000, 0x3833686936353263, 0x33686f363532636f, 0x1031687033686b38, 0x0c00000002000000, 0x020d0d000000260c, 0x0200000002000000, 0x020a0a0000000802, - 0x0200000003000000, 0x1009090000002002, 0x0000400a0a000000, 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, 0x2000010000000100, 0x6700000017000000, - 0x3568693463693233, 0x3635686f34636f36, 0x001031687033686b, 0x0707000000020000, 0x0003080800000003, 0x0d0d000000010000, 0x0000000300000008, 0x0909000000100202, - 0x00100a0a00000020, 0x0004000000000000, 0x0001000000030000, 0x0004000000040000, 0x0101000000010000, 0x0000001800000080, 0x3168693635326369, 0x686f363532636f34, - 0x31687033686b3431, 0x0000000300000008, 0x0707000000080202, 0x0003080800000003, 0x0909000000020000, 0x00040a0a00000004, 0x0202000000030000, 0x0020090900000020, - 0x000000200a0a0000, 0x0000000500000003, 0x0000000300000003, 0x000000010000000a, 0x002001010000000a, 0x6369000000300000, 0x3268693632646936, 0x36636f3732776938, - 0x3632686f3632646f, 0x6b33646b3632776f, 0x31647038776b3368, 0x0010337770306870, 0x0c0c000000020000, 0x001a0d0d0000000d, 0x0003000000000000, 0x0000000808080000, - 0x0a0a000000100909, 0x0000000200000010, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x001c000000020001, 0x6930363963690000, 0x36636f3430333277, - 0x34303332776f3034, 0x000830777031776b, 0x0202000000010000, 0x000000020000003c, 0x0d0d000000040a0a, 0x0000000300000008, 0x0a0a000000100202, 0x00180d0d00000020, - 0x0004ffffffff0000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001400000001, 0x3568693635326369, 0x686b35686f36636f, 0x0000000831687033, - 0x0002020200000003, 0x0000000307070000, 0x0001000000030808, 0x0000000802020000, 0x0010020200000003, 0x000000080a0a0000, 0x0000000000080d0d, 0x0003000000050000, - 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x3436636900000015, 0x6f31636f32336869, 0x687033686b323368, 0x0000030000001031, 0x0700000002020200, - 0x0308080000000307, 0x0d00000001000000, 0x000003000000040d, 0x0a00000020020200, 0x080d0d000000100a, 0x0400000003000000, 0x0100000003000000, 0x0700000007000000, - 0x0100000001000000, 0x0000150000000100, 0x6869323135636900, 0x35686f3432636f35, 0x000831687033686b, 0x0202000000030000, 0x0003070700000004, 0x0000000308080000, - 0x0008020200000001, 0x0202000000030000, 0x00080a0a00000010, 0x000000080d0d0000, 0x0000000400000000, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, - 0x63690000001a0000, 0x3631686930383231, 0x686f30383231636f, 0x31687033686b3631, 0x0000000300000008, 0x0707000000280202, 0x0003080800000003, 0x0202000000010000, - 0x0000000300000002, 0x0a0a000000100202, 0x00100d0d00000020, 0x0004000000000000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001a00000002, - 0x6869303832316369, 0x30383231636f3631, 0x7033686b3631686f, 0x0003000000083168, 0x0000005002020000, 0x0808000000030707, 0x0000000100000003, 0x0003000000020a0a, - 0x0000001002020000, 0x0d0d000000200a0a, 0x0000000200000010, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x0018000000010001, 0x6930363963690000, - 0x303436636f323368, 0x7033686b3233686f, 0x0003000000103168, 0x0000001e02020000, 0x0808000000030707, 0x0000000200000003, 0x0d0d000000020a0a, 0x0000000300000004, - 0x0a0a000000200202, 0x00080d0d00000020, 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000002, 0x3368693036396369, - 0x686f303436636f32, 0x31687033686b3233, 0x0000000300000010, 0x07070000001e0202, 0x0003080800000003, 0x0d0d000000010000, 0x0000000400000002, 0x0909000000200202, - 0x00400a0a00000002, 0x000000100d0d0000, 0x0000000400000003, 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x63690000001b0000, 0x3637357769303436, - 0x776f30383231636f, 0x777031776b363735, 0x0000010000000830, 0x0200000028020200, 0x0000080a0a000000, 0x03000000080d0d00, 0x0000100202000000, 0x0d000000200a0a00, - 0x000000000000180d, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1800000001000100, 0x3832316369000000, 0x3231636f32336869, 0x33686b3233686f38, - 0x0200000008316870, 0x0000030707000000, 0x0100000003080800, 0x0000080202000000, 0x1002020000000300, 0x0000200a0a000000, 0x00000000100d0d00, 0x0300000004000000, - 0x0700000003000000, 0x0700000001000000, 0x0000080001000000, 0x3563690000001b00, 0x636f333777693231, 0x6b3533776f323135, 0x3077703277733477, 0x0000000100000008, - 0x0002000000080909, 0x0000000802020000, 0x0003000000080a0a, 0x0000002002020000, 0x0d0d000000200a0a, 0x0000000300000010, 0x0000000300000005, 0x0000000700000001, - 0x0000000100000007, 0x000a000000010001, 0x3235313163690000, 0x000000103834636f, 0x0009020200000001, 0x0202000000010000, 0x0000000300000008, 0x0a0a000000100202, - 0x00080d0d00000010, 0x0004000000000000, 0x0001000000010000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000001, 0x6931636932313567, 0x776f31636f323377, - 0x31777033776b3233, 0x0000000000000008, 0x0000000300000000, 0x0808000000080101, 0x00080d0d00000003, 0x0005000000000000, 0x0003000000030000, 0x0001000000070000, - 0x0001000000070000, 0x0000001700000018, 0x6869386369323367, 0x35686f38636f3635, 0x1031687033686b36, 0x0c00000002000000, 0x0e0d0d000000020c, 0x0300000000000000, - 0x0000080202000000, 0x0a00000020090900, 0x000000000000100a, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1c00000001000100, 0x3032336369000000, - 0x636f363930347769, 0x393034776f303233, 0x0830777031776b36, 0x0200000001000000, 0x0000020000001402, 0x0d000000040a0a00, 0x000003000000040d, 0x0a00000010020200, - 0x200d0d000000200a, 0x05ffffffff000000, 0x0300000003000000, 0x0100000007000000, 0x0100000007000000, 0x00001b0000001000, 0x6869383231636900, 0x6f383231636f3537, - 0x687333686b383368, 0x0000001031687032, 0x00130c0c00000001, 0x0202000000020000, 0x00020a0a00000004, 0x0202000000030000, 0x0010090900000020, 0x000000400a0a0000, - 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x63690000001a0000, 0x3631686930363532, 0x686f30383231636f, 0x31687033686b3631, - 0x0000000300000010, 0x07070000000a0202, 0x0003080800000003, 0x0202000000010000, 0x0000000300000008, 0x0a0a000000200202, 0x00100d0d00000020, 0x0005000000000000, - 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001a00000002, 0x6869303635326369, 0x30383231636f3631, 0x7033686b3631686f, 0x0003000000103168, - 0x0000000a02020000, 0x0808000000030707, 0x0000000100000003, 0x0004000000080202, 0x0000002002020000, 0x0a0a000000020909, 0x00100d0d00000020, 0x0005000000000000, - 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000001, 0x3668693034366369, 0x686f303233636f34, 0x31687033686b3436, 0x0000000300000010, - 0x0707000000140202, 0x0003080800000003, 0x0a0a000000020000, 0x00040d0d00000002, 0x0202000000030000, 0x00400a0a00000020, 0x000000100d0d0000, 0x0000000500000003, - 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x6369000000180000, 0x6f34366869303436, 0x3436686f30323363, 0x001031687033686b, 0x0202000000030000, - 0x0003070700000014, 0x0000000308080000, 0x00080d0d00000001, 0x0202000000040000, 0x0002090900000020, 0x000000200a0a0000, 0x0003000000080d0d, 0x0003000000040000, - 0x0007000000010000, 0x0001000000070000, 0x0000000600010000, 0x3135636900000026, 0x3277693832686932, 0x686f323135636f31, 0x686b3132776f3832, 0x7031687033776b33, - 0x0003000000083177, 0x0000000802020000, 0x0808000000030707, 0x0000000100000003, 0x0003000000020202, 0x0000002002020000, 0x0d0d000000100a0a, 0x0000000000000015, - 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x0019000000010001, 0x3032393163690000, 0x3436636f32336869, 0x33686b3233686f30, 0x0300000008316870, - 0x0000780202000000, 0x0800000003070700, 0x0000020000000308, 0x0d000000080a0a00, 0x000003000000020d, 0x0a00000010020200, 0x100d0d000000200a, 0x04ffffffff000000, - 0x0100000001000000, 0x0700000007000000, 0x0100000001000000, 0x0000190000008000, 0x6931636930346700, 0x6f31636f30353168, 0x7033686b30353168, 0x0001000000083168, - 0x0000000307070000, 0x00080d0d00000001, 0x0101000000030000, 0x0003080800000010, 0x0000000809090000, 0x0000000500000001, 0x0000000200000003, 0x0000000700000001, - 0x0010000100000007, 0x6369000000190000, 0x3833686934323031, 0x33686f343233636f, 0x1031687033686b38, 0x0700000003000000, 0x0308080000000307, 0x00000b0a0a000000, - 0x0402020000000200, 0x0000080505000000, 0x4002020000000300, 0x0000100909000000, 0x03000000200a0a00, 0x0300000004000000, 0x0400000001000000, 0x0100000004000000, - 0x0000010001000000, 0x3363690000001700, 0x33636f3233326869, 0x686b343232686f32, 0x0000000830687039, 0x0009070700000002, 0x0000000208080000, 0x00020a0a00000002, - 0x000000040d0d0000, 0x0004020200000004, 0x0000000808080000, 0x0d0d000000100a0a, 0x000000010000001c, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, - 0x001d000000010001, 0x3038323163690000, 0x636f343230317769, 0x323031776f303436, 0x0830777031776b34, 0x0200000001000000, 0x0000020000005002, 0x0d000000040a0a00, - 0x000003000000080d, 0x0a00000010020200, 0x200d0d000000200a, 0x05ffffffff000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000180000004000, - 0x7769383231636900, 0x6f383231636f3831, 0x777034776b353177, 0x0000020000001030, 0x0800000004020200, 0x0000010000000408, 0x03000000080d0d00, 0x0000200202000000, - 0x0a00000008090900, 0xffffff000000200a, 0x00000300000004ff, 0x0000040000000100, 0x0000010000000400, 0x1b00000001000100, 0x6869336369000000, 0x3432636f34323031, - 0x37686b363532686f, 0x0008336870346873, 0x0707000000010000, 0x0000000200000007, 0x0d0d000000020a0a, 0x0000000400000008, 0x0808000000040202, 0x00100a0a00000008, - 0x000000200d0d0000, 0x0000000400000001, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x6369000000180000, 0x6f34366869303233, 0x3436686f30323363, - 0x000831687033686b, 0x0202000000030000, 0x0003070700000014, 0x0000000308080000, 0x00080a0a00000002, 0x000000040d0d0000, 0x0010020200000003, 0x000000100a0a0000, - 0xffff000000100d0d, 0x000300000004ffff, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x323163690000001a, 0x636f323135686938, 0x323135686f383231, - 0x000831687033686b, 0x0202000000030000, 0x0003070700000004, 0x0000000308080000, 0x00040a0a00000002, 0x000000080d0d0000, 0x0020020200000003, 0x000000200a0a0000, - 0x0003000000100d0d, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000001800010000, 0x6932336700000017, 0x636f363568693863, 0x33686b3635686f38, - 0x0100000008316870, 0x0000030707000000, 0x080d0d0000000100, 0x0200000004000000, 0x0408080000000802, 0x00000c0909000000, 0x02000000080a0a00, 0x0300000005000000, - 0x0700000003000000, 0x0700000001000000, 0x0000400001000000, 0x3163690000001800, 0x636f383177693832, 0x6b3531776f383231, 0x0000103077703477, 0x4009090000000100, - 0x0200000002000000, 0x020a0a0000000202, 0x0200000003000000, 0x100a0a0000001002, 0x0000100d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000070000000700, - 0x0200010000000100, 0x690000001a000000, 0x3468693038323163, 0x6f30383231636f38, 0x687033686b383468, 0x0000030000000831, 0x0700000050020200, 0x0308080000000307, - 0x0a00000002000000, 0x020d0d000000080a, 0x0200000003000000, 0x200a0a0000001002, 0x0000180d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000040000000400, - 0x1001010000000100, 0x690000001c000000, 0x3830317769343663, 0x776f3436636f3631, 0x31776b3631383031, 0x0100000008307770, 0x0000020202000000, 0x020d0d0000000100, - 0x0200000003000000, 0x1009090000002002, 0x0000200a0a000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x6900000014000000, - 0x636f396869323363, 0x33686b37686f3233, 0x0200000008306870, 0x0000030707000000, 0x0100000003080800, 0x0000020202000000, 0x1002020000000300, 0x0000100a0a000000, - 0x00000000070d0d00, 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3363690000001c00, 0x3432303177693032, 0x31776f303436636f, - 0x777031776b343230, 0x0000010000000830, 0x0200000014020200, 0x0000080a0a000000, 0x03000000080d0d00, 0x0000100202000000, 0x0d000000100a0a00, 0x000000000000200d, - 0x0000030000000500, 0x00000a0000000300, 0x00000a0000000100, 0x1c00000040010100, 0x6934366369000000, 0x31636f3232303177, 0x37303031776f3832, 0x103077703631776b, - 0x0900000002000000, 0x130d0d0000000209, 0x0200000002000000, 0x020a0a0000000202, 0x0200000003000000, 0x0809090000002002, 0x0000400a0a000000, 0x0000050000000200, - 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x690000001c000000, 0x3277693036353263, 0x30383231636f3635, 0x31776b363532776f, 0x0100000010307770, - 0x0000500202000000, 0x020a0a0000000200, 0x0000080d0d000000, 0x2002020000000300, 0x0000200a0a000000, 0x03000000100d0d00, 0x0300000005000000, 0x0700000001000000, - 0x0100000007000000, 0x0000020001000000, 0x3263690000001c00, 0x3635327769303635, 0x776f30383231636f, 0x777031776b363532, 0x0000010000001030, 0x0200000050020200, - 0x0000020a0a000000, 0x03000000080d0d00, 0x0000200202000000, 0x0d000000400a0a00, 0x000003000000100d, 0x0000030000000400, 0x0000040000000100, 0x0000010000000400, - 0x1a00000008000100, 0x3231356369000000, 0x32636f3438377769, 0x6b343837776f3635, 0x0000083077703177, 0x1002020000000100, 0x0a00000002000000, 0x040d0d000000080a, - 0x0200000003000000, 0x200a0a0000002002, 0x00001c0d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x690000000a000000, - 0x34636f3235313163, 0x0000010000000838, 0x0100000009020200, 0x0000080202000000, 0x1002020000000300, 0x0000080a0a000000, 0x00000000080d0d00, 0x0300000004000000, - 0x0400000001000000, 0x0100000004000000, 0x0000010001000000, 0x3263690000001800, 0x636f323377693635, 0x6b3233776f383231, 0x0000083077703177, 0x0000010000000000, - 0x0300000004020200, 0x0000400202000000, 0x0d000000080a0a00, 0x000000000000100d, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1800000001000100, - 0x3635326369000000, 0x3231636f32337769, 0x31776b3233776f38, 0x0000000008307770, 0x0200000001000000, 0x0000030000000802, 0x0a00000020020200, 0x080d0d000000080a, - 0x0400000000000000, 0x0100000001000000, 0x0400000004000000, 0x0100000001000000, 0x00001c0000002001, 0x3163693635326700, 0x636f363930347769, 0x6b36393034776f31, - 0x0000083077703177, 0x0000000000000000, 0x2001010000000200, 0x0000100909000000, 0x0000040000000000, 0x0000010000000300, 0x0000040000000400, 0x0100010000000100, - 0x6900000018000000, 0x3233776938323163, 0x33776f323135636f, 0x0830777031776b32, 0x0100000000000000, 0x0000020202000000, 0x4002020000000300, 0x0000080a0a000000, - 0x00000000080d0d00, 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3163690000001800, 0x636f323377693832, 0x6b3233776f323135, - 0x0000083077703177, 0x0000010000000000, 0x0300000008020200, 0x0000100202000000, 0x0d000000100a0a00, 0x000000000000100d, 0x0000030000000400, 0x0000040000000100, + 0x0000000000000196, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x0018000000010001, 0x6869323363690000, 0x6f3233636f323135, 0x7033686b32313568, + 0x0003000000083168, 0x0000000202020000, 0x0808000000030707, 0x0000000200000003, 0x0d0d000000020a0a, 0x0000000300000004, 0x0a0a000000100202, 0x00200d0d00000010, + 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000002, 0x3668693032336369, 0x686f303233636f34, 0x31687033686b3436, + 0x0000000300000010, 0x07070000000a0202, 0x0003080800000003, 0x0d0d000000010000, 0x0000000400000008, 0x0909000000200202, 0x00200a0a00000002, 0x000000080d0d0000, + 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x6369000000180000, 0x6f34366869303233, 0x3436686f30323363, 0x001031687033686b, + 0x0202000000030000, 0x000307070000000a, 0x0000000308080000, 0x00020a0a00000002, 0x000000040d0d0000, 0x0020020200000003, 0x000000400a0a0000, 0x0003000000100d0d, + 0x0003000000050000, 0x000a000000030000, 0x000a000000010000, 0x0000002000010000, 0x6936636900000014, 0x646f31636f363264, 0x30647032646b3532, 0x0000000200000010, + 0x0d0d000000020909, 0x0000000000000019, 0x0008020200000004, 0x0000000208080000, 0x0a0a000000080909, 0x0000000000000010, 0x0000000300000005, 0x0000000700000003, + 0x0000000700000001, 0x0016000000100001, 0x6869343663690000, 0x686f3436636f3537, 0x31687033686b3537, 0x0000000200000010, 0x0d0d000000050c0c, 0x0000000100000005, + 0x0003000000040202, 0x0000001002020000, 0x0a0a000000100909, 0x0000000300000040, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x0018000000010001, + 0x6930343663690000, 0x303233636f343668, 0x7033686b3436686f, 0x0003000000083168, 0x0000002802020000, 0x0808000000030707, 0x0000000200000003, 0x0d0d000000080a0a, + 0x0000000300000004, 0x0a0a000000100202, 0x00100d0d00000010, 0x0004ffffffff0000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000001, + 0x3368693034366369, 0x686f303436636f32, 0x31687033686b3233, 0x0000000300000008, 0x0707000000280202, 0x0003080800000003, 0x0a0a000000010000, 0x0000000300000008, + 0x0a0a000000100202, 0x00100d0d00000020, 0x0004ffffffff0000, 0x0003000000030000, 0x0001000000070000, 0x0001000000070000, 0x0000001c00000008, 0x3277693231356369, + 0x6f323135636f3939, 0x777338776b333777, 0x0000000830777034, 0x0008090900000001, 0x0202000000020000, 0x00040a0a00000008, 0x0202000000030000, 0x00400a0a00000010, + 0x000000100d0d0000, 0x0000000400000003, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x63690000000a0000, 0x3834636f32353131, 0x0000000100000008, + 0x0001000000090202, 0x0000000802020000, 0x0010020200000003, 0x000000080a0a0000, 0x0000000000080d0d, 0x0003000000050000, 0x0007000000030000, 0x0007000000010000, + 0x0000000100010000, 0x693363690000001a, 0x3231636f34323268, 0x34686b3635686f38, 0x0010306870346873, 0x0001000000000000, 0x000000020a0a0000, 0x0008020200000004, + 0x0000000408080000, 0x0d0d000000100a0a, 0x0000000300000010, 0x0000000300000005, 0x0000000a00000003, 0x0000000a00000001, 0x001a000000010001, 0x3268693363690000, + 0x6f383231636f3432, 0x687334686b363568, 0x0000001030687034, 0x00070c0c00000001, 0x0a0a000000010000, 0x0000000400000008, 0x0808000000040202, 0x00100a0a00000004, + 0x000000080d0d0000, 0x0000000400000002, 0x0000000300000003, 0x0000000100000007, 0x0010010100000007, 0x63690000001d0000, 0x3531776938383031, 0x6f323135636f3030, + 0x31776b3030353177, 0x0100000008307770, 0x0001f40d0d000000, 0x0802020000000200, 0x0000080a0a000000, 0x2002020000000300, 0x0000100909000000, 0x03000000200a0a00, + 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3263690000001c00, 0x3635327769303635, 0x776f30383231636f, 0x777031776b363532, + 0x0000010000001030, 0x0200000050020200, 0x0000020a0a000000, 0x03000000080d0d00, 0x0000200202000000, 0x0d000000400a0a00, 0x000003000000100d, 0x0000030000000500, + 0x0000070000000100, 0x0000010000000700, 0x1c00000001000100, 0x3635326369000000, 0x636f363532776930, 0x3532776f30383231, 0x1030777031776b36, 0x0200000001000000, + 0x0000020000005002, 0x0d000000020a0a00, 0x000003000000080d, 0x0a00000020020200, 0x100d0d000000200a, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, + 0x0100000001000000, 0x0000190000000200, 0x6930323931636900, 0x303436636f323368, 0x7033686b3233686f, 0x0003000000103168, 0x0000003c02020000, 0x0808000000030707, + 0x0000000100000003, 0x0004000000020d0d, 0x0000002002020000, 0x0a0a000000020909, 0x00100d0d00000040, 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, + 0x0001000000010000, 0x0000001900000001, 0x6869303239316369, 0x6f303436636f3233, 0x687033686b323368, 0x0000030000001031, 0x070000003c020200, 0x0308080000000307, + 0x0a00000002000000, 0x020d0d000000040a, 0x0200000003000000, 0x200a0a0000002002, 0x0000100d0d000000, 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, + 0x0100010000000100, 0x6900000018000000, 0x3639303477693463, 0x393034776f34636f, 0x1030777031776b36, 0x0100000000000000, 0x0000020d0d000000, 0x0402020000000400, + 0x0000040808000000, 0x0d000000100a0a00, 0x000003000000100d, 0x0000030000000400, 0x0000040000000100, 0x0000010000000400, 0x1b00000080010100, 0x3635326369000000, + 0x3135636f36356869, 0x31686b3832686f32, 0x0008306870326873, 0x0202000000010000, 0x0000000100000008, 0x0003000000080a0a, 0x0000002002020000, 0x0a0a000000200909, + 0x0000000200000020, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x0018000000200101, 0x6934383363690000, 0x363532636f363268, 0x7033686b3632686f, + 0x0003000000083168, 0x0000000c02020000, 0x0808000000030707, 0x0000000200000003, 0x0d0d000000040a0a, 0x0000000300000004, 0x0909000000200202, 0x00200a0a00000020, + 0x0005000000030000, 0x0002000000010000, 0x000b000000010000, 0x00010000000b0000, 0x0000001300000800, 0x6f30316869326369, 0x33686b38686f3863, 0x0200000008306870, + 0x0000030707000000, 0x0100000003080800, 0x0000020505000000, 0x0802020000000300, 0x0000100909000000, 0x00000000080a0a00, 0x0100000005000000, 0x0100000002000000, + 0x0800000008000000, 0x0008000001000000, 0x3263690000001300, 0x6f38636f30316869, 0x30687033686b3868, 0x0000000200000010, 0x0808000000030707, 0x0000000000000003, + 0x0010020200000003, 0x0000001009090000, 0x0000000000080a0a, 0x0003000000040000, 0x0004000000010000, 0x0001000000040000, 0x0000000100010000, 0x3631636900000018, + 0x33636f3830326869, 0x686b383032686f32, 0x0000000831687033, 0x0003070700000002, 0x0000000308080000, 0x00020a0a00000001, 0x0202000000030000, 0x00100a0a00000020, + 0x000000100d0d0000, 0x0000000400000003, 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x63690000001c0000, 0x3735776930323931, 0x6f30383231636f36, + 0x7031776b36373577, 0x0001000000083077, 0x0000007802020000, 0x00080a0a00000002, 0x000000080d0d0000, 0x0010020200000003, 0x000000200a0a0000, 0x0000000000180d0d, + 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x393163690000001a, 0x636f363168693032, 0x3631686f30383231, 0x000831687033686b, + 0x0202000000030000, 0x000307070000003c, 0x0000000308080000, 0x0002020200000001, 0x0202000000030000, 0x00200a0a00000010, 0x000000100d0d0000, 0x0000000500000000, + 0x0000000300000003, 0x000000010000000a, 0x004001010000000a, 0x6369000000190000, 0x6f30397769363532, 0x3935776f36353263, 0x103077703233776b, 0x0900000002000000, + 0x3b0d0d0000000209, 0x0200000001000000, 0x0000030000000802, 0x0900000020020200, 0x400a0a0000000809, 0x0400000002000000, 0x0100000003000000, 0x0700000007000000, + 0x0100000001000000, 0x00001a0000000100, 0x6930383231636900, 0x383231636f323368, 0x33686b3233686f30, 0x0300000008316870, 0x0000500202000000, 0x0800000003070700, + 0x0000020000000308, 0x0d000000040a0a00, 0x000003000000020d, 0x0a00000010020200, 0x100d0d000000100a, 0x05ffffffff000000, 0x0300000003000000, 0x0100000007000000, + 0x0100000007000000, 0x0000190000004000, 0x7769363532636900, 0x6f363532636f3039, 0x703233776b393577, 0x0002000000103077, 0x0000004009090000, 0x0002000000040d0d, + 0x0000000802020000, 0x0003000000040a0a, 0x0000001002020000, 0x0d0d000000400a0a, 0xffffffff00000010, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, + 0x002c000000200101, 0x3868693163690000, 0x3536303031776930, 0x3038686f31636f30, 0x303536303031776f, 0x703332776b31686b, 0x0010313177703068, 0x0808000000010000, + 0x0000000100000003, 0x0004000000080d0d, 0x0000000202020000, 0x0909000000080808, 0x00100a0a00000020, 0x0005000000030000, 0x0001000000030000, 0x0004000000040000, + 0x0101000000010000, 0x0000002c00000020, 0x6930386869316369, 0x6f30353630303177, 0x776f3038686f3163, 0x686b303536303031, 0x3068703332776b31, 0x0000001031317770, + 0x0003080800000001, 0x0d0d000000010000, 0x0000000400000004, 0x0808000000040202, 0x0020090900000008, 0x000000100a0a0000, 0x0000000400000003, 0x0000000100000003, + 0x0000000400000004, 0x0c80010100000001, 0x6369000000190000, 0x6f34367769383434, 0x36776f3038303263, 0x0830777031776b34, 0x0200000001000000, 0x0000020000000e02, + 0x0d000000040a0a00, 0x000003000000080d, 0x0900000020020200, 0x200a0a0000002009, 0x0400000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, + 0x00001900000c8001, 0x7769383434636900, 0x30383032636f3436, 0x7031776b3436776f, 0x0001000000083077, 0x0000001c02020000, 0x00020a0a00000002, 0x000000080d0d0000, + 0x0010020200000003, 0x0000002009090000, 0x0003000000200a0a, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x3231636900000019, + 0x636f323368693038, 0x6b3233686f303436, 0x0000083168703368, 0x5002020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000080a0a000000, 0x03000000020d0d00, + 0x0000100202000000, 0x0d000000200a0a00, 0xffffff000000100d, 0x00000300000004ff, 0x0000070000000100, 0x0000010000000700, 0x1a00000001000100, 0x3832316369000000, + 0x31636f3831686930, 0x6b3631686f303832, 0x0000083068703368, 0x2802020000000300, 0x0000030707000000, 0x0100000003080800, 0x0000020202000000, 0x1002020000000300, + 0x0000200a0a000000, 0x00000000100d0d00, 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3163690000001800, 0x636f386869303832, + 0x6b38686f30383231, 0x0000083168703368, 0x1402020000000300, 0x0000030707000000, 0x0100000003080800, 0x0000020202000000, 0x2002020000000300, 0x0000100a0a000000, + 0x00000000080d0d00, 0x0300000005000000, 0x0700000003000000, 0x0700000001000000, 0x0000100001000000, 0x3163690000001800, 0x6f38336869343230, 0x6b3833686f363163, + 0x0000103168703368, 0x260d0d0000000100, 0x0300000000000000, 0x0000200202000000, 0x0a00000010090900, 0x000003000000100a, 0x0000010000000500, 0x00000b0000000300, + 0x00000b0000000100, 0x1300000800000100, 0x6869326369000000, 0x38686f38636f3031, 0x000830687033686b, 0x0c0c000000020000, 0x00080d0d00000004, 0x0004000000000000, + 0x0000000202020000, 0x0909000000030808, 0x00080a0a00000008, 0x0005000000000000, 0x0003000000010000, 0x0001000000080000, 0x0001000000080000, 0x0000001300000800, + 0x6f30316869326369, 0x33686b38686f3863, 0x0200000010306870, 0x0000040c0c000000, 0x00000000080d0d00, 0x0200000004000000, 0x0308080000000202, 0x0000100909000000, + 0x00000000100a0a00, 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3163690000001c00, 0x3635327769303832, 0x776f30383231636f, + 0x777031776b363532, 0x0000010000000830, 0x0200000050020200, 0x0000080a0a000000, 0x03000000040d0d00, 0x0000100202000000, 0x0d000000100a0a00, 0xffffff000000200d, + 0x00000300000004ff, 0x0000070000000300, 0x0000070000000100, 0x2600000003000100, 0x3635326369000000, 0x3631776930316869, 0x31686f363532636f, 0x35686b3631776f30, + 0x777032687035776b, 0x0000010000000832, 0x020000000a0c0c00, 0x0000080202000000, 0x03000000040a0a00, 0x0000100202000000, 0x0d000000400a0a00, 0x000003000000100d, + 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1c00000001000100, 0x3239316369000000, 0x636f363532776930, 0x3532776f30383231, 0x0830777031776b36, + 0x0200000001000000, 0x0000020000007802, 0x0d000000080a0a00, 0x000003000000040d, 0x0a00000010020200, 0x200d0d000000100a, 0x05ffffffff000000, 0x0100000003000000, + 0x0700000007000000, 0x0100000001000000, 0x0000170000000100, 0x7769363938636900, 0x776f3233636f3934, 0x30777031776b3934, 0x0000000100000010, 0x0001000000070202, + 0x0000000802020000, 0x0010020200000003, 0x000000100a0a0000, 0x0000000000080d0d, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, + 0x393163690000001d, 0x3432303177693032, 0x31776f303436636f, 0x777031776b343230, 0x0000010000000830, 0x0200000078020200, 0x0000080a0a000000, 0x03000000040d0d00, + 0x0000100202000000, 0x0d000000200a0a00, 0xffffff000000200d, 0x00000300000005ff, 0x0000010000000200, 0x0000070000000700, 0x3200000010000100, 0x6932336369000000, + 0x6933336869373164, 0x6f3233636f333377, 0x6f3333686f373164, 0x686b33646b333377, 0x7031647033776b33, 0x0000103177703168, 0x0306060000000300, 0x0000030707000000, + 0x0000000003080800, 0x0200000003000000, 0x1009090000002002, 0x0000200a0a000000, 0x0000050000000300, 0x0000020000000300, 0x0000070000000100, 0x4000010000000700, + 0x690000001c000000, 0x3230317769343663, 0x776f383231636f32, 0x3631776b37303031, 0x0200000010307770, 0x0000100808000000, 0x01000000040a0a00, 0x0000080505000000, + 0x4002020000000300, 0x0000200505000000, 0xff000000200a0a00, 0x0300000004ffffff, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3663690000001c00, + 0x3432303177693034, 0x31776f303436636f, 0x777031776b343230, 0x0000010000000830, 0x0200000028020200, 0x0000080a0a000000, 0x03000000080d0d00, 0x0000100202000000, + 0x0d000000100a0a00, 0x000000000000200d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1800000002000100, 0x3032336369000000, 0x3436636f32336869, + 0x33686b3233686f30, 0x0300000010316870, 0x00000a0202000000, 0x0800000003070700, 0x0000010000000308, 0x04000000040d0d00, 0x0000200202000000, 0x0a00000002090900, + 0x080d0d000000200a, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000180000000100, 0x6869303233636900, 0x6f303436636f3233, + 0x687033686b323368, 0x0000030000001031, 0x070000000a020200, 0x0308080000000307, 0x0a00000002000000, 0x040d0d000000020a, 0x0200000003000000, 0x200a0a0000002002, + 0x0000080d0d000000, 0x0000040000000300, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x6900000016000000, 0x3636686930323363, 0x6b3436686f34636f, + 0x0000083068703368, 0x0302020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000040202000000, 0x03000000080d0d00, 0x0000200202000000, 0x0d000000080a0a00, + 0xffffff000000080d, 0x00000300000004ff, 0x0000070000000100, 0x0000010000000700, 0x1600000001000100, 0x6869346369000000, 0x6f303233636f3436, 0x687033686b343668, + 0x0000010000000831, 0x0100000003070700, 0x0000020a0a000000, 0x0402020000000400, 0x0000040808000000, 0x0d000000400a0a00, 0x000000000000100d, 0x0000030000000500, + 0x0000040000000100, 0x0000010000000400, 0x3900000020010100, 0x6469336369000000, 0x7769343232686938, 0x6f3631636f343232, 0x6f323131686f3864, 0x6b31646b32313177, + 0x32687333776b3368, 0x6870306470327773, 0x0000001031777031, 0x0003070700000001, 0x0004000000000000, 0x0000000402020000, 0x0909000000080808, 0x00100a0a00000020, + 0x0004000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001b00000001, 0x3368693034366369, 0x686f303436636f32, 0x32687333686b3631, + 0x0300000008316870, 0x00000a0202000000, 0x0800000003070700, 0x0000010000000308, 0x0300000004020200, 0x0000100202000000, 0x0d000000200a0a00, 0x000000000000100d, + 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1600000002000100, 0x3032336369000000, 0x6f34636f34366869, 0x687033686b343668, 0x0000030000001031, + 0x070000000a020200, 0x0308080000000307, 0x0300000000000000, 0x0000200202000000, 0x0d000000100a0a00, 0x000003000000080d, 0x0000030000000500, 0x0000070000000100, + 0x0000010000000700, 0x1600000001000100, 0x3032336369000000, 0x6f34636f34366869, 0x687033686b343668, 0x0000030000001031, 0x070000000a020200, 0x0308080000000307, + 0x0d00000001000000, 0x000003000000020d, 0x0a00000020020200, 0x080d0d000000100a, 0x0400000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, + 0x0000180000000100, 0x6869303436636900, 0x6f303436636f3433, 0x687033686b323368, 0x0000030000000830, 0x0700000028020200, 0x0308080000000307, 0x0a00000001000000, + 0x000003000000020a, 0x0a00000010020200, 0x100d0d000000200a, 0x0400000000000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, 0x0000170000000100, + 0x3036686933636900, 0x36686f3233636f38, 0x31687033686b3830, 0x0000000100000008, 0x0001000000030707, 0x000000020d0d0000, 0x0004020200000004, 0x0000000808080000, + 0x0d0d000000100a0a, 0x0000000300000020, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x001a000000010001, 0x6936353263690000, 0x3532636f30353168, + 0x686b303531686f36, 0x0000000831687033, 0x0010020200000003, 0x0000000307070000, 0x0001000000030808, 0x000000040a0a0000, 0x0010020200000003, 0x000000400a0a0000, + 0x0003000000100d0d, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x693363690000001a, 0x3436636f30303368, 0x37686b303531686f, + 0x0008336870326873, 0x0707000000020000, 0x0002080800000007, 0x0a0a000000010000, 0x0000000400000002, 0x0808000000040202, 0x00200a0a00000004, 0x0000000f0d0d0000, + 0x0000000500000000, 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x63690000001a0000, 0x3631686930383231, 0x686f30383231636f, 0x31687033686b3631, + 0x0000000300000010, 0x0707000000050202, 0x0003080800000003, 0x0202000000010000, 0x0000000400000008, 0x0909000000200202, 0x00200a0a00000002, 0x000000100d0d0000, + 0x0000000500000000, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x63690000001a0000, 0x3631686930383231, 0x686f30383231636f, 0x31687033686b3631, + 0x0000000300000010, 0x0707000000050202, 0x0003080800000003, 0x0202000000010000, 0x0000000300000008, 0x0a0a000000200202, 0x00100d0d00000020, 0x0004000000000000, + 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001c00000001, 0x7769303635326369, 0x383231636f363532, 0x776b363532776f30, 0x0000000830777031, + 0x00a0020200000001, 0x0a0a000000020000, 0x00040d0d00000008, 0x0202000000030000, 0x00100a0a00000010, 0x000000200d0d0000, 0x00000005ffffffff, 0x0000000100000003, + 0x0000000700000007, 0x0002000100000001, 0x63690000001a0000, 0x3436776930363532, 0x776f30383231636f, 0x30777031776b3436, 0x0000000100000010, 0x0002000000500202, + 0x000000020a0a0000, 0x0003000000080d0d, 0x0000002002020000, 0x0d0d000000200a0a, 0x0000000300000008, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, + 0x001a000000010001, 0x3036353263690000, 0x3231636f34367769, 0x776b3436776f3038, 0x0000001030777031, 0x0014020200000001, 0x0202000000010000, 0x0000000300000004, + 0x0a0a000000200202, 0x00100d0d00000020, 0x0004000000000000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001a00000001, 0x7769303832316369, + 0x30383231636f3436, 0x7031776b3436776f, 0x0001000000083077, 0x0000000a02020000, 0x0008020200000002, 0x000000020d0d0000, 0x0010020200000003, 0x000000100a0a0000, + 0x0000000000200d0d, 0x0001000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x6932636900000014, 0x686f32636f363568, 0x31687033686b3635, + 0x0000000100000008, 0x0000000000030707, 0x0202000000040000, 0x0003080800000002, 0x000000080a0a0000, 0x0000000000070d0d, 0x0003000000040000, 0x0007000000010000, + 0x0001000000070000, 0x0000000100010000, 0x693463690000002f, 0x3177693038303168, 0x6f3633636f303239, 0x3436776f30363368, 0x7333776b33686b30, 0x3068703377733368, + 0x0100000008307770, 0x0000030707000000, 0x020a0a0000000200, 0x0000040d0d000000, 0x0402020000000400, 0x0000040808000000, 0x0d000000200a0a00, 0xffffff000000100d, + 0x00000300000005ff, 0x0000070000000100, 0x0000010000000700, 0x1b00000010000100, 0x3635326369000000, 0x3135636f38336869, 0x33686b3931686f32, 0x0010316870326873, + 0x0202000000030000, 0x0003070700000008, 0x0000000308080000, 0x00040a0a00000002, 0x000000040d0d0000, 0x0020020200000003, 0x0000001009090000, 0x0003000000200a0a, + 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x323363690000001b, 0x33636f3436686930, 0x686b3233686f3032, 0x0831687032687333, + 0x0200000003000000, 0x0307070000001402, 0x0000030808000000, 0x080a0a0000000200, 0x0000020d0d000000, 0x1002020000000300, 0x0000100a0a000000, 0xff000000100d0d00, + 0x0300000004ffffff, 0x0100000002000000, 0x0700000007000000, 0x0000020001000000, 0x3263690000001a00, 0x6f30303168693635, 0x3031686f36353263, 0x0831687033686b30, + 0x0700000003000000, 0x0308080000000307, 0x0000080a0a000000, 0x0802020000000200, 0x0000040505000000, 0x1002020000000300, 0x0000190505000000, 0x03000000200a0a00, + 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3363690000001c00, 0x3432303177693032, 0x31776f303436636f, 0x777031776b343230, + 0x0000010000001030, 0x020000000a020200, 0x0000020a0a000000, 0x03000000040d0d00, 0x0000200202000000, 0x0d000000200a0a00, 0x000003000000100d, 0x0000030000000500, + 0x0000070000000100, 0x0000010000000700, 0x1c00000001000100, 0x3032336369000000, 0x636f343230317769, 0x323031776f303436, 0x1030777031776b34, 0x0200000001000000, + 0x0000020000000a02, 0x0d000000040a0a00, 0x000003000000080d, 0x0a00000020020200, 0x100d0d000000200a, 0x0500000003000000, 0x0100000003000000, 0x0400000004000000, + 0x0100000001000000, 0x00001a0000008000, 0x6869323135636900, 0x6f323135636f3431, 0x32687333686b3768, 0x0300000010316870, 0x0000080202000000, 0x0800000003070700, + 0x0000020000000308, 0x0d000000040a0a00, 0x000003000000080d, 0x0900000040020200, 0x400a0a0000002009, 0x0400000003000000, 0x0100000003000000, 0x0700000007000000, + 0x0100000001000000, 0x0000180000000100, 0x6869303436636900, 0x6f303436636f3436, 0x687033686b343668, 0x0000030000000831, 0x0700000028020200, 0x0308080000000307, + 0x0a00000002000000, 0x020d0d000000080a, 0x0200000003000000, 0x100a0a0000001002, 0x0000200d0d000000, 0x000004ffffffff00, 0x0000010000000100, 0x0000040000000400, + 0x0100010000000100, 0x670000001b000000, 0x3168693263693233, 0x35686f32636f3231, 0x7032687333686b36, 0x0001000000083168, 0x0000000307070000, 0x00020d0d00000001, + 0x0202000000040000, 0x0003080800000002, 0x000000080a0a0000, 0x00000000000e0d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000002000010000, + 0x3233636900000017, 0x33636f3231356869, 0x33686b323135686f, 0x0200000010316870, 0x0000030707000000, 0x0100000003080800, 0x0000080d0d000000, 0x2002020000000300, + 0x0000100909000000, 0x03000000100a0a00, 0x0300000005000000, 0x0700000003000000, 0x0700000001000000, 0x0000040001000000, 0x3863690000002800, 0x3177693034366869, + 0x686f38636f343230, 0x323031776f303436, 0x7033776b33686b34, 0x0000103177703168, 0x200c0c0000000100, 0x0400000000000000, 0x0000080202000000, 0x0a00000003080800, + 0x100d0d000000100a, 0x0400000003000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, 0x0000380000000100, 0x3264693233636900, 0x6934323268693432, + 0x3233636f30363177, 0x32686f343232646f, 0x6b303631776f3432, 0x33776b33686b3364, 0x7770316870316470, 0x0000030000000831, 0x0700000003060600, 0x0308080000000307, + 0x0a00000002000000, 0x020d0d000000020a, 0x0200000003000000, 0x100a0a0000002002, 0x0000140d0d000000, 0x0000040000000300, 0x0000030000000100, 0x0000010000000800, + 0x6400010000000800, 0x690000001e000000, 0x3335393277693163, 0x35776f3031636f32, 0x733031776b353039, 0x0000103077703577, 0x050d0d0000000100, 0x0200000000000000, + 0x0000100909000000, 0x00000000100a0a00, 0x0100000005000000, 0x0800000001000000, 0x0100000008000000, 0x0004000101000000, 0x3233670000001700, 0x6f36356869346369, + 0x686b3635686f3463, 0x0000001031687033, 0x0003070700000001, 0x0d0d000000010000, 0x0000000400000008, 0x0808000000040202, 0x0010090900000003, 0x000000100a0a0000, + 0x0000000500000000, 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x63690000001c0000, 0x3930347769303436, 0x776f303233636f36, 0x7031776b36393034, + 0x0001000000103077, 0x0000001402020000, 0x00020a0a00000002, 0x000000080d0d0000, 0x0020020200000003, 0x000000400a0a0000, 0x0003000000200d0d, 0x0003000000050000, + 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x343663690000001c, 0x6f36393034776930, 0x3034776f30323363, 0x30777031776b3639, 0x0000000100000010, + 0x0001000000140202, 0x000000020a0a0000, 0x0020020200000003, 0x000000200a0a0000, 0x0003000000200d0d, 0x0003000000040000, 0x0004000000010000, 0x0001000000040000, + 0x0000008001010000, 0x353263690000001b, 0x636f363931776936, 0x3931776f34323031, 0x0830777031776b36, 0x0200000001000000, 0x0000020000000802, 0x0d000000080a0a00, + 0x000003000000040d, 0x0900000020020200, 0x200a0a0000002009, 0x0400000003000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, 0x00001c0000008001, + 0x6869323135636900, 0x34323031636f3832, 0x7331686b3431686f, 0x0000083068703268, 0x1002020000000100, 0x0900000002000000, 0x080a0a0000000409, 0x0200000003000000, + 0x2009090000002002, 0x0000200a0a000000, 0x0000040000000300, 0x0000030000000300, 0x0000010000000700, 0x0400010000000700, 0x6900000016000000, 0x6f38646936353263, + 0x6b38646f32313563, 0x0000083164703364, 0x080c0c0000000100, 0x0200000002000000, 0x080a0a0000000202, 0x0200000003000000, 0x080a0a0000002002, 0x0000100d0d000000, + 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, 0x0200010000000100, 0x6900000018000000, 0x3436686930343663, 0x36686f303436636f, 0x1031687033686b34, + 0x0200000003000000, 0x0307070000001402, 0x0000030808000000, 0x080d0d0000000100, 0x0200000004000000, 0x0209090000002002, 0x0000400a0a000000, 0x03000000080d0d00, + 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3663690000001800, 0x636f343668693034, 0x6b3436686f303436, 0x0000103168703368, + 0x1402020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000020a0a000000, 0x03000000040d0d00, 0x0000200202000000, 0x0d000000200a0a00, 0x000003000000100d, + 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1700000020000100, 0x6932336369000000, 0x6f33636f32333268, 0x7039686b34323268, 0x0002000000103068, + 0x0000000907070000, 0x0001000000090808, 0x000000080d0d0000, 0x0020020200000003, 0x0000002009090000, 0x0003000000100a0a, 0x0003000000050000, 0x0004000000010000, + 0x0001000000040000, 0x0000002000010000, 0x3233636900000017, 0x33636f3233326869, 0x39686b343232686f, 0x0200000010306870, 0x0000090707000000, 0x0100000009080800, + 0x0000080d0d000000, 0x2002020000000300, 0x0000200909000000, 0x03000000100a0a00, 0x0300000005000000, 0x0700000003000000, 0x0700000001000000, 0x0000100001000000, + 0x3363690000001a00, 0x36636f3030336869, 0x686b303531686f34, 0x1033687032687337, 0x0c00000002000000, 0x0a0d0d000000020c, 0x0400000000000000, 0x0000040202000000, + 0x0900000008080800, 0x400a0a0000001009, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001a0000000200, 0x6930323931636900, + 0x383231636f363168, 0x33686b3631686f30, 0x0300000010316870, 0x00003c0202000000, 0x0800000003070700, 0x0000010000000308, 0x04000000020d0d00, 0x0000200202000000, + 0x0a00000002090900, 0x080d0d000000400a, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001a0000000100, 0x6930323931636900, + 0x383231636f363168, 0x33686b3631686f30, 0x0300000010316870, 0x0000080202000000, 0x0800000003070700, 0x0000010000000308, 0x0300000008020200, 0x0000200202000000, + 0x0d000000200a0a00, 0x000000000000100d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1800000010000100, 0x3635326369000000, 0x3532636f38336869, + 0x33686b3833686f36, 0x0300000010316870, 0x0000080202000000, 0x0800000003070700, 0x0000020000000308, 0x0d000000020a0a00, 0x000003000000080d, 0x0900000020020200, + 0x400a0a0000001009, 0x0400000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001c0000000200, 0x6930363532636900, 0x3231636f34343177, + 0x6b343431776f3038, 0x0000083077703177, 0x5002020000000100, 0x0200000001000000, 0x0000040000000202, 0x0900000010020200, 0x100a0a0000000209, 0x0000100d0d000000, + 0x0000050000000000, 0x0000020000000100, 0x00000b0000000100, 0x0000010000000b00, 0x6900000014000008, 0x636f303168693863, 0x33686b38686f3031, 0x0200000008306870, + 0x0000030707000000, 0x0100000003080800, 0x0000020a0a000000, 0x0802020000000300, 0x0000100909000000, 0x00000000080a0a00, 0x0100000005000000, 0x0100000002000000, + 0x0800000008000000, 0x0008000001000000, 0x3863690000001400, 0x3031636f30316869, 0x687033686b38686f, 0x0000020000001030, 0x0800000003070700, 0x0000000000000308, + 0x1002020000000300, 0x0000100909000000, 0x00000000100a0a00, 0x0300000005000000, 0x0a00000003000000, 0x0a00000001000000, 0x0000200101000000, 0x3863690000001400, + 0x6f38636f35326469, 0x647035646b353264, 0x0000020000001032, 0x0d000000190c0c00, 0x000000000000050d, 0x0802020000000300, 0x0000100909000000, 0x02000000100a0a00, + 0x0300000005000000, 0x0400000001000000, 0x0100000004000000, 0x0000010001000000, 0x3163690000001700, 0x636f333168693832, 0x686b3331686f3233, 0x0000001031687033, + 0x0003070700000002, 0x0000000308080000, 0x0004020200000001, 0x0202000000030000, 0x00100a0a00000020, 0x000000080d0d0000, 0x0000000500000000, 0x0000000200000003, + 0x0000000700000001, 0x0010000100000007, 0x63690000001b0000, 0x6f35376869383231, 0x3833686f38323163, 0x687032687333686b, 0x0000030000001031, 0x0800000003070700, + 0x040a0a0000000308, 0x0300000000000000, 0x0000400202000000, 0x0a00000010090900, 0x000003000000200a, 0x0000030000000500, 0x0000070000000300, 0x0000070000000100, + 0x1c00000010000100, 0x3635326369000000, 0x636f353236357769, 0x323635776f383231, 0x1030777031776b35, 0x0d00000001000000, 0x0000020000007d0d, 0x0a00000008020200, + 0x000003000000020a, 0x0900000010020200, 0x400a0a0000001009, 0x0400000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001a0000000100, + 0x3232686933636900, 0x686f383231636f34, 0x34687334686b3635, 0x0100000008306870, 0x0000040707000000, 0x020a0a0000000100, 0x0200000004000000, 0x0408080000000402, + 0x0000200a0a000000, 0x00000000080d0d00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3263690000001800, 0x636f386869303635, + 0x6b38686f30383231, 0x0000103168703368, 0x5002020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000020a0a000000, 0x03000000080d0d00, 0x0000200202000000, + 0x0a00000002090900, 0x000003000000400a, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1800000001000100, 0x3635326369000000, 0x3231636f38686930, + 0x33686b38686f3038, 0x0300000010316870, 0x0000500202000000, 0x0800000003070700, 0x0000010000000308, 0x03000000020a0a00, 0x0000200202000000, 0x0d000000100a0a00, + 0x000003000000080d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x2000000001000100, 0x3635326369000000, 0x3434313236327769, 0x32776f383231636f, + 0x31776b3434313236, 0x0100000010307770, 0x0000080202000000, 0x020a0a0000000200, 0x0000020d0d000000, 0x2002020000000300, 0x0000400a0a000000, 0x03000000200d0d00, + 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3163690000001d00, 0x3230317769303832, 0x776f303436636f34, 0x7031776b34323031, + 0x0001000000103077, 0x0000002802020000, 0x00040a0a00000002, 0x000000080d0d0000, 0x0020020200000003, 0x000000200a0a0000, 0x0003000000200d0d, 0x0003000000050000, + 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x323163690000001d, 0x3432303177693038, 0x31776f303436636f, 0x777031776b343230, 0x0000010000001030, + 0x0200000028020200, 0x0000040a0a000000, 0x03000000040d0d00, 0x0000200202000000, 0x0d000000200a0a00, 0x000003000000200d, 0x0000030000000400, 0x0000040000000100, 0x0000010000000400, 0x1900000080010100, 0x3231356369000000, 0x3032636f39347769, 0x776b3934776f3834, 0x0000000830777031, 0x0010020200000001, 0x0909000000020000, - 0x00080a0a00000004, 0x0202000000030000, 0x0020090900000020, 0x000000200a0a0000, 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, - 0x63690000001d0000, 0x3031776930383231, 0x6f303436636f3432, 0x31776b3432303177, 0x0100000010307770, 0x0000280202000000, 0x040a0a0000000200, 0x0000040d0d000000, - 0x2002020000000300, 0x0000200a0a000000, 0x03000000200d0d00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3163690000001d00, - 0x3230317769303832, 0x776f303436636f34, 0x7031776b34323031, 0x0001000000103077, 0x0000002802020000, 0x00040a0a00000002, 0x000000080d0d0000, 0x0020020200000003, - 0x000000200a0a0000, 0x0003000000200d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x3532636900000018, 0x31636f3868693036, - 0x686b38686f303832, 0x0000001031687033, 0x0050020200000003, 0x0000000307070000, 0x0001000000030808, 0x000000020a0a0000, 0x0020020200000003, 0x000000100a0a0000, - 0x0003000000080d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000200010000, 0x3532636900000018, 0x31636f3868693036, 0x686b38686f303832, - 0x0000001031687033, 0x0050020200000003, 0x0000000307070000, 0x0002000000030808, 0x000000020a0a0000, 0x0003000000080d0d, 0x0000002002020000, 0x0a0a000000020909, - 0x0000000300000040, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x0020000000010001, 0x6936353263690000, 0x6f34343132363277, 0x3632776f38323163, - 0x7031776b34343132, 0x0001000000103077, 0x0000000802020000, 0x00020a0a00000002, 0x000000020d0d0000, 0x0020020200000003, 0x000000400a0a0000, 0x0003000000200d0d, - 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x693363690000001a, 0x3231636f34323268, 0x34686b3635686f38, 0x0008306870346873, - 0x0707000000010000, 0x0000000100000004, 0x0004000000020a0a, 0x0000000402020000, 0x0a0a000000040808, 0x00080d0d00000020, 0x0005000000000000, 0x0003000000030000, - 0x0001000000070000, 0x0001000000070000, 0x0000001c00000010, 0x3577693635326369, 0x383231636f353236, 0x776b35323635776f, 0x0000001030777031, 0x007d0d0d00000001, - 0x0202000000020000, 0x00020a0a00000008, 0x0202000000030000, 0x0010090900000010, 0x000000400a0a0000, 0x0000000500000003, 0x0000000200000003, 0x0000000700000001, - 0x0010000100000007, 0x63690000001b0000, 0x6f35376869383231, 0x3833686f38323163, 0x687032687333686b, 0x0000030000001031, 0x0800000003070700, 0x040a0a0000000308, - 0x0300000000000000, 0x0000400202000000, 0x0a00000010090900, 0x000003000000200a, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1600000001000100, - 0x6869346369000000, 0x6f303233636f3436, 0x687033686b343668, 0x0000010000001031, 0x0000000003070700, 0x0200000004000000, 0x0408080000000402, 0x0000400a0a000000, - 0x03000000100d0d00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3463690000001600, 0x3233636f34366869, 0x33686b3436686f30, - 0x0100000010316870, 0x0000030707000000, 0x040d0d0000000100, 0x0200000005000000, 0x0408080000000402, 0x0000020909000000, 0x0d000000400a0a00, 0x000003000000080d, - 0x0000030000000500, 0x00000a0000000300, 0x00000a0000000100, 0x1400000020010100, 0x6469386369000000, 0x32646f38636f3532, 0x1032647035646b35, 0x0c00000002000000, - 0x050d0d000000190c, 0x0300000000000000, 0x0000080202000000, 0x0a00000010090900, 0x000002000000100a, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, - 0x1a00000001000100, 0x3239316369000000, 0x31636f3631686930, 0x6b3631686f303832, 0x0000103168703368, 0x0802020000000300, 0x0000030707000000, 0x0100000003080800, - 0x0000080202000000, 0x2002020000000300, 0x0000200a0a000000, 0x00000000100d0d00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, - 0x3163690000001a00, 0x6f36316869303239, 0x31686f3038323163, 0x1031687033686b36, 0x0200000003000000, 0x0307070000003c02, 0x0000030808000000, 0x020d0d0000000100, - 0x0200000004000000, 0x0209090000002002, 0x0000400a0a000000, 0x03000000080d0d00, 0x0300000005000000, 0x0400000001000000, 0x0100000004000000, 0x0000010001000000, - 0x3163690000001700, 0x636f333168693832, 0x686b3331686f3233, 0x0000001031687033, 0x0003070700000002, 0x0000000308080000, 0x0004020200000001, 0x0202000000030000, - 0x00100a0a00000020, 0x000000080d0d0000, 0x0000000500000000, 0x0000000200000001, 0x0000000800000001, 0x0800000100000008, 0x6369000000140000, 0x31636f3031686938, - 0x7033686b38686f30, 0x0002000000103068, 0x0000000307070000, 0x0000000000030808, 0x0202000000030000, 0x0010090900000010, 0x000000100a0a0000, 0x0000000500000000, - 0x0000000200000001, 0x0000000b00000001, 0x080000010000000b, 0x6369000000140000, 0x31636f3031686938, 0x7033686b38686f30, 0x0002000000083068, 0x0000000307070000, - 0x0001000000030808, 0x000000020a0a0000, 0x0008020200000003, 0x0000001009090000, 0x0000000000080a0a, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, - 0x0000001000010000, 0x3532636900000018, 0x32636f3833686936, 0x686b3833686f3635, 0x0000001031687033, 0x0008020200000003, 0x0000000307070000, 0x0002000000030808, - 0x000000020a0a0000, 0x0003000000080d0d, 0x0000002002020000, 0x0a0a000000100909, 0x0000000300000040, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, - 0x001c000000020001, 0x3036353263690000, 0x31636f3434317769, 0x343431776f303832, 0x000830777031776b, 0x0202000000010000, 0x0000000100000050, 0x0004000000020202, - 0x0000001002020000, 0x0a0a000000020909, 0x00100d0d00000010, 0x0005000000000000, 0x0003000000030000, 0x0001000000070000, 0x0001000000070000, 0x0000001a00000010, - 0x3030336869336369, 0x3531686f3436636f, 0x7032687337686b30, 0x0002000000103368, 0x000000020c0c0000, 0x00000000000a0d0d, 0x0202000000040000, 0x0008080800000004, - 0x0000001009090000, 0x0003000000400a0a, 0x0003000000040000, 0x0007000000030000, 0x0007000000010000, 0x0000000400010000, 0x3532636900000016, 0x3135636f38646936, - 0x7033646b38646f32, 0x0001000000083164, 0x000000080c0c0000, 0x0002020200000002, 0x000000080a0a0000, 0x0020020200000003, 0x000000080a0a0000, 0x0003000000100d0d, - 0x0003000000040000, 0x0004000000010000, 0x0001000000040000, 0x0000000800010000, 0x3231636900000018, 0x35636f3832686938, 0x686b3832686f3231, 0x0000000830687031, - 0x0004020200000001, 0x0a0a000000010000, 0x0000000300000004, 0x0a0a000000200202, 0x001c0d0d00000020, 0x0005ffffffff0000, 0x0001000000030000, 0x0004000000040000, - 0x0001000000010000, 0x0000001700000020, 0x3332686932336369, 0x3232686f33636f32, 0x1030687039686b34, 0x0700000002000000, 0x0908080000000907, 0x0d00000001000000, - 0x000003000000080d, 0x0900000020020200, 0x100a0a0000002009, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000170000002000, - 0x3268693233636900, 0x32686f33636f3233, 0x30687039686b3432, 0x0000000200000010, 0x0808000000090707, 0x0000000100000009, 0x0003000000080d0d, 0x0000002002020000, - 0x0a0a000000200909, 0x0000000300000010, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x0018000000010001, 0x6930343663690000, 0x303436636f343668, - 0x7033686b3436686f, 0x0003000000103168, 0x0000001402020000, 0x0808000000030707, 0x0000000200000003, 0x0d0d000000020a0a, 0x0000000300000004, 0x0a0a000000200202, - 0x00100d0d00000020, 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000002, 0x3668693034366369, 0x686f303436636f34, - 0x31687033686b3436, 0x0000000300000010, 0x0707000000140202, 0x0003080800000003, 0x0d0d000000010000, 0x0000000400000008, 0x0909000000200202, 0x00400a0a00000002, - 0x000000080d0d0000, 0x0000000400000003, 0x0000000100000003, 0x0000000400000004, 0x0080010100000001, 0x63690000001c0000, 0x6f38326869323135, 0x31686f3432303163, - 0x7032687331686b34, 0x0001000000083068, 0x0000001002020000, 0x0004090900000002, 0x000000080a0a0000, 0x0020020200000003, 0x0000002009090000, 0x0003000000200a0a, - 0x0003000000040000, 0x0004000000010000, 0x0001000000040000, 0x0000008001010000, 0x353263690000001b, 0x636f363931776936, 0x3931776f34323031, 0x0830777031776b36, - 0x0200000001000000, 0x0000020000000802, 0x0d000000080a0a00, 0x000003000000040d, 0x0900000020020200, 0x200a0a0000002009, 0x0500000003000000, 0x0100000003000000, - 0x0700000007000000, 0x0100000001000000, 0x00001c0000000100, 0x7769303436636900, 0x3233636f36393034, 0x6b36393034776f30, 0x0000103077703177, 0x1402020000000100, - 0x0a00000001000000, 0x000003000000020a, 0x0a00000020020200, 0x200d0d000000200a, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, - 0x00001c0000000200, 0x7769303436636900, 0x3233636f36393034, 0x6b36393034776f30, 0x0000103077703177, 0x1402020000000100, 0x0a00000002000000, 0x080d0d000000020a, - 0x0200000003000000, 0x400a0a0000002002, 0x0000200d0d000000, 0x0000040000000300, 0x0000010000000300, 0x0000070000000700, 0x0500010000000100, 0x6900000018000000, - 0x3635686938323163, 0x35686f383231636f, 0x0831687033686b36, 0x0200000003000000, 0x0307070000000202, 0x0000030808000000, 0x0202020000000100, 0x0200000003000000, - 0x200a0a0000002002, 0x00000e0d0d000000, 0x0000050000000000, 0x0000030000000100, 0x0000010000000800, 0x0000010000000800, 0x6900000014000008, 0x636f303168693863, - 0x33686b38686f3031, 0x0200000010306870, 0x0000040c0c000000, 0x00000000080d0d00, 0x0200000003000000, 0x1009090000000802, 0x0000100a0a000000, 0x0000050000000000, - 0x0000030000000100, 0x0000010000000b00, 0x0000010000000b00, 0x6900000014000008, 0x636f303168693863, 0x33686b38686f3031, 0x0200000008306870, 0x0000020c0c000000, - 0x00000000080d0d00, 0x0200000003000000, 0x0809090000000802, 0x0000100a0a000000, 0x0000050000000000, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, + 0x00080a0a00000004, 0x0202000000030000, 0x0020090900000020, 0x000000200a0a0000, 0x0000000400000003, 0x0000000100000003, 0x0000000400000004, 0x0020000100000001, + 0x3367000000170000, 0x3635686934636932, 0x6b3635686f34636f, 0x0000083168703368, 0x0307070000000100, 0x0d00000001000000, 0x000004000000080d, 0x0800000008020200, + 0x2009090000000408, 0x0000080a0a000000, 0x0000040000000200, 0x0000010000000300, 0x0000070000000700, 0x2000010000000100, 0x6700000017000000, 0x3568693463693233, + 0x3635686f34636f36, 0x000831687033686b, 0x0707000000010000, 0x0000000100000003, 0x0004000000080d0d, 0x0000000402020000, 0x0909000000040808, 0x00080a0a00000020, + 0x0005000000020000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000001, 0x3668693231356369, 0x686f323135636f34, 0x31687033686b3436, + 0x0000000300000010, 0x0707000000100202, 0x0003080800000003, 0x0a0a000000020000, 0x00040d0d00000004, 0x0202000000030000, 0x00200a0a00000020, 0x000000100d0d0000, + 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x6369000000160000, 0x636f373168693233, 0x686b3731686f3233, 0x0000001031687033, + 0x0002020200000003, 0x0000000307070000, 0x0002000000030808, 0x000000020a0a0000, 0x0003000000040d0d, 0x0000001002020000, 0x0d0d000000100a0a, 0x0000000300000008, + 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x0017000000200001, 0x6869323363690000, 0x686f33636f323135, 0x687033686b323135, 0x0000020000000831, + 0x0800000003070700, 0x0000010000000308, 0x03000000080d0d00, 0x0000200202000000, 0x0a00000010090900, 0x000002000000080a, 0x0000030000000400, 0x0000070000000100, + 0x0000010000000700, 0x1700000020000100, 0x6932336369000000, 0x6f33636f32313568, 0x7033686b32313568, 0x0002000000083168, 0x0000000307070000, 0x0001000000030808, + 0x000000080d0d0000, 0x0020020200000003, 0x0000001009090000, 0x0002000000080a0a, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000200010000, + 0x323163690000001a, 0x636f323168693038, 0x3231686f30383231, 0x000831687033686b, 0x0202000000030000, 0x0003070700000028, 0x0000000308080000, 0x0002020200000002, + 0x000000020d0d0000, 0x0010020200000004, 0x0000000209090000, 0x0d0d000000200a0a, 0xffffffff00000006, 0x0000000100000004, 0x0000000400000001, 0x0000000100000004, + 0x001c000000200101, 0x6369363532670000, 0x6f36393034776931, 0x36393034776f3163, 0x000830777031776b, 0x0000000000000000, 0x0101000000020000, 0x0010090900000020, + 0x0004000000000000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000001, 0x3377693635326369, 0x776f383231636f32, 0x30777031776b3233, + 0x0000000000000008, 0x0008020200000001, 0x0202000000030000, 0x00080a0a00000020, 0x000000080d0d0000, 0x0000000400000000, 0x0000000100000003, 0x0000000400000004, + 0x0001000100000001, 0x6369000000180000, 0x6f32337769363532, 0x3233776f38323163, 0x000830777031776b, 0x0001000000000000, 0x0000000402020000, 0x0040020200000003, + 0x000000080a0a0000, 0x0000000000100d0d, 0x0003000000050000, 0x000a000000030000, 0x000a000000010000, 0x0000004001010000, 0x343663690000001c, 0x636f323230317769, + 0x303031776f383231, 0x3077703631776b37, 0x0000000200000010, 0x0d0d000000020909, 0x0000000200000013, 0x0a0a000000020202, 0x0000000300000002, 0x0909000000200202, + 0x00400a0a00000008, 0x0004000000020000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001c00000001, 0x3177693032336369, 0x303436636f343230, + 0x776b34323031776f, 0x0000000830777031, 0x0014020200000001, 0x0a0a000000020000, 0x00080d0d00000008, 0x0202000000030000, 0x00100a0a00000010, 0x000000200d0d0000, + 0x0000000400000000, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x6369000000140000, 0x33636f3968693233, 0x7033686b37686f32, 0x0002000000083068, + 0x0000000307070000, 0x0001000000030808, 0x0000000202020000, 0x0010020200000003, 0x000000100a0a0000, 0x0000000000070d0d, 0x0003000000050000, 0x0004000000010000, + 0x0001000000040000, 0x0000008000010000, 0x3135636900000019, 0x32636f3934776932, 0x6b3934776f383430, 0x0000103077703177, 0x0802020000000100, 0x0900000002000000, + 0x040a0a0000000809, 0x0200000003000000, 0x1009090000004002, 0x0000400a0a000000, 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x6900000016000000, 0x636f343668693463, 0x6b3436686f323135, 0x0000103168703368, 0x0307070000000100, 0x0d00000001000000, 0x000004000000040d, 0x0800000004020200, - 0x400a0a0000000408, 0x0000100d0d000000, 0x0000050000000300, 0x0000010000000300, 0x0000040000000400, 0x8000010000000100, 0x6900000019000000, 0x3934776932313563, - 0x776f38343032636f, 0x30777031776b3934, 0x0000000100000010, 0x0002000000080202, 0x0000000809090000, 0x0003000000040a0a, 0x0000004002020000, 0x0a0a000000100909, - 0x0000000300000040, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x001a000000100101, 0x6932313563690000, 0x3135636f36373677, 0x776b363736776f32, - 0x0000000830777031, 0x0010020200000001, 0x0a0a000000020000, 0x00020d0d00000004, 0x0202000000030000, 0x0010090900000020, 0x000000400a0a0000, 0x0000000400000000, - 0x0000000100000003, 0x0000000700000007, 0x0020000100000001, 0x6369000000170000, 0x6f32313568693233, 0x6b323135686f3363, 0x0000083168703368, 0x0307070000000200, - 0x0000030808000000, 0x080d0d0000000100, 0x0200000003000000, 0x1009090000002002, 0x0000080a0a000000, 0x0000040000000200, 0x0000010000000300, 0x0000040000000400, - 0x2000010000000100, 0x6900000017000000, 0x3231356869323363, 0x323135686f33636f, 0x000831687033686b, 0x0707000000020000, 0x0003080800000003, 0x0d0d000000010000, - 0x0000000300000008, 0x0909000000200202, 0x00080a0a00000010, 0x0004000000020000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001a00000002, - 0x6869303832316369, 0x30383231636f3231, 0x7033686b3231686f, 0x0003000000083168, 0x0000002802020000, 0x0808000000030707, 0x0000000200000003, 0x0d0d000000020202, - 0x0000000400000002, 0x0909000000100202, 0x00200a0a00000002, 0x000000060d0d0000, 0x00000004ffffffff, 0x0000000300000001, 0x0000000100000008, 0x0064000100000008, - 0x63690000001e0000, 0x3233353932776931, 0x3935776f3031636f, 0x77733031776b3530, 0x0000001030777035, 0x00050d0d00000001, 0x0002000000000000, 0x0000001009090000, - 0x0000000000100a0a, 0x0001000000050000, 0x0008000000010000, 0x0001000000080000, 0x0000040001010000, 0x6932336700000017, 0x636f363568693463, 0x33686b3635686f34, - 0x0100000010316870, 0x0000030707000000, 0x080d0d0000000100, 0x0200000004000000, 0x0308080000000402, 0x0000100909000000, 0x00000000100a0a00, 0x0300000004000000, - 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3663690000001800, 0x636f343668693034, 0x6b3436686f303436, 0x0000083168703368, 0x2802020000000300, - 0x0000030707000000, 0x0200000003080800, 0x0000080a0a000000, 0x03000000020d0d00, 0x0000100202000000, 0x0d000000100a0a00, 0xffffff000000200d, 0x00000100000004ff, - 0x0000040000000100, 0x0000010000000400, 0x1b00000001000100, 0x6369323367000000, 0x636f323131686932, 0x33686b3635686f32, 0x0008316870326873, 0x0707000000010000, - 0x0000000100000003, 0x0004000000020d0d, 0x0000000202020000, 0x0a0a000000030808, 0x000e0d0d00000008, 0x0005000000000000, 0x0001000000030000, 0x0004000000040000, - 0x0001000000010000, 0x0000001a00000080, 0x3168693231356369, 0x686f323135636f34, 0x7032687333686b37, 0x0003000000103168, 0x0000000802020000, 0x0808000000030707, - 0x0000000200000003, 0x0d0d000000040a0a, 0x0000000300000008, 0x0909000000400202, 0x00400a0a00000020, 0x0004000000030000, 0x0001000000030000, 0x0004000000040000, - 0x0101000000010000, 0x0000001a00000010, 0x3177693231356369, 0x6f363532636f3936, 0x7031776b39363177, 0x0001000000083077, 0x0000001002020000, 0x00040a0a00000002, - 0x000000040d0d0000, 0x0020020200000003, 0x0000001009090000, 0xffff000000200a0a, 0x000300000004ffff, 0x0001000000020000, 0x0007000000070000, 0x0000000200010000, - 0x353263690000001a, 0x636f303031686936, 0x303031686f363532, 0x000831687033686b, 0x0707000000030000, 0x0003080800000003, 0x000000080a0a0000, 0x0008020200000002, - 0x0000000405050000, 0x0010020200000003, 0x0000001905050000, 0x0003000000200a0a, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, - 0x323363690000001b, 0x33636f3436686930, 0x686b3233686f3032, 0x0831687032687333, 0x0200000003000000, 0x0307070000001402, 0x0000030808000000, 0x080a0a0000000200, - 0x0000020d0d000000, 0x1002020000000300, 0x0000100a0a000000, 0xff000000100d0d00, 0x0300000005ffffff, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, - 0x3363690000001c00, 0x3432303177693032, 0x31776f303436636f, 0x777031776b343230, 0x0000010000001030, 0x020000000a020200, 0x0000040a0a000000, 0x03000000080d0d00, - 0x0000200202000000, 0x0d000000200a0a00, 0x000003000000100d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1c00000002000100, 0x3032336369000000, - 0x636f343230317769, 0x323031776f303436, 0x1030777031776b34, 0x0200000001000000, 0x0000020000000a02, 0x0d000000020a0a00, 0x000003000000040d, 0x0a00000020020200, - 0x100d0d000000200a, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001b0000001000, 0x6869363532636900, 0x6f323135636f3833, - 0x687333686b393168, 0x0000001031687032, 0x0008020200000003, 0x0000000307070000, 0x0002000000030808, 0x000000040a0a0000, 0x0003000000040d0d, 0x0000002002020000, - 0x0a0a000000100909, 0x0000000300000020, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x002f000000010001, 0x3168693463690000, 0x3239317769303830, - 0x33686f3633636f30, 0x6b303436776f3036, 0x33687333776b3368, 0x7770306870337773, 0x0000010000000830, 0x0200000003070700, 0x0000020a0a000000, 0x04000000040d0d00, - 0x0000040202000000, 0x0a00000004080800, 0x100d0d000000200a, 0x04ffffffff000000, 0x0100000001000000, 0x0700000007000000, 0x0100000001000000, 0x0000140000000100, - 0x3635686932636900, 0x6b3635686f32636f, 0x0000083168703368, 0x0307070000000100, 0x0400000000000000, 0x0000020202000000, 0x0a00000003080800, 0x070d0d000000080a, - 0x0400000000000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, 0x00001b0000000800, 0x3377693436636900, 0x363532636f363331, 0x776b36333133776f, - 0x0000000830777031, 0x0002020200000001, 0x0a0a000000010000, 0x0000000300000004, 0x0a0a000000200202, 0x00100d0d00000020, 0x0004ffffffff0000, 0x0001000000030000, - 0x0007000000070000, 0x0001000000010000, 0x0000001a00000001, 0x7769303832316369, 0x30383231636f3436, 0x7031776b3436776f, 0x0001000000083077, 0x0000000a02020000, - 0x0008020200000002, 0x000000020d0d0000, 0x0010020200000003, 0x000000100a0a0000, 0x0000000000200d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, - 0x0000000100010000, 0x353263690000001a, 0x636f343677693036, 0x3436776f30383231, 0x001030777031776b, 0x0202000000010000, 0x0000000100000014, 0x0003000000040202, - 0x0000002002020000, 0x0d0d000000200a0a, 0x0000000000000010, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x001a000000020001, 0x3036353263690000, - 0x3231636f34367769, 0x776b3436776f3038, 0x0000001030777031, 0x0050020200000001, 0x0a0a000000020000, 0x00080d0d00000002, 0x0202000000030000, 0x00200a0a00000020, - 0x000000080d0d0000, 0x0000000400000003, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x63690000001c0000, 0x3532776930363532, 0x6f30383231636f36, - 0x7031776b36353277, 0x0001000000083077, 0x000000a002020000, 0x00080a0a00000002, 0x000000040d0d0000, 0x0010020200000003, 0x000000100a0a0000, 0xffff000000200d0d, - 0x000300000005ffff, 0x0007000000030000, 0x0007000000010000, 0x0000000400010000, 0x6938636900000028, 0x3031776930343668, 0x36686f38636f3432, 0x34323031776f3034, - 0x687033776b33686b, 0x0000001031777031, 0x00200c0c00000001, 0x0004000000000000, 0x0000000802020000, 0x0a0a000000030808, 0x00100d0d00000010, 0x0005000000030000, - 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001700000020, 0x3135686932336369, 0x3135686f33636f32, 0x1031687033686b32, 0x0700000002000000, - 0x0308080000000307, 0x0d00000001000000, 0x000003000000080d, 0x0900000020020200, 0x100a0a0000001009, 0x0400000003000000, 0x0100000003000000, 0x0400000004000000, - 0x0100000001000000, 0x0000380000000100, 0x3264693233636900, 0x6934323268693432, 0x3233636f30363177, 0x32686f343232646f, 0x6b303631776f3432, 0x33776b33686b3364, - 0x7770316870316470, 0x0000030000000831, 0x0700000003060600, 0x0308080000000307, 0x0a00000002000000, 0x020d0d000000020a, 0x0200000003000000, 0x100a0a0000002002, - 0x0000140d0d000000, 0x0000050000000300, 0x0000030000000300, 0x0000010000000a00, 0x4001010000000a00, 0x6900000019000000, 0x3039776936353263, 0x35776f363532636f, - 0x3077703233776b39, 0x0000000200000010, 0x0d0d000000020909, 0x000000010000003b, 0x0003000000080202, 0x0000002002020000, 0x0a0a000000080909, 0x0000000200000040, - 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x001a000000010001, 0x3038323163690000, 0x3231636f32336869, 0x686b3233686f3038, 0x0000000831687033, - 0x0050020200000003, 0x0000000307070000, 0x0002000000030808, 0x000000040a0a0000, 0x0003000000020d0d, 0x0000001002020000, 0x0d0d000000100a0a, 0xffffffff00000010, - 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x0017000000080001, 0x6932313563690000, 0x38343032636f3768, 0x687031686b37686f, 0x0000010000000830, - 0x0100000010020200, 0x0000020a0a000000, 0x2002020000000300, 0x0000400a0a000000, 0xff000000080d0d00, 0x0100000005ffffff, 0x0100000002000000, 0x0800000008000000, - 0x0008000001000000, 0x3263690000001300, 0x6f38636f30316869, 0x30687033686b3868, 0x0000000200000010, 0x0808000000030707, 0x0000000000000003, 0x0010020200000003, - 0x0000001009090000, 0x0000000000080a0a, 0x0001000000050000, 0x0001000000020000, 0x000b0000000b0000, 0x0000080000010000, 0x6932636900000013, 0x686f38636f303168, - 0x0830687033686b38, 0x0700000002000000, 0x0308080000000307, 0x0500000001000000, 0x0000030000000205, 0x0900000008020200, 0x080a0a0000001009, 0x0400000000000000, - 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, 0x0000180000002001, 0x6869343833636900, 0x6f363532636f3632, 0x687033686b363268, 0x0000030000000831, - 0x070000000c020200, 0x0308080000000307, 0x0a00000002000000, 0x040d0d000000040a, 0x0200000003000000, 0x2009090000002002, 0x0000200a0a000000, 0x0000040000000300, - 0x0000010000000300, 0x0000040000000400, 0x1001010000000100, 0x6900000018000000, 0x3235686938323163, 0x35686f363532636f, 0x0831687033686b32, 0x0200000003000000, - 0x0307070000000202, 0x0000030808000000, 0x0202020000000100, 0x0200000003000000, 0x1009090000002002, 0x0000200a0a000000, 0x0000040000000000, 0x0000010000000300, - 0x0000070000000700, 0x0200010000000100, 0x690000001c000000, 0x3577693032393163, 0x30383231636f3637, 0x31776b363735776f, 0x0100000008307770, 0x0000780202000000, - 0x080a0a0000000200, 0x0000080d0d000000, 0x1002020000000300, 0x0000200a0a000000, 0x00000000180d0d00, 0x0300000004000000, 0x0400000001000000, 0x0100000004000000, - 0x0000010001000000, 0x3163690000001800, 0x636f383032686936, 0x6b383032686f3233, 0x0000083168703368, 0x0307070000000200, 0x0000030808000000, 0x020a0a0000000100, - 0x0200000003000000, 0x100a0a0000002002, 0x0000100d0d000000, 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x6900000018000000, - 0x3233686930323363, 0x33686f303436636f, 0x1031687033686b32, 0x0200000003000000, 0x0307070000000a02, 0x0000030808000000, 0x020a0a0000000200, 0x0000040d0d000000, - 0x2002020000000300, 0x0000200a0a000000, 0x03000000080d0d00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3363690000001800, - 0x636f323368693032, 0x6b3233686f303436, 0x0000103168703368, 0x0a02020000000300, 0x0000030707000000, 0x0100000003080800, 0x0000040d0d000000, 0x2002020000000400, - 0x0000020909000000, 0x0d000000200a0a00, 0x000003000000080d, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1600000001000100, 0x3032336369000000, - 0x6f34636f36366869, 0x687033686b343668, 0x0000030000000830, 0x0700000003020200, 0x0308080000000307, 0x0200000002000000, 0x080d0d0000000402, 0x0200000003000000, - 0x080a0a0000002002, 0x0000080d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x690000001a000000, 0x3168693032393163, - 0x6f30383231636f36, 0x687033686b363168, 0x0000030000000831, 0x070000003c020200, 0x0308080000000307, 0x0200000001000000, 0x0000030000000202, 0x0a00000010020200, - 0x100d0d000000200a, 0x0500000000000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000160000000100, 0x3168693233636900, 0x31686f3233636f37, - 0x1031687033686b37, 0x0200000003000000, 0x0307070000000202, 0x0000030808000000, 0x020a0a0000000200, 0x0000040d0d000000, 0x1002020000000300, 0x0000100a0a000000, - 0x03000000080d0d00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3563690000001800, 0x636f343668693231, 0x6b3436686f323135, - 0x0000103168703368, 0x1002020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000040a0a000000, 0x03000000040d0d00, 0x0000200202000000, 0x0d000000200a0a00, - 0x000003000000100d, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1700000020000100, 0x6369323367000000, 0x34636f3635686934, 0x7033686b3635686f, - 0x0001000000083168, 0x0000000307070000, 0x00080d0d00000001, 0x0202000000040000, 0x0004080800000004, 0x0000002009090000, 0x0002000000080a0a, 0x0003000000040000, - 0x0004000000010000, 0x0001000000040000, 0x0000002000010000, 0x6932336700000017, 0x636f363568693463, 0x33686b3635686f34, 0x0100000008316870, 0x0000030707000000, - 0x080d0d0000000100, 0x0200000004000000, 0x0408080000000802, 0x0000200909000000, 0x02000000080a0a00, 0x0100000004000000, 0x0700000001000000, 0x0100000007000000, - 0x0000010001000000, 0x3135670000001b00, 0x3233776931636932, 0x6b3233776f31636f, 0x3477703377643377, 0x0000000000000008, 0x00040d0d00000001, 0x0101000000030000, - 0x0003080800000008, 0x000000080d0d0000, 0x00000004ffffffff, 0x0000000100000003, 0x0000000400000004, 0x0080010100000001, 0x63690000001b0000, 0x6f36356869363532, - 0x3832686f32313563, 0x687032687331686b, 0x0000010000000830, 0x0100000008020200, 0x0000080a0a000000, 0x2002020000000300, 0x0000200909000000, 0x02000000200a0a00, - 0x0300000005000000, 0x0a00000003000000, 0x0a00000001000000, 0x0000010001000000, 0x3363690000001a00, 0x31636f3432326869, 0x686b3635686f3832, 0x1030687034687334, - 0x0c00000001000000, 0x000001000000070c, 0x04000000080a0a00, 0x0000040202000000, 0x0a00000004080800, 0x080d0d000000100a, 0x0500000002000000, 0x0300000003000000, - 0x0100000007000000, 0x0100000007000000, 0x00001a0000000100, 0x3232686933636900, 0x686f383231636f34, 0x34687334686b3635, 0x0000000010306870, 0x0a00000001000000, - 0x000004000000020a, 0x0800000008020200, 0x100a0a0000000408, 0x0000100d0d000000, 0x0000040000000300, 0x0000030000000300, 0x0000010000000700, 0x1001010000000700, - 0x690000001d000000, 0x3177693838303163, 0x323135636f303035, 0x776b30303531776f, 0x0000000830777031, 0x01f40d0d00000001, 0x0202000000020000, 0x00080a0a00000008, - 0x0202000000030000, 0x0010090900000020, 0x000000200a0a0000, 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x6369000000190000, - 0x3233686930323931, 0x33686f303436636f, 0x1031687033686b32, 0x0200000003000000, 0x0307070000003c02, 0x0000030808000000, 0x040a0a0000000200, 0x0000020d0d000000, - 0x2002020000000300, 0x0000200a0a000000, 0x03000000100d0d00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3163690000001900, - 0x6f32336869303239, 0x3233686f30343663, 0x001031687033686b, 0x0202000000030000, 0x000307070000003c, 0x0000000308080000, 0x00020d0d00000001, 0x0202000000040000, - 0x0002090900000020, 0x000000400a0a0000, 0x0003000000100d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x6934636900000018, - 0x34636f3639303477, 0x776b36393034776f, 0x0000001030777031, 0x0000000100000000, 0x0004000000020d0d, 0x0000000402020000, 0x0a0a000000040808, 0x00100d0d00000010, - 0x0005000000030000, 0x0001000000030000, 0x0004000000040000, 0x0001000000010000, 0x0000001900000020, 0x6936316369323367, 0x6f3631636f343168, 0x687033686b343168, - 0x0000020000001031, 0x0800000003070700, 0x0000010000000308, 0x03000000040d0d00, 0x0000200202000000, 0x0a00000020090900, 0x000003000000100a, 0x0000030000000400, - 0x0000040000000100, 0x0000010000000400, 0x1a00000008000100, 0x6869336369000000, 0x6f3436636f343232, 0x7337686b32313168, 0x0000083368703268, 0x0707070000000100, - 0x0d00000001000000, 0x000004000000040d, 0x0800000004020200, 0x200a0a0000000808, 0x00001c0d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000070000000700, - 0x0100010000000100, 0x690000001a000000, 0x6f34323268693363, 0x323131686f343663, 0x687032687337686b, 0x0000020000000833, 0x0800000007070700, 0x0000020000000208, - 0x0d000000020a0a00, 0x000004000000040d, 0x0800000004020200, 0x100a0a0000000408, 0x0000200d0d000000, 0x0000040000000100, 0x0000010000000300, 0x0000040000000400, - 0x0100010000000100, 0x690000001a000000, 0x6f34323268693363, 0x323131686f343663, 0x687032687337686b, 0x0000010000000833, 0x0200000007070700, 0x0000020a0a000000, - 0x04000000040d0d00, 0x0000040202000000, 0x0a00000008080800, 0x200d0d000000100a, 0x0400000001000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, - 0x0000160000000100, 0x3436686934636900, 0x36686f303233636f, 0x0831687033686b34, 0x0700000001000000, 0x0000010000000307, 0x04000000020a0a00, 0x0000040202000000, - 0x0a00000004080800, 0x100d0d000000400a, 0x0500000000000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000160000000100, 0x6869303233636900, - 0x36686f34636f3436, 0x1031687033686b34, 0x0200000003000000, 0x0307070000000a02, 0x0000030808000000, 0x020d0d0000000100, 0x0200000003000000, 0x100a0a0000002002, - 0x0000080d0d000000, 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, 0x0200010000000100, 0x6900000016000000, 0x3436686930323363, 0x6b3436686f34636f, - 0x0000103168703368, 0x0a02020000000300, 0x0000030707000000, 0x0000000003080800, 0x0200000003000000, 0x100a0a0000002002, 0x0000080d0d000000, 0x0000040000000300, - 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x6900000018000000, 0x3433686930343663, 0x33686f303436636f, 0x0830687033686b32, 0x0200000003000000, - 0x0307070000002802, 0x0000030808000000, 0x020a0a0000000100, 0x0200000003000000, 0x200a0a0000001002, 0x0000100d0d000000, 0x0000040000000000, 0x0000010000000300, - 0x0000070000000700, 0x0100010000000100, 0x690000001a000000, 0x6f30303368693363, 0x303531686f343663, 0x687032687337686b, 0x0000020000000833, 0x0800000007070700, - 0x0000010000000208, 0x04000000020a0a00, 0x0000040202000000, 0x0a00000004080800, 0x0f0d0d000000200a, 0x0500000000000000, 0x0100000003000000, 0x0700000007000000, - 0x0100000001000000, 0x00001a0000000100, 0x6930383231636900, 0x383231636f363168, 0x33686b3631686f30, 0x0300000010316870, 0x0000050202000000, 0x0800000003070700, - 0x0000010000000308, 0x0300000008020200, 0x0000200202000000, 0x0d000000200a0a00, 0x000000000000100d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, - 0x1a00000002000100, 0x3832316369000000, 0x31636f3631686930, 0x6b3631686f303832, 0x0000103168703368, 0x0502020000000300, 0x0000030707000000, 0x0100000003080800, - 0x0000080202000000, 0x2002020000000400, 0x0000020909000000, 0x0d000000200a0a00, 0x000000000000100d, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, - 0x1c00000001000100, 0x3832316369000000, 0x636f363532776930, 0x3532776f30383231, 0x0830777031776b36, 0x0200000001000000, 0x0000020000005002, 0x0d000000080a0a00, - 0x000003000000040d, 0x0a00000010020200, 0x200d0d000000100a, 0x05ffffffff000000, 0x0300000001000000, 0x0100000008000000, 0x0100000008000000, 0x0000130000080000, - 0x3031686932636900, 0x686b38686f38636f, 0x0000001030687033, 0x00040c0c00000002, 0x000000080d0d0000, 0x0000000400000000, 0x0808000000020202, 0x0010090900000003, - 0x000000100a0a0000, 0x0000000500000000, 0x0000000300000001, 0x000000010000000b, 0x080000010000000b, 0x6369000000130000, 0x38636f3031686932, 0x687033686b38686f, - 0x0000020000000830, 0x0d000000040c0c00, 0x000000000000080d, 0x0202020000000400, 0x0000030808000000, 0x0a00000008090900, 0x000000000000080a, 0x0000030000000500, - 0x0000070000000300, 0x0000070000000100, 0x1800000010000100, 0x3230316369000000, 0x31636f3833686934, 0x33686b3833686f36, 0x0100000010316870, 0x0000260d0d000000, - 0x0000030000000000, 0x0900000020020200, 0x100a0a0000001009, 0x0400000003000000, 0x0300000003000000, 0x0100000007000000, 0x0100000007000000, 0x00001c0000000800, - 0x7769323135636900, 0x323135636f393932, 0x7338776b3337776f, 0x0000083077703477, 0x0809090000000100, 0x0200000002000000, 0x040a0a0000000802, 0x0200000003000000, - 0x400a0a0000001002, 0x0000100d0d000000, 0x0000040000000300, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x6900000018000000, 0x3233686930343663, - 0x33686f303436636f, 0x0831687033686b32, 0x0200000003000000, 0x0307070000002802, 0x0000030808000000, 0x080a0a0000000100, 0x0200000003000000, 0x200a0a0000001002, - 0x0000100d0d000000, 0x000005ffffffff00, 0x0000030000000300, 0x0000010000000700, 0x1000010000000700, 0x6900000016000000, 0x6f35376869343663, 0x6b3537686f343663, - 0x0000103168703368, 0x050c0c0000000200, 0x0000050d0d000000, 0x0402020000000100, 0x0200000003000000, 0x1009090000001002, 0x0000400a0a000000, 0x0000050000000300, - 0x0000030000000300, 0x0000010000000a00, 0x2000010000000a00, 0x6900000014000000, 0x636f363264693663, 0x32646b3532646f31, 0x0200000010306470, 0x0000020909000000, - 0x00000000190d0d00, 0x0200000004000000, 0x0208080000000802, 0x0000080909000000, 0x00000000100a0a00, 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, - 0x0000010001000000, 0x3663690000001800, 0x636f343668693034, 0x6b3436686f303233, 0x0000083168703368, 0x2802020000000300, 0x0000030707000000, 0x0200000003080800, - 0x0000080a0a000000, 0x03000000040d0d00, 0x0000100202000000, 0x0d000000100a0a00, 0xffffff000000100d, 0x00000300000004ff, 0x0000070000000100, 0x0000010000000700, - 0x1a00000001000100, 0x3832316369000000, 0x31636f3831686930, 0x6b3631686f303832, 0x0000083068703368, 0x2802020000000300, 0x0000030707000000, 0x0100000003080800, - 0x0000020202000000, 0x1002020000000300, 0x0000200a0a000000, 0x00000000100d0d00, 0x0300000004000000, 0x0400000001000000, 0x0100000004000000, 0x0000010001000000, - 0x3363690000001700, 0x33636f3830366869, 0x686b383036686f32, 0x0000000831687033, 0x0003070700000001, 0x0d0d000000010000, 0x0000000400000002, 0x0808000000040202, - 0x00100a0a00000008, 0x000000200d0d0000, 0x0000000400000003, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x63690000001a0000, 0x3035316869363532, - 0x31686f363532636f, 0x31687033686b3035, 0x0000000300000008, 0x0707000000100202, 0x0003080800000003, 0x0a0a000000010000, 0x0000000300000004, 0x0a0a000000100202, - 0x00100d0d00000040, 0x0004000000030000, 0x0001000000030000, 0x0007000000070000, 0x0101000000010000, 0x0000001900000c80, 0x3677693834346369, 0x6f30383032636f34, - 0x777031776b343677, 0x0000010000000830, 0x020000001c020200, 0x0000020a0a000000, 0x03000000080d0d00, 0x0000100202000000, 0x0a00000020090900, 0x000003000000200a, - 0x0000030000000400, 0x0000040000000100, 0x0000010000000400, 0x1900000c80010100, 0x3834346369000000, 0x3032636f34367769, 0x776b3436776f3038, 0x0000000830777031, - 0x000e020200000001, 0x0a0a000000020000, 0x00080d0d00000004, 0x0202000000030000, 0x0020090900000020, 0x000000200a0a0000, 0x0000000400000003, 0x0000000100000003, - 0x0000000700000007, 0x0001000100000001, 0x6369000000190000, 0x3233686930383231, 0x33686f303436636f, 0x0831687033686b32, 0x0200000003000000, 0x0307070000005002, - 0x0000030808000000, 0x080a0a0000000200, 0x0000020d0d000000, 0x1002020000000300, 0x0000200a0a000000, 0xff000000100d0d00, 0x0300000005ffffff, 0x0100000002000000, - 0x0700000007000000, 0x0000400001000000, 0x3663690000001c00, 0x6f32323031776934, 0x3031776f38323163, 0x77703631776b3730, 0x0000020000001030, 0x0a00000010080800, - 0x000001000000040a, 0x0300000008050500, 0x0000400202000000, 0x0a00000020050500, 0xffffff000000200a, 0x00000300000004ff, 0x0000070000000100, 0x0000010000000700, - 0x1c00000001000100, 0x3034366369000000, 0x636f343230317769, 0x323031776f303436, 0x0830777031776b34, 0x0200000001000000, 0x0000020000002802, 0x0d000000080a0a00, - 0x000003000000080d, 0x0a00000010020200, 0x200d0d000000100a, 0x0500000000000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000180000000100, - 0x6869303233636900, 0x6f303233636f3436, 0x687033686b343668, 0x0000030000001031, 0x070000000a020200, 0x0308080000000307, 0x0a00000002000000, 0x040d0d000000020a, - 0x0200000003000000, 0x400a0a0000002002, 0x0000100d0d000000, 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, 0x0200010000000100, 0x6900000018000000, - 0x3436686930323363, 0x36686f303233636f, 0x1031687033686b34, 0x0200000003000000, 0x0307070000000a02, 0x0000030808000000, 0x080d0d0000000100, 0x0200000004000000, - 0x0209090000002002, 0x0000200a0a000000, 0x03000000080d0d00, 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3363690000001800, - 0x636f323135686932, 0x6b323135686f3233, 0x0000083168703368, 0x0202020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000020a0a000000, 0x03000000040d0d00, - 0x0000100202000000, 0x0d000000100a0a00, 0x000003000000200d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1700000001000100, 0x3639386369000000, - 0x3233636f39347769, 0x7031776b3934776f, 0x0001000000103077, 0x0000000702020000, 0x0008020200000001, 0x0202000000030000, 0x00100a0a00000010, 0x000000080d0d0000, - 0x0000000400000000, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x63690000001d0000, 0x3031776930323931, 0x6f303436636f3432, 0x31776b3432303177, - 0x0100000008307770, 0x0000780202000000, 0x080a0a0000000200, 0x0000040d0d000000, 0x1002020000000300, 0x0000200a0a000000, 0xff000000200d0d00, 0x0300000005ffffff, - 0x0400000001000000, 0x0100000004000000, 0x0000200101000000, 0x3363690000003900, 0x3432326869386469, 0x31636f3432327769, 0x3131686f38646f36, 0x646b323131776f32, - 0x7333776b33686b31, 0x3064703277733268, 0x0010317770316870, 0x0707000000010000, 0x0000000000000003, 0x0004020200000004, 0x0000000808080000, 0x0a0a000000200909, - 0x0000000300000010, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x001b000000010001, 0x6930343663690000, 0x303436636f323368, 0x7333686b3631686f, - 0x0000083168703268, 0x0a02020000000300, 0x0000030707000000, 0x0100000003080800, 0x0000040202000000, 0x1002020000000300, 0x0000200a0a000000, 0x00000000100d0d00, - 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3163690000001800, 0x636f386869303832, 0x6b38686f30383231, 0x0000083168703368, - 0x1402020000000300, 0x0000030707000000, 0x0100000003080800, 0x0000020202000000, 0x2002020000000300, 0x0000100a0a000000, 0x00000000080d0d00, 0x0300000004000000, - 0x0400000001000000, 0x0100000004000000, 0x0000080001000000, 0x3563690000001600, 0x35636f3768693231, 0x33686b37686f3231, 0x0300000008316870, 0x0000040202000000, - 0x0800000003070700, 0x0000010000000308, 0x0300000004020200, 0x0000200202000000, 0x0d000000400a0a00, 0x000000000000070d, 0x0000030000000500, 0x0000040000000100, - 0x0000010000000400, 0x2c00000020010100, 0x6869316369000000, 0x3630303177693038, 0x38686f31636f3035, 0x3536303031776f30, 0x3332776b31686b30, 0x1031317770306870, - 0x0800000001000000, 0x0000010000000308, 0x04000000040d0d00, 0x0000040202000000, 0x0900000008080800, 0x100a0a0000002009, 0x0500000003000000, 0x0100000003000000, - 0x0700000007000000, 0x0100000001000000, 0x00002c0000002001, 0x3038686931636900, 0x3035363030317769, 0x6f3038686f31636f, 0x6b30353630303177, 0x68703332776b3168, - 0x0000103131777030, 0x0308080000000100, 0x0d00000001000000, 0x000004000000080d, 0x0800000002020200, 0x2009090000000808, 0x0000100a0a000000, 0x0000050000000300, - 0x0000030000000300, 0x0000010000000700, 0x4000010000000700, 0x6900000019000000, 0x3039776936353263, 0x35776f363532636f, 0x3077703233776b39, 0x0000000200000010, - 0x0d0d000000400909, 0x0000000200000004, 0x0a0a000000080202, 0x0000000300000004, 0x0a0a000000100202, 0x00100d0d00000040, 0x0004ffffffff0000, 0x0003000000030000, - 0x0001000000070000, 0x0001000000070000, 0x0000002600000003, 0x3168693635326369, 0x32636f3631776930, 0x776f3031686f3635, 0x35776b35686b3631, 0x0008327770326870, - 0x0c0c000000010000, 0x000000020000000a, 0x0a0a000000080202, 0x0000000300000004, 0x0a0a000000100202, 0x00100d0d00000040, 0x0004000000030000, 0x0001000000030000, - 0x0007000000070000, 0x0001000000010000, 0x0000001c00000001, 0x7769303239316369, 0x383231636f363532, 0x776b363532776f30, 0x0000000830777031, 0x0078020200000001, - 0x0a0a000000020000, 0x00040d0d00000008, 0x0202000000030000, 0x00100a0a00000010, 0x000000200d0d0000, 0x00000005ffffffff, 0x0000000200000003, 0x0000000700000001, - 0x0010000100000007, 0x6369000000320000, 0x6869373164693233, 0x636f333377693333, 0x686f3731646f3233, 0x646b3333776f3333, 0x7033776b33686b33, 0x3177703168703164, - 0x0000000300000010, 0x0707000000030606, 0x0003080800000003, 0x0003000000000000, 0x0000002002020000, 0x0a0a000000100909, 0x0000000300000020, 0x0000000100000005, - 0x0000000800000003, 0x0000000800000001, 0x001e000000640001, 0x3277693163690000, 0x3031636f32333539, 0x776b35303935776f, 0x3077703577733031, 0x0000000100000010, - 0x0000000000400d0d, 0x0808000000030000, 0x000a090900000008, 0x000000100a0a0000, 0x0000000500000000, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, - 0x6369000000380000, 0x6934323264693233, 0x3631776934323268, 0x32646f3233636f30, 0x6f343232686f3432, 0x6b33646b30363177, 0x31647033776b3368, 0x0010317770316870, - 0x0606000000030000, 0x0003070700000003, 0x0000000308080000, 0x0002020200000001, 0x0202000000030000, 0x00200a0a00000010, 0x000000200d0d0000, 0x0000000400000000, - 0x0000000100000003, 0x0000000400000004, 0x0008000100000001, 0x6369000000170000, 0x636f363568693436, 0x6b3635686f363532, 0x0000083068703168, 0x0202020000000100, - 0x0a00000001000000, 0x000003000000040a, 0x0a00000020020200, 0x100d0d000000200a, 0x04ffffffff000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, - 0x00001a0000008001, 0x7769323135636900, 0x383231636f343837, 0x31776b343837776f, 0x0100000008307770, 0x0000100202000000, 0x080a0a0000000100, 0x0200000003000000, - 0x2009090000002002, 0x0000100a0a000000, 0x0000050000000200, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x690000001c000000, 0x3277693038323163, - 0x30383231636f3635, 0x31776b363532776f, 0x0100000010307770, 0x0000280202000000, 0x020a0a0000000200, 0x0000080d0d000000, 0x2002020000000300, 0x0000400a0a000000, - 0x03000000080d0d00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3163690000001c00, 0x3635327769303832, 0x776f30383231636f, - 0x777031776b363532, 0x0000010000001030, 0x0200000028020200, 0x0000040a0a000000, 0x03000000040d0d00, 0x0000200202000000, 0x0d000000200a0a00, 0x000003000000200d, - 0x0000030000000400, 0x0000040000000100, 0x0000010000000400, 0x1900000008000100, 0x3430326369000000, 0x35636f3934776938, 0x776b3934776f3231, 0x0000000830777031, - 0x0020020200000001, 0x0a0a000000020000, 0x00020d0d00000008, 0x0202000000030000, 0x00080a0a00000040, 0x000000100d0d0000, 0x00000005ffffffff, 0x0000000300000003, - 0x000000010000000a, 0x002001010000000a, 0x6369000000140000, 0x38636f3532646938, 0x7035646b3332646f, 0x0001000000103164, 0x000000170c0c0000, 0x0000000300000000, - 0x0909000000080202, 0x00100a0a00000010, 0x0004000000020000, 0x0001000000030000, 0x0004000000040000, 0x0101000000010000, 0x0000001b00000080, 0x3268693635326369, - 0x686f363532636f38, 0x32687333686b3431, 0x0300000008316870, 0x0000080202000000, 0x0800000003070700, 0x0000020000000308, 0x0a00000004090900, 0x000003000000080a, - 0x0900000020020200, 0x200a0a0000002009, 0x0400000003000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, 0x00001a0000008001, 0x7769323135636900, - 0x363532636f343837, 0x31776b343837776f, 0x0100000008307770, 0x0000100202000000, 0x080a0a0000000100, 0x0200000003000000, 0x2009090000002002, 0x0000200a0a000000, - 0x0000040000000200, 0x0000010000000300, 0x0000040000000400, 0x8001010000000100, 0x690000001b000000, 0x3177693432303163, 0x6f323135636f3639, 0x7031776b36393177, - 0x0001000000083077, 0x0000002002020000, 0x00080a0a00000002, 0x000000040d0d0000, 0x0020020200000003, 0x0000002009090000, 0x0003000000200a0a, 0x0003000000050000, - 0x0001000000020000, 0x0007000000070000, 0x0000001000010000, 0x313563690000001d, 0x6f34343431776932, 0x31776f3432303163, 0x777031776b343434, 0x0000010000001030, - 0x02000000200a0a00, 0x0000020202000000, 0x0300000008050500, 0x0000400202000000, 0x0a00000010090900, 0x000003000000200a, 0x0000030000000400, 0x0000040000000100, - 0x0000010000000400, 0x1400000001000100, 0x6930326369000000, 0x686f3035636f3668, 0x0830687033686b34, 0x0700000002000000, 0x0308080000000307, 0x0300000000000000, - 0x0000200202000000, 0x0d000000080a0a00, 0x000000000000040d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1a00000001000100, 0x3832316369000000, - 0x31636f3231356869, 0x6b323135686f3832, 0x0000103168703368, 0x0402020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000020a0a000000, 0x03000000020d0d00, - 0x0000200202000000, 0x0d000000400a0a00, 0x000003000000200d, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1800000001000100, 0x3036396369000000, - 0x3233636f34366869, 0x33686b3436686f30, 0x0300000008316870, 0x00003c0202000000, 0x0800000003070700, 0x0000020000000308, 0x0d000000080a0a00, 0x000003000000040d, - 0x0a00000010020200, 0x100d0d000000100a, 0x04ffffffff000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, 0x00002a0000002001, 0x3031686934636900, - 0x3032393177693038, 0x383031686f31636f, 0x6b30323931776f30, 0x31687033776b3368, 0x0100000008317770, 0x0000030707000000, 0x020d0d0000000100, 0x0200000004000000, - 0x0808080000000402, 0x0000200909000000, 0x02000000080a0a00, 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3663690000001800, - 0x636f363968693034, 0x6b3639686f303233, 0x0000083168703368, 0x2802020000000300, 0x0000030707000000, 0x0100000003080800, 0x0000020d0d000000, 0x1002020000000400, - 0x0000020909000000, 0x0d000000200a0a00, 0x000000000000100d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1a00000001000100, 0x3231356369000000, - 0x35636f3635326869, 0x6b363532686f3231, 0x0000103168703368, 0x1002020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000040a0a000000, 0x03000000080d0d00, - 0x0000200202000000, 0x0d000000200a0a00, 0x000003000000200d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x0b00000001000100, 0x3430326369000000, - 0x0010383231636f38, 0x0202000000010000, 0x0000000100000010, 0x0003000000080202, 0x0000001002020000, 0x0d0d000000100a0a, 0x0000000000000008, 0x0000000300000004, - 0x0000000400000001, 0x0000000100000004, 0x001c000000100101, 0x6936353263690000, 0x32636f3430373277, 0x34303732776f3635, 0x000830777031776b, 0x0202000000010000, - 0x0000000200000008, 0x0d0d000000040a0a, 0x0000000300000002, 0x0909000000200202, 0x00400a0a00000010, 0x0004000000000000, 0x0001000000030000, 0x0007000000070000, - 0x0001000000010000, 0x0000001c00000001, 0x6869303832316369, 0x30383231636f3631, 0x687333686b38686f, 0x0000000831687032, 0x0014020200000003, 0x0000000307070000, - 0x0001000000030808, 0x0000000202020000, 0x0020020200000003, 0x000000100a0a0000, 0x0000000000080d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, - 0x0000000100010000, 0x353263690000001a, 0x636f323135686936, 0x323135686f383231, 0x001031687033686b, 0x0202000000030000, 0x0003070700000008, 0x0000000308080000, - 0x00040a0a00000002, 0x000000040d0d0000, 0x0020020200000003, 0x000000200a0a0000, 0x0003000000200d0d, 0x0001000000050000, 0x0008000000010000, 0x0001000000080000, - 0x0000080000010000, 0x6932636900000013, 0x686f38636f303168, 0x1030687033686b38, 0x0700000001000000, 0x0000000000000307, 0x0202020000000400, 0x0000030808000000, - 0x0d000000100a0a00, 0x000000000000080d, 0x0000010000000500, 0x00000b0000000100, 0x0000010000000b00, 0x1300000800000100, 0x6869326369000000, 0x38686f38636f3031, - 0x000830687033686b, 0x0707000000010000, 0x0000000000000003, 0x0002020200000004, 0x0000000308080000, 0x0d0d000000080a0a, 0x0000000000000008, 0x0000000300000005, - 0x0000000700000003, 0x0000000700000001, 0x0017000000400001, 0x6938323163690000, 0x6f3436636f363177, 0x777032776b353177, 0x0000000000001030, 0x0802020000000100, - 0x0200000004000000, 0x0208080000000802, 0x0000100a0a000000, 0xff000000100d0d00, 0x0300000004ffffff, 0x0400000001000000, 0x0100000004000000, 0x0000200101000000, - 0x3163690000001600, 0x32636f3531686930, 0x33686b3331686f30, 0x0200000008306870, 0x0000030707000000, 0x0200000002080800, 0x0000040a0a000000, 0x04000000040d0d00, - 0x0000100202000000, 0x0900000002080800, 0x080a0a0000002009, 0x0500000003000000, 0x0100000001000000, 0x0400000004000000, 0x0100000001000000, 0x00002c0000000100, - 0x3038686931636900, 0x3035363030317769, 0x6f3038686f31636f, 0x6b30353630303177, 0x68703332776b3168, 0x0000103131777030, 0x0000010000000000, 0x03000000040d0d00, - 0x0000170808000000, 0x0d000000100a0a00, 0x000000000000100d, 0x0000010000000500, 0x0000070000000100, 0x0000010000000700, 0x2c00000001000100, 0x6869316369000000, - 0x3630303177693038, 0x38686f31636f3035, 0x3536303031776f30, 0x3332776b31686b30, 0x1031317770306870, 0x0000000000000000, 0x0800000003000000, 0x100a0a0000001708, - 0x0000100d0d000000, 0x0000040000000000, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x6900000019000000, 0x3168693038323163, 0x6f30383231636f30, - 0x30687033686b3868, 0x0000000300000008, 0x0707000000140202, 0x0003080800000003, 0x0202000000010000, 0x0000000300000002, 0x0a0a000000200202, 0x00080d0d00000010, - 0x0005000000000000, 0x0002000000030000, 0x0007000000010000, 0x0001000000070000, 0x0000001900000040, 0x3977693635326369, 0x776f363532636f30, 0x77703233776b3935, - 0x0000020000001030, 0x0a00000020080800, 0x000002000000080a, 0x0500000008020200, 0x0000030000000405, 0x0500000020020200, 0x200a0a0000001e05, 0x04ffffffff000000, - 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000190000000200, 0x6869303436636900, 0x30383231636f3432, 0x7033686b3432686f, 0x0003000000083168, - 0x0000002802020000, 0x0808000000030707, 0x0000000100000003, 0x0003000000040a0a, 0x0000001002020000, 0x0d0d000000200a0a, 0xffffffff00000018, 0x0000000300000005, - 0x0000000700000001, 0x0000000100000007, 0x0017000000200001, 0x6869343663690000, 0x686f33636f363532, 0x687039686b363532, 0x0000030000001034, 0x0700000002020200, - 0x0908080000000907, 0x0d00000001000000, 0x000003000000080d, 0x0900000020020200, 0x100a0a0000002009, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, - 0x0100000001000000, 0x00001a0000000100, 0x6869323135636900, 0x363532636f363532, 0x33686b363532686f, 0x0300000010316870, 0x0000100202000000, 0x0800000003070700, - 0x0000020000000308, 0x0d000000040a0a00, 0x000003000000080d, 0x0a00000020020200, 0x200d0d000000200a, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, - 0x0100000001000000, 0x00001d0000000100, 0x6930323931636900, 0x36636f3432303177, 0x34323031776f3034, 0x001030777031776b, 0x0202000000010000, 0x000000020000003c, - 0x0d0d000000040a0a, 0x0000000300000004, 0x0a0a000000200202, 0x00200d0d00000020, 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, - 0x0000001d00000002, 0x7769303239316369, 0x3436636f34323031, 0x6b34323031776f30, 0x0000103077703177, 0x3c02020000000100, 0x0a00000002000000, 0x040d0d000000080a, - 0x0200000003000000, 0x200a0a0000002002, 0x0000200d0d000000, 0x0000050000000300, 0x0000010000000100, 0x0000070000000700, 0x0100010000000100, 0x6900000014000000, - 0x636f363568693263, 0x33686b3635686f32, 0x0100000010316870, 0x0000030707000000, 0x080d0d0000000100, 0x0200000004000000, 0x0308080000000202, 0x0000100a0a000000, - 0x00000000070d0d00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3163690000001800, 0x636f386869303832, 0x6b38686f30383231, - 0x0000103168703368, 0x2802020000000300, 0x0000030707000000, 0x0100000003080800, 0x0000020a0a000000, 0x2002020000000300, 0x0000100a0a000000, 0x03000000080d0d00, - 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3163690000001800, 0x636f386869303832, 0x6b38686f30383231, 0x0000103168703368, - 0x2802020000000300, 0x0000030707000000, 0x0000000003080800, 0x0200000004000000, 0x0209090000002002, 0x0000200a0a000000, 0x03000000080d0d00, 0x0300000004000000, - 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3263690000001a00, 0x6f34326869303635, 0x32686f3038323163, 0x0831687033686b34, 0x0200000003000000, - 0x030707000000a002, 0x0000030808000000, 0x040a0a0000000100, 0x0200000003000000, 0x200a0a0000001002, 0x0000180d0d000000, 0x000005ffffffff00, 0x0000010000000300, - 0x0000070000000700, 0x4000010000000100, 0x690000001b000000, 0x3532776938323163, 0x776f363532636f32, 0x703631776b373332, 0x0002000000103077, 0x0000000402020000, - 0x0002000000100808, 0x000000040a0a0000, 0x0003000000080d0d, 0x0000002002020000, 0x0d0d000000400a0a, 0xffffffff00000020, 0x0000000300000004, 0x0000000700000001, - 0x0000000100000007, 0x0019000000020001, 0x3038323163690000, 0x3436636f38346869, 0x33686b3834686f30, 0x0300000008316870, 0x0000500202000000, 0x0800000003070700, - 0x0000010000000308, 0x04000000040d0d00, 0x0000100202000000, 0x0a00000002090900, 0x0c0d0d000000200a, 0x05ffffffff000000, 0x0200000003000000, 0x0700000001000000, - 0x0100000007000000, 0x00001b0000004000, 0x7769383231636900, 0x363532636f323532, 0x31776b373332776f, 0x0000001030777036, 0x0010080800000002, 0x000000080a0a0000, - 0x0004020200000002, 0x0000000805050000, 0x0020020200000003, 0x0000002005050000, 0xffff000000200a0a, 0x000300000004ffff, 0x0004000000010000, 0x0001000000040000, - 0x0000008001010000, 0x303163690000001c, 0x636f343168693432, 0x6b37686f38343032, 0x3068703268733168, 0x0000000100000008, 0x0002000000200202, 0x0000000409090000, - 0x0003000000080a0a, 0x0000002002020000, 0x0a0a000000200909, 0x0000000300000020, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x001b000000800101, - 0x6936353263690000, 0x36636f3633313377, 0x6b36333133776f34, 0x0000083077703177, 0x0802020000000100, 0x0a00000002000000, 0x020d0d000000080a, 0x0200000003000000, - 0x1009090000002002, 0x0000080a0a000000, 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, 0x4000010000000100, 0x6900000019000000, 0x3039776936353263, - 0x35776f363532636f, 0x3077703233776b39, 0x0000000200000010, 0x0808000000080202, 0x0000000100000020, 0x0003000000040d0d, 0x0000002002020000, 0x0d0d000000200a0a, - 0xffffffff00000010, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x0016000000020001, 0x6930323363690000, 0x686f34636f363968, 0x31687033686b3639, - 0x0000000300000008, 0x07070000000a0202, 0x0003080800000003, 0x0d0d000000010000, 0x0000000300000008, 0x0a0a000000200202, 0x00100d0d00000008, 0x0005000000000000, - 0x0001000000030000, 0x0007000000070000, 0x0101000000010000, 0x0000001a00000020, 0x3230316869336369, 0x3135686f33636f34, 0x7032687337686b32, 0x0002000000103368, - 0x0000000707070000, 0x0001000000020808, 0x000000040d0d0000, 0x0004020200000004, 0x0000000408080000, 0x0a0a000000200909, 0x0000000300000010, 0x0000000300000005, - 0x0000000700000001, 0x0000000100000007, 0x0018000000010001, 0x6938323163690000, 0x6f33636f32313568, 0x7033686b32313568, 0x0003000000103168, 0x0000000402020000, - 0x0808000000030707, 0x0000000100000003, 0x0003000000040d0d, 0x0000002002020000, 0x0d0d000000100a0a, 0x0000000300000020, 0x0000000300000005, 0x0000000100000002, - 0x0000000700000007, 0x0017000000400001, 0x6938323163690000, 0x6f3436636f363177, 0x777032776b353177, 0x0000020000001030, 0x0a00000002080800, 0x000001000000020a, - 0x0300000002020200, 0x0000100202000000, 0x0a00000010050500, 0xffffff000000200a, 0x00000300000004ff, 0x0000040000000100, 0x0000010000000400, 0x1a00000010010100, - 0x3231356369000000, 0x32636f3637367769, 0x6b363736776f3635, 0x0000083077703177, 0x1002020000000100, 0x0a00000002000000, 0x040d0d000000040a, 0x0200000003000000, - 0x1009090000002002, 0x0000200a0a000000, 0x000005ffffffff00, 0x0000020000000300, 0x0000070000000100, 0x1000010000000700, 0x690000001b000000, 0x3537686936353263, - 0x33686f323135636f, 0x7032687331686b38, 0x0001000000103068, 0x000000100a0a0000, 0x0000000300000000, 0x0909000000400202, 0x00200a0a00000010, 0x0004000000030000, - 0x0001000000030000, 0x0004000000040000, 0x0101000000010000, 0x0000001a00000010, 0x3677693635326369, 0x6f383231636f3637, 0x7031776b36373677, 0x0001000000083077, - 0x0000000802020000, 0x00080a0a00000002, 0x000000040d0d0000, 0x0020020200000003, 0x0000001009090000, 0xffff000000100a0a, 0x000300000005ffff, 0x0001000000020000, - 0x0007000000070000, 0x0000004000010000, 0x3231636900000018, 0x31636f3831776938, 0x776b3531776f3832, 0x0000001030777034, 0x0004080800000002, 0x000000040a0a0000, - 0x0004050500000001, 0x0202000000030000, 0x0008050500000020, 0x000000200a0a0000, 0x00000005ffffffff, 0x0000000300000003, 0x0000000100000007, 0x0040000100000007, - 0x6369000000180000, 0x6f38317769363532, 0x3531776f38323163, 0x001030777034776b, 0x0909000000010000, 0x0000000200000008, 0x0a0a000000020202, 0x0000000300000008, - 0x0a0a000000200202, 0x00100d0d00000010, 0x0005ffffffff0000, 0x0001000000030000, 0x0004000000040000, 0x0001000000010000, 0x0000001500000001, 0x3668693832316369, - 0x6b36686f3233636f, 0x0000103168703368, 0x0307070000000200, 0x0000030808000000, 0x0402020000000100, 0x0200000003000000, 0x100a0a0000002002, 0x0000080d0d000000, - 0x0000050000000000, 0x0000010000000300, 0x0000040000000400, 0x0100010000000100, 0x6900000018000000, 0x3835686938323163, 0x35686f383231636f, 0x1030687033686b36, - 0x0200000003000000, 0x0307070000000402, 0x0000030808000000, 0x020a0a0000000200, 0x0000080d0d000000, 0x2002020000000300, 0x0000200a0a000000, 0x03000000070d0d00, - 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3163690000001800, 0x636f383568693832, 0x6b3635686f383231, 0x0000103068703368, - 0x0802020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000040a0a000000, 0x03000000040d0d00, 0x0000100202000000, 0x0d000000100a0a00, 0x0000030000000e0d, - 0x0000010000000500, 0x0000080000000100, 0x0000010000000800, 0x1400000800000100, 0x6869386369000000, 0x686f3031636f3031, 0x1030687033686b38, 0x0700000001000000, - 0x0000000000000307, 0x0802020000000400, 0x0000030808000000, 0x0d000000100a0a00, 0x000000000000080d, 0x0000010000000500, 0x00000b0000000100, 0x0000010000000b00, - 0x1400000800000100, 0x6869386369000000, 0x686f3031636f3031, 0x1030687033686b38, 0x0700000001000000, 0x0000000000000307, 0x0802020000000400, 0x0000030808000000, - 0x0d000000100a0a00, 0x000000000000080d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1c00000040000100, 0x6934366369000000, 0x31636f3232303177, - 0x37303031776f3832, 0x103077703631776b, 0x0200000002000000, 0x1008080000000202, 0x0a00000002000000, 0x020d0d000000020a, 0x0200000003000000, 0x400a0a0000002002, - 0x0000200d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000040000000400, 0x0100010000000100, 0x6900000018000000, 0x3233776932313563, 0x33776f383231636f, - 0x0830777031776b32, 0x0100000000000000, 0x0000080202000000, 0x4002020000000300, 0x0000080a0a000000, 0x00000000080d0d00, 0x0300000004000000, 0x0700000001000000, - 0x0100000007000000, 0x0000010001000000, 0x3563690000001800, 0x636f323377693231, 0x6b3233776f383231, 0x0000083077703177, 0x0202020000000100, 0x0200000001000000, - 0x0000030000000802, 0x0a00000020020200, 0x080d0d000000080a, 0x0500000000000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001a0000002001, - 0x3138776931636900, 0x776f3436636f3239, 0x3631776b37373138, 0x0100000010307770, 0x0000020808000000, 0x0000040000000000, 0x0800000002020200, 0x400a0a0000000808, - 0x00000d0d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000040000000400, 0x0100010000000100, 0x690000001a000000, 0x6f30303368693363, 0x303531686f323363, - 0x687032687333686b, 0x0000010000000831, 0x0200000003070700, 0x0000020a0a000000, 0x04000000020d0d00, 0x0000040202000000, 0x0a00000008080800, 0x190d0d000000100a, - 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00000a0000000100, 0x6f34323031636900, 0x0100000010343663, 0x0000080202000000, - 0x0802020000000100, 0x0200000003000000, 0x100a0a0000001002, 0x0000080d0d000000, 0x0000050000000000, 0x0000010000000100, 0x0000070000000700, 0x0100010000000100, - 0x6700000015000000, 0x3868693163693436, 0x686b38686f31636f, 0x0000001031687033, 0x0003070700000001, 0x0003000000000000, 0x0000001001010000, 0x0d0d000000030808, - 0x0000000000000008, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x001b000000010001, 0x6930323363690000, 0x303233636f343668, 0x7333686b3233686f, - 0x0000103168703268, 0x0a02020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000040a0a000000, 0x03000000020d0d00, 0x0000200202000000, 0x0d000000100a0a00, - 0x000003000000100d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1b00000002000100, 0x3032336369000000, 0x3233636f34366869, 0x33686b3233686f30, - 0x0010316870326873, 0x0202000000030000, 0x000307070000000a, 0x0000000308080000, 0x00040a0a00000002, 0x000000020d0d0000, 0x0020020200000003, 0x000000200a0a0000, - 0x0003000000100d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x323363690000001c, 0x6f36393034776930, 0x3034776f30323363, - 0x30777031776b3639, 0x0000000100000010, 0x00020000000a0202, 0x000000020a0a0000, 0x0003000000080d0d, 0x0000002002020000, 0x0d0d000000400a0a, 0x0000000300000020, - 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x001c000000020001, 0x6930323363690000, 0x33636f3639303477, 0x36393034776f3032, 0x001030777031776b, - 0x0202000000010000, 0x000000020000000a, 0x0d0d000000020a0a, 0x0000000300000008, 0x0a0a000000200202, 0x00200d0d00000020, 0x0005000000030000, 0x0003000000030000, - 0x0001000000070000, 0x0001000000070000, 0x0000001a00000020, 0x3931387769316369, 0x38776f3436636f32, 0x703631776b373731, 0x0002000000103077, 0x0000000209090000, - 0x0001000000110d0d, 0x000000020a0a0000, 0x0002020200000004, 0x0000001008080000, 0x0a0a000000100909, 0xffffffff00000010, 0x0000000300000004, 0x0000000400000001, - 0x0000000100000004, 0x0027000000010001, 0x3368693463690000, 0x6f30343677693036, 0x303633686f363363, 0x33686b303436776f, 0x777031687033776b, 0x0000010000000831, - 0x0100000003070700, 0x0000020d0d000000, 0x0402020000000400, 0x0000080808000000, 0x0d000000100a0a00, 0xffffff000000200d, 0x00000300000005ff, 0x0000040000000100, - 0x0000010000000400, 0x1a00000080000100, 0x3832316369000000, 0x35636f3438377769, 0x6b343837776f3231, 0x0000103077703177, 0x0202020000000100, 0x0a00000001000000, - 0x000003000000040a, 0x0a00000040020200, 0x1c0d0d000000400a, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001e0000000100, - 0x7769323135636900, 0x32636f3633353536, 0x33353536776f3635, 0x1030777031776b36, 0x0200000001000000, 0x0000020000001002, 0x0d000000040a0a00, 0x000003000000020d, - 0x0a00000020020200, 0x200d0d000000400a, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001c0000000100, 0x6934323031636900, - 0x3032636f36393177, 0x6b363931776f3834, 0x0000103077703177, 0x4002020000000100, 0x0a00000002000000, 0x040d0d000000040a, 0x0200000003000000, 0x200a0a0000001002, - 0x0000070d0d000000, 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, 0x4000010000000100, 0x6900000018000000, 0x3831776936353263, 0x31776f383231636f, - 0x1030777034776b35, 0x0200000002000000, 0x0408080000000802, 0x0d00000001000000, 0x000003000000080d, 0x0900000020020200, 0x200a0a0000000809, 0x04ffffffff000000, - 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001c0000000100, 0x7769303639636900, 0x3436636f34323031, 0x6b34323031776f30, 0x0000083077703177, - 0x3c02020000000100, 0x0a00000002000000, 0x080d0d000000080a, 0x0200000003000000, 0x100a0a0000001002, 0x0000200d0d000000, 0x0000040000000000, 0x0000010000000100, - 0x0000070000000700, 0x0100010000000100, 0x670000001c000000, 0x7769316369323135, 0x34776f31636f3233, 0x7037776433776b38, 0x0000000008363177, 0x0d00000001000000, - 0x000003000000080d, 0x0800000008010100, 0x080d0d0000000308, 0x0500000000000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000190000000100, - 0x7769363532636900, 0x6f3233636f393832, 0x7031776b39383277, 0x0001000000103077, 0x0000000202020000, 0x0008020200000001, 0x0202000000030000, 0x00100a0a00000010, - 0x000000080d0d0000, 0x0000000400000000, 0x0000000100000003, 0x0000000400000004, 0x0008000100000001, 0x63690000001b0000, 0x3331337769363532, 0x33776f3436636f36, - 0x777031776b363331, 0x0000010000000830, 0x0200000004020200, 0x0000080a0a000000, 0x03000000040d0d00, 0x0000400202000000, 0x0d000000080a0a00, 0xffffff000000100d, - 0x00000300000004ff, 0x0000040000000100, 0x0000010000000400, 0x1800000001000100, 0x3436346369000000, 0x3631636f34366869, 0x33686b3436686f30, 0x0300000008316870, - 0x00000f0202000000, 0x0800000003070700, 0x0000020000000308, 0x0d000000040a0a00, 0x000003000000020d, 0x0a00000020020200, 0x200d0d000000100a, 0x0500000003000000, - 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000190000002001, 0x6869323135636900, 0x34323031636f3331, 0x7033686b3331686f, 0x0003000000103168, - 0x0000002002020000, 0x0808000000030707, 0x0000000200000003, 0x0d0d000000020a0a, 0x0000000300000008, 0x0909000000100202, 0x00200a0a00000020, 0x0005000000030000, - 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000001, 0x3668693036396369, 0x686f303233636f34, 0x31687033686b3436, 0x0000000300000010, - 0x07070000001e0202, 0x0003080800000003, 0x0a0a000000020000, 0x00040d0d00000002, 0x0202000000030000, 0x00400a0a00000020, 0x000000100d0d0000, 0x0000000500000003, - 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x6369000000180000, 0x6f34366869303639, 0x3436686f30323363, 0x001031687033686b, 0x0202000000030000, - 0x000307070000001e, 0x0000000308080000, 0x00080d0d00000001, 0x0202000000040000, 0x0002090900000020, 0x000000200a0a0000, 0x0003000000080d0d, 0x0001000000050000, - 0x0008000000030000, 0x0008000000010000, 0x0000080000010000, 0x6938636900000013, 0x686f38636f303168, 0x1030687033686b38, 0x0c00000002000000, 0x040d0d000000080c, - 0x0300000000000000, 0x0000080202000000, 0x0a00000010090900, 0x000000000000100a, 0x0000010000000500, 0x00000b0000000300, 0x00000b0000000100, 0x1300000800000100, - 0x6869386369000000, 0x38686f38636f3031, 0x000830687033686b, 0x0c0c000000020000, 0x00020d0d00000008, 0x0003000000000000, 0x0000000802020000, 0x0a0a000000080909, - 0x0000000000000008, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x001a000000010001, 0x6936353263690000, 0x3532636f36353268, 0x686b363532686f36, - 0x0000001031687033, 0x0008020200000003, 0x0000000307070000, 0x0002000000030808, 0x000000020a0a0000, 0x0003000000040d0d, 0x0000002002020000, 0x0d0d000000400a0a, - 0x0000000300000020, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x001a000000800101, 0x6938323163690000, 0x3135636f34383777, 0x776b343837776f32, - 0x0000000830777031, 0x0004020200000001, 0x0a0a000000020000, 0x00040d0d00000008, 0x0202000000030000, 0x0020090900000020, 0x000000200a0a0000, 0x0000000400000003, - 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x63690000001d0000, 0x3432686930383231, 0x686f30383231636f, 0x32687333686b3231, 0x0300000008316870, - 0x0000280202000000, 0x0800000003070700, 0x0000020000000308, 0x0d00000002020200, 0x000004000000020d, 0x0900000010020200, 0x200a0a0000000209, 0x0000060d0d000000, - 0x000005ffffffff00, 0x0000010000000300, 0x0000070000000700, 0x4000010000000100, 0x6900000017000000, 0x3631776938323163, 0x3531776f3436636f, 0x001030777032776b, - 0x0202000000020000, 0x0002080800000004, 0x0909000000010000, 0x0000000300000004, 0x0909000000200202, 0x00200a0a00000010, 0x0005ffffffff0000, 0x0002000000030000, - 0x0007000000010000, 0x0101000000070000, 0x0000003200000010, 0x3731646932336369, 0x3333776933336869, 0x3731646f3233636f, 0x3333776f3333686f, 0x776b33686b33646b, - 0x7031687031647033, 0x0004000000103177, 0x0000000306060000, 0x0808000000030707, 0x00020a0a00000003, 0x0505000000010000, 0x0000000300000002, 0x0909000000200202, - 0x00100a0a00000010, 0x0004000000030000, 0x0001000000030000, 0x0004000000040000, 0x0001000000010000, 0x0000001c00000008, 0x3377693635326369, 0x383231636f363331, - 0x776b36333133776f, 0x0000000830777031, 0x0008020200000001, 0x0a0a000000020000, 0x00040d0d00000004, 0x0202000000030000, 0x00200a0a00000020, 0x000000100d0d0000, - 0x00000004ffffffff, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x63690000001a0000, 0x3631686930363532, 0x686f30383231636f, 0x31687033686b3631, - 0x0000000300000008, 0x0707000000500202, 0x0003080800000003, 0x0202000000010000, 0x0000000300000002, 0x0a0a000000100202, 0x00100d0d00000020, 0x0004000000000000, - 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001a00000002, 0x6869303635326369, 0x30383231636f3631, 0x7033686b3631686f, 0x0003000000083168, - 0x000000a002020000, 0x0808000000030707, 0x0000000100000003, 0x0003000000040a0a, 0x0000001002020000, 0x0d0d000000200a0a, 0x0000000200000010, 0x0000000300000005, - 0x0000000100000002, 0x0000000700000007, 0x001c000000080001, 0x6932313563690000, 0x3135636f39393277, 0x38776b3337776f32, 0x0010307770347773, 0x0808000000020000, - 0x00200a0a00000008, 0x0202000000020000, 0x0004050500000008, 0x0202000000030000, 0x0010050500000040, 0x000000100a0a0000, 0x0000000500000003, 0x0000000100000003, - 0x0000000700000007, 0x0001000100000001, 0x63690000001c0000, 0x3230317769303639, 0x776f303436636f34, 0x7031776b34323031, 0x0001000000103077, 0x0000001e02020000, - 0x00040a0a00000002, 0x000000040d0d0000, 0x0020020200000003, 0x000000200a0a0000, 0x0003000000200d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, - 0x0000000200010000, 0x363963690000001c, 0x6f34323031776930, 0x3031776f30343663, 0x30777031776b3432, 0x0000000100000010, 0x00020000001e0202, 0x000000040a0a0000, - 0x0003000000080d0d, 0x0000002002020000, 0x0d0d000000200a0a, 0x0000000300000020, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x0018000000010001, - 0x6469363163690000, 0x6f3631636f383231, 0x7033646b38323164, 0x0003000000103164, 0x0000000306060000, 0x0808000000030707, 0x0000000100000003, 0x0003000000040d0d, - 0x0000001002020000, 0x0d0d000000100a0a, 0x0000000300000020, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x002a000000010001, 0x3168693463690000, - 0x3239317769303830, 0x3031686f31636f30, 0x30323931776f3038, 0x687033776b33686b, 0x0000000831777031, 0x0003070700000001, 0x0d0d000000010000, 0x0000000400000004, - 0x0808000000040202, 0x00080a0a00000008, 0x000000200d0d0000, 0x00000005ffffffff, 0x0000000100000003, 0x0000000400000004, 0x0001000100000001, 0x6369000000180000, - 0x6f36316869383231, 0x3631686f38323163, 0x001031687033686b, 0x0707000000020000, 0x0003080800000003, 0x0202000000010000, 0x0000000300000004, 0x0a0a000000200202, - 0x00080d0d00000010, 0x0004000000000000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001c00000002, 0x7769303635326369, 0x383231636f363735, - 0x776b363735776f30, 0x0000000830777031, 0x00a0020200000001, 0x0a0a000000020000, 0x00080d0d00000008, 0x0202000000030000, 0x00200a0a00000010, 0x000000180d0d0000, - 0x0000000400000000, 0x0000000100000001, 0x0000000700000007, 0x0001000100000001, 0x35670000001d0000, 0x3377693163693231, 0x3436776f31636f32, 0x703531776433776b, - 0x0000000008323377, 0x0d00000001000000, 0x000003000000080d, 0x0800000008010100, 0x080d0d0000000308, 0x0500000000000000, 0x0100000003000000, 0x0700000007000000, - 0x0100000001000000, 0x00001c0000000100, 0x6930323931636900, 0x3231636f36353277, 0x6b363532776f3038, 0x0000103077703177, 0x3c02020000000100, 0x0a00000002000000, - 0x040d0d000000040a, 0x0200000003000000, 0x200a0a0000002002, 0x0000100d0d000000, 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, 0x0200010000000100, - 0x690000001c000000, 0x3277693032393163, 0x30383231636f3635, 0x31776b363532776f, 0x0100000010307770, 0x00003c0202000000, 0x040a0a0000000200, 0x0000080d0d000000, - 0x2002020000000300, 0x0000200a0a000000, 0x03000000100d0d00, 0x0300000005000000, 0x0100000002000000, 0x0a0000000a000000, 0x0000200001000000, 0x3163690000001500, - 0x3233636f35326469, 0x7035646b3532646f, 0x0004000000103264, 0x0000000506060000, 0x0808000000050707, 0x00020a0a00000005, 0x0505000000010000, 0x0000000300000008, - 0x0909000000100202, 0x00100a0a00000020, 0x0004000000020000, 0x0001000000030000, 0x0004000000040000, 0x0101000000010000, 0x0000001a00000080, 0x3133776934366369, - 0x776f3436636f3633, 0x7031776b36333133, 0x0001000000083077, 0x0000000202020000, 0x0000000300000000, 0x0909000000200202, 0x00400a0a00000010, 0x0004000000000000, - 0x0001000000030000, 0x0004000000040000, 0x0001000000010000, 0x0000002600000001, 0x3036336869316369, 0x34636f3034367769, 0x36776f303633686f, 0x33776b33686b3034, - 0x0008317770316870, 0x0707000000010000, 0x0000000100000003, 0x0004000000040d0d, 0x0000000402020000, 0x0a0a000000080808, 0x00200d0d00000008, 0x0005ffffffff0000, - 0x0002000000030000, 0x0007000000010000, 0x0001000000070000, 0x0000001d00000010, 0x3177693635326369, 0x323031636f343434, 0x6b34343431776f34, 0x0000103077703177, - 0x200a0a0000000100, 0x0200000002000000, 0x0405050000000202, 0x0200000003000000, 0x1009090000004002, 0x0000200a0a000000, 0x0000040000000300, 0x0000010000000300, - 0x0000040000000400, 0x0800010000000100, 0x690000001b000000, 0x3177693432303163, 0x6f323135636f3639, 0x7031776b36393177, 0x0001000000083077, 0x0000002002020000, - 0x00080a0a00000002, 0x000000040d0d0000, 0x0020020200000003, 0x000000200a0a0000, 0xffff000000200d0d, 0x000300000005ffff, 0x0007000000030000, 0x0007000000010000, - 0x0000004000010000, 0x323163690000001b, 0x636f323532776938, 0x373332776f363532, 0x103077703631776b, 0x0900000002000000, 0x0f0d0d0000000409, 0x0200000002000000, - 0x040a0a0000000402, 0x0200000003000000, 0x400a0a0000002002, 0x0000100d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000040000000400, 0x1001010000000100, - 0x690000001a000000, 0x3631776932313563, 0x776f323135636f39, 0x777031776b393631, 0x0000010000000830, 0x0200000010020200, 0x0000080a0a000000, 0x03000000040d0d00, - 0x0000200202000000, 0x0a00000010090900, 0xffffff000000100a, 0x00000300000004ff, 0x0000070000000100, 0x0000010000000700, 0x1a00000002000100, 0x3635326369000000, - 0x31636f3231686930, 0x6b3231686f303832, 0x0000083168703368, 0xa002020000000300, 0x0000030707000000, 0x0100000003080800, 0x0000020d0d000000, 0x1002020000000400, - 0x0000020909000000, 0x0d000000100a0a00, 0xffffff000000060d, 0x00000300000004ff, 0x0000040000000100, 0x0000010000000400, 0x1c00000010010100, 0x6934366369000000, - 0x3231636f38303268, 0x686b343031686f38, 0x0831687032687333, 0x0200000003000000, 0x0307070000000202, 0x0000030808000000, 0x040a0a0000000200, 0x0000040d0d000000, - 0x2002020000000300, 0x0000100909000000, 0xff000000200a0a00, 0x0300000004ffffff, 0x0400000001000000, 0x0100000004000000, 0x0000010001000000, 0x3163690000001700, - 0x636f323368693832, 0x686b3233686f3436, 0x0000000831687033, 0x0003070700000002, 0x0000000308080000, 0x0004020200000001, 0x0202000000030000, 0x00100a0a00000020, - 0x000000080d0d0000, 0x0000000400000000, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x63690000001c0000, 0x3930347769303436, 0x776f303233636f36, - 0x7031776b36393034, 0x0001000000083077, 0x0000002802020000, 0x00040a0a00000002, 0x000000040d0d0000, 0x0010020200000003, 0x000000200a0a0000, 0xffff000000200d0d, - 0x000100000005ffff, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x6934366700000017, 0x636f363168693163, 0x33686b3631686f31, 0x0100000010316870, - 0x0000030707000000, 0x020d0d0000000100, 0x0100000003000000, 0x0308080000001001, 0x0000080d0d000000, 0x0000040000000000, 0x0000010000000300, 0x0000070000000700, - 0x0200010000000100, 0x6900000018000000, 0x3639686930343663, 0x39686f303436636f, 0x0831687033686b36, 0x0200000003000000, 0x0307070000002802, 0x0000030808000000, - 0x040a0a0000000200, 0x0000040d0d000000, 0x1002020000000300, 0x0000200a0a000000, 0xff000000180d0d00, 0x0300000005ffffff, 0x0100000002000000, 0x0700000007000000, - 0x0000400001000000, 0x3263690000001800, 0x636f383177693635, 0x6b3531776f383231, 0x0000103077703477, 0x0408080000000200, 0x0000040a0a000000, 0x0202020000000200, - 0x0000080505000000, 0x2002020000000300, 0x0000100909000000, 0xff000000200a0a00, 0x0300000004ffffff, 0x0400000001000000, 0x0100000004000000, 0x0000010001000000, - 0x3163690000001600, 0x32636f3531686930, 0x33686b3331686f30, 0x0200000008306870, 0x0000030707000000, 0x0100000002080800, 0x0000020d0d000000, 0x1002020000000400, - 0x0000020808000000, 0x0d000000080a0a00, 0x000003000000080d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1c00000001000100, 0x3832316369000000, - 0x31636f3631686930, 0x686b38686f303832, 0x1031687032687333, 0x0200000003000000, 0x0307070000002802, 0x0000030808000000, 0x080a0a0000000100, 0x0200000003000000, - 0x100a0a0000002002, 0x0000080d0d000000, 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, 0x0200010000000100, 0x690000001c000000, 0x3168693038323163, - 0x6f30383231636f36, 0x32687333686b3868, 0x0300000010316870, 0x0000280202000000, 0x0800000003070700, 0x0000020000000308, 0x0d000000020a0a00, 0x000003000000080d, - 0x0900000020020200, 0x400a0a0000000209, 0x0400000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000180000000100, 0x6930363532636900, - 0x30383231636f3868, 0x687033686b38686f, 0x0000030000000831, 0x0700000028020200, 0x0308080000000307, 0x0200000001000000, 0x0000030000000202, 0x0a00000020020200, - 0x080d0d000000100a, 0x0400000000000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001b0000000200, 0x6869303436636900, 0x6f303436636f3834, - 0x687333686b343268, 0x0000000831687032, 0x0028020200000003, 0x0000000307070000, 0x0001000000030808, 0x000000080a0a0000, 0x0010020200000003, 0x000000100a0a0000, - 0xffff000000180d0d, 0x000300000004ffff, 0x0004000000010000, 0x0001000000040000, 0x0000008001010000, 0x3135636900000016, 0x3135636f37686932, 0x7033686b37686f32, - 0x0003000000083168, 0x0000001002020000, 0x0808000000030707, 0x0000000200000003, 0x0a0a000000040909, 0x0000000300000004, 0x0909000000200202, 0x00200a0a00000020, - 0x0004000000030000, 0x0001000000030000, 0x0004000000040000, 0x0101000000010000, 0x0000001c00000010, 0x7769343230316369, 0x323031636f393631, 0x776b393631776f34, - 0x0000000830777031, 0x0020020200000001, 0x0a0a000000020000, 0x00080d0d00000008, 0x0202000000030000, 0x0010090900000020, 0x000000200a0a0000, 0x00000005ffffffff, - 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x63690000001d0000, 0x636f343232686933, 0x3631686f34323031, 0x343168733431686b, 0x0200000010306870, - 0x00000e0707000000, 0x0200000002080800, 0x0000080a0a000000, 0x04000000020d0d00, 0x0000040202000000, 0x0a00000008080800, 0x080d0d000000100a, 0x0400000000000000, - 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001c0000000800, 0x3232686933636900, 0x686f383637636f34, 0x68733631686b3431, 0x0000083068703631, - 0x1007070000000200, 0x0000040808000000, 0x020a0a0000000100, 0x0200000004000000, 0x0408080000000402, 0x0000400a0a000000, 0x000000000e0d0d00, 0x0300000005000000, - 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3563690000001a00, 0x6f38323168693231, 0x3231686f32313563, 0x1031687033686b38, 0x0200000003000000, - 0x0307070000001002, 0x0000030808000000, 0x040a0a0000000200, 0x0000080d0d000000, 0x2002020000000300, 0x0000400a0a000000, 0x03000000100d0d00, 0x0300000004000000, - 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, 0x3163690000001d00, 0x3033327769303832, 0x776f303436636f34, 0x7031776b34303332, 0x0001000000083077, - 0x0000005002020000, 0x00040a0a00000002, 0x000000080d0d0000, 0x0010020200000003, 0x000000200a0a0000, 0xffff000000180d0d, 0x000300000005ffff, 0x0007000000010000, - 0x0001000000070000, 0x0000000100010000, 0x3231636900000019, 0x636f323368693038, 0x6b3233686f303436, 0x0000103168703368, 0x2802020000000300, 0x0000030707000000, - 0x0200000003080800, 0x0000020a0a000000, 0x03000000040d0d00, 0x0000200202000000, 0x0d000000400a0a00, 0x000003000000080d, 0x0000030000000500, 0x0000070000000100, - 0x0000010000000700, 0x1900000002000100, 0x3832316369000000, 0x36636f3233686930, 0x686b3233686f3034, 0x0000001031687033, 0x0028020200000003, 0x0000000307070000, - 0x0001000000030808, 0x000000020d0d0000, 0x0020020200000004, 0x0000000209090000, 0x0d0d000000400a0a, 0x0000000300000010 + 0x400a0a0000000408, 0x0000100d0d000000, 0x0000050000000300, 0x0000030000000100, 0x0000010000000b00, 0x0000010000000b00, 0x6900000014000008, 0x636f303168693863, + 0x33686b38686f3031, 0x0200000008306870, 0x0000020c0c000000, 0x00000000080d0d00, 0x0200000003000000, 0x0809090000000802, 0x0000100a0a000000, 0x0000050000000000, + 0x0000030000000100, 0x0000010000000800, 0x0000010000000800, 0x6900000014000008, 0x636f303168693863, 0x33686b38686f3031, 0x0200000010306870, 0x0000040c0c000000, + 0x00000000080d0d00, 0x0200000003000000, 0x1009090000000802, 0x0000100a0a000000, 0x0000040000000000, 0x0000010000000300, 0x0000070000000700, 0x0500010000000100, + 0x6900000018000000, 0x3635686938323163, 0x35686f383231636f, 0x0831687033686b36, 0x0200000003000000, 0x0307070000000202, 0x0000030808000000, 0x0202020000000100, + 0x0200000003000000, 0x200a0a0000002002, 0x00000e0d0d000000, 0x0000040000000000, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x690000001a000000, + 0x3135686938323163, 0x686f383231636f32, 0x687033686b323135, 0x0000030000000831, 0x0700000004020200, 0x0308080000000307, 0x0a00000002000000, 0x080d0d000000040a, + 0x0200000003000000, 0x200a0a0000002002, 0x0000100d0d000000, 0x0000040000000300, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x6900000018000000, + 0x3436686930323363, 0x36686f303233636f, 0x0831687033686b34, 0x0200000003000000, 0x0307070000001402, 0x0000030808000000, 0x080a0a0000000200, 0x0000040d0d000000, + 0x1002020000000300, 0x0000100a0a000000, 0xff000000100d0d00, 0x0300000004ffffff, 0x0400000001000000, 0x0100000004000000, 0x0000010001000000, 0x3363690000001b00, + 0x636f343230316869, 0x6b363532686f3432, 0x3368703468733768, 0x0000000100000008, 0x0002000000070707, 0x000000020a0a0000, 0x0004000000080d0d, 0x0000000402020000, + 0x0a0a000000080808, 0x00200d0d00000010, 0x0005000000010000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001600000002, 0x6f34366869346369, + 0x3436686f30323363, 0x001031687033686b, 0x0707000000010000, 0x0000000100000003, 0x0005000000040d0d, 0x0000000402020000, 0x0909000000040808, 0x00400a0a00000002, + 0x000000080d0d0000, 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x6369000000160000, 0x33636f3436686934, 0x686b3436686f3032, + 0x0000001031687033, 0x0003070700000001, 0x0004000000000000, 0x0000000402020000, 0x0a0a000000040808, 0x00100d0d00000040, 0x0005000000030000, 0x0003000000010000, + 0x0001000000080000, 0x0001000000080000, 0x0000001e00000064, 0x3539327769316369, 0x776f3031636f3233, 0x3031776b35303935, 0x0010307770357773, 0x0d0d000000010000, + 0x0000000000000040, 0x0008080800000003, 0x0000000a09090000, 0x0000000000100a0a, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, + 0x3233636900000038, 0x3268693432326469, 0x6f30363177693432, 0x343232646f323363, 0x31776f343232686f, 0x33686b33646b3036, 0x687031647033776b, 0x0000001031777031, + 0x0003060600000003, 0x0000000307070000, 0x0001000000030808, 0x0000000202020000, 0x0010020200000003, 0x000000200a0a0000, 0x0000000000200d0d, 0x0003000000040000, + 0x0004000000010000, 0x0001000000040000, 0x0000008001010000, 0x313563690000001a, 0x636f343837776932, 0x343837776f383231, 0x000830777031776b, 0x0202000000010000, + 0x0000000100000010, 0x0003000000080a0a, 0x0000002002020000, 0x0a0a000000200909, 0x0000000200000010, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, + 0x001c000000020001, 0x3038323163690000, 0x31636f3635327769, 0x363532776f303832, 0x001030777031776b, 0x0202000000010000, 0x0000000200000028, 0x0d0d000000040a0a, + 0x0000000300000004, 0x0a0a000000200202, 0x00200d0d00000020, 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001c00000001, + 0x7769303832316369, 0x383231636f363532, 0x776b363532776f30, 0x0000001030777031, 0x0028020200000001, 0x0a0a000000020000, 0x00080d0d00000002, 0x0202000000030000, + 0x00400a0a00000020, 0x000000080d0d0000, 0x0000000500000003, 0x0000000300000003, 0x000000010000000a, 0x002001010000000a, 0x6369000000140000, 0x38636f3532646938, + 0x7035646b3332646f, 0x0001000000103164, 0x000000170c0c0000, 0x0000000300000000, 0x0909000000080202, 0x00100a0a00000010, 0x0004000000020000, 0x0001000000030000, + 0x0004000000040000, 0x0101000000010000, 0x0000001b00000080, 0x3268693635326369, 0x686f363532636f38, 0x32687333686b3431, 0x0300000008316870, 0x0000080202000000, + 0x0800000003070700, 0x0000020000000308, 0x0a00000004090900, 0x000003000000080a, 0x0900000020020200, 0x200a0a0000002009, 0x0400000003000000, 0x0100000003000000, + 0x0400000004000000, 0x0100000001000000, 0x00001b0000008001, 0x6934323031636900, 0x3135636f36393177, 0x776b363931776f32, 0x0000000830777031, 0x0020020200000001, + 0x0a0a000000020000, 0x00040d0d00000008, 0x0202000000030000, 0x0020090900000020, 0x000000200a0a0000, 0x0000000400000003, 0x0000000100000003, 0x0000000400000004, + 0x0080010100000001, 0x63690000001a0000, 0x3438377769323135, 0x37776f363532636f, 0x30777031776b3438, 0x0000000100000008, 0x0001000000100202, 0x000000080a0a0000, + 0x0020020200000003, 0x0000002009090000, 0x0002000000200a0a, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000001800010000, 0x6932336700000017, + 0x636f363568693863, 0x33686b3635686f38, 0x0100000008316870, 0x0000030707000000, 0x080d0d0000000100, 0x0200000004000000, 0x0408080000000802, 0x00000c0909000000, + 0x02000000080a0a00, 0x0300000005000000, 0x0100000002000000, 0x0700000007000000, 0x0000100001000000, 0x3563690000001d00, 0x3434343177693231, 0x776f34323031636f, + 0x7031776b34343431, 0x0001000000103077, 0x000000200a0a0000, 0x0002020200000002, 0x0000000805050000, 0x0040020200000003, 0x0000001009090000, 0x0003000000200a0a, + 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x323163690000001a, 0x636f323135686938, 0x323135686f383231, 0x001031687033686b, + 0x0202000000030000, 0x0003070700000004, 0x0000000308080000, 0x00020a0a00000002, 0x000000020d0d0000, 0x0020020200000003, 0x000000400a0a0000, 0x0003000000200d0d, + 0x0003000000040000, 0x0004000000010000, 0x0001000000040000, 0x0000000100010000, 0x3032636900000014, 0x6f3035636f366869, 0x30687033686b3468, 0x0000000200000008, + 0x0808000000030707, 0x0000000000000003, 0x0020020200000003, 0x000000080a0a0000, 0x0000000000040d0d, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, + 0x0000000200010000, 0x3436636900000018, 0x33636f3639686930, 0x686b3639686f3032, 0x0000000831687033, 0x0028020200000003, 0x0000000307070000, 0x0001000000030808, + 0x000000020d0d0000, 0x0010020200000004, 0x0000000209090000, 0x0d0d000000200a0a, 0x0000000000000010, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, + 0x002a000000200101, 0x3168693463690000, 0x3239317769303830, 0x3031686f31636f30, 0x30323931776f3038, 0x687033776b33686b, 0x0000000831777031, 0x0003070700000001, + 0x0d0d000000010000, 0x0000000400000002, 0x0808000000040202, 0x0020090900000008, 0x000000080a0a0000, 0x0000000400000002, 0x0000000100000003, 0x0000000700000007, + 0x0001000100000001, 0x6369000000180000, 0x6f34366869303639, 0x3436686f30323363, 0x000831687033686b, 0x0202000000030000, 0x000307070000003c, 0x0000000308080000, + 0x00080a0a00000002, 0x000000040d0d0000, 0x0010020200000003, 0x000000100a0a0000, 0xffff000000100d0d, 0x000300000005ffff, 0x0007000000010000, 0x0001000000070000, + 0x0000000100010000, 0x303263690000000b, 0x10383231636f3834, 0x0200000001000000, 0x0000010000001002, 0x0300000008020200, 0x0000100202000000, 0x0d000000100a0a00, + 0x000000000000080d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1a00000001000100, 0x3231356369000000, 0x35636f3635326869, 0x6b363532686f3231, + 0x0000103168703368, 0x1002020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000040a0a000000, 0x03000000080d0d00, 0x0000200202000000, 0x0d000000200a0a00, + 0x000003000000200d, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1c00000001000100, 0x3832316369000000, 0x31636f3631686930, 0x686b38686f303832, + 0x0831687032687333, 0x0200000003000000, 0x0307070000001402, 0x0000030808000000, 0x0202020000000100, 0x0200000003000000, 0x100a0a0000002002, 0x0000080d0d000000, + 0x0000050000000000, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x690000001a000000, 0x3135686936353263, 0x686f383231636f32, 0x687033686b323135, + 0x0000030000001031, 0x0700000008020200, 0x0308080000000307, 0x0a00000002000000, 0x040d0d000000040a, 0x0200000003000000, 0x200a0a0000002002, 0x0000200d0d000000, + 0x0000050000000300, 0x0000010000000100, 0x00000b0000000b00, 0x0000010000000100, 0x6900000013000008, 0x636f303168693263, 0x7033686b38686f38, 0x0001000000083068, + 0x0000000307070000, 0x0000000400000000, 0x0808000000020202, 0x00080a0a00000003, 0x000000080d0d0000, 0x0000000500000000, 0x0000000100000001, 0x0000000800000008, + 0x0800000100000001, 0x6369000000130000, 0x38636f3031686932, 0x687033686b38686f, 0x0000010000001030, 0x0000000003070700, 0x0200000004000000, 0x0308080000000202, + 0x0000100a0a000000, 0x00000000080d0d00, 0x0300000005000000, 0x0700000003000000, 0x0700000001000000, 0x0000400001000000, 0x3163690000001700, 0x636f363177693832, + 0x776b3531776f3436, 0x0000001030777032, 0x0000000100000000, 0x0004000000080202, 0x0000000802020000, 0x0a0a000000020808, 0x00100d0d00000010, 0x0004ffffffff0000, + 0x0001000000030000, 0x0004000000040000, 0x0101000000010000, 0x0000001600000020, 0x3531686930316369, 0x3331686f3032636f, 0x000830687033686b, 0x0707000000020000, + 0x0002080800000003, 0x0a0a000000020000, 0x00040d0d00000004, 0x0202000000040000, 0x0002080800000010, 0x0000002009090000, 0x0003000000080a0a, 0x0001000000050000, + 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x693163690000002c, 0x3030317769303868, 0x686f31636f303536, 0x36303031776f3038, 0x32776b31686b3035, + 0x3131777030687033, 0x0000000000000010, 0x0000000300000000, 0x0a0a000000170808, 0x00100d0d00000010, 0x0005000000000000, 0x0001000000010000, 0x0004000000040000, + 0x0001000000010000, 0x0000002c00000001, 0x6930386869316369, 0x6f30353630303177, 0x776f3038686f3163, 0x686b303536303031, 0x3068703332776b31, 0x0000001031317770, + 0x0000000100000000, 0x0003000000040d0d, 0x0000001708080000, 0x0d0d000000100a0a, 0x0000000000000010, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, + 0x0019000000010001, 0x3038323163690000, 0x3231636f30316869, 0x33686b38686f3038, 0x0300000008306870, 0x0000140202000000, 0x0800000003070700, 0x0000010000000308, + 0x0300000002020200, 0x0000200202000000, 0x0d000000100a0a00, 0x000000000000080d, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1900000002000100, + 0x3034366369000000, 0x3231636f34326869, 0x686b3432686f3038, 0x0000000831687033, 0x0028020200000003, 0x0000000307070000, 0x0001000000030808, 0x000000040a0a0000, + 0x0010020200000003, 0x000000200a0a0000, 0xffff000000180d0d, 0x000300000005ffff, 0x0001000000020000, 0x0007000000070000, 0x0000004000010000, 0x3532636900000019, + 0x32636f3039776936, 0x776b3935776f3635, 0x0000103077703233, 0x2008080000000200, 0x0000080a0a000000, 0x0802020000000200, 0x0000040505000000, 0x2002020000000300, + 0x00001e0505000000, 0xff000000200a0a00, 0x0300000005ffffff, 0x0700000001000000, 0x0100000007000000, 0x0000200001000000, 0x3663690000001700, 0x636f363532686934, + 0x686b363532686f33, 0x0000001034687039, 0x0002020200000003, 0x0000000907070000, 0x0001000000090808, 0x000000080d0d0000, 0x0020020200000003, 0x0000002009090000, + 0x0003000000100a0a, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x313563690000001a, 0x636f363532686932, 0x363532686f363532, + 0x001031687033686b, 0x0202000000030000, 0x0003070700000010, 0x0000000308080000, 0x00040a0a00000002, 0x000000080d0d0000, 0x0020020200000003, 0x000000200a0a0000, + 0x0003000000200d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000200010000, 0x393163690000001d, 0x3432303177693032, 0x31776f303436636f, + 0x777031776b343230, 0x0000010000001030, 0x020000003c020200, 0x0000080a0a000000, 0x03000000040d0d00, 0x0000200202000000, 0x0d000000200a0a00, 0x000003000000200d, + 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1d00000001000100, 0x3239316369000000, 0x6f34323031776930, 0x3031776f30343663, 0x30777031776b3432, + 0x0000000100000010, 0x00020000003c0202, 0x000000040a0a0000, 0x0003000000040d0d, 0x0000002002020000, 0x0d0d000000200a0a, 0x0000000300000020, 0x0000000300000004, + 0x0000000700000001, 0x0000000100000007, 0x001a000000020001, 0x3036353263690000, 0x3231636f34326869, 0x686b3432686f3038, 0x0000000831687033, 0x00a0020200000003, + 0x0000000307070000, 0x0001000000030808, 0x000000040a0a0000, 0x0010020200000003, 0x000000200a0a0000, 0xffff000000180d0d, 0x000300000005ffff, 0x0007000000010000, + 0x0001000000070000, 0x0000004000010000, 0x323163690000001b, 0x636f323532776938, 0x373332776f363532, 0x103077703631776b, 0x0200000002000000, 0x1008080000000402, + 0x0a00000002000000, 0x080d0d000000040a, 0x0200000003000000, 0x400a0a0000002002, 0x0000200d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000070000000700, + 0x0200010000000100, 0x6900000019000000, 0x3468693038323163, 0x686f303436636f38, 0x31687033686b3834, 0x0000000300000008, 0x0707000000500202, 0x0003080800000003, + 0x0d0d000000010000, 0x0000000400000004, 0x0909000000100202, 0x00200a0a00000002, 0x0000000c0d0d0000, 0x00000005ffffffff, 0x0000000100000001, 0x0000000700000007, + 0x0001000100000001, 0x6369000000140000, 0x32636f3635686932, 0x7033686b3635686f, 0x0001000000103168, 0x0000000307070000, 0x00080d0d00000001, 0x0202000000040000, + 0x0003080800000002, 0x000000100a0a0000, 0x0000000000070d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000200010000, 0x3231636900000018, + 0x31636f3868693038, 0x686b38686f303832, 0x0000001031687033, 0x0028020200000003, 0x0000000307070000, 0x0000000000030808, 0x0202000000040000, 0x0002090900000020, + 0x000000200a0a0000, 0x0003000000080d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x3231636900000018, 0x31636f3868693038, + 0x686b38686f303832, 0x0000001031687033, 0x0028020200000003, 0x0000000307070000, 0x0001000000030808, 0x000000020a0a0000, 0x0020020200000003, 0x000000100a0a0000, + 0x0003000000080d0d, 0x0003000000050000, 0x0001000000020000, 0x0007000000070000, 0x0000004000010000, 0x323163690000001b, 0x636f323532776938, 0x373332776f363532, + 0x103077703631776b, 0x0800000002000000, 0x080a0a0000001008, 0x0200000002000000, 0x0805050000000402, 0x0200000003000000, 0x2005050000002002, 0x0000200a0a000000, + 0x000004ffffffff00, 0x0000010000000300, 0x0000040000000400, 0x8001010000000100, 0x690000001b000000, 0x3133776936353263, 0x776f3436636f3633, 0x7031776b36333133, + 0x0001000000083077, 0x0000000802020000, 0x00080a0a00000002, 0x000000020d0d0000, 0x0020020200000003, 0x0000001009090000, 0x0003000000080a0a, 0x0003000000040000, + 0x0004000000010000, 0x0001000000040000, 0x0000008001010000, 0x303163690000001c, 0x636f343168693432, 0x6b37686f38343032, 0x3068703268733168, 0x0000000100000008, + 0x0002000000200202, 0x0000000409090000, 0x0003000000080a0a, 0x0000002002020000, 0x0a0a000000200909, 0x0000000300000020, 0x0000000300000005, 0x0000000700000001, + 0x0000000100000007, 0x0019000000400001, 0x6936353263690000, 0x363532636f303977, 0x3233776b3935776f, 0x0200000010307770, 0x0000080202000000, 0x0100000020080800, + 0x0000040d0d000000, 0x2002020000000300, 0x0000200a0a000000, 0xff000000100d0d00, 0x0300000004ffffff, 0x0700000001000000, 0x0100000007000000, 0x0000020001000000, + 0x3363690000001600, 0x636f363968693032, 0x33686b3639686f34, 0x0300000008316870, 0x00000a0202000000, 0x0800000003070700, 0x0000010000000308, 0x03000000080d0d00, + 0x0000200202000000, 0x0d000000080a0a00, 0x000000000000100d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1a00000020010100, 0x6869336369000000, + 0x6f33636f34323031, 0x7337686b32313568, 0x0000103368703268, 0x0707070000000200, 0x0000020808000000, 0x040d0d0000000100, 0x0200000004000000, 0x0408080000000402, + 0x0000200909000000, 0x03000000100a0a00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3163690000001800, 0x6f32313568693832, + 0x6b323135686f3363, 0x0000103168703368, 0x0402020000000300, 0x0000030707000000, 0x0100000003080800, 0x0000040d0d000000, 0x2002020000000300, 0x0000100a0a000000, + 0x03000000200d0d00, 0x0300000005000000, 0x0100000002000000, 0x0700000007000000, 0x0000400001000000, 0x3163690000001700, 0x636f363177693832, 0x776b3531776f3436, + 0x0000001030777032, 0x0002080800000002, 0x000000020a0a0000, 0x0002020200000001, 0x0202000000030000, 0x0010050500000010, 0x000000200a0a0000, 0x00000005ffffffff, + 0x0000000200000003, 0x0000000700000001, 0x0010000100000007, 0x63690000001b0000, 0x6f35376869363532, 0x3833686f32313563, 0x687032687331686b, 0x0000010000001030, + 0x00000000100a0a00, 0x0200000003000000, 0x1009090000004002, 0x0000200a0a000000, 0x0000050000000300, 0x0000010000000300, 0x0000040000000400, 0x2000010000000100, + 0x6700000019000000, 0x6869363163693233, 0x686f3631636f3431, 0x31687033686b3431, 0x0000000200000010, 0x0808000000030707, 0x0000000100000003, 0x0003000000040d0d, + 0x0000002002020000, 0x0a0a000000200909, 0x0000000300000010, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x001a000000010001, 0x3268693363690000, + 0x686f3436636f3432, 0x687337686b323131, 0x0000000833687032, 0x0007070700000001, 0x0a0a000000020000, 0x00040d0d00000002, 0x0202000000040000, 0x0008080800000004, + 0x000000100a0a0000, 0x0001000000200d0d, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x693363690000001a, 0x3436636f34323268, + 0x37686b323131686f, 0x0008336870326873, 0x0707000000020000, 0x0002080800000007, 0x0a0a000000020000, 0x00040d0d00000002, 0x0202000000040000, 0x0004080800000004, + 0x000000100a0a0000, 0x0001000000200d0d, 0x0003000000050000, 0x0007000000030000, 0x0007000000010000, 0x0000004000010000, 0x3231636900000018, 0x31636f3831776938, + 0x776b3531776f3832, 0x0000001030777034, 0x0040090900000001, 0x0202000000020000, 0x00020a0a00000002, 0x0202000000030000, 0x00100a0a00000010, 0x000000100d0d0000, + 0x00000004ffffffff, 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x63690000001a0000, 0x3834686930383231, 0x686f30383231636f, 0x31687033686b3834, + 0x0000000300000008, 0x0707000000500202, 0x0003080800000003, 0x0a0a000000020000, 0x00020d0d00000008, 0x0202000000030000, 0x00200a0a00000010, 0x000000180d0d0000, + 0x00000005ffffffff, 0x0000000200000003, 0x0000000700000001, 0x0040000100000007, 0x6369000000180000, 0x6f38317769383231, 0x3531776f38323163, 0x001030777034776b, + 0x0808000000020000, 0x00040a0a00000004, 0x0505000000010000, 0x0000000300000004, 0x0505000000200202, 0x00200a0a00000008, 0x0005ffffffff0000, 0x0003000000030000, + 0x0001000000070000, 0x0001000000070000, 0x0000001800000040, 0x3177693635326369, 0x776f383231636f38, 0x30777034776b3531, 0x0000000100000010, 0x0002000000080909, + 0x0000000202020000, 0x0003000000080a0a, 0x0000002002020000, 0x0d0d000000100a0a, 0xffffffff00000010, 0x0000000300000005, 0x0000000400000001, 0x0000000100000004, + 0x0015000000010001, 0x6938323163690000, 0x686f3233636f3668, 0x1031687033686b36, 0x0700000002000000, 0x0308080000000307, 0x0200000001000000, 0x0000030000000402, + 0x0a00000020020200, 0x080d0d000000100a, 0x0500000000000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000180000000100, 0x6869383231636900, + 0x6f383231636f3835, 0x687033686b363568, 0x0000030000001030, 0x0700000008020200, 0x0308080000000307, 0x0a00000002000000, 0x040d0d000000040a, 0x0200000003000000, + 0x100a0a0000001002, 0x00000e0d0d000000, 0x0000050000000300, 0x0000010000000300, 0x0000040000000400, 0x0100010000000100, 0x6900000018000000, 0x3835686938323163, + 0x35686f383231636f, 0x1030687033686b36, 0x0200000003000000, 0x0307070000000402, 0x0000030808000000, 0x020a0a0000000200, 0x0000080d0d000000, 0x2002020000000300, + 0x0000200a0a000000, 0x03000000070d0d00, 0x0100000005000000, 0x0b00000001000000, 0x010000000b000000, 0x0008000001000000, 0x3863690000001400, 0x3031636f30316869, + 0x687033686b38686f, 0x0000010000001030, 0x0000000003070700, 0x0200000004000000, 0x0308080000000802, 0x0000100a0a000000, 0x00000000080d0d00, 0x0100000005000000, + 0x0800000001000000, 0x0100000008000000, 0x0008000001000000, 0x3863690000001400, 0x3031636f30316869, 0x687033686b38686f, 0x0000010000001030, 0x0000000003070700, + 0x0200000004000000, 0x0308080000000802, 0x0000100a0a000000, 0x00000000080d0d00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000400001000000, + 0x3663690000001c00, 0x6f32323031776934, 0x3031776f38323163, 0x77703631776b3730, 0x0000020000001030, 0x0800000002020200, 0x0000020000001008, 0x0d000000020a0a00, + 0x000003000000020d, 0x0a00000020020200, 0x200d0d000000400a, 0x04ffffffff000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000180000000100, + 0x7769323135636900, 0x6f383231636f3233, 0x777031776b323377, 0x0000010000000830, 0x0100000002020200, 0x0000080202000000, 0x2002020000000300, 0x0000080a0a000000, + 0x00000000080d0d00, 0x0300000004000000, 0x0400000001000000, 0x0100000004000000, 0x0000010001000000, 0x3563690000001800, 0x636f323377693231, 0x6b3233776f383231, + 0x0000083077703177, 0x0000010000000000, 0x0300000008020200, 0x0000400202000000, 0x0d000000080a0a00, 0x000000000000080d, 0x0000030000000500, 0x0000070000000100, + 0x0000010000000700, 0x1a00000020010100, 0x7769316369000000, 0x3436636f32393138, 0x776b37373138776f, 0x0000103077703631, 0x0208080000000100, 0x0400000000000000, + 0x0000020202000000, 0x0a00000008080800, 0x0d0d0d000000400a, 0x04ffffffff000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, 0x00001a0000000100, + 0x3033686933636900, 0x31686f3233636f30, 0x32687333686b3035, 0x0100000008316870, 0x0000030707000000, 0x020a0a0000000200, 0x0000020d0d000000, 0x0402020000000400, + 0x0000080808000000, 0x0d000000100a0a00, 0x000003000000190d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x0a00000001000100, 0x3230316369000000, + 0x0000103436636f34, 0x0802020000000100, 0x0200000001000000, 0x0000030000000802, 0x0a00000010020200, 0x080d0d000000100a, 0x0500000000000000, 0x0100000001000000, + 0x0700000007000000, 0x0100000001000000, 0x0000150000000100, 0x6931636934366700, 0x38686f31636f3868, 0x001031687033686b, 0x0707000000010000, 0x0000000000000003, + 0x0010010100000003, 0x0000000308080000, 0x0000000000080d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000200010000, 0x323363690000001b, + 0x33636f3436686930, 0x686b3233686f3032, 0x1031687032687333, 0x0200000003000000, 0x0307070000000a02, 0x0000030808000000, 0x040a0a0000000200, 0x0000020d0d000000, + 0x2002020000000300, 0x0000200a0a000000, 0x03000000100d0d00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3363690000001b00, + 0x636f343668693032, 0x6b3233686f303233, 0x3168703268733368, 0x0000000300000010, 0x07070000000a0202, 0x0003080800000003, 0x0a0a000000020000, 0x00020d0d00000004, + 0x0202000000030000, 0x00100a0a00000020, 0x000000100d0d0000, 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x63690000001c0000, + 0x3930347769303233, 0x776f303233636f36, 0x7031776b36393034, 0x0001000000103077, 0x0000000a02020000, 0x00020a0a00000002, 0x000000080d0d0000, 0x0020020200000003, + 0x000000200a0a0000, 0x0003000000200d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x323363690000001c, 0x6f36393034776930, + 0x3034776f30323363, 0x30777031776b3639, 0x0000000100000010, 0x00020000000a0202, 0x000000020a0a0000, 0x0003000000080d0d, 0x0000002002020000, 0x0d0d000000400a0a, + 0x0000000300000020, 0x0000000300000005, 0x0000000700000003, 0x0000000700000001, 0x001a000000200001, 0x3877693163690000, 0x6f3436636f323931, 0x31776b3737313877, + 0x0000001030777036, 0x0002090900000002, 0x000000110d0d0000, 0x00020a0a00000001, 0x0202000000040000, 0x0010080800000002, 0x0000001009090000, 0xffff000000100a0a, + 0x000300000004ffff, 0x0004000000010000, 0x0001000000040000, 0x0000000100010000, 0x6934636900000027, 0x3436776930363368, 0x33686f3633636f30, 0x6b303436776f3036, + 0x31687033776b3368, 0x0100000008317770, 0x0000030707000000, 0x020d0d0000000100, 0x0200000004000000, 0x0808080000000402, 0x0000100a0a000000, 0xff000000200d0d00, + 0x0300000005ffffff, 0x0400000001000000, 0x0100000004000000, 0x0000800001000000, 0x3163690000001a00, 0x6f34383777693832, 0x3837776f32313563, 0x1030777031776b34, + 0x0200000001000000, 0x0000010000000202, 0x03000000040a0a00, 0x0000400202000000, 0x0d000000400a0a00, 0x0000030000001c0d, 0x0000030000000500, 0x0000070000000100, + 0x0000010000000700, 0x1e00000001000100, 0x3231356369000000, 0x6f36333535367769, 0x3536776f36353263, 0x777031776b363335, 0x0000010000001030, 0x0200000010020200, + 0x0000040a0a000000, 0x03000000020d0d00, 0x0000200202000000, 0x0d000000400a0a00, 0x000003000000200d, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, + 0x1c00000001000100, 0x3230316369000000, 0x636f363931776934, 0x3931776f38343032, 0x1030777031776b36, 0x0200000001000000, 0x0000020000004002, 0x0d000000040a0a00, + 0x000003000000040d, 0x0a00000010020200, 0x070d0d000000200a, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x0000180000004000, + 0x7769363532636900, 0x6f383231636f3831, 0x777034776b353177, 0x0000020000001030, 0x0800000008020200, 0x0000010000000408, 0x03000000080d0d00, 0x0000200202000000, + 0x0a00000008090900, 0xffffff000000200a, 0x00000300000004ff, 0x0000070000000100, 0x0000010000000700, 0x1c00000001000100, 0x3036396369000000, 0x636f343230317769, + 0x323031776f303436, 0x0830777031776b34, 0x0200000001000000, 0x0000020000003c02, 0x0d000000080a0a00, 0x000003000000080d, 0x0a00000010020200, 0x200d0d000000100a, + 0x0400000000000000, 0x0100000001000000, 0x0700000007000000, 0x0100000001000000, 0x00001c0000000100, 0x3163693231356700, 0x6f31636f32337769, 0x776433776b383477, + 0x0000083631777037, 0x0000010000000000, 0x03000000080d0d00, 0x0000080101000000, 0x0d00000003080800, 0x000000000000080d, 0x0000030000000500, 0x0000070000000100, + 0x0000010000000700, 0x1900000001000100, 0x3635326369000000, 0x33636f3938327769, 0x776b393832776f32, 0x0000001030777031, 0x0002020200000001, 0x0202000000010000, + 0x0000000300000008, 0x0a0a000000100202, 0x00080d0d00000010, 0x0004000000000000, 0x0001000000030000, 0x0004000000040000, 0x0001000000010000, 0x0000001800000001, + 0x3668693436346369, 0x686f303631636f34, 0x31687033686b3436, 0x0000000300000008, 0x07070000000f0202, 0x0003080800000003, 0x0a0a000000020000, 0x00020d0d00000004, + 0x0202000000030000, 0x00100a0a00000020, 0x000000200d0d0000, 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, 0x0020010100000001, 0x6369000000190000, + 0x6f33316869323135, 0x31686f3432303163, 0x1031687033686b33, 0x0200000003000000, 0x0307070000002002, 0x0000030808000000, 0x020a0a0000000200, 0x0000080d0d000000, + 0x1002020000000300, 0x0000200909000000, 0x03000000200a0a00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3263690000001a00, + 0x6f36353268693635, 0x3532686f36353263, 0x1031687033686b36, 0x0200000003000000, 0x0307070000000802, 0x0000030808000000, 0x020a0a0000000200, 0x0000040d0d000000, + 0x2002020000000300, 0x0000400a0a000000, 0x03000000200d0d00, 0x0300000004000000, 0x0400000001000000, 0x0100000004000000, 0x0000800101000000, 0x3163690000001a00, + 0x6f34383777693832, 0x3837776f32313563, 0x0830777031776b34, 0x0200000001000000, 0x0000020000000402, 0x0d000000080a0a00, 0x000003000000040d, 0x0900000020020200, + 0x200a0a0000002009, 0x0400000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001d0000000200, 0x6930383231636900, 0x383231636f343268, + 0x33686b3231686f30, 0x0008316870326873, 0x0202000000030000, 0x0003070700000028, 0x0000000308080000, 0x0002020200000002, 0x000000020d0d0000, 0x0010020200000004, + 0x0000000209090000, 0x0d0d000000200a0a, 0xffffffff00000006, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x0017000000400001, 0x6938323163690000, + 0x6f3436636f363177, 0x777032776b353177, 0x0000020000001030, 0x0800000004020200, 0x0000010000000208, 0x0300000004090900, 0x0000200202000000, 0x0a00000010090900, + 0xffffff000000200a, 0x00000300000005ff, 0x0000010000000200, 0x0000070000000700, 0x3200000010010100, 0x6932336369000000, 0x6933336869373164, 0x6f3233636f333377, + 0x6f3333686f373164, 0x686b33646b333377, 0x7031647033776b33, 0x0000103177703168, 0x0306060000000400, 0x0000030707000000, 0x0a00000003080800, 0x000001000000020a, + 0x0300000002050500, 0x0000200202000000, 0x0a00000010090900, 0x000003000000100a, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1a00000002000100, + 0x3635326369000000, 0x31636f3631686930, 0x6b3631686f303832, 0x0000083168703368, 0xa002020000000300, 0x0000030707000000, 0x0100000003080800, 0x0000040a0a000000, + 0x1002020000000300, 0x0000200a0a000000, 0x02000000100d0d00, 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3263690000001a00, + 0x6f36316869303635, 0x31686f3038323163, 0x0831687033686b36, 0x0200000003000000, 0x0307070000005002, 0x0000030808000000, 0x0202020000000100, 0x0200000003000000, + 0x200a0a0000001002, 0x0000100d0d000000, 0x0000050000000000, 0x0000020000000300, 0x0000070000000100, 0x0800010000000700, 0x690000001c000000, 0x3932776932313563, + 0x776f323135636f39, 0x34777338776b3337, 0x0200000010307770, 0x0000080808000000, 0x02000000200a0a00, 0x0000080202000000, 0x0300000004050500, 0x0000400202000000, + 0x0a00000010050500, 0x000003000000100a, 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1c00000002000100, 0x3036396369000000, 0x636f343230317769, + 0x323031776f303436, 0x1030777031776b34, 0x0200000001000000, 0x0000020000001e02, 0x0d000000040a0a00, 0x000003000000080d, 0x0a00000020020200, 0x200d0d000000200a, + 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001c0000000100, 0x7769303639636900, 0x3436636f34323031, 0x6b34323031776f30, + 0x0000103077703177, 0x1e02020000000100, 0x0a00000002000000, 0x040d0d000000040a, 0x0200000003000000, 0x200a0a0000002002, 0x0000200d0d000000, 0x0000050000000300, + 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x6900000018000000, 0x3832316469363163, 0x3231646f3631636f, 0x1031647033646b38, 0x0600000003000000, + 0x0307070000000306, 0x0000030808000000, 0x040d0d0000000100, 0x0200000003000000, 0x100a0a0000001002, 0x0000200d0d000000, 0x0000040000000300, 0x0000010000000300, + 0x0000040000000400, 0x0100010000000100, 0x690000002a000000, 0x3038303168693463, 0x636f303239317769, 0x6f30383031686f31, 0x33686b3032393177, 0x777031687033776b, + 0x0000010000000831, 0x0100000003070700, 0x0000040d0d000000, 0x0402020000000400, 0x0000080808000000, 0x0d000000080a0a00, 0xffffff000000200d, 0x00000300000005ff, + 0x0000040000000100, 0x0000010000000400, 0x1800000001000100, 0x3832316369000000, 0x3231636f36316869, 0x33686b3631686f38, 0x0200000010316870, 0x0000030707000000, + 0x0100000003080800, 0x0000040202000000, 0x2002020000000300, 0x0000100a0a000000, 0x00000000080d0d00, 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, + 0x0000020001000000, 0x3263690000001c00, 0x3637357769303635, 0x776f30383231636f, 0x777031776b363735, 0x0000010000000830, 0x02000000a0020200, 0x0000080a0a000000, + 0x03000000080d0d00, 0x0000100202000000, 0x0d000000200a0a00, 0x000000000000180d, 0x0000010000000400, 0x0000070000000100, 0x0000010000000700, 0x1d00000001000100, + 0x6932313567000000, 0x636f323377693163, 0x33776b3436776f31, 0x3233777035317764, 0x0000000000000008, 0x00080d0d00000001, 0x0101000000030000, 0x0003080800000008, + 0x000000080d0d0000, 0x0000000500000000, 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x63690000001c0000, 0x3532776930323931, 0x6f30383231636f36, + 0x7031776b36353277, 0x0001000000103077, 0x0000003c02020000, 0x00040a0a00000002, 0x000000080d0d0000, 0x0020020200000003, 0x000000200a0a0000, 0x0003000000100d0d, + 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x393163690000001c, 0x6f36353277693032, 0x32776f3038323163, 0x30777031776b3635, + 0x0000000100000010, 0x00020000003c0202, 0x000000040a0a0000, 0x0003000000040d0d, 0x0000002002020000, 0x0d0d000000200a0a, 0x0000000300000010, 0x0000000300000005, + 0x0000000100000002, 0x0000000a0000000a, 0x0015000000200001, 0x3264693163690000, 0x32646f3233636f35, 0x1032647035646b35, 0x0600000004000000, 0x0507070000000506, + 0x0000050808000000, 0x01000000020a0a00, 0x0000080505000000, 0x1002020000000300, 0x0000200909000000, 0x02000000100a0a00, 0x0300000004000000, 0x0400000001000000, + 0x0100000004000000, 0x0000800101000000, 0x3663690000001a00, 0x6f36333133776934, 0x333133776f343663, 0x0830777031776b36, 0x0200000001000000, 0x0000000000000202, + 0x2002020000000300, 0x0000100909000000, 0x00000000400a0a00, 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, 0x0000010001000000, 0x3163690000001800, + 0x636f323377693832, 0x6b3233776f323135, 0x0000083077703177, 0x0000010000000000, 0x0300000008020200, 0x0000100202000000, 0x0d000000100a0a00, 0x000000000000100d, + 0x0000030000000400, 0x0000040000000100, 0x0000010000000400, 0x1800000001000100, 0x3832316369000000, 0x3135636f32337769, 0x31776b3233776f32, 0x0000000008307770, + 0x0200000001000000, 0x0000030000000202, 0x0a00000040020200, 0x080d0d000000080a, 0x0400000000000000, 0x0100000003000000, 0x0400000004000000, 0x0100000001000000, + 0x0000260000000100, 0x3633686931636900, 0x636f303436776930, 0x776f303633686f34, 0x776b33686b303436, 0x0831777031687033, 0x0700000001000000, 0x0000010000000307, + 0x04000000040d0d00, 0x0000040202000000, 0x0a00000008080800, 0x200d0d000000080a, 0x04ffffffff000000, 0x0100000001000000, 0x0700000007000000, 0x0100000001000000, + 0x00001b0000000100, 0x3163693231356700, 0x6f31636f32337769, 0x776433776b323377, 0x0000000834777033, 0x0000000100000000, 0x0003000000040d0d, 0x0000000801010000, + 0x0d0d000000030808, 0xffffffff00000008, 0x0000000300000005, 0x0000000100000002, 0x0000000700000007, 0x001d000000100001, 0x6936353263690000, 0x31636f3434343177, + 0x343431776f343230, 0x1030777031776b34, 0x0a00000001000000, 0x000002000000200a, 0x0500000002020200, 0x0000030000000405, 0x0900000040020200, 0x200a0a0000001009, + 0x0500000003000000, 0x0300000003000000, 0x0100000007000000, 0x0100000007000000, 0x00001b0000004000, 0x7769383231636900, 0x363532636f323532, 0x31776b373332776f, + 0x0000001030777036, 0x0004090900000002, 0x0000000f0d0d0000, 0x0004020200000002, 0x000000040a0a0000, 0x0020020200000003, 0x000000400a0a0000, 0xffff000000100d0d, + 0x000300000004ffff, 0x0007000000010000, 0x0001000000070000, 0x0000000200010000, 0x353263690000001a, 0x636f323168693036, 0x3231686f30383231, 0x000831687033686b, + 0x0202000000030000, 0x00030707000000a0, 0x0000000308080000, 0x00020d0d00000001, 0x0202000000040000, 0x0002090900000010, 0x000000100a0a0000, 0xffff000000060d0d, + 0x000300000004ffff, 0x0004000000010000, 0x0001000000040000, 0x0000000100010000, 0x3231636900000017, 0x36636f3233686938, 0x33686b3233686f34, 0x0200000008316870, + 0x0000030707000000, 0x0100000003080800, 0x0000040202000000, 0x2002020000000300, 0x0000100a0a000000, 0x00000000080d0d00, 0x0100000005000000, 0x0700000001000000, + 0x0100000007000000, 0x0000010001000000, 0x3436670000001700, 0x6f36316869316369, 0x686b3631686f3163, 0x0000001031687033, 0x0003070700000001, 0x0d0d000000010000, + 0x0000000300000002, 0x0808000000100101, 0x00080d0d00000003, 0x0004000000000000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001c00000001, + 0x3477693034366369, 0x303233636f363930, 0x776b36393034776f, 0x0000000830777031, 0x0028020200000001, 0x0a0a000000020000, 0x00040d0d00000004, 0x0202000000030000, + 0x00200a0a00000010, 0x000000200d0d0000, 0x00000004ffffffff, 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x6369000000180000, 0x6f36396869303436, + 0x3639686f30343663, 0x000831687033686b, 0x0202000000030000, 0x0003070700000028, 0x0000000308080000, 0x00040a0a00000002, 0x000000040d0d0000, 0x0010020200000003, + 0x000000200a0a0000, 0xffff000000180d0d, 0x000300000005ffff, 0x0001000000020000, 0x0007000000070000, 0x0000004000010000, 0x3532636900000018, 0x31636f3831776936, + 0x776b3531776f3832, 0x0000001030777034, 0x0004080800000002, 0x000000040a0a0000, 0x0002020200000002, 0x0000000805050000, 0x0020020200000003, 0x0000001009090000, + 0xffff000000200a0a, 0x000300000004ffff, 0x0004000000010000, 0x0001000000040000, 0x0000000100010000, 0x3031636900000016, 0x3032636f35316869, 0x7033686b3331686f, + 0x0002000000083068, 0x0000000307070000, 0x0001000000020808, 0x000000020d0d0000, 0x0010020200000004, 0x0000000208080000, 0x0d0d000000080a0a, 0x0000000300000008, + 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x0018000000010001, 0x3036353263690000, 0x383231636f386869, 0x7033686b38686f30, 0x0003000000083168, + 0x0000002802020000, 0x0808000000030707, 0x0000000100000003, 0x0003000000020202, 0x0000002002020000, 0x0d0d000000100a0a, 0x0000000000000008, 0x0000000300000005, + 0x0000000700000001, 0x0000000100000007, 0x001c000000020001, 0x3038323163690000, 0x3231636f36316869, 0x33686b38686f3038, 0x0010316870326873, 0x0202000000030000, + 0x0003070700000028, 0x0000000308080000, 0x00020a0a00000002, 0x000000080d0d0000, 0x0020020200000003, 0x0000000209090000, 0x0003000000400a0a, 0x0003000000050000, + 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x323163690000001c, 0x636f363168693038, 0x6b38686f30383231, 0x3168703268733368, 0x0000000300000010, + 0x0707000000280202, 0x0003080800000003, 0x0a0a000000010000, 0x0000000300000008, 0x0a0a000000200202, 0x00080d0d00000010, 0x0004000000030000, 0x0001000000030000, + 0x0007000000070000, 0x0001000000010000, 0x0000001b00000002, 0x3468693034366369, 0x686f303436636f38, 0x32687333686b3432, 0x0300000008316870, 0x0000280202000000, + 0x0800000003070700, 0x0000010000000308, 0x03000000080a0a00, 0x0000100202000000, 0x0d000000100a0a00, 0xffffff000000180d, 0x00000300000004ff, 0x0000040000000100, + 0x0000010000000400, 0x1600000080010100, 0x3231356369000000, 0x323135636f376869, 0x687033686b37686f, 0x0000030000000831, 0x0700000010020200, 0x0308080000000307, + 0x0900000002000000, 0x040a0a0000000409, 0x0200000003000000, 0x2009090000002002, 0x0000200a0a000000, 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, + 0x0100010000000100, 0x690000001d000000, 0x6f34323268693363, 0x31686f3432303163, 0x3168733431686b36, 0x0000001030687034, 0x000e070700000002, 0x0000000208080000, + 0x00080a0a00000002, 0x000000020d0d0000, 0x0004020200000004, 0x0000000808080000, 0x0d0d000000100a0a, 0x0000000000000008, 0x0000000300000004, 0x0000000700000001, + 0x0000000100000007, 0x001c000000080001, 0x3268693363690000, 0x6f383637636f3432, 0x733631686b343168, 0x0008306870363168, 0x0707000000020000, 0x0004080800000010, + 0x0a0a000000010000, 0x0000000400000002, 0x0808000000040202, 0x00400a0a00000004, 0x0000000e0d0d0000, 0x0000000500000000, 0x0000000100000003, 0x0000000700000007, + 0x0001000100000001, 0x63690000001a0000, 0x3832316869323135, 0x31686f323135636f, 0x31687033686b3832, 0x0000000300000010, 0x0707000000100202, 0x0003080800000003, + 0x0a0a000000020000, 0x00080d0d00000004, 0x0202000000030000, 0x00400a0a00000020, 0x000000100d0d0000, 0x0000000400000003, 0x0000000100000003, 0x0000000700000007, + 0x0002000100000001, 0x63690000001d0000, 0x3332776930383231, 0x6f303436636f3430, 0x31776b3430333277, 0x0100000008307770, 0x0000500202000000, 0x040a0a0000000200, + 0x0000080d0d000000, 0x1002020000000300, 0x0000200a0a000000, 0xff000000180d0d00, 0x0100000005ffffff, 0x0b00000003000000, 0x0b00000001000000, 0x0008000001000000, + 0x3863690000001300, 0x6f38636f30316869, 0x30687033686b3868, 0x0000000200000008, 0x0d0d000000080c0c, 0x0000000000000002, 0x0008020200000003, 0x0000000809090000, + 0x0000000000080a0a, 0x0001000000050000, 0x0008000000030000, 0x0008000000010000, 0x0000080000010000, 0x6938636900000013, 0x686f38636f303168, 0x1030687033686b38, + 0x0c00000002000000, 0x040d0d000000080c, 0x0300000000000000, 0x0000080202000000, 0x0a00000010090900, 0x000000000000100a, 0x0000030000000500, 0x0000070000000100, + 0x0000010000000700, 0x1800000002000100, 0x3036396369000000, 0x3233636f34366869, 0x33686b3436686f30, 0x0300000010316870, 0x00001e0202000000, 0x0800000003070700, + 0x0000010000000308, 0x04000000080d0d00, 0x0000200202000000, 0x0a00000002090900, 0x080d0d000000200a, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, + 0x0100000001000000, 0x0000180000000100, 0x6869303639636900, 0x6f303233636f3436, 0x687033686b343668, 0x0000030000001031, 0x070000001e020200, 0x0308080000000307, + 0x0a00000002000000, 0x040d0d000000020a, 0x0200000003000000, 0x400a0a0000002002, 0x0000100d0d000000, 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, + 0x0200010000000100, 0x6900000019000000, 0x3368693038323163, 0x686f303436636f32, 0x31687033686b3233, 0x0000000300000010, 0x0707000000280202, 0x0003080800000003, + 0x0d0d000000010000, 0x0000000400000002, 0x0909000000200202, 0x00400a0a00000002, 0x000000100d0d0000, 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, + 0x0001000100000001, 0x6369000000190000, 0x3233686930383231, 0x33686f303436636f, 0x1031687033686b32, 0x0200000003000000, 0x0307070000002802, 0x0000030808000000, + 0x020a0a0000000200, 0x0000040d0d000000, 0x2002020000000300, 0x0000400a0a000000, 0x03000000080d0d00, 0x0300000005000000, 0x0700000001000000, 0x0100000007000000, + 0x0000400001000000, 0x3163690000001800, 0x636f383177693832, 0x6b3531776f383231, 0x0000103077703477, 0x0402020000000200, 0x0000040808000000, 0x080d0d0000000100, + 0x0200000003000000, 0x0809090000002002, 0x0000200a0a000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x690000001d000000, + 0x3177693038323163, 0x303436636f343230, 0x776b34323031776f, 0x0000000830777031, 0x0050020200000001, 0x0a0a000000020000, 0x00080d0d00000004, 0x0202000000030000, + 0x00200a0a00000010, 0x000000200d0d0000, 0x00000004ffffffff, 0x0000000100000003, 0x0000000400000004, 0x0001000100000001, 0x6369000000170000, 0x636f323332686933, + 0x6b343232686f3233, 0x0000083068703968, 0x0907070000000200, 0x0000020808000000, 0x020a0a0000000200, 0x0000040d0d000000, 0x0402020000000400, 0x0000080808000000, + 0x0d000000100a0a00, 0x0000010000001c0d, 0x0000030000000500, 0x0000010000000200, 0x0000070000000700, 0x1900000010000100, 0x3230316369000000, 0x33636f3833686934, + 0x686b3833686f3432, 0x0000001031687033, 0x0003070700000003, 0x0000000308080000, 0x00020000000b0a0a, 0x0000000402020000, 0x0003000000080505, 0x0000004002020000, + 0x0a0a000000100909, 0x0000000300000020, 0x0000000100000004, 0x0000000700000001, 0x0000000100000007, 0x0019000000800001, 0x3163693034670000, 0x31636f3035316869, + 0x33686b303531686f, 0x0100000008316870, 0x0000030707000000, 0x080d0d0000000100, 0x0100000003000000, 0x0308080000001001, 0x0000080909000000, 0x0000040000000100, + 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x6900000019000000, 0x3368693032393163, 0x686f303436636f32, 0x31687033686b3233, 0x0000000300000008, + 0x0707000000780202, 0x0003080800000003, 0x0a0a000000020000, 0x00020d0d00000008, 0x0202000000030000, 0x00200a0a00000010, 0x000000100d0d0000, 0x00000004ffffffff, + 0x0000000100000003, 0x0000000700000007, 0x0006000100000001, 0x6369000000260000, 0x6938326869323135, 0x323135636f313277, 0x3132776f3832686f, 0x687033776b33686b, + 0x0000000831777031, 0x0008020200000003, 0x0000000307070000, 0x0001000000030808, 0x0000000202020000, 0x0020020200000003, 0x000000100a0a0000, 0x0000000000150d0d, + 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000200010000, 0x3436636900000018, 0x33636f3436686930, 0x686b3436686f3032, 0x0000001031687033, + 0x0014020200000003, 0x0000000307070000, 0x0001000000030808, 0x000000080d0d0000, 0x0020020200000004, 0x0000000209090000, 0x0d0d000000200a0a, 0x0000000300000008, + 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x0018000000010001, 0x6930343663690000, 0x303233636f343668, 0x7033686b3436686f, 0x0003000000103168, + 0x0000001402020000, 0x0808000000030707, 0x0000000200000003, 0x0d0d000000020a0a, 0x0000000300000004, 0x0a0a000000200202, 0x00100d0d00000040, 0x0005000000030000, + 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001a00000002, 0x6869303635326369, 0x30383231636f3631, 0x7033686b3631686f, 0x0003000000103168, + 0x0000000a02020000, 0x0808000000030707, 0x0000000100000003, 0x0004000000080202, 0x0000002002020000, 0x0a0a000000020909, 0x00100d0d00000020, 0x0005000000000000, + 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001a00000001, 0x6869303635326369, 0x30383231636f3631, 0x7033686b3631686f, 0x0003000000103168, + 0x0000000a02020000, 0x0808000000030707, 0x0000000100000003, 0x0003000000080202, 0x0000002002020000, 0x0d0d000000200a0a, 0x0000000000000010, 0x0000000300000005, + 0x0000000700000003, 0x0000000700000001, 0x001b000000100001, 0x6938323163690000, 0x383231636f353768, 0x7333686b3833686f, 0x0000103168703268, 0x130c0c0000000100, + 0x0200000002000000, 0x020a0a0000000402, 0x0200000003000000, 0x1009090000002002, 0x0000400a0a000000, 0x0000040000000300, 0x0000010000000300, 0x0000070000000700, + 0x0100010000000100, 0x690000001c000000, 0x3034776930323363, 0x6f303233636f3639, 0x31776b3639303477, 0x0100000008307770, 0x0000140202000000, 0x040a0a0000000200, + 0x0000040d0d000000, 0x1002020000000300, 0x0000200a0a000000, 0xff000000200d0d00, 0x0300000005ffffff, 0x0700000003000000, 0x0700000001000000, 0x0000180001000000, + 0x3233670000001700, 0x6f36356869386369, 0x686b3635686f3863, 0x0000001031687033, 0x00020c0c00000002, 0x0000000e0d0d0000, 0x0000000300000000, 0x0909000000080202, + 0x00100a0a00000020, 0x0004000000000000, 0x0001000000010000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000001, 0x6931636932313567, 0x776f31636f323377, + 0x31777033776b3233, 0x0000000000000008, 0x0000000300000000, 0x0808000000080101, 0x00080d0d00000003, 0x0005000000000000, 0x0001000000030000, 0x0007000000070000, + 0x0001000000010000, 0x0000000a00000001, 0x636f323531316369, 0x0001000000103834, 0x0000000902020000, 0x0008020200000001, 0x0202000000030000, 0x00100a0a00000010, + 0x000000080d0d0000, 0x0000000400000000, 0x0000000300000003, 0x0000000100000007, 0x0008000100000007, 0x63690000001b0000, 0x6f33377769323135, 0x3533776f32313563, + 0x777032777334776b, 0x0000010000000830, 0x0200000008090900, 0x0000080202000000, 0x03000000080a0a00, 0x0000200202000000, 0x0d000000200a0a00, 0x000003000000100d, + 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1800000001000100, 0x3832316369000000, 0x3231636f32336869, 0x33686b3233686f38, 0x0200000008316870, + 0x0000030707000000, 0x0100000003080800, 0x0000080202000000, 0x1002020000000300, 0x0000200a0a000000, 0x00000000100d0d00, 0x0300000004000000, 0x0700000001000000, + 0x0100000007000000, 0x0000020001000000, 0x3663690000001b00, 0x6f36373577693034, 0x35776f3038323163, 0x30777031776b3637, 0x0000000100000008, 0x0002000000280202, + 0x000000080a0a0000, 0x0003000000080d0d, 0x0000001002020000, 0x0d0d000000200a0a, 0x0000000000000018, 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, + 0x0018000000020001, 0x6930363963690000, 0x303436636f323368, 0x7033686b3233686f, 0x0003000000103168, 0x0000001e02020000, 0x0808000000030707, 0x0000000100000003, + 0x0004000000020d0d, 0x0000002002020000, 0x0a0a000000020909, 0x00100d0d00000040, 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, + 0x0000001800000001, 0x3368693036396369, 0x686f303436636f32, 0x31687033686b3233, 0x0000000300000010, 0x07070000001e0202, 0x0003080800000003, 0x0a0a000000020000, + 0x00040d0d00000002, 0x0202000000030000, 0x00200a0a00000020, 0x000000080d0d0000, 0x0000000400000003, 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, + 0x63690000001a0000, 0x3631686930383231, 0x686f30383231636f, 0x31687033686b3631, 0x0000000300000008, 0x0707000000500202, 0x0003080800000003, 0x0a0a000000010000, + 0x0000000300000002, 0x0a0a000000100202, 0x00100d0d00000020, 0x0004000000020000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001a00000001, + 0x6869303832316369, 0x30383231636f3631, 0x7033686b3631686f, 0x0003000000083168, 0x0000002802020000, 0x0808000000030707, 0x0000000100000003, 0x0003000000020202, + 0x0000001002020000, 0x0d0d000000200a0a, 0x0000000000000010, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x0015000000010001, 0x6932313563690000, + 0x686f3432636f3568, 0x0831687033686b35, 0x0200000003000000, 0x0307070000000402, 0x0000030808000000, 0x0802020000000100, 0x0200000003000000, 0x080a0a0000001002, + 0x0000080d0d000000, 0x0000050000000000, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x6900000015000000, 0x6f32336869343663, 0x686b3233686f3163, + 0x0000001031687033, 0x0002020200000003, 0x0000000307070000, 0x0001000000030808, 0x000000040d0d0000, 0x0020020200000003, 0x000000100a0a0000, 0x0003000000080d0d, + 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x3532636900000014, 0x6f36636f35686936, 0x31687033686b3568, 0x0000000300000008, + 0x0707000000020202, 0x0003080800000003, 0x0202000000010000, 0x0000000300000008, 0x0a0a000000100202, 0x00080d0d00000008, 0x0004000000000000, 0x0001000000030000, + 0x0007000000070000, 0x0001000000010000, 0x0000001c00000002, 0x3277693036396369, 0x303436636f343033, 0x776b34303332776f, 0x0000000830777031, 0x003c020200000001, + 0x0a0a000000020000, 0x00080d0d00000004, 0x0202000000030000, 0x00200a0a00000010, 0x000000180d0d0000, 0x00000005ffffffff, 0x0000000300000003, 0x000000010000000a, + 0x002001010000000a, 0x6369000000300000, 0x3268693632646936, 0x36636f3732776938, 0x3632686f3632646f, 0x6b33646b3632776f, 0x31647038776b3368, 0x0010337770306870, + 0x0c0c000000020000, 0x001a0d0d0000000d, 0x0003000000000000, 0x0000000808080000, 0x0a0a000000100909, 0x0000000200000010, 0x0000000300000004, 0x0000000400000001, + 0x0000000100000004, 0x0018000000800101, 0x6936353263690000, 0x363532636f343168, 0x7033686b3431686f, 0x0003000000083168, 0x0000000802020000, 0x0808000000030707, + 0x0000000200000003, 0x0a0a000000040909, 0x0000000300000004, 0x0909000000200202, 0x00200a0a00000020, 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, + 0x0001000000010000, 0x0000001700000020, 0x6869346369323367, 0x35686f34636f3635, 0x1031687033686b36, 0x0700000002000000, 0x0308080000000307, 0x0d00000001000000, + 0x000003000000080d, 0x0900000010020200, 0x100a0a0000002009, 0x0500000000000000, 0x0300000003000000, 0x0100000007000000, 0x0100000007000000, 0x0000180000001000, + 0x6869363532636900, 0x6f363532636f3833, 0x687033686b383368, 0x0000020000001031, 0x0d000000260c0c00, 0x000002000000020d, 0x0a00000008020200, 0x000003000000020a, + 0x0900000020020200, 0x400a0a0000001009, 0x0500000003000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001b0000001000, 0x6869363532636900, + 0x6f323135636f3537, 0x687331686b383368, 0x0000001030687032, 0x0008020200000001, 0x0a0a000000020000, 0x00020d0d00000004, 0x0202000000030000, 0x0010090900000020, + 0x000000400a0a0000, 0x0000000500000003, 0x0000000200000003, 0x0000000700000001, 0x0010000100000007, 0x6369000000180000, 0x6f38336869363532, 0x3833686f36353263, + 0x001031687033686b, 0x0707000000030000, 0x0003080800000003, 0x000000080a0a0000, 0x0002020200000002, 0x0000000805050000, 0x0040020200000003, 0x0000001009090000, + 0x0003000000200a0a, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000200010000, 0x343663690000001b, 0x636f363532776930, 0x3532776f30383231, + 0x1030777031776b36, 0x0200000001000000, 0x0000020000001402, 0x0d000000080a0a00, 0x000003000000040d, 0x0a00000020020200, 0x100d0d000000200a, 0x0500000003000000, + 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001b0000000100, 0x7769303436636900, 0x383231636f363532, 0x776b363532776f30, 0x0000001030777031, + 0x0014020200000001, 0x0a0a000000020000, 0x00080d0d00000002, 0x0202000000030000, 0x00200a0a00000020, 0x000000100d0d0000, 0x0000000400000003, 0x0000000100000003, + 0x0000000700000007, 0x0001000100000001, 0x6369000000160000, 0x6f30316869323135, 0x686b3031686f3663, 0x0000000831687033, 0x0004020200000003, 0x0000000307070000, + 0x0001000000030808, 0x0000000802020000, 0x0010020200000003, 0x000000080a0a0000, 0x0000000000080d0d, 0x0003000000050000, 0x0004000000010000, 0x0001000000040000, + 0x0000002001010000, 0x3431636900000016, 0x3431636f32686934, 0x7033686b32686f34, 0x0003000000103168, 0x0000000502020000, 0x0808000000030707, 0x0000000200000003, + 0x0a0a000000020909, 0x0000000300000002, 0x0909000000200202, 0x00100a0a00000010, 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, + 0x0000001900000010, 0x6869343230316369, 0x6f343233636f3833, 0x687033686b383368, 0x0000030000001031, 0x0700000020020200, 0x0308080000000307, 0x0a00000002000000, + 0x080d0d000000020a, 0x0200000003000000, 0x1009090000002002, 0x0000400a0a000000, 0x0000050000000300, 0x0000010000000300, 0x0000070000000700, 0x1000010000000100, + 0x690000001b000000, 0x3537686938323163, 0x33686f383231636f, 0x7032687333686b38, 0x0003000000103168, 0x0000000402020000, 0x0808000000030707, 0x0000000200000003, + 0x0d0d000000020a0a, 0x0000000300000004, 0x0909000000200202, 0x00400a0a00000010, 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, + 0x0000001c00000002, 0x3177693034366369, 0x303436636f343230, 0x776b34323031776f, 0x0000001030777031, 0x0014020200000001, 0x0a0a000000020000, 0x00080d0d00000004, + 0x0202000000030000, 0x00400a0a00000020, 0x000000100d0d0000, 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x63690000001c0000, + 0x3230317769303436, 0x776f303436636f34, 0x7031776b34323031, 0x0001000000103077, 0x0000001402020000, 0x00040a0a00000002, 0x000000080d0d0000, 0x0020020200000003, + 0x000000400a0a0000, 0x0003000000080d0d, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000002001010000, 0x3231636900000019, 0x636f303168693038, + 0x6b3031686f363435, 0x0000083168703368, 0x5002020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000080a0a000000, 0x03000000020d0d00, 0x0000100202000000, + 0x0a00000020090900, 0x000003000000100a, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1a00000001000100, 0x3635326369000000, 0x31636f3436776930, + 0x6b3436776f303832, 0x0000083077703177, 0x1402020000000100, 0x0200000002000000, 0x020d0d0000000802, 0x0200000003000000, 0x100a0a0000001002, 0x0000200d0d000000, + 0x0000050000000000, 0x0000030000000300, 0x0000010000000700, 0x1000010000000700, 0x6900000016000000, 0x6f35686936353263, 0x6b35686f36383463, 0x0000103168703368, + 0x050c0c0000000200, 0x0000050d0d000000, 0x040a0a0000000100, 0x0200000003000000, 0x1009090000002002, 0x0000200a0a000000, 0x0000040000000300, 0x0000010000000300, + 0x0000070000000700, 0x0100010000000100, 0x6900000018000000, 0x3233686930363963, 0x33686f303436636f, 0x0831687033686b32, 0x0200000003000000, 0x0307070000003c02, + 0x0000030808000000, 0x080a0a0000000100, 0x0200000003000000, 0x200a0a0000001002, 0x0000100d0d000000, 0x000004ffffffff00, 0x0000010000000300, 0x0000040000000400, + 0x8001010000000100, 0x6900000019000000, 0x3477693834303263, 0x776f323135636f39, 0x30777031776b3934, 0x0000000100000008, 0x0002000000400202, 0x000000040a0a0000, + 0x0003000000040d0d, 0x0000002002020000, 0x0a0a000000200909, 0x0000000300000020, 0x0000000100000005, 0x0000000800000003, 0x0000000800000001, 0x0014000000200001, + 0x3264693863690000, 0x3532646f38636f35, 0x001032647035646b, 0x0d0d000000010000, 0x0000000000000019, 0x0008020200000003, 0x0000001009090000, 0x0000000000100a0a, + 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x353263690000001a, 0x636f323135686936, 0x323135686f363532, 0x001031687033686b, + 0x0202000000030000, 0x0003070700000008, 0x0000000308080000, 0x00040a0a00000002, 0x000000040d0d0000, 0x0020020200000003, 0x000000400a0a0000, 0x0003000000200d0d, + 0x0003000000050000, 0x0004000000010000, 0x0001000000040000, 0x0000000100010000, 0x3436636900000038, 0x3268693432326469, 0x6f30363177693432, 0x343232646f323363, + 0x31776f343232686f, 0x33686b33646b3036, 0x687031647033776b, 0x0000001031777031, 0x0003060600000003, 0x0000000307070000, 0x0001000000030808, 0x0000000202020000, + 0x0020020200000003, 0x000000200a0a0000, 0x0000000000200d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x3436636900000038, + 0x3268693432326469, 0x6f30363177693432, 0x343232646f323363, 0x31776f343232686f, 0x33686b33646b3036, 0x687031647033776b, 0x0000001031777031, 0x0004020200000004, + 0x0000000306060000, 0x0808000000030707, 0x0000000000000003, 0x0010020200000003, 0x000000200a0a0000, 0x0003000000200d0d, 0x0003000000050000, 0x0001000000020000, + 0x0007000000070000, 0x0000001000010000, 0x3135636900000018, 0x34636f3931686932, 0x686b3931686f3638, 0x0000001031687033, 0x0003070700000003, 0x0000000308080000, + 0x0002000000100a0a, 0x0000000202020000, 0x0003000000080505, 0x0000004002020000, 0x0a0a000000100909, 0x0000000300000020, 0x0000000300000004, 0x0000000400000001, + 0x0000000100000004, 0x0031000000010001, 0x3432303163690000, 0x3177693030316869, 0x6f323135636f3137, 0x3731776f30303168, 0x6433776b33686b31, 0x3268703177643168, + 0x0300000008327770, 0x0000200202000000, 0x0800000003070700, 0x0000010000000308, 0x03000000080a0a00, 0x0000200202000000, 0x0d000000400a0a00, 0x000003000000100d, + 0x0000010000000500, 0x0000010000000200, 0x00000b0000000b00, 0x1300000800000100, 0x6869386369000000, 0x38686f38636f3031, 0x000830687033686b, 0x0707000000020000, + 0x0003080800000003, 0x0505000000010000, 0x0000000300000002, 0x0909000000080202, 0x00080a0a00000010, 0x0005000000000000, 0x0002000000010000, 0x0008000000010000, + 0x0001000000080000, 0x0000001300000800, 0x6f30316869386369, 0x33686b38686f3863, 0x0200000010306870, 0x0000030707000000, 0x0100000003080800, 0x0000020505000000, + 0x1002020000000300, 0x0000100909000000, 0x00000000080a0a00, 0x0300000005000000, 0x0400000001000000, 0x0100000004000000, 0x0000200001000000, 0x3233670000001700, + 0x6f38326869386369, 0x686b3832686f3863, 0x0000001031687033, 0x0003070700000002, 0x0000000308080000, 0x00040d0d00000001, 0x0202000000030000, 0x0020090900000020, + 0x000000100a0a0000, 0x0000000400000003, 0x0000000100000003, 0x0000000400000004, 0x0001000100000001, 0x6369000000180000, 0x6f34316869363532, 0x3431686f36353263, + 0x000831687033686b, 0x0707000000020000, 0x0003080800000003, 0x0202000000010000, 0x0000000300000008, 0x0a0a000000200202, 0x00080d0d00000020, 0x0005000000000000, + 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000002, 0x3368693034366369, 0x686f303436636f32, 0x31687033686b3233, 0x0000000300000010, + 0x0707000000140202, 0x0003080800000003, 0x0a0a000000010000, 0x0000000300000008, 0x0a0a000000200202, 0x00200d0d00000020, 0x0005000000030000, 0x0001000000030000, + 0x0007000000070000, 0x0001000000010000, 0x0000001800000001, 0x3368693034366369, 0x686f303436636f32, 0x31687033686b3233, 0x0000000300000010, 0x0707000000140202, + 0x0003080800000003, 0x0a0a000000010000, 0x0000000300000008, 0x0a0a000000200202, 0x00200d0d00000010, 0x0005000000030000, 0x0001000000030000, 0x0007000000070000, + 0x0101000000010000, 0x0000003200000010, 0x3731646932336369, 0x3333776933336869, 0x3731646f3233636f, 0x3333776f3333686f, 0x776b33686b33646b, 0x7031687031647033, + 0x0004000000103177, 0x0000000202020000, 0x0707000000030606, 0x0003080800000003, 0x0d0d000000010000, 0x0000000300000002, 0x0909000000100202, 0x00200a0a00000010, + 0x0004000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000001, 0x3377693832316369, 0x776f363532636f32, 0x30777031776b3233, + 0x0000000100000008, 0x0002000000040202, 0x000000040a0a0000, 0x0003000000040d0d, 0x0000002002020000, 0x0d0d000000080a0a, 0xffffffff00000008, 0x0000000300000004, + 0x0000000400000001, 0x0000000100000004, 0x0018000000010001, 0x6938323163690000, 0x363532636f323377, 0x7031776b3233776f, 0x0001000000083077, 0x0000000202020000, + 0x00080a0a00000002, 0x000000020d0d0000, 0x0040020200000003, 0x000000080a0a0000, 0xffff000000080d0d, 0x000100000004ffff, 0x0007000000010000, 0x0001000000070000, + 0x0000000100010000, 0x323135670000001b, 0x6f32337769316369, 0x776b3233776f3163, 0x0832777031776433, 0x0100000000000000, 0x0000040d0d000000, 0x0801010000000300, + 0x0000030808000000, 0xff000000080d0d00, 0x0300000004ffffff, 0x0100000002000000, 0x0700000007000000, 0x0000100001000000, 0x3163690000001f00, 0x3035317769383830, + 0x6f323135636f3030, 0x776b303030353177, 0x0000000830777031, 0x00100a0a00000001, 0x0202000000020000, 0x0008050500000004, 0x0202000000030000, 0x0010090900000020, + 0x000000200a0a0000, 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x63690000001a0000, 0x3233686930383231, 0x686f30383231636f, + 0x31687033686b3233, 0x0000000300000010, 0x0707000000280202, 0x0003080800000003, 0x0d0d000000010000, 0x0000000400000004, 0x0909000000200202, 0x00200a0a00000002, + 0x000000080d0d0000, 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, 0x0001000100000001, 0x63690000001a0000, 0x3233686930383231, 0x686f30383231636f, + 0x31687033686b3233, 0x0000000300000010, 0x0707000000050202, 0x0003080800000003, 0x0202000000010000, 0x0000000300000008, 0x0a0a000000200202, 0x00200d0d00000020, + 0x0004000000000000, 0x0001000000030000, 0x0004000000040000, 0x0101000000010000, 0x0000001900000020, 0x3168693231356369, 0x6f34323031636f33, 0x687033686b333168, + 0x0000030000000831, 0x0700000010020200, 0x0308080000000307, 0x0a00000002000000, 0x020d0d000000080a, 0x0200000003000000, 0x2009090000002002, 0x0000100a0a000000, + 0x0000040000000300, 0x0000010000000300, 0x0000070000000700, 0x2001010000000100, 0x6900000019000000, 0x3331686932313563, 0x686f34323031636f, 0x31687033686b3331, + 0x0000000300000008, 0x0707000000200202, 0x0003080800000003, 0x0a0a000000010000, 0x0000000300000008, 0x0909000000100202, 0x00200a0a00000020, 0x0005000000020000, + 0x0001000000030000, 0x0007000000070000, 0x0101000000010000, 0x0000002a00000020, 0x3830316869346369, 0x6f30323931776930, 0x30383031686f3163, 0x686b30323931776f, + 0x7031687033776b33, 0x0001000000103177, 0x0000000307070000, 0x00080d0d00000001, 0x0202000000040000, 0x0004080800000004, 0x0000002009090000, 0x0003000000100a0a, + 0x0003000000050000, 0x0004000000010000, 0x0001000000040000, 0x0000002001010000, 0x693463690000002a, 0x3177693038303168, 0x686f31636f303239, 0x3931776f30383031, + 0x33776b33686b3032, 0x0010317770316870, 0x0707000000010000, 0x0000000000000003, 0x0004020200000004, 0x0000000808080000, 0x0a0a000000200909, 0x0000000300000010, + 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x0019000000010001, 0x6930343663690000, 0x383231636f363168, 0x33686b3631686f30, 0x0300000008316870, + 0x0000140202000000, 0x0800000003070700, 0x0000010000000308, 0x0300000002020200, 0x0000100202000000, 0x0d000000200a0a00, 0x000000000000100d, 0x0000030000000400, + 0x0000040000000100, 0x0000010000000400, 0x1700000020000100, 0x6369323367000000, 0x38636f3832686938, 0x7033686b3832686f, 0x0001000000083168, 0x0000000307070000, + 0x00080d0d00000001, 0x0202000000040000, 0x0004080800000008, 0x0000002009090000, 0x0002000000080a0a, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, + 0x0000000200010000, 0x393163690000001a, 0x636f343268693032, 0x3432686f30383231, 0x000831687033686b, 0x0202000000030000, 0x0003070700000078, 0x0000000308080000, + 0x00040a0a00000001, 0x0202000000030000, 0x00200a0a00000010, 0x000000180d0d0000, 0x00000005ffffffff, 0x0000000100000003, 0x0000000400000004, 0x0001000100000001, + 0x6369000000170000, 0x6f32336869383231, 0x6b3233686f343663, 0x0000103168703368, 0x0307070000000200, 0x0000030808000000, 0x0402020000000100, 0x0200000003000000, + 0x100a0a0000002002, 0x0000080d0d000000, 0x0000050000000000, 0x0000010000000300, 0x0000040000000400, 0x0100010000000100, 0x6900000017000000, 0x3431686938323163, + 0x3431686f3233636f, 0x001031687033686b, 0x0707000000020000, 0x0003080800000003, 0x0202000000010000, 0x0000000300000004, 0x0a0a000000200202, 0x00080d0d00000010, + 0x0004000000000000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000005, 0x3168693231356369, 0x686f323135636f34, 0x31687033686b3431, + 0x0000000300000008, 0x0707000000040202, 0x0003080800000003, 0x0202000000010000, 0x0000000300000004, 0x0a0a000000200202, 0x000e0d0d00000010, 0x0005000000000000, + 0x0003000000030000, 0x0001000000070000, 0x0001000000070000, 0x0000001800000010, 0x3168693231356369, 0x686f363834636f30, 0x31687033686b3031, 0x0000000200000010, + 0x0d0d0000000a0c0c, 0x000000000000000a, 0x0020020200000003, 0x0000001009090000, 0x0003000000400a0a, 0x0003000000040000, 0x0004000000010000, 0x0001000000040000, + 0x0000008001010000, 0x353263690000001c, 0x6f36333133776936, 0x3133776f38323163, 0x30777031776b3633, 0x0000000100000008, 0x0002000000080202, 0x000000080a0a0000, + 0x0003000000040d0d, 0x0000002002020000, 0x0a0a000000100909, 0x0000000300000010, 0x0000000300000004, 0x0000000700000001, 0x0000000100000007, 0x001c000000020001, + 0x6930343663690000, 0x33636f3631323977, 0x36313239776f3032, 0x000830777031776b, 0x0202000000010000, 0x0000000200000028, 0x0d0d000000020a0a, 0x0000000300000008, + 0x0a0a000000100202, 0x00200d0d00000020, 0x0004ffffffff0000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000001, 0x3368693032336369, + 0x686f303436636f32, 0x31687033686b3233, 0x0000000300000008, 0x0707000000140202, 0x0003080800000003, 0x0a0a000000010000, 0x0000000300000002, 0x0a0a000000100202, + 0x00100d0d00000020, 0x0005ffffffff0000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001c00000008, 0x3432326869336369, 0x31686f383637636f, + 0x3168733631686b34, 0x0000001030687036, 0x0010070700000002, 0x0000000408080000, 0x00080a0a00000001, 0x0202000000040000, 0x0004080800000004, 0x000000400a0a0000, + 0x00030000000e0d0d, 0x0003000000040000, 0x0004000000010000, 0x0001000000040000, 0x0000008001010000, 0x3231636900000018, 0x31636f3832686938, 0x686b3832686f3832, + 0x0000000831687033, 0x0004020200000003, 0x0000000307070000, 0x0002000000030808, 0x000000040a0a0000, 0x0003000000040d0d, 0x0000002002020000, 0x0a0a000000200909, + 0x0000000300000020, 0x0000000300000005, 0x0000000400000001, 0x0000000100000004, 0x0018000000800001, 0x6938323163690000, 0x383231636f383268, 0x7033686b3832686f, + 0x0003000000103168, 0x0000000202020000, 0x0808000000030707, 0x0000000200000003, 0x0d0d000000020a0a, 0x0000000300000004, 0x0909000000400202, 0x00400a0a00000020, + 0x0005000000030000, 0x0001000000010000, 0x0004000000040000, 0x0001000000010000, 0x0000003000000001, 0x6931636938383267, 0x3532776938323168, 0x3231686f31636f36, + 0x686b363532776f38, 0x6433686435776b35, 0x3877703868703377, 0x0000000100000010, 0x0000000000050707, 0x0101000000030000, 0x0005080800000010, 0x000000100d0d0000, + 0x0000000400000000, 0x0000000100000001, 0x0000000400000004, 0x0020010100000001, 0x32670000001c0000, 0x3377693163693432, 0x776f31636f363331, 0x7031776b36333133, + 0x0000000000083077, 0x0002000000000000, 0x0000002001010000, 0x0000000000100909, 0x0003000000040000, 0x0007000000010000, 0x0001000000070000, 0x0000000200010000, + 0x3931636900000019, 0x636f383468693032, 0x6b3834686f303436, 0x0000083168703368, 0x7802020000000300, 0x0000030707000000, 0x0100000003080800, 0x0000040d0d000000, + 0x1002020000000400, 0x0000020909000000, 0x0d000000200a0a00, 0xffffff0000000c0d, 0x00000300000005ff, 0x0000070000000100, 0x0000010000000700, 0x1700000020010100, + 0x6869336369000000, 0x6f3631636f363134, 0x7033686b36313468, 0x0001000000103168, 0x0000000307070000, 0x00080d0d00000001, 0x0202000000040000, 0x0004080800000004, + 0x0000002009090000, 0x0003000000100a0a, 0x0003000000050000, 0x0004000000010000, 0x0001000000040000, 0x0000002001010000, 0x6933636900000017, 0x3631636f36313468, + 0x33686b363134686f, 0x0100000010316870, 0x0000030707000000, 0x0000040000000000, 0x0800000004020200, 0x2009090000000808, 0x0000100a0a000000, 0x0000040000000300, + 0x0000010000000300, 0x0000040000000400, 0x8001010000000100, 0x690000001a000000, 0x3431686932313563, 0x37686f323135636f, 0x687032687333686b, 0x0000030000000831, + 0x0700000010020200, 0x0308080000000307, 0x0900000002000000, 0x080a0a0000000409, 0x0200000003000000, 0x2009090000002002, 0x0000200a0a000000, 0x0000050000000300, + 0x0000010000000300, 0x0000070000000700, 0x0200010000000100, 0x690000001a000000, 0x3677693038323163, 0x6f30383231636f34, 0x777031776b343677, 0x0000010000001030, + 0x0200000028020200, 0x0000020a0a000000, 0x03000000080d0d00, 0x0000200202000000, 0x0d000000200a0a00, 0x000003000000080d, 0x0000030000000500, 0x0000070000000100, + 0x0000010000000700, 0x1a00000001000100, 0x3832316369000000, 0x31636f3436776930, 0x6b3436776f303832, 0x0000103077703177, 0x2802020000000100, 0x0a00000002000000, + 0x020d0d000000020a, 0x0200000003000000, 0x100a0a0000002002, 0x0000100d0d000000, 0x0000050000000300, 0x0000010000000300, 0x0000040000000400, 0x8000010000000100, + 0x690000001b000000, 0x3331337769343663, 0x776f363532636f36, 0x7031776b36333133, 0x0000000000103077, 0x0a0a000000010000, 0x0000000300000004, 0x0a0a000000400202, + 0x00200d0d00000040, 0x0005000000030000, 0x0002000000030000, 0x0007000000010000, 0x0001000000070000, 0x0000001c00000010, 0x3577693635326369, 0x383231636f353236, + 0x776b35323635776f, 0x0000001030777031, 0x00040a0a00000001, 0x0202000000010000, 0x0000000300000004, 0x0909000000400202, 0x00200a0a00000010, 0x0005000000030000, + 0x0003000000030000, 0x0001000000070000, 0x0001000000070000, 0x0000001800000020, 0x3168693231356369, 0x686f363834636f39, 0x31687033686b3931, 0x0000000100000010, + 0x0000000000130d0d, 0x0202000000030000, 0x0010090900000020, 0x000000400a0a0000, 0x0000000400000003, 0x0000000100000001, 0x0000000700000007, 0x0080000100000001, + 0x31670000001c0000, 0x3168693163693434, 0x39686f31636f3139, 0x7032687333686b35, 0x0001000000083068, 0x0000000307070000, 0x0000000300000000, 0x0808000000400101, + 0x0008090900000003, 0x0004000000000000, 0x0001000000010000, 0x0004000000040000, 0x0001000000010000, 0x0000001700000001, 0x6869326369323367, 0x32686f32636f3832, + 0x0831687033686b38, 0x0700000001000000, 0x0000010000000307, 0x04000000020d0d00, 0x0000020202000000, 0x0a00000003080800, 0x0e0d0d000000080a, 0x0400000000000000, + 0x0300000001000000, 0x0100000008000000, 0x0100000008000000, 0x00001a0000000100, 0x3232686933636900, 0x686f383231636f34, 0x34687334686b3635, 0x0100000010306870, + 0x00001c0c0c000000, 0x0000030000000000, 0x0a00000003020200, 0x080d0d000000200a, 0x0500000000000000, 0x0100000001000000, 0x0800000008000000, 0x0100000001000000, + 0x00001e0000006400, 0x3932776931636900, 0x6f3031636f323335, 0x31776b3530393577, 0x1030777035777330, 0x0000000000000000, 0x0800000003000000, 0x0a09090000000a08, + 0x0000100a0a000000, 0x0000040000000000, 0x0000020000000300, 0x0000070000000100, 0x0100010000000700, 0x690000002f000000, 0x3038303168693463, 0x636f303239317769, + 0x6f303633686f3633, 0x6b33686b30343677, 0x3377733368733377, 0x0008307770306870, 0x0707000000030000, 0x0003080800000003, 0x000000030a0a0000, 0x0002050500000001, + 0x0202000000030000, 0x001e050500000008, 0x000000100a0a0000, 0x00000005ffffffff, 0x0000000300000003, 0x0000000100000007, 0x0010000100000007, 0x63690000001f0000, + 0x3035317769383231, 0x34323031636f3030, 0x6b3030303531776f, 0x0000103077703177, 0xfa0d0d0000000100, 0x0200000002000000, 0x080a0a0000000402, 0x0200000003000000, + 0x1009090000002002, 0x0000200a0a000000, 0x0000040000000300, 0x0000010000000300, 0x0000070000000700, 0x0100010000000100, 0x690000001b000000, 0x3532776930343663, + 0x6f30383231636f36, 0x7031776b36353277, 0x0001000000083077, 0x0000002802020000, 0x00080a0a00000002, 0x000000040d0d0000, 0x0010020200000003, 0x000000100a0a0000, + 0xffff000000200d0d, 0x000100000005ffff, 0x0008000000030000, 0x0008000000010000, 0x0000002000010000, 0x6936636900000030, 0x6938326869363264, 0x646f36636f373277, + 0x776f3632686f3632, 0x33686b33646b3632, 0x687031647038776b, 0x0000001033777030, 0x00020c0c00000002, 0x0000001a0d0d0000, 0x0002020200000001, 0x0808000000030000, + 0x0010090900000008, 0x000000100a0a0000, 0x0000000500000000, 0x0000000200000003, 0x0000000700000001, 0x0010000100000007, 0x6369000000180000, 0x3833686934323031, + 0x3833686f3631636f, 0x001031687033686b, 0x0707000000020000, 0x0003080800000003, 0x0202000000020000, 0x0002050500000002, 0x0202000000030000, 0x0010090900000040, + 0x000000100a0a0000, 0x0000000400000003, 0x0000000100000003, 0x0000000400000004, 0x0001000100000001, 0x6369000000150000, 0x31636f3233686933, 0x33686b3033686f30, + 0x0200000008306870, 0x0000030707000000, 0x0100000003080800, 0x0000040d0d000000, 0x2002020000000300, 0x0000080a0a000000, 0x03000000080d0d00, 0x0300000004000000, + 0x0400000001000000, 0x0100000004000000, 0x0000800101000000, 0x3363690000001a00, 0x36636f3432326869, 0x686b323131686f34, 0x0833687032687337, 0x0700000001000000, + 0x0000020000000707, 0x0d000000020a0a00, 0x000004000000080d, 0x0800000004020200, 0x2009090000000808, 0x0000200a0a000000, 0x0000040000000300, 0x0000010000000300, + 0x0000070000000700, 0x0100010000000100, 0x690000001c000000, 0x3034776930363963, 0x6f303233636f3639, 0x31776b3639303477, 0x0100000008307770, 0x00003c0202000000, + 0x040a0a0000000200, 0x0000040d0d000000, 0x1002020000000300, 0x0000200a0a000000, 0xff000000200d0d00, 0x0300000005ffffff, 0x0700000001000000, 0x0100000007000000, + 0x0000020001000000, 0x3663690000001b00, 0x636f323368693034, 0x6b3631686f303436, 0x3168703268733368, 0x0000000300000010, 0x0707000000140202, 0x0003080800000003, + 0x0d0d000000010000, 0x0000000400000002, 0x0909000000200202, 0x00400a0a00000002, 0x000000080d0d0000, 0x0000000500000003, 0x0000000100000003, 0x0000000700000007, + 0x0001000100000001, 0x63690000001b0000, 0x6f32336869303436, 0x3631686f30343663, 0x687032687333686b, 0x0000030000001031, 0x0700000014020200, 0x0308080000000307, + 0x0a00000002000000, 0x020d0d000000080a, 0x0200000003000000, 0x100a0a0000002002, 0x0000080d0d000000, 0x0000040000000300, 0x0000010000000300, 0x0000040000000400, + 0x0100010000000100, 0x6900000018000000, 0x6f38323168693363, 0x686b3436686f3863, 0x0830687032687333, 0x0700000001000000, 0x0000010000000307, 0x04000000020d0d00, + 0x0000040202000000, 0x0a00000008080800, 0x080d0d000000080a, 0x0500000003000000, 0x0300000003000000, 0x0100000007000000, 0x0100000007000000, 0x00001c0000004000, + 0x3177693436636900, 0x383231636f323230, 0x776b37303031776f, 0x0000103077703631, 0x0409090000000200, 0x0000070d0d000000, 0x0202020000000200, 0x0000020a0a000000, + 0x2002020000000300, 0x0000400a0a000000, 0xff000000100d0d00, 0x0300000005ffffff, 0x0700000003000000, 0x0700000001000000, 0x0000100001000000, 0x3163690000001900, + 0x6f38336869343230, 0x3833686f34323363, 0x001031687033686b, 0x0c0c000000020000, 0x00260d0d00000002, 0x0a0a000000010000, 0x0000000300000004, 0x0909000000200202, + 0x00200a0a00000010, 0x0004000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000002, 0x3968693036396369, 0x686f303233636f36, + 0x31687033686b3639, 0x0000000300000008, 0x07070000003c0202, 0x0003080800000003, 0x0a0a000000020000, 0x00040d0d00000004, 0x0202000000030000, 0x00200a0a00000010, + 0x000000180d0d0000, 0x00000004ffffffff, 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x6369000000160000, 0x33636f3639686934, 0x686b3639686f3032, + 0x0000000831687033, 0x0003070700000001, 0x0d0d000000010000, 0x0000000500000002, 0x0808000000040202, 0x0002090900000004, 0x000000200a0a0000, 0xffff000000100d0d, + 0x000300000005ffff, 0x0007000000010000, 0x0001000000070000, 0x0000000200010000, 0x3436636900000019, 0x31636f3631686930, 0x6b3631686f303832, 0x0000103168703368, + 0x1402020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000020a0a000000, 0x03000000020d0d00, 0x0000200202000000, 0x0d000000100a0a00, 0x000003000000080d, + 0x0000030000000500, 0x0000070000000100, 0x0000010000000700, 0x1900000001000100, 0x3034366369000000, 0x3231636f36316869, 0x686b3631686f3038, 0x0000001031687033, + 0x0005020200000003, 0x0000000307070000, 0x0001000000030808, 0x0000000402020000, 0x0020020200000003, 0x000000200a0a0000, 0x0000000000100d0d, 0x0003000000040000, + 0x0007000000010000, 0x0001000000070000, 0x0000000200010000, 0x323363690000001b, 0x33636f3639686930, 0x686b3834686f3032, 0x0831687032687333, 0x0200000003000000, + 0x0307070000001402, 0x0000030808000000, 0x040d0d0000000100, 0x0200000004000000, 0x0209090000001002, 0x0000200a0a000000, 0x000000000c0d0d00, 0x0100000004000000, + 0x0700000001000000, 0x0100000007000000, 0x0000800001000000, 0x3431670000001c00, 0x3531686931636934, 0x3537686f31636f31, 0x687032687333686b, 0x0000010000000830, + 0x0000000003070700, 0x0100000003000000, 0x0308080000004001, 0x0000080909000000, 0x0000040000000000, 0x0000030000000100, 0x0000010000000800, 0x0000010000000800, + 0x6900000013000008, 0x636f303168693263, 0x7033686b38686f38, 0x0001000000083068, 0x000000080c0c0000, 0x0000000300000000, 0x0909000000080202, 0x00080a0a00000010, + 0x0004000000000000, 0x0001000000030000, 0x0004000000040000, 0x0001000000010000, 0x0000001700000020, 0x3332686932336369, 0x3232686f33636f32, 0x0830687039686b34, + 0x0700000002000000, 0x0908080000000907, 0x0d00000001000000, 0x000003000000040d, 0x0900000020020200, 0x080a0a0000002009, 0x0400000001000000, 0x0100000003000000, + 0x0400000004000000, 0x0100000001000000, 0x0000170000000100, 0x3268693233636900, 0x32686f33636f3233, 0x30687039686b3432, 0x0000000200000008, 0x0808000000090707, + 0x0000000100000009, 0x0003000000020d0d, 0x0000002002020000, 0x0d0d000000080a0a, 0x000000010000001c, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, + 0x0016000000010001, 0x6869363163690000, 0x686f3631636f3631, 0x30687033686b3431, 0x0000000200000008, 0x0808000000030707, 0x0000000100000002, 0x0004000000020d0d, + 0x0000001002020000, 0x0a0a000000020808, 0x00080d0d00000008, 0x0004000000030000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000002, + 0x3468693036396369, 0x686f303436636f38, 0x31687033686b3834, 0x0000000300000008, 0x07070000003c0202, 0x0003080800000003, 0x0d0d000000010000, 0x0000000400000004, + 0x0909000000100202, 0x00200a0a00000002, 0x0000000c0d0d0000, 0x00000004ffffffff, 0x0000000100000003, 0x0000000700000007, 0x0002000100000001, 0x6369000000180000, + 0x6f36396869303233, 0x3639686f30323363, 0x000831687033686b, 0x0202000000030000, 0x0003070700000014, 0x0000000308080000, 0x00020d0d00000001, 0x0202000000040000, + 0x0002090900000010, 0x000000200a0a0000, 0x0000000000100d0d, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, 0x0000000100010000, 0x3231636900000015, + 0x3233636f37686938, 0x687033686b37686f, 0x0000020000001031, 0x0800000003070700, 0x0000010000000308, 0x0300000008020200, 0x0000100202000000, 0x0d000000100a0a00, + 0x000000000000080d, 0x0000030000000500, 0x0000040000000100, 0x0000010000000400, 0x1500000001000100, 0x3832316369000000, 0x6f3233636f376869, 0x31687033686b3768, + 0x0000000200000010, 0x0808000000030707, 0x0000000100000003, 0x0003000000040202, 0x0000002002020000, 0x0d0d000000100a0a, 0x0000000000000008, 0x0000000100000005, + 0x0000000b00000001, 0x000000010000000b, 0x0013000008000001, 0x3168693863690000, 0x6b38686f38636f30, 0x0000083068703368, 0x0307070000000100, 0x0400000000000000, + 0x0000080202000000, 0x0a00000003080800, 0x080d0d000000080a, 0x0500000000000000, 0x0100000001000000, 0x0800000008000000, 0x0100000001000000, 0x0000130000080000, + 0x3031686938636900, 0x686b38686f38636f, 0x0000001030687033, 0x0003070700000001, 0x0004000000000000, 0x0000000802020000, 0x0a0a000000030808, 0x00080d0d00000010, + 0x0004000000000000, 0x0001000000030000, 0x0007000000070000, 0x0001000000010000, 0x0000001800000002, 0x3468693034366369, 0x686f303436636f38, 0x31687033686b3834, + 0x0000000300000008, 0x0707000000280202, 0x0003080800000003, 0x0d0d000000010000, 0x0000000400000004, 0x0909000000100202, 0x00200a0a00000002, 0x0000000c0d0d0000, + 0x0000000500000000, 0x0000000300000003, 0x000000010000000a, 0x004001010000000a, 0x63690000001b0000, 0x3235327769383231, 0x32776f363532636f, 0x77703631776b3733, + 0x0000020000001030, 0x0d00000002090900, 0x0000020000004f0d, 0x0a00000004020200, 0x000003000000020a, 0x0900000020020200, 0x400a0a0000000809, 0x0500000002000000, + 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001c0000000200, 0x7769303639636900, 0x3233636f36393034, 0x6b36393034776f30, 0x0000103077703177, + 0x1e02020000000100, 0x0a00000002000000, 0x080d0d000000020a, 0x0200000003000000, 0x400a0a0000002002, 0x0000200d0d000000, 0x0000050000000300, 0x0000010000000300, + 0x0000070000000700, 0x0100010000000100, 0x690000001c000000, 0x3034776930363963, 0x6f303233636f3639, 0x31776b3639303477, 0x0100000010307770, 0x00001e0202000000, + 0x040a0a0000000200, 0x0000040d0d000000, 0x2002020000000300, 0x0000200a0a000000, 0x03000000200d0d00, 0x0300000004000000, 0x0700000001000000, 0x0100000007000000, + 0x0000020001000000, 0x3363690000001800, 0x636f383468693032, 0x6b3834686f303436, 0x0000083168703368, 0x1402020000000300, 0x0000030707000000, 0x0100000003080800, + 0x0000040d0d000000, 0x1002020000000400, 0x0000020909000000, 0x0d000000200a0a00, 0x0000000000000c0d, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, + 0x1800000001000100, 0x3032336369000000, 0x3233636f36366869, 0x33686b3436686f30, 0x0300000008306870, 0x0000140202000000, 0x0800000003070700, 0x0000020000000308, + 0x0d000000080a0a00, 0x000003000000020d, 0x0a00000010020200, 0x200d0d000000100a, 0x0400000000000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, + 0x00001d0000000200, 0x6930323931636900, 0x36636f3430333277, 0x34303332776f3034, 0x000830777031776b, 0x0202000000010000, 0x0000000200000078, 0x0d0d000000040a0a, + 0x0000000300000008, 0x0a0a000000100202, 0x00180d0d00000020, 0x0005ffffffff0000, 0x0003000000010000, 0x0001000000080000, 0x0001000000080000, 0x0000001400000020, + 0x6f35326469386369, 0x646b3332646f3863, 0x0000001031647035, 0x00170d0d00000001, 0x0003000000000000, 0x0000000802020000, 0x0a0a000000100909, 0x0000000000000010, + 0x0000000300000005, 0x0000000700000001, 0x0000000100000007, 0x0015000000010001, 0x6936353263690000, 0x686f3231636f3568, 0x1031687033686b35, 0x0200000003000000, + 0x0307070000000202, 0x0000030808000000, 0x0802020000000100, 0x0200000003000000, 0x100a0a0000001002, 0x0000080d0d000000, 0x0000040000000000, 0x0000010000000300, + 0x0000040000000400, 0x8001010000000100, 0x690000001b000000, 0x3177693432303163, 0x6f363532636f3639, 0x7031776b36393177, 0x0001000000083077, 0x0000002002020000, + 0x00080a0a00000002, 0x000000040d0d0000, 0x0020020200000003, 0x0000002009090000, 0x0003000000200a0a, 0x0003000000040000, 0x0004000000010000, 0x0001000000040000, + 0x0000008001010000, 0x3436636900000016, 0x3436636f36356869, 0x7033686b3635686f, 0x0003000000083168, 0x0000000202020000, 0x0808000000030707, 0x0000000200000003, + 0x0d0d000000020a0a, 0x0000000300000008, 0x0909000000200202, 0x00200a0a00000020, 0x0004000000030000, 0x0001000000010000, 0x0004000000040000, 0x0001000000010000, + 0x0000002c00000001, 0x6930386869316369, 0x6f30353630303177, 0x776f3038686f3163, 0x686b303536303031, 0x3068703332776b31, 0x0000000831317770, 0x0000000100000000, + 0x0003000000020d0d, 0x0000001708080000, 0x0d0d000000080a0a, 0x000000000000000f, 0x0000000100000004, 0x0000000700000001, 0x0000000100000007, 0x002c000000010001, + 0x3868693163690000, 0x3536303031776930, 0x3038686f31636f30, 0x303536303031776f, 0x703332776b31686b, 0x0008313177703068, 0x0808000000010000, 0x0000000100000003, + 0x0003000000080d0d, 0x0000000808080000, 0x0d0d000000080a0a, 0x0000000100000010, 0x0000000300000004, 0x0000000400000001, 0x0000000100000004, 0x001b000000800101, + 0x6938323163690000, 0x383231636f363568, 0x7333686b3832686f, 0x0000083168703268, 0x0402020000000300, 0x0000030707000000, 0x0200000003080800, 0x0000040a0a000000, + 0x03000000080d0d00, 0x0000200202000000, 0x0a00000020090900, 0x000003000000200a, 0x0000030000000400, 0x0000070000000100, 0x0000010000000700, 0x1c00000002000100, + 0x3036396369000000, 0x636f363132397769, 0x313239776f303233, 0x0830777031776b36, 0x0200000001000000, 0x0000020000003c02, 0x0d000000040a0a00, 0x000003000000080d, + 0x0a00000010020200, 0x200d0d000000200a, 0x04ffffffff000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00001a0000000200, 0x6930383231636900, + 0x383231636f343268, 0x33686b3432686f30, 0x0300000008316870, 0x0000500202000000, 0x0800000003070700, 0x0000010000000308, 0x03000000040a0a00, 0x0000100202000000, + 0x0d000000200a0a00, 0xffffff000000180d, 0x00000300000004ff, 0x0000040000000100, 0x0000010000000400, 0x2c00000020010100, 0x6869316369000000, 0x3630303177693038, + 0x38686f31636f3035, 0x3536303031776f30, 0x3332776b31686b30, 0x0831317770306870, 0x0800000001000000, 0x0000010000000308, 0x04000000080d0d00, 0x0000040202000000, + 0x0900000008080800, 0x080a0a0000002009, 0x0400000001000000, 0x0100000003000000, 0x0700000007000000, 0x0100000001000000, 0x00002c0000002001, 0x3038686931636900, + 0x3035363030317769, 0x6f3038686f31636f, 0x6b30353630303177, 0x68703332776b3168, 0x0000083131777030, 0x0308080000000100, 0x0d00000001000000, 0x000004000000080d, + 0x0800000002020200, 0x2009090000000808, 0x0000080a0a000000, 0x0000040000000100, 0x0000010000000300, 0x0000070000000700, 0x0200010000000100, 0x690000001c000000, + 0x3332776930323363, 0x6f303436636f3430, 0x31776b3430333277, 0x0100000008307770, 0x0000140202000000, 0x040a0a0000000200, 0x0000080d0d000000, 0x1002020000000300, + 0x0000200a0a000000, 0xff000000180d0d00, 0x0300000004ffffff, 0x0400000001000000, 0x0100000004000000, 0x0000800101000000, 0x3663690000001b00, 0x6f36333133776934, + 0x3133776f36353263, 0x30777031776b3633, 0x0000000100000008, 0x0001000000020202, 0x000000080d0d0000, 0x0020020200000003, 0x0000002009090000, 0x0002000000200a0a, + 0x0003000000040000, 0x0007000000030000, 0x0007000000010000, 0x0000000100010000, 0x693363690000001a, 0x3231636f34323268, 0x34686b3635686f38, 0x0008306870346873, + 0x0c0c000000010000, 0x0000000100000008, 0x0004000000020a0a, 0x0000000402020000, 0x0a0a000000040808, 0x00100d0d00000008, 0x0005000000000000, 0x0003000000030000, + 0x0001000000070000, 0x0001000000070000, 0x0000001a00000018, 0x7769383430326369, 0x38343032636f3934, 0x7031776b3934776f, 0x0001000000103077, 0x000000310d0d0000, + 0x0004020200000002, 0x000000040a0a0000, 0x0020020200000003, 0x0000001009090000, 0x0003000000400a0a, 0x0003000000050000, 0x0007000000010000, 0x0001000000070000, + 0x0000000100010000, 0x3938636900000018, 0x31636f3436776936, 0x776b3436776f3832, 0x0000001030777031, 0x0007020200000001, 0x0202000000010000, 0x0000000300000008, + 0x0a0a000000100202, 0x00080d0d00000010, 0x0000000000000000 }; return data; }; From 52d32ea6809e6e79247cfc090b8652c084f5fccb Mon Sep 17 00:00:00 2001 From: "Gu, Yonghao" Date: Wed, 15 May 2024 08:48:17 +0000 Subject: [PATCH 150/187] graph: backend: dnnl: use plain layout for sdp_primitive --- src/graph/backend/dnnl/kernels/sdp_primitive.hpp | 4 ++-- src/graph/backend/dnnl/op_executable.cpp | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive.hpp b/src/graph/backend/dnnl/kernels/sdp_primitive.hpp index 800f0062a98..26c7d8e7122 100644 --- a/src/graph/backend/dnnl/kernels/sdp_primitive.hpp +++ b/src/graph/backend/dnnl/kernels/sdp_primitive.hpp @@ -239,7 +239,7 @@ class sdp_primitive_kernel_t : public kernel_base_t { // First, dry run on a deep copy subgraph_ = std::make_shared( graph_t::deep_copy(part->get_ops()), p_engine_, - part->get_fpmath_mode(), part->get_use_blocked_layout(), true); + part->get_fpmath_mode(), false, true); CHECK(set_given_inputs_outputs(subgraph_, inputs, outputs)); subgraph_visualizer_t vis(part->id(), [this](const value_t *val) { @@ -292,7 +292,7 @@ class sdp_primitive_kernel_t : public kernel_base_t { // Successfully created the primitive. Rerun the passes again, modifying // the original ops. subgraph_ = std::make_shared(part->get_ops(), p_engine_, - part->get_fpmath_mode(), part->get_use_blocked_layout(), true); + part->get_fpmath_mode(), false, true); CHECK(set_given_inputs_outputs(subgraph_, inputs, outputs)); CHECK(modify_subgraph()); CHECK(cfg_.locate_io(subgraph_, inputs, outputs)); diff --git a/src/graph/backend/dnnl/op_executable.cpp b/src/graph/backend/dnnl/op_executable.cpp index 2a21693ff3d..da5e633d59f 100644 --- a/src/graph/backend/dnnl/op_executable.cpp +++ b/src/graph/backend/dnnl/op_executable.cpp @@ -328,7 +328,9 @@ matmul_executable_t::desc_t matmul_executable_t::create_desc( pd_cache.at(op.get())); return {pd, true}; } - + bool can_use_blocked_layout = true; + if (p_engine.get_kind() == dnnl::engine::kind::gpu) + can_use_blocked_layout = mgr.get_use_blocked_layout(); dnnl::primitive_attr prm_attr; if (op->has_attr(op_attr::fusion_info_key) && op->get_attr(op_attr::fusion_info_key) != -1) { @@ -363,7 +365,9 @@ matmul_executable_t::desc_t matmul_executable_t::create_desc( // convert src memory desc to any when: // 1) not the situation mentioned above // 2) the given md is blocked and convert to queried layout is necessary - if (!use_strided_src || !is_plain(src)) { src = to_format_any(src); } + if (can_use_blocked_layout && (!use_strided_src || !is_plain(src))) { + src = to_format_any(src); + } auto wei = make_dnnl_memory_desc( op->get_input_value(1)->get_logical_tensor()); // For non-constant weight, create primitive desc with strided layout when: @@ -378,7 +382,9 @@ matmul_executable_t::desc_t matmul_executable_t::create_desc( && (is_format(wei, dnnl::memory::format_tag::adbc) || is_format(wei, dnnl::memory::format_tag::abdc) || is_format(wei, dnnl::memory::format_tag::acbd))); - if (!use_strided_wei) { wei = to_format_any(wei); } + if (can_use_blocked_layout && !use_strided_wei) { + wei = to_format_any(wei); + } auto dst = make_dnnl_memory_desc( op->get_output_value(0)->get_logical_tensor()); const bool keep_dst_layout = op->has_attr(op_attr::keep_dst_layout) @@ -387,7 +393,7 @@ matmul_executable_t::desc_t matmul_executable_t::create_desc( = ((src.get_ndims() == 2 || src.get_ndims() == 3) && p_engine.get_kind() == dnnl::engine::kind::gpu) || keep_dst_layout; - if (!use_strided_dst) { + if (can_use_blocked_layout && !use_strided_dst) { dst = to_format_any(dst); } else if (dst.get_format_kind() == dnnl::memory::format_kind::any && !keep_dst_layout) { From ea2c4b2fd48dd3916248b1459d5cb02092d2e3a2 Mon Sep 17 00:00:00 2001 From: Zhitao Wang Date: Wed, 22 May 2024 12:06:18 +0800 Subject: [PATCH 151/187] graph: backend: dnnl: enhance getting op direction in datatype check --- .../dnnl/patterns/data_type_check_pass.hpp | 157 ++++++++++++------ src/graph/backend/dnnl/platform.cpp | 4 +- src/graph/backend/dnnl/platform.hpp | 7 + .../graph/unit/backend/dnnl/test_pass.cpp | 46 +---- 4 files changed, 127 insertions(+), 87 deletions(-) diff --git a/src/graph/backend/dnnl/patterns/data_type_check_pass.hpp b/src/graph/backend/dnnl/patterns/data_type_check_pass.hpp index 42c5e64db44..e2e04e1125e 100644 --- a/src/graph/backend/dnnl/patterns/data_type_check_pass.hpp +++ b/src/graph/backend/dnnl/patterns/data_type_check_pass.hpp @@ -17,6 +17,7 @@ #ifndef GRAPH_BACKEND_DNNL_PATTERNS_DATA_TYPE_CHECK_PASS_HPP #define GRAPH_BACKEND_DNNL_PATTERNS_DATA_TYPE_CHECK_PASS_HPP +#include "graph/backend/dnnl/kernels/quantize.hpp" #include "graph/backend/dnnl/patterns/pattern_matcher_pass.hpp" #include "graph/backend/dnnl/platform.hpp" #include "graph/backend/fake/pattern_utils.hpp" @@ -32,6 +33,97 @@ namespace pattern { namespace { +platform::dir_t get_op_dir(const std::shared_ptr &aop) { + using namespace dnnl::impl::graph::op_kind; + using namespace dnnl::impl::graph::dnnl_impl::platform; + + const auto &op_kind = aop->get_kind(); + const auto &num_inputs = aop->num_inputs(); + const auto &num_outputs = aop->num_outputs(); + + dir_t dir = dir_t::FLAG_FWD; + switch (op_kind) { + // BatchNorm + case BatchNormForwardTraining: dir = dir_t::FWD_D; break; + case BatchNormInference: dir = dir_t::FWD_I; break; + case BatchNormTrainingBackward: + dir = num_outputs == 1 ? dir_t::BWD_D : dir_t::BWD_DW; + break; + // Convolution + case Convolution: + dir = num_inputs > 2 ? dir_t::FWD_B : dir_t::FWD_I; + break; + case ConvolutionBackwardData: dir = dir_t::BWD_D; break; + case ConvolutionBackwardWeights: dir = dir_t::BWD_W; break; + // ConvTranspose + case ConvTranspose: + dir = num_inputs > 2 ? dir_t::FWD_B : dir_t::FWD_I; + break; + case ConvTransposeBackwardData: dir = dir_t::BWD_D; break; + case ConvTransposeBackwardWeights: dir = dir_t::BWD_W; break; + // Eltwise + case Abs: + case Clamp: + case Elu: + case Exp: + case GELU: + case HardSigmoid: + case HardSwish: + case LeakyReLU: + case Log: + case Mish: + case Pow: + case Reciprocal: + case ReLU: + case Round: + case Sigmoid: + case SoftPlus: + case Sqrt: + case Square: + case SquaredDifference: + case Tanh: dir = dir_t::FWD_D; break; + case AbsBackward: + case ClampBackward: + case EluBackward: + case GELUBackward: + case HardSigmoidBackward: + case HardSwishBackward: + case MishBackward: + case ReLUBackward: + case SigmoidBackward: + case SoftPlusBackward: + case SqrtBackward: + case TanhBackward: dir = dir_t::BWD_D; break; + // LayerNorm + case LayerNorm: + // Outputs: SRC, MEAN( optional ), VAR( optional ) + dir = num_outputs == 1 ? dir_t::FWD_I : dir_t::FWD_D; + break; + case LayerNormBackward: dir = dir_t::BWD_DW; break; + // Pool + case MaxPool: + case AvgPool: dir = dir_t::FWD_I; break; + case MaxPoolBackward: + case AvgPoolBackward: dir = dir_t::BWD_D; break; + // PReLU + case PReLU: dir = dir_t::FWD_D; break; + case PReLUBackward: dir = dir_t::BWD_DW; break; + // Resampling + case Interpolate: dir = dir_t::FWD_D; break; + case InterpolateBackward: dir = dir_t::BWD_D; break; + // Softmax + case SoftMax: + case LogSoftmax: dir = dir_t::FWD_D; break; + case SoftMaxBackward: + case LogSoftmaxBackward: dir = dir_t::BWD_D; break; + // Other ops lack of propagation kind, which are always considered as + // forward, including Binary, Concat, Matmul, Reduction and Reorder. + default: dir = dir_t::FLAG_FWD; break; + } + + return dir; +} + bool is_reorder_type(op_kind_t op_kind) { using namespace dnnl::impl::graph::op_kind; static const std::unordered_set reorder_ops {Reorder, Quantize, @@ -40,32 +132,6 @@ bool is_reorder_type(op_kind_t op_kind) { return (reorder_ops.find(op_kind) != reorder_ops.end()); } -bool is_backward_op(op_kind_t op_kind) { - using namespace dnnl::impl::graph::op_kind; - static std::unordered_set backward_op_kind = { - AbsBackward, - AvgPoolBackward, - BatchNormTrainingBackward, - BiasAddBackward, - ConvolutionBackwardData, - ConvolutionBackwardWeights, - ConvTransposeBackwardData, - ConvTransposeBackwardWeights, - HardSigmoidBackward, - InterpolateBackward, - LayerNormBackward, - LogSoftmaxBackward, - MaxPoolBackward, - MishBackward, - PReLUBackward, - ReLUBackward, - SigmoidBackward, - SoftPlusBackward, - TanhBackward, - }; - return backward_op_kind.find(op_kind) != backward_op_kind.end(); -} - } // namespace /*! @@ -85,25 +151,25 @@ class dtype_check_pass_t : public graph::pass::pass_base { // the criteria of pass execution impl::status_t run(graph_t &agraph) override { + using namespace dnnl::impl::graph::dnnl_impl::platform; + // check if current pattern pass can be run on current graph engine_kind_t graph_engine_kind = agraph.get_engine_kind(); if (get_engine_kind() != engine_kind::any_engine && get_engine_kind() != graph_engine_kind) return impl::status::success; - const std::vector dir_to_check { - platform::dir_t::FLAG_INF, platform::dir_t::FLAG_FWD, - platform::dir_t::FLAG_BWD}; - std::unordered_map> unsupported_dt; - unsupported_dt.reserve(dir_to_check.size()); - - for (const auto dir : dir_to_check) { - unsupported_dt.emplace(dir, std::vector {}); - for (const auto &dt : dt_to_check_) { - bool has_dtype_support = platform::get_dtype_support_status( - graph_engine_kind, dt, dir); - if (!has_dtype_support) unsupported_dt.at(dir).emplace_back(dt); + for (const std::shared_ptr &aop : agraph.get_ops()) { + dir_t dir = get_op_dir(aop); + if (unsupported_dt.find(dir) == unsupported_dt.end()) { + unsupported_dt.emplace(dir, std::vector {}); + for (const auto &dt : dt_to_check_) { + bool has_dtype_support = platform::get_dtype_support_status( + graph_engine_kind, dt, dir); + if (!has_dtype_support) + unsupported_dt.at(dir).emplace_back(dt); + } } } @@ -120,16 +186,12 @@ class dtype_check_pass_t : public graph::pass::pass_base { bool meet_unsupported_dt {false}; bool meet_reorder {false}; + dir_t dir = get_op_dir(aop); const auto &op_kind = aop->get_kind(); - platform::dir_t dir = platform::dir_t::FLAG_INF; - if (is_backward_op(op_kind)) - dir = platform::dir_t::FLAG_BWD; - else if (op_kind - == dnnl::impl::graph::op_kind::BatchNormForwardTraining) - // Currently, batchnorm forward training is the only forward op - // that provides extra output for training purpose. - dir = platform::dir_t::FLAG_FWD; + if (unsupported_dt.find(dir) == unsupported_dt.end()) { + return impl::status::unimplemented; + } const auto &dt_with_dir = unsupported_dt.at(dir); for (size_t i = 0; i < aop->num_inputs(); ++i) { @@ -179,7 +241,8 @@ class dtype_check_pass_t : public graph::pass::pass_base { if (!reorder_fusion_list.empty()) { pattern_utils_t dnnl_pu; const auto quantize_kernel_creater = []() -> kernel_ptr { - return std::make_shared(); + return std::make_shared< + dnnl::impl::graph::dnnl_impl::quantize_dequantize_t>(); }; dnnl_pu.init_partition(agraph, reorder_fusion_list, quantize_kernel_creater, diff --git a/src/graph/backend/dnnl/platform.cpp b/src/graph/backend/dnnl/platform.cpp index edec4c44ebd..4b8016f7e5c 100644 --- a/src/graph/backend/dnnl/platform.cpp +++ b/src/graph/backend/dnnl/platform.cpp @@ -78,8 +78,8 @@ bool get_dtype_support_status(engine_kind_t eng, data_type_t dtype, dir_t dir) { break; } case dnnl_f16: { - // TODO(zhitao): f16 for backward on gpu? - is_supported = is_gpu(eng); + // f16 is supported on GPU for inference only. + is_supported = is_gpu(eng) && (dir & dir_t::FLAG_FWD); break; } case dnnl_f8_e5m2: { diff --git a/src/graph/backend/dnnl/platform.hpp b/src/graph/backend/dnnl/platform.hpp index 2dee9d6095e..1af9270b6c4 100644 --- a/src/graph/backend/dnnl/platform.hpp +++ b/src/graph/backend/dnnl/platform.hpp @@ -37,6 +37,13 @@ enum dir_t { FLAG_FWD = 32, FLAG_BWD = 64, FLAG_INF = 128, + FWD_D = FLAG_FWD + FLAG_DAT, + FWD_I = FLAG_FWD + FLAG_DAT + FLAG_INF, + FWD_B = FLAG_FWD + FLAG_DAT + FLAG_BIA, + BWD_D = FLAG_BWD + FLAG_DAT, + BWD_DW = FLAG_BWD + FLAG_DAT + FLAG_WEI, + BWD_W = FLAG_BWD + FLAG_WEI, + BWD_WB = FLAG_BWD + FLAG_WEI + FLAG_BIA, }; bool has_cpu_data_type_support(data_type_t data_type); diff --git a/tests/gtests/graph/unit/backend/dnnl/test_pass.cpp b/tests/gtests/graph/unit/backend/dnnl/test_pass.cpp index 30042600848..dfd7c1ecc77 100644 --- a/tests/gtests/graph/unit/backend/dnnl/test_pass.cpp +++ b/tests/gtests/graph/unit/backend/dnnl/test_pass.cpp @@ -30,6 +30,7 @@ #include "backend/dnnl/dnnl_backend.hpp" #include "backend/dnnl/dnnl_partition_impl.hpp" +#include "graph/backend/dnnl/patterns/data_type_check_pass.hpp" #include "graph/backend/dnnl/platform.hpp" #include "graph/unit/unit_test_common.hpp" #include "graph/unit/utils.hpp" @@ -63,42 +64,7 @@ bool is_supported_partition(const std::shared_ptr &p) { && (p->get_assigned_backend()->get_name() != "fake_backend"); } -static inline dir_t get_op_dir(dnnl::impl::graph::op_kind_t op_kind) { - using namespace dnnl::impl::graph::op_kind; - dir_t dir = dir_t::FLAG_INF; - static std::unordered_set backward_op_kind = { - AbsBackward, - AvgPoolBackward, - BatchNormTrainingBackward, - BiasAddBackward, - ConvolutionBackwardData, - ConvolutionBackwardWeights, - ConvTransposeBackwardData, - ConvTransposeBackwardWeights, - HardSigmoidBackward, - InterpolateBackward, - LayerNormBackward, - LogSoftmaxBackward, - MaxPoolBackward, - MishBackward, - PReLUBackward, - ReLUBackward, - SigmoidBackward, - SoftPlusBackward, - TanhBackward, - }; - - if (backward_op_kind.find(op_kind) != backward_op_kind.end()) - dir = dir_t::FLAG_BWD; - else if (op_kind == dnnl::impl::graph::op_kind::BatchNormForwardTraining) - // Currently, batchnorm forward training is the only forward op - // that provides extra output for training purpose. - dir = dir_t::FLAG_FWD; - - return dir; -} - -static bool is_supported_dtype(data_type_t dt, dir_t dir = dir_t::FLAG_INF) { +static bool is_supported_dtype(data_type_t dt, dir_t dir = dir_t::FLAG_FWD) { static graph::engine_t *engine = get_engine(); return get_dtype_support_status(engine->kind(), dt, dir); } @@ -4831,7 +4797,10 @@ class test_single_op_pass_t auto pm = pass::pass_manager_t(backend_ptr.get_pass_registry()); pm.run_passes(agraph, "no_config"); - if (!is_supported_dtype(params.data_type, get_op_dir(params.op_kind))) { + const auto aop_ptr = std::make_shared(aop); + if (!is_supported_dtype(params.data_type, + dnnl::impl::graph::dnnl_impl::pattern::get_op_dir( + aop_ptr))) { ASSERT_EQ(agraph.get_num_partitions(), 1U); auto p = agraph.get_partitions().front(); @@ -15213,7 +15182,8 @@ TEST(test_pass_pass_system, LayernormWithSpecialAxis) { auto pm = pass::pass_manager_t(backend_ptr.get_pass_registry()); pm.run_passes(agraph, "no_config"); - if (!is_supported_dtype(data_type::bf16)) { + if (!is_supported_dtype(data_type::bf16, + dnnl::impl::graph::dnnl_impl::platform::dir_t::FWD_I)) { ASSERT_EQ(agraph.get_num_partitions(), 1U); auto p = agraph.get_partitions().front(); From d5971f424c467d04bdf64ed8994f1a41d2be050d Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Tue, 21 May 2024 14:31:59 -0700 Subject: [PATCH 152/187] gpu: intel: compute: fix large GRF mode query with ARL-H The workaround is no more relevant, DG2 A0 is used neither in development nor in production. It is also more reliable and correct to assume that systolic support implies large GRF mode support. --- src/gpu/intel/compute/compute_engine.hpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/gpu/intel/compute/compute_engine.hpp b/src/gpu/intel/compute/compute_engine.hpp index 368d563f5f4..73aee48becd 100644 --- a/src/gpu/intel/compute/compute_engine.hpp +++ b/src/gpu/intel/compute/compute_engine.hpp @@ -163,12 +163,7 @@ class compute_engine_t : public engine_t { : device_info_->gpu_arch() >= compute::gpu_arch_t::xe_hpc; } bool mayiuse_large_grf_mode() const { - // XXX: XeHPG 128EU A0 causes hangs with large GRF mode. - if (is_xe_hpg() && device_info()->eu_count() == 128 - && device_info()->stepping_id() == 0 - && device_info()->mayiuse_systolic()) - return false; - return device_info_->gpu_arch() >= compute::gpu_arch_t::xe_hp; + return device_info()->mayiuse_systolic(); } dispatch_t create_dispatch(const memory_desc_t *md = nullptr) const { From 2cc1f6ab013388c4b2e0918f487a9f574c9afd33 Mon Sep 17 00:00:00 2001 From: Tomasz Czeszun Date: Wed, 22 May 2024 10:00:13 -0700 Subject: [PATCH 153/187] tests: benchdnn: eltwise: disable unimplemented test cases for GPU --- tests/benchdnn/eltwise/eltwise.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/benchdnn/eltwise/eltwise.cpp b/tests/benchdnn/eltwise/eltwise.cpp index b2ed404a77f..e5d1fe08c90 100644 --- a/tests/benchdnn/eltwise/eltwise.cpp +++ b/tests/benchdnn/eltwise/eltwise.cpp @@ -284,6 +284,13 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { skip_unimplemented_data_type({prb->dt}, prb->dir, res); skip_unimplemented_sum_po(prb->attr, res, dnnl_eltwise, prb->dt); skip_unimplemented_prelu_po(prb->attr, res, dnnl_eltwise); + + if (is_gpu() && (prb->dt == dnnl_f8_e5m2 || prb->dt == dnnl_f8_e4m3) + && prb->dir == BWD_D) { + res->state = SKIPPED; + res->reason = skip_reason::data_type_not_supported; + return; + } } void skip_invalid_prb(const prb_t *prb, res_t *res) { From 799ac1cc61119b9ccdb0294b72ac1c048b3b68d1 Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Tue, 21 May 2024 13:19:54 -0700 Subject: [PATCH 154/187] gpu: intel: jit: pass: remove unused code --- src/gpu/intel/jit/pass/cse.cpp | 130 --------------------------------- 1 file changed, 130 deletions(-) diff --git a/src/gpu/intel/jit/pass/cse.cpp b/src/gpu/intel/jit/pass/cse.cpp index cfb70bb6867..2874ebdb05b 100644 --- a/src/gpu/intel/jit/pass/cse.cpp +++ b/src/gpu/intel/jit/pass/cse.cpp @@ -138,136 +138,6 @@ class cse_var_entry_t { const object_map_t *var2entry_ = nullptr; }; -// Helper class for IR nodes where CSE variables may be -// generated. Entry stores the peak GRF usage and propagates -// additional memory usage up and down the IR tree. -class cse_stmt_entry_t { -public: - bool visited() const { return visited_; } - - void set_bytes(int bytes) { - bytes_ = bytes; - visited_ = true; - }; - - void set_parent(cse_stmt_entry_t *parent) { - parent_ = parent; - parent_->childs_.push_back(this); - } - - bool try_allocate(int size, int limit) { - const auto alloc_size - = utils::rnd_up(size, reg_allocator_t::granularity); - if (bytes_ + alloc_size > limit) return false; - propagate_usage_down(alloc_size); - if (parent_) parent_->propagate_usage_up(this); - return true; - } - - void propagate_usage_up() { - for (auto *c : childs_) { - propagate_usage_up(c); - } - } - -private: - void propagate_usage_up(const cse_stmt_entry_t *child) { - if (child->bytes_ <= bytes_) return; - bytes_ = child->bytes_; - if (parent_) parent_->propagate_usage_up(this); - } - - void propagate_usage_down(int size) { - bytes_ += size; - for (auto *c : childs_) - c->propagate_usage_down(size); - } - - int bytes_ = 0; - bool visited_ = false; - cse_stmt_entry_t *parent_ = nullptr; - std::vector childs_; -}; - -class cse_memory_usage_visitor_t : public ir_visitor_t { -public: - cse_memory_usage_visitor_t( - std::unordered_map - &entries, - const object_eq_map_t &cse_exprs, int grf_size) - : entries_(entries), grf_size_(grf_size) { - for (auto &kv : cse_exprs) { - auto &cse_expr = kv.second; - if (cse_expr.cse_var.is_empty()) continue; - auto *obj = cse_expr.path.back(); - entries_.emplace(obj, cse_stmt_entry_t()); - } - } - - ~cse_memory_usage_visitor_t() override { - for (auto &kv : entries_) { - ir_assert(kv.second.visited()) << *kv.first; - } - } - -#define HANDLE_IR_OBJECT(type) \ - void _visit(const type &obj) override { visit_stmt(obj); } - - HANDLE_STMT_IR_OBJECTS() - -#undef HANDLE_IR_OBJECT - -private: - mem_usage_guard_t grf_usage_guard(int size) { - return mem_usage_guard_t(&cur_bytes_, size); - } - - template - void visit_stmt(const T &obj) { - cur_depth_++; - int obj_bytes = 0; - if (auto *alloc = obj.template as_ptr()) { - if (alloc->kind == alloc_kind_t::grf) - obj_bytes = utils::rnd_up(alloc->size, grf_size_); - } else if (auto *let = obj.template as_ptr()) { - obj_bytes = utils::rnd_up( - let->var.type().size(), reg_allocator_t::granularity); - } - - auto guard = grf_usage_guard(obj_bytes); - cse_stmt_entry_t *entry = nullptr; - auto it = entries_.find(&obj); - if (it != entries_.end()) entry = &it->second; - if (entry) { - if (!path_.empty()) entry->set_parent(path_.back()); - path_.push_back(entry); - } - ir_visitor_t::_visit(obj); - auto peak_it = peak_down_bytes_.find(cur_depth_ + 1); - int cur_peak_bytes = obj_bytes; - if (peak_it != peak_down_bytes_.end()) { - cur_peak_bytes += peak_it->second; - peak_down_bytes_.erase(peak_it); - } - peak_down_bytes_[cur_depth_] - = std::max(peak_down_bytes_[cur_depth_], cur_peak_bytes); - if (entry) { - path_.pop_back(); - entry->set_bytes(cur_bytes_ + cur_peak_bytes); - } - cur_depth_--; - } - - std::unordered_map &entries_; - int grf_size_; - - int cur_bytes_ = 0; - std::vector path_; - - int cur_depth_ = 0; - std::unordered_map peak_down_bytes_; // depth -> bytes -}; - // Stores information about all expressions subject to CSEing. class cse_context_t { public: From 169e02857fc37a2cf068b44b0c42cf6a1527864a Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Wed, 22 May 2024 06:32:42 -0700 Subject: [PATCH 155/187] gpu: intel: jit: utils: fix performance trace level --- src/gpu/intel/jit/utils/trace.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/gpu/intel/jit/utils/trace.hpp b/src/gpu/intel/jit/utils/trace.hpp index 2cb4fccd3c8..ffb584b8d04 100644 --- a/src/gpu/intel/jit/utils/trace.hpp +++ b/src/gpu/intel/jit/utils/trace.hpp @@ -32,19 +32,19 @@ class ir_context_t; #ifdef DNNL_DEV_MODE ir_utils::debug_profiler_t &get_trace_profiler(); inline void trace_start() { - if (get_verbose(verbose_t::debuginfo) >= ir_utils::LOG_TRACE) + if (get_verbose(verbose_t::debuginfo) >= ir_utils::LOG_PERF) get_trace_profiler().start(); } inline void trace_reset() { - if (get_verbose(verbose_t::debuginfo) >= ir_utils::LOG_TRACE) + if (get_verbose(verbose_t::debuginfo) >= ir_utils::LOG_PERF) get_trace_profiler().reset(); } inline void trace_stamp(const char *pass_name) { - if (get_verbose(verbose_t::debuginfo) >= ir_utils::LOG_TRACE) + if (get_verbose(verbose_t::debuginfo) >= ir_utils::LOG_PERF) get_trace_profiler().stamp(pass_name); } inline void trace_stop(const char *pass_name) { - if (get_verbose(verbose_t::debuginfo) >= ir_utils::LOG_TRACE) + if (get_verbose(verbose_t::debuginfo) >= ir_utils::LOG_PERF) get_trace_profiler().stop(pass_name); } inline void trace_perf() { From a4a4237b7f829df49cc7684be6ce0b6ef9632506 Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Wed, 22 May 2024 07:46:49 -0700 Subject: [PATCH 156/187] gpu: intel: jit: ir: move register allocation size logic to core --- src/gpu/intel/jit/ir/core.hpp | 12 ++++++++++++ src/gpu/intel/jit/ir/ir.cpp | 22 +++++----------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/gpu/intel/jit/ir/core.hpp b/src/gpu/intel/jit/ir/core.hpp index 94160b02a43..3419956b9f6 100644 --- a/src/gpu/intel/jit/ir/core.hpp +++ b/src/gpu/intel/jit/ir/core.hpp @@ -28,6 +28,7 @@ #include "common/c_types_map.hpp" #include "common/float16.hpp" #include "common/math_utils.hpp" +#include "gpu/intel/jit/codegen/register_allocator.hpp" #include "gpu/intel/jit/utils/ngen_proxy.hpp" #include "gpu/intel/jit/utils/utils.hpp" @@ -2049,6 +2050,10 @@ class alloc_t : public stmt_impl_t { return attrs[0].as(); } + int register_alloc_size(int grf_size) const { + return (kind == alloc_kind_t::grf) ? utils::rnd_up(size, grf_size) : 0; + } + IR_DECLARE_TRAVERSERS() expr_t buf; @@ -2264,6 +2269,13 @@ class let_t : public stmt_impl_t { return ir_utils::get_hash(var, value, body); } + int register_alloc_size() const { + // Empty objects are allocated in reserved space + // nGEN only claims subregisters at dword granularity + if (value.is_empty()) return 0; + return utils::rnd_up(var.type().size(), reg_allocator_t::granularity); + }; + IR_DECLARE_TRAVERSERS() expr_t var; diff --git a/src/gpu/intel/jit/ir/ir.cpp b/src/gpu/intel/jit/ir/ir.cpp index 272b880720d..54be3ed71db 100644 --- a/src/gpu/intel/jit/ir/ir.cpp +++ b/src/gpu/intel/jit/ir/ir.cpp @@ -20,7 +20,6 @@ #include "common/math_utils.hpp" #include "common/optional.hpp" -#include "gpu/intel/jit/codegen/register_allocator.hpp" #include "gpu/intel/jit/ir/core.hpp" #include "gpu/intel/jit/ir/message.hpp" #include "gpu/intel/jit/pass/simplify.hpp" @@ -41,8 +40,8 @@ class ir_printer_t : public ir_visitor_t { ir_printer_t(std::ostream &out) : out_(out) {} void _visit(const alloc_t &obj) override { - auto guard - = mem_usage_guard(obj.kind == alloc_kind_t::grf ? obj.size : 0); + auto grf_size = 1; // Assume all objects are grf aligned + auto guard = mem_usage_guard(obj.register_alloc_size(grf_size)); print_indent(); out_ << "alloc " << obj.buf.as().name << "[" << obj.size << "] (mem_usage: " << mem_usage_bytes_ << ")\n"; @@ -135,11 +134,7 @@ class ir_printer_t : public ir_visitor_t { } void _visit(const let_t &obj) override { - // Empty objects are allocated in reserved space - // nGEN only claims subregisters at dword granularity - int size = obj.value.is_empty() ? 0 - : utils::rnd_up(obj.var.type().size(), - reg_allocator_t::granularity); + int size = obj.register_alloc_size(); auto guard = mem_usage_guard(size); print_indent(); out_ << obj.var << "." << obj.var.type() << " = " << obj.value << "\n"; @@ -670,19 +665,12 @@ class grf_usage_visitor_t : public ir_visitor_t { : grf_size_(grf_size), skip_let_(skip_let), regs_(external_regs) {} void _visit(const alloc_t &obj) override { - int size = (obj.kind == alloc_kind_t::grf ? obj.size : 0); - size = utils::rnd_up(size, grf_size_); - auto guard = grf_usage_guard(size); + auto guard = grf_usage_guard(obj.register_alloc_size(grf_size_)); ir_visitor_t::_visit(obj); } void _visit(const let_t &obj) override { - // Empty objects are allocated in reserved space - // nGEN only claims subregisters at dword granularity - int size = (skip_let_ || obj.value.is_empty()) - ? 0 - : utils::rnd_up( - obj.var.type().size(), reg_allocator_t::granularity); + int size = skip_let_ ? 0 : obj.register_alloc_size(); auto guard = grf_usage_guard(size); ir_visitor_t::_visit(obj); } From 4e84bd31855887cbde8abae1e0699d8d798d91af Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Wed, 22 May 2024 08:06:53 -0700 Subject: [PATCH 157/187] gpu: intel: jit: pass: fix to skip cse_exprs only on critical path The previous algorithm did not consider the allocation stack when deciding which `let_t` statements to skip. The result is that skipping expressions may not reduce register consumption. Beyond the unnecessary work this can create, this also can cause kernel generation failure due to the mis-estimation of GRF usage. This patch modifies the existing algorithm to run in allocation stack aware manner and avoid unneeded skips. --- src/gpu/intel/jit/pass/cse.cpp | 266 ++++++++++++++++----------------- 1 file changed, 126 insertions(+), 140 deletions(-) diff --git a/src/gpu/intel/jit/pass/cse.cpp b/src/gpu/intel/jit/pass/cse.cpp index 2874ebdb05b..02dffec703a 100644 --- a/src/gpu/intel/jit/pass/cse.cpp +++ b/src/gpu/intel/jit/pass/cse.cpp @@ -77,12 +77,10 @@ class cse_var_entry_t { const cse_expr_t *cse_expr() const { return cse_expr_; } - bool unallocated() const { return allocated_ == allocated_t::no; } - bool allocated() const { return allocated_ == allocated_t::yes; } + bool allocated() const { return allocated_; } - void set_unallocated() { allocated_ = allocated_t::no; } - void set_allocated() { allocated_ = allocated_t::yes; } - void mark() { allocated_ = allocated_t::mark; } + void set_unallocated() { allocated_ = false; } + void set_allocated() { allocated_ = true; } int size() const { return utils::rnd_up( @@ -126,16 +124,109 @@ class cse_var_entry_t { const cse_expr_t *cse_expr_ = nullptr; int cost_ = 0; + bool allocated_ = true; + const object_map_t *var2entry_ = nullptr; +}; + +// Greedily marks the least beneficial cse entries as unallocated, so that those +// expressions can be skipped in the final CSE output. +class cse_skipper_t : public ir_visitor_t { +public: + cse_skipper_t(const object_eq_map_t &cse_exprs, + int grf_limit, int grf_size) + : grf_limit_(grf_limit), grf_size_(grf_size) { + + for (auto &kv : cse_exprs) { + auto &cse_expr = kv.second; + if (cse_expr.cse_var.is_empty()) continue; + entries_.emplace_back(&cse_expr); + } - enum class allocated_t { - no, - yes, - mark // Used for topological sorting algorithm - }; + for (auto &e : entries_) { + var2entry_.emplace(e.cse_expr()->cse_var, &e); + e.set_var2entry(var2entry_); + } + } - allocated_t allocated_ = allocated_t::no; + void _visit(const alloc_t &obj) override { + auto size = obj.register_alloc_size(grf_size_); + grf_usage_ += size; + handle_grf_overflow(); - const object_map_t *var2entry_ = nullptr; + ir_visitor_t::_visit(obj); + + grf_usage_ -= size; + } + + void _visit(const let_t &obj) override { + auto it = var2entry_.find(obj.var); + auto *e = it != var2entry_.end() ? var2entry_.find(obj.var)->second + : nullptr; + + int size = obj.register_alloc_size(); + if (e) { + var_stack_.emplace_back(e); + if (e->allocated()) { grf_usage_ += size; } + } else { + grf_usage_ += size; + } + handle_grf_overflow(); + + ir_visitor_t::_visit(obj); + + if (e) { + if (e->allocated()) grf_usage_ -= size; + var_stack_.pop_back(); + } else { + grf_usage_ -= size; + } + } + + void handle_grf_overflow() { + if (grf_usage_ <= grf_limit_) return; + + std::vector sorted_var_entries = [&]() { + std::vector ret; + for (auto v : var_stack_) { + if (v->allocated()) ret.emplace_back(v); + } + return ret; + }(); + + auto it = sorted_var_entries.begin(); + while (grf_usage_ > grf_limit_ && it != sorted_var_entries.end()) { + // var_stack_ is guaranteed to be in topological order due to + // traversing the IR tree. + for (auto &e : var_stack_) { + e->recompute_cost(); + } + std::sort(it, sorted_var_entries.end(), + [&](const cse_var_entry_t *a, const cse_var_entry_t *b) { + // Sort by cost per byte + return a->cost() * b->size() < b->cost() * a->size(); + }); + auto &e = **it; + + ir_trace() << "cse_pass: skipping " << e.cse_expr()->expr + << " with cost " << e.cost() << ", size " << e.size() + << ", and cost per byte " << (double)e.cost() / e.size() + << "\n"; + + e.set_unallocated(); + grf_usage_ -= e.size(); + ++it; + } + } + + const std::vector &entries() const { return entries_; }; + +private: + std::vector entries_; + object_map_t var2entry_; + std::vector var_stack_; + int grf_usage_ = 0; + int grf_limit_ = 0; + int grf_size_ = 0; }; // Stores information about all expressions subject to CSEing. @@ -238,125 +329,17 @@ class cse_context_t { return true; } - void set_skip_exprs( - const stmt_t &root, int usage, int limit, int grf_size) { - struct var_entries_t { - var_entries_t( - const object_eq_map_t &cse_exprs) { - for (auto &kv : cse_exprs) { - auto &cse_expr = kv.second; - if (cse_expr.cse_var.is_empty()) continue; - entries_.emplace_back(&cse_expr); - } - - for (auto &e : entries_) { - var2entry_.emplace(e.cse_expr()->cse_var, &e); - e.set_var2entry(var2entry_); - } - - topological_sort(); - - for (auto &e : entries_) { - gpu_assert(e.allocated()) - << "unallocated: " << e.cse_expr()->cse_var << " = " - << e.cse_expr()->expr << "\n"; - } - } - - std::vector::iterator begin() { - return entries_.begin(); - }; - std::vector::iterator end() { - return entries_.end(); - }; - - private: - // Depth first search visitor for topological_sort() - void visit(cse_var_entry_t &e, - std::vector::reverse_iterator &head) { - if (e.allocated()) return; - gpu_assert(e.unallocated()) - << "Cyclic expression dependency detected"; - e.mark(); - - for (auto &dep : find_objects(e.cse_expr()->expr)) { - auto it = var2entry_.find(dep); - if (it != var2entry_.end()) visit(*(it->second), head); - } - e.set_allocated(); - - *head++ = &e; - } - - // Topological sort `entries_` and mark all expressions as allocated. - // Topological sort is required for correct iteration order when - // updating node costs. Uses a depth first search based algorithm. - void topological_sort() { - std::vector e_sorted( - entries_.size(), nullptr); - auto head = e_sorted.rbegin(); - for (auto it = entries_.begin(); it != entries_.end(); it++) { - if (it->unallocated()) { visit(*it, head); } - } - - std::vector entries; - entries.reserve(entries_.size()); - for (auto e_ptr : e_sorted) { - entries.emplace_back(*e_ptr); - } - entries_ = std::move(entries); - for (auto &e : entries_) { - var2entry_[e.cse_expr()->cse_var] = &e; - } - } - - std::vector entries_; - object_map_t var2entry_; - }; - - var_entries_t var_entries(cse_exprs_); - - // Greedily remove the least beneficial variable until memory usage - // limit is met. - std::vector sorted_var_entries; - for (auto &e : var_entries) { - sorted_var_entries.push_back(&e); - } - - int overflow_size = usage - limit; - auto it = sorted_var_entries.begin(); - while (overflow_size > 0 && it != sorted_var_entries.end()) { - // Update costs. - for (auto &e : var_entries) { - e.recompute_cost(); - } - std::sort(it, sorted_var_entries.end(), - [&](const cse_var_entry_t *a, const cse_var_entry_t *b) { - // Sort by cost per byte - return a->cost() * b->size() < b->cost() * a->size(); - }); - auto &e = **it; - - ir_trace() << "cse_pass: unmarking " << e.cse_expr()->expr - << " with cost " << e.cost() << ", size " << e.size() - << ", and cost per byte " << (double)e.cost() / e.size() - << "\n"; - - e.set_unallocated(); - overflow_size -= e.size(); - ++it; - } - - // Skip not allocated variables. + bool set_skip_exprs(const stmt_t &root, int limit, int grf_size) { + cse_skipper_t skipper(cse_exprs_, limit, grf_size); + skipper.visit(root); // TODO: Rather than rerun CSE, just delete `let_t` and substitute - // variables with their value. This needs to be performed in the reverse - // order on `var_entries` to ensure no substitutions are missed in - // computation chains. - for (auto &e : var_entries) { + // variables with their value. + for (auto &e : skipper.entries()) { if (e.allocated()) continue; skip_exprs_.insert(e.cse_expr()->orig_expr); } + return !skip_exprs_.empty(); } void reset_cse_exprs() { cse_exprs_.clear(); } @@ -675,22 +658,25 @@ stmt_t eliminate_common_subexprs_impl(const stmt_t &_stmt, cse_context_t &ctx, stmt = mutator.mutate(stmt); // The second run is the last run. - if (run_idx != 0) return stmt; - - // If memory usage exceeds the limit, exclude some - // expressions from CSE and retry the whole process from - // scratch. - int memory_usage = get_peak_regs(stmt, grf_size) * grf_size; - if (memory_usage > memory_usage_limit) { - ir_trace() << "CSE exceeded GRF usage limit. Usage: " << memory_usage - << ", limit: " << memory_usage_limit - << ". Retry CSE and skip some expressions..." << std::endl; - ctx.set_skip_exprs(_stmt, memory_usage, memory_usage_limit, grf_size); - ctx.reset_cse_exprs(); - return stmt_t(); + if (run_idx != 0) { + gpu_assert( + get_peak_regs(stmt, grf_size) * grf_size <= memory_usage_limit + || get_peak_regs(_stmt, grf_size) * grf_size + >= memory_usage_limit); + return stmt; } - return stmt; + // If memory usage exceeds the limit, exclude some expressions from CSE and + // retry the whole process from scratch. + bool has_skip = ctx.set_skip_exprs(stmt, memory_usage_limit, grf_size); + if (!has_skip) return stmt; + + int memory_usage = get_peak_regs(stmt, grf_size) * grf_size; + ir_trace() << "CSE exceeded GRF usage limit. Usage: " << memory_usage + << ", limit: " << memory_usage_limit + << ". Retry CSE and skip some expressions..." << std::endl; + ctx.reset_cse_exprs(); + return stmt_t(); } stmt_t eliminate_common_subexprs( From 69883d929410c543414ff20bc580971323f53700 Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Wed, 22 May 2024 11:30:37 -0700 Subject: [PATCH 158/187] gpu: intel: jit: pass: improve bool variable assignment --- src/gpu/intel/jit/pass/cse.cpp | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/gpu/intel/jit/pass/cse.cpp b/src/gpu/intel/jit/pass/cse.cpp index 02dffec703a..c34aa8933b7 100644 --- a/src/gpu/intel/jit/pass/cse.cpp +++ b/src/gpu/intel/jit/pass/cse.cpp @@ -95,14 +95,15 @@ class cse_var_entry_t { } void recompute_cost() { - cost_ = expr_cost(cse_expr_->expr) * cse_expr_->refs; + cost_ = expr_cost(cse_expr_->expr, var2entry_) * cse_expr_->refs; } -private: - int expr_cost(const expr_t &e) { + static int expr_cost(const expr_t &e, + const object_map_t *var2entry) { if (is_var(e)) { - auto it = var2entry_->find(e); - if (it == var2entry_->end()) return 0; + if (var2entry == nullptr) return 0; + auto it = var2entry->find(e); + if (it == var2entry->end()) return 0; if (it->second->allocated()) return 0; // If variable is not allocated, its value // has to be recomputed every time. @@ -111,9 +112,12 @@ class cse_var_entry_t { if (is_const(e)) return 0; if (e.is()) return e.type().is_bool(); if (auto *op = e.as_ptr()) { - return expr_cost(op->a) + expr_cost(op->b) + 1; + return expr_cost(op->a, var2entry) + expr_cost(op->b, var2entry) + + 1; + } + if (auto *op = e.as_ptr()) { + return expr_cost(op->a, var2entry) + 1; } - if (auto *op = e.as_ptr()) { return expr_cost(op->a) + 1; } if (auto *s = e.as_ptr()) { if (s->is_broadcast()) return 0; return s->elems(); @@ -122,6 +126,7 @@ class cse_var_entry_t { return 0; } +private: const cse_expr_t *cse_expr_ = nullptr; int cost_ = 0; bool allocated_ = true; @@ -324,7 +329,14 @@ class cse_context_t { || is_const(e)) return false; auto &cse_expr = find_cse_expr(e); - if (cse_expr.refs <= (e.type().is_bool() ? 2 : 1)) return false; + + if (cse_expr.refs <= 1) return false; + if (e.type().is_bool()) { + // Account for possible cost to move bool variable to and from flag + // register + auto cost = cse_var_entry_t::expr_cost(cse_expr.expr, nullptr); + if (cost + cse_expr.refs + 1 >= cost * cse_expr.refs) return false; + } if (skip_exprs_.count(cse_expr.orig_expr) != 0) return false; return true; } From 8852679db87ca455e1c02056e6a75c7d534413c8 Mon Sep 17 00:00:00 2001 From: "Taylor, Deb" Date: Thu, 9 May 2024 14:41:27 -0500 Subject: [PATCH 159/187] doc: Updated broken links in Memory Formats file Signed-off-by: Taylor, Deb --- doc/advanced/understanding_memory_formats.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/advanced/understanding_memory_formats.md b/doc/advanced/understanding_memory_formats.md index 9b04c5c156c..30d2f191179 100644 --- a/doc/advanced/understanding_memory_formats.md +++ b/doc/advanced/understanding_memory_formats.md @@ -137,17 +137,15 @@ For a single image (**N** = 1), this format is very similar to how where the image is kept pixel by pixel and every pixel contains all required information about colors (for instance, three channels for 24bit BMP). -NHWC data format is the default one for -[TensorFlow](https://www.tensorflow.org/performance/performance_guide#data_formats). +NHWC is the default data format for image recognition in +[TensorFlow](https://www.tensorflow.org/api_docs/python/tf/conv). This layout corresponds to #dnnl_nhwc or dnnl::memory::format_tag::nhwc. #### CHWN -The last example here for the plain data layout is **CHWN**, which is used by -[Neon](https://neon.nervanasys.com/index.html/design.html#data-layout). -This layout might be very interesting from a vectorization perspective if +The last example here for the plain data layout is **CHWN**. This layout might be very interesting from a vectorization perspective if an appropriate batch size is used, but on the other hand users cannot always have *good* batch size (for example, in case of real-time inference batch is typically 1). @@ -222,6 +220,7 @@ the function above is: dnnl_memory_desc_create_with_strides(&md, ndims, dims, dnnl_f32, strides); ~~~ + In particular, whenever a user creates memory with the #dnnl_nchw format, oneDNN computes the strides and fills the structure on behalf of the user. From 4d12649f925479c3440fe16b1da4d1edc5bfea23 Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Thu, 16 May 2024 11:58:27 -0700 Subject: [PATCH 160/187] gpu: reduction: remove vectorized loads --- .../intel/ocl/reduction/combined_reduction.cl | 174 +++++++----------- .../ocl/reduction/combined_reduction.cpp | 51 +---- .../ocl/reduction/combined_reduction.hpp | 3 +- 3 files changed, 71 insertions(+), 157 deletions(-) diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cl b/src/gpu/intel/ocl/reduction/combined_reduction.cl index ecce525f0e3..8908f6453f2 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cl +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cl @@ -20,7 +20,7 @@ // Define how to read data #define BLOCK_READ_DATA_T(data_ptr) \ - AS_VECT_DATA_T(VECT_BLOCK_READ((const __global BLOCK_DATA_T *)data_ptr)) + AS_DATA_T(BLOCK_READ((const __global BLOCK_DATA_T *)data_ptr)) #define READ_DATA(val) WITH_BLOCK_READ ? BLOCK_READ_DATA_T(&val) : val // Zero-padding defines @@ -105,18 +105,15 @@ __kernel void combined_reduce( __global SRC_DATA_T *src, __global DST_DATA_T *dst POST_OP_ARGS) { // Compute constants deriving from defined constants - const int sg_per_inner_dim - = div_up(div_up(INNER_DIM_SIZE, VECT_DT_N), SUBGROUP_SIZE); + const int sg_per_inner_dim = div_up(INNER_DIM_SIZE, SUBGROUP_SIZE); const int inner_dims_per_sg = min(REDUCTION_SIZE, max(1, SUBGROUP_SIZE / INNER_DIM_SIZE)); - const int num_horiz_reductions = REDUCTION_SIZE / inner_dims_per_sg - / (REDUCE_VECTOR ? VECT_DT_N : 1); + const int num_horiz_reductions = REDUCTION_SIZE / inner_dims_per_sg; const int tail_reductions = REDUCTION_SIZE % inner_dims_per_sg; // Direct indices from gws const int sgid = get_global_id(0) / SUBGROUP_SIZE; - const int inner_idx_start - = (sgid % sg_per_inner_dim) * SUBGROUP_SIZE * VECT_DT_N; + const int inner_idx_start = (sgid % sg_per_inner_dim) * SUBGROUP_SIZE; // Handle inner vector packing into subgroups const int sglid = get_sub_group_local_id(); @@ -128,15 +125,12 @@ combined_reduce( || sglid >= INNER_DIM_SIZE * inner_dims_per_sg) return; - const int loop_stride = _SRC_OFF( - 0, inner_dims_per_sg * (REDUCE_VECTOR ? VECT_DT_N : 1), 0); + const int loop_stride = _SRC_OFF(0, inner_dims_per_sg, 0); unroll_for(int oid = 0; oid < OUTER_TILE_SIZE; oid++) { const int outer_idx = sgid / sg_per_inner_dim * OUTER_TILE_SIZE + oid; - DEF_ACC_DATA_T acc[VECT_DT_N]; - unroll_for(int v = 0; v < VECT_DT_N; v++) { - init_acc(REDUCTION_ALG, &acc[v]); - } + DEF_ACC_DATA_T acc; + init_acc(REDUCTION_ALG, &acc); int src_off = _SRC_OFF(outer_idx, WITH_BLOCK_READ ? 0 : red_off, WITH_BLOCK_READ ? inner_idx_start : inner_idx); @@ -144,134 +138,102 @@ combined_reduce( for (int off = 0; off < num_horiz_reductions; off++, src_off += loop_stride) { // Load - const VECT_DATA_T src_val = READ_DATA(src[src_off]); - const DATA_T *next_val = (DATA_T *)&src_val; + const DATA_T src_val = READ_DATA(src[src_off]); // Accumulate - unroll_for(int v = 0; v < VECT_DT_N; v++) { - acc[v] = reduce(REDUCTION_ALG, acc[v], - TO_DEF_ACC_DATA_T(next_val[v]), POWER); - } + acc = reduce(REDUCTION_ALG, acc, TO_DEF_ACC_DATA_T(src_val), POWER); } if (red_off < tail_reductions) { // Load - const VECT_DATA_T src_val = READ_DATA(src[src_off]); - const DATA_T *next_val = (DATA_T *)&src_val; + const DATA_T src_val = READ_DATA(src[src_off]); // Accumulate - unroll_for(int v = 0; v < VECT_DT_N; v++) { - acc[v] = reduce(REDUCTION_ALG, acc[v], - TO_DEF_ACC_DATA_T(next_val[v]), POWER); - } + acc = reduce(REDUCTION_ALG, acc, TO_DEF_ACC_DATA_T(src_val), POWER); } // Potentially accumulate within the subgroup too // TODO: Change to tree-based reduce to help large inner_dims_per_sg cases - DEF_ACC_DATA_T acc_sg[VECT_DT_N]; - for (int v = 0; v < VECT_DT_N; v++) { - init_acc(SECONDARY_REDUCTION_ALG, &acc_sg[v]); - } + DEF_ACC_DATA_T acc_sg; + init_acc(SECONDARY_REDUCTION_ALG, &acc_sg); unroll_for(int i = 0; i < inner_dims_per_sg; i++) { - unroll_for(int v = 0; v < VECT_DT_N; v++) { - DEF_ACC_DATA_T next = intel_sub_group_shuffle_down(acc[v], - SPECIAL(DEF_ACC_DATA_T, zero), i * INNER_DIM_SIZE); - acc_sg[v] = reduce( - SECONDARY_REDUCTION_ALG, acc_sg[v], next, POWER); - } + DEF_ACC_DATA_T next = intel_sub_group_shuffle_down( + acc, SPECIAL(DEF_ACC_DATA_T, zero), i * INNER_DIM_SIZE); + acc_sg = reduce(SECONDARY_REDUCTION_ALG, acc_sg, next, POWER); } if (sglid < INNER_DIM_SIZE) { - const int final_vec_size = REDUCE_VECTOR ? 1 : VECT_DT_N; -#if REDUCE_VECTOR - DEF_ACC_DATA_T final_acc[1]; - init_acc(SECONDARY_REDUCTION_ALG, final_acc); - unroll_for(int v = 0; v < VECT_DT_N; v++) { - final_acc[0] = reduce(SECONDARY_REDUCTION_ALG, acc_sg[v], - final_acc[0], POWER); - } -#else - // Just rename the variable to match the REDUCE_VECTOR case - DEF_ACC_DATA_T final_acc[VECT_DT_N]; - for (int v = 0; v < VECT_DT_N; v++) { - final_acc[v] = acc_sg[v]; - } -#endif // REDUCE_VECTOR - // For each result: // 1. (if IS_FINAL) finalize the result // 2. (if IS_FINAL) apply post-ops // 3. write to dst - for (int v = 0; v < final_vec_size; v++) { - const dim_t dst_off - = _DST_OFF(outer_idx, inner_idx + v * SUBGROUP_SIZE); - // finalize the result + const dim_t dst_off = _DST_OFF(outer_idx, inner_idx); + // finalize the result #if IS_FINAL - float res = finalize(REDUCTION_ALG, convert_float(final_acc[v]), - DIV, POWER, EPS); + float res = finalize( + REDUCTION_ALG, convert_float(acc_sg), DIV, POWER, EPS); - // Apply post-ops + // Apply post-ops #if WITH_POST_OP - float dst_val; + float dst_val; #if WITH_SUM - dst_val = DST_TO_REF(dst[dst_off]); + dst_val = DST_TO_REF(dst[dst_off]); #endif // WITH_SUM - // Reconstruct MB/C/D/H/W indices from dst_off - const int mb = (DST_S0 == 0) - ? 0 - : dst_off / DST_S0 % div_up(DST_D0, DST_B0) * DST_B0 - + dst_off / DST_SB0 % DST_B0; - const int c = (DST_S1 == 0) - ? 0 - : dst_off / DST_S1 % div_up(DST_D1, DST_B1) * DST_B1 - + dst_off / DST_SB1 % DST_B1; - const int d = (DST_S2 == 0) - ? 0 - : dst_off / DST_S2 % div_up(DST_D2, DST_B2) * DST_B2 - + dst_off / DST_SB2 % DST_B2; - const int h = (DST_S3 == 0) - ? 0 - : dst_off / DST_S3 % div_up(DST_D3, DST_B3) * DST_B3 - + dst_off / DST_SB3 % DST_B3; - const int w = (DST_S4 == 0) - ? 0 - : dst_off / DST_S4 % div_up(DST_D4, DST_B4) * DST_B4 - + dst_off / DST_SB4 % DST_B4; - - // Only use post-ops on non-zero-padded elements - if (mb < DST_D0 && c < DST_D1 && d < DST_D2 && h < DST_D3 - && w < DST_D4) { - APPLY_POST_OPS_SERIAL(res, float, dst_val, float, mb, 1, c, - 1, d, 1, h, 1, w, 1, 0, 1); - } + // Reconstruct MB/C/D/H/W indices from dst_off + const int mb = (DST_S0 == 0) + ? 0 + : dst_off / DST_S0 % div_up(DST_D0, DST_B0) * DST_B0 + + dst_off / DST_SB0 % DST_B0; + const int c = (DST_S1 == 0) + ? 0 + : dst_off / DST_S1 % div_up(DST_D1, DST_B1) * DST_B1 + + dst_off / DST_SB1 % DST_B1; + const int d = (DST_S2 == 0) + ? 0 + : dst_off / DST_S2 % div_up(DST_D2, DST_B2) * DST_B2 + + dst_off / DST_SB2 % DST_B2; + const int h = (DST_S3 == 0) + ? 0 + : dst_off / DST_S3 % div_up(DST_D3, DST_B3) * DST_B3 + + dst_off / DST_SB3 % DST_B3; + const int w = (DST_S4 == 0) + ? 0 + : dst_off / DST_S4 % div_up(DST_D4, DST_B4) * DST_B4 + + dst_off / DST_SB4 % DST_B4; + + // Only use post-ops on non-zero-padded elements + if (mb < DST_D0 && c < DST_D1 && d < DST_D2 && h < DST_D3 + && w < DST_D4) { + APPLY_POST_OPS_SERIAL(res, float, dst_val, float, mb, 1, c, 1, + d, 1, h, 1, w, 1, 0, 1); + } #endif // WITH_POST_OP #else - float res = final_acc[v]; + float res = acc_sg; #endif // IS_FINAL - // Write to dst - if (is_dst_zero_padded(dst_off)) res = 0.0f; - dst[dst_off] = IS_FINAL ? TO_DST(res) : res; + // Write to dst + if (is_dst_zero_padded(dst_off)) res = 0.0f; + dst[dst_off] = IS_FINAL ? TO_DST(res) : res; - // Reduced + zero-padded dims need extra zeros written + // Reduced + zero-padded dims need extra zeros written #if DST_Z0_IS_REDUCED && DST_Z1_IS_REDUCED - for (int i = 0; i < DST_Z0_SIZE0; i++) { - for (int j = 0; j < DST_Z1_SIZE0; j++) { - if (i == 0 && j == 0) continue; - dst[dst_off + i * DST_Z0_STRIDE0 + j * DST_Z1_STRIDE0] - = TO_DST(0.0f); - } + for (int i = 0; i < DST_Z0_SIZE0; i++) { + for (int j = 0; j < DST_Z1_SIZE0; j++) { + if (i == 0 && j == 0) continue; + dst[dst_off + i * DST_Z0_STRIDE0 + j * DST_Z1_STRIDE0] + = TO_DST(0.0f); } + } #elif DST_Z0_IS_REDUCED - for (int i = 1; i < DST_Z0_SIZE0; i++) { - dst[dst_off + i * DST_Z0_STRIDE0] = TO_DST(0.0f); - } + for (int i = 1; i < DST_Z0_SIZE0; i++) { + dst[dst_off + i * DST_Z0_STRIDE0] = TO_DST(0.0f); + } #elif DST_Z1_IS_REDUCED - for (int j = 1; j < DST_Z1_SIZE0; j++) { - dst[dst_off + j * DST_Z1_STRIDE0] = TO_DST(0.0f); - } -#endif + for (int j = 1; j < DST_Z1_SIZE0; j++) { + dst[dst_off + j * DST_Z1_STRIDE0] = TO_DST(0.0f); } +#endif } } } diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cpp b/src/gpu/intel/ocl/reduction/combined_reduction.cpp index 27a78520925..dd19f799cbf 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cpp +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cpp @@ -89,54 +89,11 @@ reduction_phase_conf_t::reduction_phase_conf_t( // inner_dim can either be: // 1. packed into a single subgroup (small inner dim), or // 2. split among several subgroups (large inner dim) - const dim_t num_packed_inner_dims - = nstl::clamp(subgroup_size / inner_block.block, dim_t {1}, - reduction_block.block); const dim_t num_split_inner_dims = utils::div_up(inner_block.block, subgroup_size); // S per I - const dim_t num_horiz_reductions - = reduction_block.block / num_packed_inner_dims; - dim_t num_subgroups = outer_block.block * num_split_inner_dims; - // We need to determine 2 variables according to some heuristic: - // 1. Vector size (increases block load size) - // 2. Threads per EU (decreases scheduling overhead, in this case) - - // Vector size requirements: - // 1. (required) reductions and inner_dim aligned with no tails on either one - // 2. (heuristic) Block loads should not exceed maximum instruction load size - // 3. (heuristic) EUs should not become unsaturated due to vector size - int nvec = 1; - bool reduce_vec = false; - if (with_block_reads) { - const size_t single_load_size = types::data_type_size(src_type) - * static_cast(subgroup_size); - const int max_load_size = 256; // Set on ATS-M, may depend on arch - const int max_vect_size - = static_cast(max_load_size / single_load_size); - - for (int N : {8, 4, 2}) { - // Related to EU saturation - if (num_subgroups / N < num_EU) continue; - // Related to block load size - if (N > max_vect_size) continue; - if (num_horiz_reductions % N == 0) { - if (num_split_inner_dims == 1 - || num_split_inner_dims % N == 0) { - nvec = N; - reduce_vec = (num_split_inner_dims == 1); - break; - } - } - } - } - vect_size = nvec; - reduce_vector = reduce_vec; - - if (!reduce_vector) num_subgroups /= vect_size; - // Increase num_outer_idxs to use persistent threading to reduce the number of subgroups // and avoid overdispatching outer_tile_size = [this, &compute_engine, &num_EU, &large_grf_mode, @@ -427,17 +384,13 @@ static status_t init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx, kernel_ctx.define_int("IS_FINAL", phase.is_final); kernel_ctx.define_int("IS_FIRST", phase.is_first); - kernel_ctx.define_int("VECT_DT_N", phase.vect_size); - kernel_ctx.define_int("REDUCE_VECTOR", phase.reduce_vector ? 1 : 0); - // Because the reduction loop is quite tight, we can override the compiler's // loop unrolling logic to increase it a lot and get a bit more speed // Heuristic determined on ATS-m, set to exclude the possibility of // exceeding the instruction cache const dim_t max_unroll = 256; - const dim_t unroll_factor = nstl::clamp( - num_horiz_reductions / (phase.reduce_vector ? phase.vect_size : 1), - dim_t {1}, max_unroll); + const dim_t unroll_factor + = nstl::clamp(num_horiz_reductions, dim_t {1}, max_unroll); kernel_ctx.define_int("UNROLL_FACTOR", unroll_factor); kernel_ctx.define_int("WITH_BLOCK_READ", phase.with_block_reads ? 1 : 0); diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.hpp b/src/gpu/intel/ocl/reduction/combined_reduction.hpp index ce6e6768119..38aa99750d8 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.hpp +++ b/src/gpu/intel/ocl/reduction/combined_reduction.hpp @@ -38,8 +38,7 @@ struct reduction_phase_conf_t : public reduction_subproblem_t { data_type_t src_type, dst_type; compute::nd_range_t nd_range; - int vect_size, outer_tile_size; - bool reduce_vector; + int outer_tile_size; bool is_final, is_first; int subgroup_size; bool with_block_reads; From 80d36be3e74014862630a8043e5064f71f060170 Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Thu, 16 May 2024 13:51:15 -0700 Subject: [PATCH 161/187] gpu: reduction: optimize subgroup reduction --- .../intel/ocl/reduction/combined_reduction.cl | 75 ++++++++++--------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cl b/src/gpu/intel/ocl/reduction/combined_reduction.cl index 8908f6453f2..cbe0a60858e 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cl +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cl @@ -106,10 +106,10 @@ combined_reduce( __global SRC_DATA_T *src, __global DST_DATA_T *dst POST_OP_ARGS) { // Compute constants deriving from defined constants const int sg_per_inner_dim = div_up(INNER_DIM_SIZE, SUBGROUP_SIZE); - const int inner_dims_per_sg + const int red_per_sg = min(REDUCTION_SIZE, max(1, SUBGROUP_SIZE / INNER_DIM_SIZE)); - const int num_horiz_reductions = REDUCTION_SIZE / inner_dims_per_sg; - const int tail_reductions = REDUCTION_SIZE % inner_dims_per_sg; + const int num_horiz_reductions = REDUCTION_SIZE / red_per_sg; + const int tail_reductions = REDUCTION_SIZE % red_per_sg; // Direct indices from gws const int sgid = get_global_id(0) / SUBGROUP_SIZE; @@ -120,48 +120,51 @@ combined_reduce( const int inner_idx = inner_idx_start + (sglid % INNER_DIM_SIZE); const int red_off = sglid / INNER_DIM_SIZE; - // Case happens when inner_dim_size is not a multiple/factor of subgroup size - if (inner_idx >= INNER_DIM_SIZE - || sglid >= INNER_DIM_SIZE * inner_dims_per_sg) - return; - - const int loop_stride = _SRC_OFF(0, inner_dims_per_sg, 0); + const int active_channels = min(SUBGROUP_SIZE, red_per_sg * INNER_DIM_SIZE); + ASSUME(active_channels == SUBGROUP_SIZE || !WITH_BLOCK_READ); + const int loop_stride = _SRC_OFF(0, red_per_sg, 0); unroll_for(int oid = 0; oid < OUTER_TILE_SIZE; oid++) { const int outer_idx = sgid / sg_per_inner_dim * OUTER_TILE_SIZE + oid; DEF_ACC_DATA_T acc; init_acc(REDUCTION_ALG, &acc); - int src_off = _SRC_OFF(outer_idx, WITH_BLOCK_READ ? 0 : red_off, - WITH_BLOCK_READ ? inner_idx_start : inner_idx); - __attribute__((opencl_unroll_hint(UNROLL_FACTOR))) // attr:no-format - for (int off = 0; off < num_horiz_reductions; - off++, src_off += loop_stride) { - // Load - const DATA_T src_val = READ_DATA(src[src_off]); - - // Accumulate - acc = reduce(REDUCTION_ALG, acc, TO_DEF_ACC_DATA_T(src_val), POWER); - } - if (red_off < tail_reductions) { - // Load - const DATA_T src_val = READ_DATA(src[src_off]); + if (sglid < active_channels) { + int src_off = _SRC_OFF(outer_idx, 0, inner_idx_start); + if (!WITH_BLOCK_READ) src_off += sglid; + __attribute__((opencl_unroll_hint(UNROLL_FACTOR))) // attr:no-format + for (int off = 0; off < num_horiz_reductions; + off++, src_off += loop_stride) { + // Load + const DATA_T src_val = READ_DATA(src[src_off]); + + // Accumulate + acc = reduce( + REDUCTION_ALG, acc, TO_DEF_ACC_DATA_T(src_val), POWER); + } + if (red_off < tail_reductions) { + // Load + const DATA_T src_val = READ_DATA(src[src_off]); - // Accumulate - acc = reduce(REDUCTION_ALG, acc, TO_DEF_ACC_DATA_T(src_val), POWER); + // Accumulate + acc = reduce( + REDUCTION_ALG, acc, TO_DEF_ACC_DATA_T(src_val), POWER); + } } - // Potentially accumulate within the subgroup too - // TODO: Change to tree-based reduce to help large inner_dims_per_sg cases - DEF_ACC_DATA_T acc_sg; - init_acc(SECONDARY_REDUCTION_ALG, &acc_sg); - unroll_for(int i = 0; i < inner_dims_per_sg; i++) { - DEF_ACC_DATA_T next = intel_sub_group_shuffle_down( - acc, SPECIAL(DEF_ACC_DATA_T, zero), i * INNER_DIM_SIZE); - acc_sg = reduce(SECONDARY_REDUCTION_ALG, acc_sg, next, POWER); + // Reduce between work items within a thread + DEF_ACC_DATA_T init; + init_acc(SECONDARY_REDUCTION_ALG, &init); + unroll_for(int shift = INNER_DIM_SIZE; shift < active_channels; + shift *= 2) { + DEF_ACC_DATA_T next + = intel_sub_group_shuffle_down(acc, init, shift); + acc = reduce(SECONDARY_REDUCTION_ALG, acc, next, POWER); + DEBUG_PRINT("%d->%d/%d: sg reduce from sglid %d\n", + get_global_id(0), sgid, sglid, sglid + shift); } - if (sglid < INNER_DIM_SIZE) { + if (red_off == 0 && inner_idx < INNER_DIM_SIZE) { // For each result: // 1. (if IS_FINAL) finalize the result // 2. (if IS_FINAL) apply post-ops @@ -170,7 +173,7 @@ combined_reduce( // finalize the result #if IS_FINAL float res = finalize( - REDUCTION_ALG, convert_float(acc_sg), DIV, POWER, EPS); + REDUCTION_ALG, convert_float(acc), DIV, POWER, EPS); // Apply post-ops #if WITH_POST_OP @@ -209,7 +212,7 @@ combined_reduce( } #endif // WITH_POST_OP #else - float res = acc_sg; + float res = acc; #endif // IS_FINAL // Write to dst From 9078f188de133e5bcfe286233eb94304f927ffb7 Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Thu, 16 May 2024 14:30:09 -0700 Subject: [PATCH 162/187] gpu: reduction: refactor infrequent steps into functions --- .../intel/ocl/reduction/combined_reduction.cl | 123 +++++++++--------- .../ocl/reduction/combined_reduction.cpp | 10 +- 2 files changed, 64 insertions(+), 69 deletions(-) diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cl b/src/gpu/intel/ocl/reduction/combined_reduction.cl index cbe0a60858e..2ade501cef4 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cl +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cl @@ -97,6 +97,47 @@ dim_t dst_off_w_zero_padding(dim_t outer, dim_t inner) { *DST_Z1_SIZE1 #endif +void reverse_indexing(dim_t dst_off, int *res) { + // Reconstruct dimension indices from dst_off + res[0] = (DST_S0 == 0) ? 0 + : dst_off / DST_S0 % div_up(DST_D0, DST_B0) * DST_B0 + + dst_off / DST_SB0 % DST_B0; + res[1] = (DST_S1 == 0) ? 0 + : dst_off / DST_S1 % div_up(DST_D1, DST_B1) * DST_B1 + + dst_off / DST_SB1 % DST_B1; + res[2] = (DST_S2 == 0) ? 0 + : dst_off / DST_S2 % div_up(DST_D2, DST_B2) * DST_B2 + + dst_off / DST_SB2 % DST_B2; + res[3] = (DST_S3 == 0) ? 0 + : dst_off / DST_S3 % div_up(DST_D3, DST_B3) * DST_B3 + + dst_off / DST_SB3 % DST_B3; + res[4] = (DST_S4 == 0) ? 0 + : dst_off / DST_S4 % div_up(DST_D4, DST_B4) * DST_B4 + + dst_off / DST_SB4 % DST_B4; + res[5] = (DST_S5 == 0) ? 0 + : dst_off / DST_S5 % div_up(DST_D5, DST_B5) * DST_B5 + + dst_off / DST_SB5 % DST_B5; +} + +void write_padded_zeros(__global DST_DATA_T *dst) { +#if DST_Z0_IS_REDUCED && DST_Z1_IS_REDUCED + for (int i = 0; i < DST_Z0_SIZE0; i++) { + for (int j = 0; j < DST_Z1_SIZE0; j++) { + if (i == 0 && j == 0) continue; + *(dst + i * DST_Z0_STRIDE0 + j * DST_Z1_STRIDE0) = TO_DST(0.0f); + } + } +#elif DST_Z0_IS_REDUCED + for (int i = 1; i < DST_Z0_SIZE0; i++) { + *(dst + i * DST_Z0_STRIDE0) = TO_DST(0.0f); + } +#elif DST_Z1_IS_REDUCED + for (int j = 1; j < DST_Z1_SIZE0; j++) { + *(dst + j * DST_Z1_STRIDE0) = TO_DST(0.0f); + } +#endif +} + // Specifying wg size since larger work groups reduce performance. // TODO: Look into why this is the case __attribute__((reqd_work_group_size(LWS_SIZE, 1, 1))) // attr:no-format @@ -165,78 +206,34 @@ combined_reduce( } if (red_off == 0 && inner_idx < INNER_DIM_SIZE) { - // For each result: - // 1. (if IS_FINAL) finalize the result - // 2. (if IS_FINAL) apply post-ops - // 3. write to dst const dim_t dst_off = _DST_OFF(outer_idx, inner_idx); - // finalize the result -#if IS_FINAL - float res = finalize( - REDUCTION_ALG, convert_float(acc), DIV, POWER, EPS); - - // Apply post-ops + float res = acc; + if (IS_FINAL) { + res = finalize( + REDUCTION_ALG, convert_float(acc), DIV, POWER, EPS); #if WITH_POST_OP - float dst_val; + float dst_val; #if WITH_SUM - dst_val = DST_TO_REF(dst[dst_off]); + dst_val = DST_TO_REF(dst[dst_off]); #endif // WITH_SUM - - // Reconstruct MB/C/D/H/W indices from dst_off - const int mb = (DST_S0 == 0) - ? 0 - : dst_off / DST_S0 % div_up(DST_D0, DST_B0) * DST_B0 - + dst_off / DST_SB0 % DST_B0; - const int c = (DST_S1 == 0) - ? 0 - : dst_off / DST_S1 % div_up(DST_D1, DST_B1) * DST_B1 - + dst_off / DST_SB1 % DST_B1; - const int d = (DST_S2 == 0) - ? 0 - : dst_off / DST_S2 % div_up(DST_D2, DST_B2) * DST_B2 - + dst_off / DST_SB2 % DST_B2; - const int h = (DST_S3 == 0) - ? 0 - : dst_off / DST_S3 % div_up(DST_D3, DST_B3) * DST_B3 - + dst_off / DST_SB3 % DST_B3; - const int w = (DST_S4 == 0) - ? 0 - : dst_off / DST_S4 % div_up(DST_D4, DST_B4) * DST_B4 - + dst_off / DST_SB4 % DST_B4; - - // Only use post-ops on non-zero-padded elements - if (mb < DST_D0 && c < DST_D1 && d < DST_D2 && h < DST_D3 - && w < DST_D4) { - APPLY_POST_OPS_SERIAL(res, float, dst_val, float, mb, 1, c, 1, - d, 1, h, 1, w, 1, 0, 1); - } + int idxs[6]; + reverse_indexing(dst_off, idxs); + + // Only use post-ops on non-zero-padded elements + if (idxs[0] < DST_D0 && idxs[1] < DST_D1 && idxs[2] < DST_D2 + && idxs[3] < DST_D3 && idxs[4] < DST_D4 + && idxs[5] < DST_D5) { + APPLY_POST_OPS_SERIAL(res, float, dst_val, float, idxs[0], + 1, idxs[1], 1, idxs[2], 1, idxs[3], 1, idxs[4], 1, + idxs[5], 1); + } #endif // WITH_POST_OP -#else - float res = acc; -#endif // IS_FINAL + } // Write to dst if (is_dst_zero_padded(dst_off)) res = 0.0f; dst[dst_off] = IS_FINAL ? TO_DST(res) : res; - - // Reduced + zero-padded dims need extra zeros written -#if DST_Z0_IS_REDUCED && DST_Z1_IS_REDUCED - for (int i = 0; i < DST_Z0_SIZE0; i++) { - for (int j = 0; j < DST_Z1_SIZE0; j++) { - if (i == 0 && j == 0) continue; - dst[dst_off + i * DST_Z0_STRIDE0 + j * DST_Z1_STRIDE0] - = TO_DST(0.0f); - } - } -#elif DST_Z0_IS_REDUCED - for (int i = 1; i < DST_Z0_SIZE0; i++) { - dst[dst_off + i * DST_Z0_STRIDE0] = TO_DST(0.0f); - } -#elif DST_Z1_IS_REDUCED - for (int j = 1; j < DST_Z1_SIZE0; j++) { - dst[dst_off + j * DST_Z1_STRIDE0] = TO_DST(0.0f); - } -#endif + write_padded_zeros(dst + dst_off); } } } diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cpp b/src/gpu/intel/ocl/reduction/combined_reduction.cpp index dd19f799cbf..9af882b6246 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cpp +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cpp @@ -447,13 +447,11 @@ status_t combined_reduction_t::pd_t::init_kernel_ctx( // Set post-op macros CHECK(def_attr_info( kernel_ctx, conf.attr_info, attr()->post_ops_, *dst_md())); - if (attr()->post_ops_.len() > 0) { - if (phase.is_final) { - // Can only do this for the final phase, since it overwrites def_data_type for DST - def_memory_desc_info(kernel_ctx, conf.dst_md_info, "DST"); - } - def_offsets(conf.off.dst_off, kernel_ctx, "DST", conf.ndims); + if (attr()->post_ops_.len() > 0 && phase.is_final) { + // Can only do this for the final phase, since it overwrites def_data_type for DST + def_memory_desc_info(kernel_ctx, conf.dst_md_info, "DST"); } + def_offsets(conf.off.dst_off, kernel_ctx, "DST", conf.ndims); return status; } From e9aefe49320f6db682de2668cf8f13668ea41c9e Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Fri, 17 May 2024 09:11:10 -0700 Subject: [PATCH 163/187] gpu: reduction: implement SLM reduction --- .../intel/ocl/reduction/combined_reduction.cl | 70 +++++++++++++------ .../ocl/reduction/combined_reduction.cpp | 61 +++++++--------- .../ocl/reduction/combined_reduction.hpp | 2 +- 3 files changed, 75 insertions(+), 58 deletions(-) diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cl b/src/gpu/intel/ocl/reduction/combined_reduction.cl index 2ade501cef4..73fc8812117 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cl +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cl @@ -16,6 +16,7 @@ #include "gpu/intel/ocl/ocl_post_ops.h" #include "gpu/intel/ocl/ocl_types.h" +#include "gpu/intel/ocl/ocl_utils.h" #include "gpu/intel/ocl/reduction/ocl_reduction.h" // Define how to read data @@ -138,6 +139,12 @@ void write_padded_zeros(__global DST_DATA_T *dst) { #endif } +#if INNER_DIM_SIZE < SUBGROUP_SIZE +#define SLM_PER_SG INNER_DIM_SIZE +#else +#define SLM_PER_SG SUBGROUP_SIZE +#endif + // Specifying wg size since larger work groups reduce performance. // TODO: Look into why this is the case __attribute__((reqd_work_group_size(LWS_SIZE, 1, 1))) // attr:no-format @@ -149,45 +156,51 @@ combined_reduce( const int sg_per_inner_dim = div_up(INNER_DIM_SIZE, SUBGROUP_SIZE); const int red_per_sg = min(REDUCTION_SIZE, max(1, SUBGROUP_SIZE / INNER_DIM_SIZE)); - const int num_horiz_reductions = REDUCTION_SIZE / red_per_sg; - const int tail_reductions = REDUCTION_SIZE % red_per_sg; + const int wg_reductions = LWS_SIZE / SUBGROUP_SIZE; + const int other_reductions = red_per_sg * wg_reductions; + const int num_horiz_reductions = REDUCTION_SIZE / other_reductions; + const int tail_reductions = REDUCTION_SIZE % other_reductions; // Direct indices from gws - const int sgid = get_global_id(0) / SUBGROUP_SIZE; - const int inner_idx_start = (sgid % sg_per_inner_dim) * SUBGROUP_SIZE; + const int sgid = get_sub_group_id(); + ASSUME(sgid < wg_reductions); + ASSUME(sgid >= 0); + const int tgid = get_global_id(0) / LWS_SIZE; + const int inner_idx_start = (tgid % sg_per_inner_dim) * SUBGROUP_SIZE; // Handle inner vector packing into subgroups const int sglid = get_sub_group_local_id(); + ASSUME(sglid < SUBGROUP_SIZE); + ASSUME(sglid >= 0); const int inner_idx = inner_idx_start + (sglid % INNER_DIM_SIZE); const int red_off = sglid / INNER_DIM_SIZE; + const int red_off_tg = red_off + sgid * red_per_sg; const int active_channels = min(SUBGROUP_SIZE, red_per_sg * INNER_DIM_SIZE); ASSUME(active_channels == SUBGROUP_SIZE || !WITH_BLOCK_READ); - const int loop_stride = _SRC_OFF(0, red_per_sg, 0); + const int loop_stride = _SRC_OFF(0, other_reductions, 0); + __local DEF_ACC_DATA_T slm_acc[SLM_PER_SG * wg_reductions]; unroll_for(int oid = 0; oid < OUTER_TILE_SIZE; oid++) { - const int outer_idx = sgid / sg_per_inner_dim * OUTER_TILE_SIZE + oid; + const int outer_idx = tgid / sg_per_inner_dim * OUTER_TILE_SIZE + oid; DEF_ACC_DATA_T acc; init_acc(REDUCTION_ALG, &acc); + // Each thread reduces in a loop if (sglid < active_channels) { - int src_off = _SRC_OFF(outer_idx, 0, inner_idx_start); + // red_off_tg - red_off to get the starting point for the subgroup + int src_off = _SRC_OFF( + outer_idx, red_off_tg - red_off, inner_idx_start); if (!WITH_BLOCK_READ) src_off += sglid; - __attribute__((opencl_unroll_hint(UNROLL_FACTOR))) // attr:no-format - for (int off = 0; off < num_horiz_reductions; - off++, src_off += loop_stride) { - // Load + for (int iters = num_horiz_reductions; iters > 0; --iters) { const DATA_T src_val = READ_DATA(src[src_off]); - - // Accumulate acc = reduce( REDUCTION_ALG, acc, TO_DEF_ACC_DATA_T(src_val), POWER); + src_off += loop_stride; } - if (red_off < tail_reductions) { - // Load + const int red_off_tg = red_off + sgid * red_per_sg; + if (red_off_tg < tail_reductions) { const DATA_T src_val = READ_DATA(src[src_off]); - - // Accumulate acc = reduce( REDUCTION_ALG, acc, TO_DEF_ACC_DATA_T(src_val), POWER); } @@ -201,11 +214,28 @@ combined_reduce( DEF_ACC_DATA_T next = intel_sub_group_shuffle_down(acc, init, shift); acc = reduce(SECONDARY_REDUCTION_ALG, acc, next, POWER); - DEBUG_PRINT("%d->%d/%d: sg reduce from sglid %d\n", - get_global_id(0), sgid, sglid, sglid + shift); } - if (red_off == 0 && inner_idx < INNER_DIM_SIZE) { + // Reduce all threads in work group to one using SLM + if (wg_reductions > 1) { + const int local_idx = sgid * SLM_PER_SG + sglid; + if (red_off == 0 && inner_idx < INNER_DIM_SIZE) { + slm_acc[local_idx] = acc; + } + init_acc(SECONDARY_REDUCTION_ALG, &acc); + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (red_off_tg == 0 && inner_idx < INNER_DIM_SIZE) { + if (wg_reductions > 1) { + unroll_for(int i = 0; i < wg_reductions; i++) { + const int idx = i * SLM_PER_SG + sglid; + acc = reduce( + SECONDARY_REDUCTION_ALG, acc, slm_acc[idx], POWER); + } + } + + // Finalize and write to dst const dim_t dst_off = _DST_OFF(outer_idx, inner_idx); float res = acc; if (IS_FINAL) { diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cpp b/src/gpu/intel/ocl/reduction/combined_reduction.cpp index 9af882b6246..2fe404219ca 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cpp +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cpp @@ -16,6 +16,7 @@ #include "gpu/intel/ocl/reduction/combined_reduction.hpp" #include "common/c_types_map.hpp" +#include "common/utils.hpp" #include "gpu/intel/block_structure.hpp" #include "gpu/intel/compute/device_info.hpp" #include "gpu/intel/compute/utils.hpp" @@ -85,25 +86,36 @@ reduction_phase_conf_t::reduction_phase_conf_t( const int num_EU = compute_engine->device_info()->eu_count(); const int max_wg_size = static_cast( compute_engine->device_info()->max_wg_size(large_grf_mode)); + compute::gpu_arch_t arch = compute_engine->device_info()->gpu_arch(); + int threads_per_eu + = large_grf_mode ? 4 : compute::device_info_t::threads_per_eu(arch); + int num_threads = num_EU * threads_per_eu; // inner_dim can either be: // 1. packed into a single subgroup (small inner dim), or // 2. split among several subgroups (large inner dim) + const dim_t num_packed_inner_dims + = nstl::clamp(subgroup_size / inner_block.block, dim_t {1}, + reduction_block.block); const dim_t num_split_inner_dims - = utils::div_up(inner_block.block, subgroup_size); // S per I - - dim_t num_subgroups = outer_block.block * num_split_inner_dims; + = utils::div_up(inner_block.block, subgroup_size); + + int max_slm = utils::div_up( + num_threads, outer_block.block * num_split_inner_dims); + max_slm = nstl::min(max_slm, max_wg_size / subgroup_size); + slm_reductions = [this, &num_packed_inner_dims, &max_slm]() { + const dim_t rem_red = reduction_block.block / num_packed_inner_dims; + // XXX: max_div no longer required + int n_slm = gpu_utils::into( + nstl::min(rem_red, gpu_utils::into(max_slm))); + return gpu_utils::dev_getenv("combined_reduction_n_slm", n_slm); + }(); + dim_t num_subgroups + = outer_block.block * num_split_inner_dims * slm_reductions; // Increase num_outer_idxs to use persistent threading to reduce the number of subgroups // and avoid overdispatching - outer_tile_size = [this, &compute_engine, &num_EU, &large_grf_mode, - &num_subgroups]() -> int { - compute::gpu_arch_t arch = compute_engine->device_info()->gpu_arch(); - int threads_per_eu = large_grf_mode - ? 4 - : compute::device_info_t::threads_per_eu(arch); - int num_threads = num_EU * threads_per_eu; - + outer_tile_size = [this, &arch, &num_threads, &num_subgroups]() -> int { // Enable >1 block sizes only for PVC+, to avoid oldest-first thread arbitration dim_t block_size = 1; if (arch >= compute::gpu_arch_t::xe_hpc) { @@ -117,20 +129,11 @@ reduction_phase_conf_t::reduction_phase_conf_t( << "Invalid choice of persistent thread outer idxs"; num_subgroups /= outer_tile_size; - // Compute the number of threads per EU - this has no major impact - // on average time, but can improve the best times on - // close-to-cache-size problems with high parallelism - const dim_t max_threads = num_subgroups / num_EU; - dim_t threads_per_wg - = nstl::clamp(static_cast(max_wg_size / subgroup_size), - dim_t {1}, max_threads); - threads_per_wg = get_previous_factor(num_subgroups, threads_per_wg); - // Compute the nd_range for this phase compute::range_t gws( gpu_utils::into(num_subgroups * subgroup_size)); compute::range_t lws( - gpu_utils::into(threads_per_wg * subgroup_size)); + gpu_utils::into(slm_reductions * subgroup_size)); nd_range = compute::nd_range_t(gws, lws); is_first = false; @@ -359,13 +362,6 @@ static status_t init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx, kernel_ctx.set_data_type(phase.src_type); - // Used for packing small inner vectors into a subgroup - const dim_t inner_dim_per_sg - = nstl::clamp(phase.subgroup_size / phase.inner_block.block, - dim_t {1}, phase.reduction_block.block); - const dim_t num_horiz_reductions - = phase.reduction_block.block / inner_dim_per_sg; - kernel_ctx.define_int("SUBGROUP_SIZE", phase.subgroup_size); const auto &lws = phase.nd_range.local_range(); if (!lws) return status::runtime_error; @@ -384,15 +380,6 @@ static status_t init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx, kernel_ctx.define_int("IS_FINAL", phase.is_final); kernel_ctx.define_int("IS_FIRST", phase.is_first); - // Because the reduction loop is quite tight, we can override the compiler's - // loop unrolling logic to increase it a lot and get a bit more speed - // Heuristic determined on ATS-m, set to exclude the possibility of - // exceeding the instruction cache - const dim_t max_unroll = 256; - const dim_t unroll_factor - = nstl::clamp(num_horiz_reductions, dim_t {1}, max_unroll); - kernel_ctx.define_int("UNROLL_FACTOR", unroll_factor); - kernel_ctx.define_int("WITH_BLOCK_READ", phase.with_block_reads ? 1 : 0); switch (conf.alg) { diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.hpp b/src/gpu/intel/ocl/reduction/combined_reduction.hpp index 38aa99750d8..92270fab1bd 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.hpp +++ b/src/gpu/intel/ocl/reduction/combined_reduction.hpp @@ -38,7 +38,7 @@ struct reduction_phase_conf_t : public reduction_subproblem_t { data_type_t src_type, dst_type; compute::nd_range_t nd_range; - int outer_tile_size; + int outer_tile_size, slm_reductions; bool is_final, is_first; int subgroup_size; bool with_block_reads; From 7dcb64a392d485c0442146b1e128f46228bb2ac2 Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Fri, 17 May 2024 09:54:48 -0700 Subject: [PATCH 164/187] gpu: reduction: add debug prints --- src/gpu/intel/ocl/reduction/combined_reduction.cl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cl b/src/gpu/intel/ocl/reduction/combined_reduction.cl index 73fc8812117..e6fa20a3a26 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cl +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cl @@ -145,6 +145,10 @@ void write_padded_zeros(__global DST_DATA_T *dst) { #define SLM_PER_SG SUBGROUP_SIZE #endif +#define DUMP(str, ...) \ + DEBUG_PRINT("%d->%d/%d/%d: " str, get_global_id(0), tgid, sgid, sglid, \ + __VA_ARGS__) + // Specifying wg size since larger work groups reduce performance. // TODO: Look into why this is the case __attribute__((reqd_work_group_size(LWS_SIZE, 1, 1))) // attr:no-format @@ -196,13 +200,14 @@ combined_reduce( const DATA_T src_val = READ_DATA(src[src_off]); acc = reduce( REDUCTION_ALG, acc, TO_DEF_ACC_DATA_T(src_val), POWER); + DUMP("(iter +%d) src[%d] = %f\n", iters, src_off, src_val); src_off += loop_stride; } - const int red_off_tg = red_off + sgid * red_per_sg; if (red_off_tg < tail_reductions) { const DATA_T src_val = READ_DATA(src[src_off]); acc = reduce( REDUCTION_ALG, acc, TO_DEF_ACC_DATA_T(src_val), POWER); + DUMP("(tail) src[%d] = %f\n", src_off, src_val); } } @@ -214,6 +219,7 @@ combined_reduce( DEF_ACC_DATA_T next = intel_sub_group_shuffle_down(acc, init, shift); acc = reduce(SECONDARY_REDUCTION_ALG, acc, next, POWER); + DUMP("(sg) acc from sglid %d: %f\n", sglid + shift, next); } // Reduce all threads in work group to one using SLM @@ -232,6 +238,7 @@ combined_reduce( const int idx = i * SLM_PER_SG + sglid; acc = reduce( SECONDARY_REDUCTION_ALG, acc, slm_acc[idx], POWER); + DUMP("(wg) acc from wg %d/%d: %f\n", i, idx, slm_acc[idx]); } } @@ -264,6 +271,7 @@ combined_reduce( if (is_dst_zero_padded(dst_off)) res = 0.0f; dst[dst_off] = IS_FINAL ? TO_DST(res) : res; write_padded_zeros(dst + dst_off); + DUMP("dst[%d] <- %f\n", dst_off, TO_DST(res)); } } } From 14270e24eb93a8b64aefe9d484bb174eca6c847b Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Fri, 17 May 2024 09:55:08 -0700 Subject: [PATCH 165/187] gpu: reduction: update subproblem splitting heuristic --- src/gpu/intel/ocl/reduction/combined_reduction.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cpp b/src/gpu/intel/ocl/reduction/combined_reduction.cpp index 2fe404219ca..216e40ba2e9 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cpp +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cpp @@ -201,7 +201,7 @@ status_t split_into_phases(const reduction_subproblem_t &subprb, // EU_mult: reduce parallelism to at most num_EU*EU_mult (reduces scheduling overhead?) const int EU_mult = 20; // Target single_phase_threshold horizontal reductions with each phase - const int single_phase_threshold = 256; + const int single_phase_threshold = 1024; // Estimate the number of phases remaining, and divide it up evenly around this target int N = static_cast(std::ceil(std::log2(reduction_elems) From d8abef07cbc921a92ffe89db3b4b6a4e86ebd705 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Mon, 20 May 2024 17:18:10 -0700 Subject: [PATCH 166/187] include, doc: update and fix brgemm texts and labels f --- doc/Doxyfile.in | 2 +- doc/build/link.md | 25 +++++------ doc/rst/index.rst | 1 + doc/rst/ukernels.rst | 10 +++++ doc/sphinx/conf.py | 2 +- doc/ukernel/operations/brgemm.md | 62 +++++++++++----------------- doc/ukernel/operations/transform.md | 10 ++--- examples/ukernels/cpu_brgemm.cpp | 12 ++---- include/oneapi/dnnl/dnnl_ukernel.h | 6 +-- include/oneapi/dnnl/dnnl_ukernel.hpp | 8 ++-- 10 files changed, 68 insertions(+), 70 deletions(-) create mode 100644 doc/rst/ukernels.rst diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index f0411adb8b2..ee54ae50d08 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -1962,7 +1962,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS DNNL_GPU_RUNTIME=DNNL_RUNTIME_OCL DNNL_WITH_SYCL DNNL_USE_SYCL_BUFFERS +PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS DNNL_GPU_RUNTIME=DNNL_RUNTIME_OCL DNNL_WITH_SYCL DNNL_USE_SYCL_BUFFERS DNNL_EXPERIMENTAL_UKERNEL # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/doc/build/link.md b/doc/build/link.md index aa2f80c50dd..52f9e5309bd 100644 --- a/doc/build/link.md +++ b/doc/build/link.md @@ -7,18 +7,19 @@ on how oneDNN was built. ## Header Files -| File | Description | -|:-------------------------------------------|:----------------------------------| -| ``include/oneapi/dnnl/dnnl.h`` | C header | -| ``include/oneapi/dnnl/dnnl.hpp`` | C++ header | -| ``include/oneapi/dnnl/dnnl_types.h`` | Auxiliary C header | -| ``include/oneapi/dnnl/dnnl_config.h`` | Auxiliary C header | -| ``include/oneapi/dnnl/dnnl_version.h`` | C header with version information | -| ``include/oneapi/dnnl/dnnl_graph.h`` | C header for graph API | -| ``include/oneapi/dnnl/dnnl_graph.hpp`` | C++ header for graph API | -| ``include/oneapi/dnnl/dnnl_graph_types.h`` | Auxiliary C header for graph API | -| ``include/oneapi/dnnl/dnnl_ukernel.h`` | C header with ukernel API | -| ``include/oneapi/dnnl/dnnl_ukernel.hpp`` | C++ header with ukernel API | +| File | Description | +|:---------------------------------------------|:-----------------------------------| +| ``include/oneapi/dnnl/dnnl.h`` | C header | +| ``include/oneapi/dnnl/dnnl.hpp`` | C++ header | +| ``include/oneapi/dnnl/dnnl_types.h`` | Auxiliary C header | +| ``include/oneapi/dnnl/dnnl_config.h`` | Auxiliary C header | +| ``include/oneapi/dnnl/dnnl_version.h`` | C header with version information | +| ``include/oneapi/dnnl/dnnl_graph.h`` | C header for graph API | +| ``include/oneapi/dnnl/dnnl_graph.hpp`` | C++ header for graph API | +| ``include/oneapi/dnnl/dnnl_graph_types.h`` | Auxiliary C header for graph API | +| ``include/oneapi/dnnl/dnnl_ukernel.h`` | C header for ukernel API | +| ``include/oneapi/dnnl/dnnl_ukernel.hpp`` | C++ header for ukernel API | +| ``include/oneapi/dnnl/dnnl_ukernel_types.h`` | Auxiliary C header for ukernel API | ## Libraries diff --git a/doc/rst/index.rst b/doc/rst/index.rst index cdb6f750d40..8cdad3d559d 100644 --- a/doc/rst/index.rst +++ b/doc/rst/index.rst @@ -11,6 +11,7 @@ oneAPI Deep Neural Network Library Developer Guide and Reference dev_guide_examples performance_profiling_and_inspection advanced_topics + ukernels group_dnnl_api.rst oneAPI Deep Neural Network Library (oneDNN) is an open-source cross-platform performance library of basic building blocks for deep learning applications. The library is optimized for Intel Architecture Processors, Intel Processor Graphics and Xe Architecture graphics. Support for other architectures such as Arm* 64-bit Architecture (AArch64) and OpenPOWER* Power ISA (PPC64) is experimental. diff --git a/doc/rst/ukernels.rst b/doc/rst/ukernels.rst new file mode 100644 index 00000000000..baba53517f9 --- /dev/null +++ b/doc/rst/ukernels.rst @@ -0,0 +1,10 @@ +Ukernels +##################### + +.. toctree:: + :maxdepth: 1 + + dev_guide_ukernel_basic_concepts.rst + dev_guide_ukernel_brgemm.rst + dev_guide_ukernel_transform.rst + page_cpu_brgemm_example_cpp.rst diff --git a/doc/sphinx/conf.py b/doc/sphinx/conf.py index 42c0801eeaa..bfaa66dbf22 100644 --- a/doc/sphinx/conf.py +++ b/doc/sphinx/conf.py @@ -190,7 +190,7 @@ def setup(app): def fixFileNameRefs(app, env, docnames): - replacements = {"page_dev_guide": "dev_guide", "group_Dnnl":"group_dnnl"} + replacements = {"page_dev_guide":"dev_guide", "group_Dnnl":"group_dnnl", "brgemm_pack_B":"brgemm_pack_b"} targetDir = "rst" fileExtension = ".rst" diff --git a/doc/ukernel/operations/brgemm.md b/doc/ukernel/operations/brgemm.md index 70322633eb8..98cd640f328 100644 --- a/doc/ukernel/operations/brgemm.md +++ b/doc/ukernel/operations/brgemm.md @@ -1,4 +1,4 @@ -Batch-reduce General Matrix Multiplication {#dev_guide_ukernel_brgemm} +Batch-Reduce General Matrix Multiplication {#dev_guide_ukernel_brgemm} ======================================= > @@ -8,33 +8,28 @@ Batch-reduce General Matrix Multiplication {#dev_guide_ukernel_brgemm} ## General -The batch-reduce General Matrix Multiplication ukernel (BRGeMM) is an -operation that allows to compute a batch of small matrix -multiplication and accumulate their results in the same destination. +The batch-reduce General Matrix Multiplication ukernel (BRGeMM) is an operation +that computes a small matrix multiplication batch and accumulates their results +in the same destination. -```math -C = \sum_i A_i \cdot B_i -``` +\f$C = \sum_i A_i \cdot B_i\f$ with - \f$A_i\f$ a set of matrices of dimension \f$M \times K\f$ - \f$B_i\f$ a set of matrices of dimension \f$K \times N\f$ -- C matrix of dimension \f$M \times N\f$. +- \f$C\f$ matrix of dimension \f$M \times N\f$. -The BRGeMM ukernel also supports accumulation with values already -present in \f$C\f$, as well as post-operation and down-conversion to -another \f$D\f$ matrix: +The BRGeMM ukernel also supports accumulation with values already present in +\f$C\f$, as well as post-operation and down-conversion to another \f$D\f$ +matrix: -```math -D = \operatorname{convert}( \operatorname{post\_ops}(C + \sum_i A_i \cdot B_i, post_ops_args)). -``` +\f$D = \operatorname{convert}( \operatorname{post\_ops}(C + \sum_i A_i \cdot B_i, post\_ops\_args))\f$ ## Data Types -In general, C represents an accumulation buffer. Hence when -computations are carried in floating-point arithmetic, C shall be of -type f32, and when computation is carried in integer arithmetic, C -should be of type s32. +In general, C represents an accumulation buffer. Hence, when computations are +carried in floating-point arithmetic, C shall be of type f32; when computation +is carried in integer arithmetic, C should be of type s32. The BRGeMM ukernel supports the following combinations of data-types. @@ -47,21 +42,14 @@ The BRGeMM ukernel supports the following combinations of data-types. ## Data Representation -Because of hardware restrictions, the BRGeMM ukernel requires specific -data layout. +Because of hardware restrictions, the BRGeMM ukernel requires a specific data +layout. - The -@ref dnnl::ukernel::brgemm_pack_B::need_pack() method can be called to determine -if packing is necessary. If so, +The @ref dnnl_brgemm_pack_B_need_pack method can be called to +determine if packing is necessary. If so, [packB ukernel](@ref dev_guide_ukernel_transform) shall be created to do the actual packing. - - ## Attributes The following ukernel attributes can be set through dedicated setters. @@ -73,12 +61,12 @@ The following ukernel attributes can be set through dedicated setters. | Post-op | [Binary](@ref dnnl::post_ops::append_binary) | Applies a @ref dnnl_api_binary operation to the result | General binary post-op restrictions | -@note if zero-points are passed for A/B, fpmath_mode should be set for -the computation to happen over floating-point format (so up-conversion -to floating-point format would happen before computation). If -computation in integer format is needed, BRGeMM ukernel should be -configured without zero-point, and the user should prepare a -compensation term that will be passed to the binary post-op. +@note if zero-points are passed for A/B, fpmath_mode should be set for the +computation to happen over floating-point format (so up-conversion to +floating-point format would happen before computation). If computation in +integer format is needed, BRGeMM ukernel should be configured without +zero-point, and the user should prepare a compensation term that will be passed +to the binary post-op. ## Implementation limitations @@ -86,6 +74,6 @@ BRGeMM ukernel has no known limitations. ## Examples -[BRGeMM ukernel example](@ref brgemm_example_cpp) +[BRGeMM ukernel example](@ref cpu_brgemm_example_cpp) -@copydetails brgemm_example_cpp_short +@copydetails cpu_brgemm_example_cpp diff --git a/doc/ukernel/operations/transform.md b/doc/ukernel/operations/transform.md index 49d9f09f26d..7abe8f7de32 100644 --- a/doc/ukernel/operations/transform.md +++ b/doc/ukernel/operations/transform.md @@ -7,9 +7,9 @@ Data transformation {#dev_guide_ukernel_transform} ## General -The packB ukernel allows to pack BRGeMM B matrices in optimal layout -before executing the [BRGeMM ukernel](@ref dev_guide_ukernel_brgemm). -This is an out-of-place operation. +The packB ukernel allows users to pack BRGeMM B matrices in an optimal layout +before executing the [BRGeMM ukernel](@ref dev_guide_ukernel_brgemm). This is an +out-of-place operation. ## Data Types @@ -37,6 +37,6 @@ No attribute is supported for packB ukernel. ## Examples -[BRGeMM ukernel example](@ref brgemm_example_cpp) +[BRGeMM ukernel example](@ref cpu_brgemm_example_cpp) -@copydetails brgemm_example_cpp_short +@copydetails cpu_brgemm_example_cpp diff --git a/examples/ukernels/cpu_brgemm.cpp b/examples/ukernels/cpu_brgemm.cpp index 9ae799b530d..0252a783d37 100644 --- a/examples/ukernels/cpu_brgemm.cpp +++ b/examples/ukernels/cpu_brgemm.cpp @@ -14,18 +14,14 @@ * limitations under the License. *******************************************************************************/ -/// @example brgemm.cpp -/// > Annotated version: @ref brgemm_example_cpp -/// -/// @page brgemm_example_cpp_short +/// @example cpu_brgemm.cpp +/// > Annotated version: @ref cpu_brgemm_example_cpp /// +/// @page cpu_brgemm_example_cpp BRGeMM ukernel example /// This C++ API example demonstrates how to create and execute a BRGeMM /// ukernel. /// -/// @page brgemm_example_cpp Example of using BRGeMM ukernel to implement Matmul -/// @copydetails brgemm_example_cpp_short -/// -/// @include brgemm.cpp +/// @include cpu_brgemm.cpp #include #include diff --git a/include/oneapi/dnnl/dnnl_ukernel.h b/include/oneapi/dnnl/dnnl_ukernel.h index 0ae5ec52e65..3143c979ff7 100644 --- a/include/oneapi/dnnl/dnnl_ukernel.h +++ b/include/oneapi/dnnl/dnnl_ukernel.h @@ -124,8 +124,8 @@ dnnl_status_t DNNL_API dnnl_brgemm_execute(const_dnnl_brgemm_t brgemm, /// Executes a BRGeMM ukernel object with post operations. /// /// @param brgemm BRGeMM ukernel object. -/// @param A_ptr Base pointer to a tensor A. -/// @param B_ptr Base pointer to a tensor B. +/// @param A Base pointer to a tensor A. +/// @param B Base pointer to a tensor B. /// @param A_B_offsets Pointer to a set of tensor A and tensor B offsets for /// each batch. A set must be contiguous in memory. A single batch should /// supply offsets for both tensors A and B simultaneously. The number of @@ -177,7 +177,7 @@ dnnl_status_t DNNL_API dnnl_brgemm_pack_B_need_pack( const_dnnl_brgemm_pack_B_t brgemm_pack_B, int *need_pack); /// Generates an executable part of BRGeMM ukernel packing B object. -/// @param brgemm BRGeMM ukernel packing B object. +/// @param brgemm_pack_B BRGeMM ukernel packing B object. /// @returns #dnnl_success on success and a status describing the error /// otherwise. dnnl_status_t DNNL_API dnnl_brgemm_pack_B_generate( diff --git a/include/oneapi/dnnl/dnnl_ukernel.hpp b/include/oneapi/dnnl/dnnl_ukernel.hpp index 4ac0b93c56f..32cc8155696 100644 --- a/include/oneapi/dnnl/dnnl_ukernel.hpp +++ b/include/oneapi/dnnl/dnnl_ukernel.hpp @@ -52,7 +52,8 @@ struct handle_traits { /// @} dnnl_api_utils -/// @addtogroup dnnl_api_ukernel +/// @addtogroup dnnl_api_ukernel Ukernels +/// Collection of ukernels /// @{ /// ukernel namespace @@ -61,6 +62,7 @@ namespace ukernel { #ifdef DNNL_EXPERIMENTAL_UKERNEL /// @addtogroup dnnl_api_ukernel_brgemm BRGeMM ukernel +/// BRGeMM ukernel routines /// @{ struct brgemm : public handle { @@ -282,8 +284,8 @@ struct brgemm_pack_B : public handle { /// Executes a BRGeMM ukernel packing tensor B object. /// - /// @param in_ptr Pointer to an input buffer. - /// @param out_ptr Pointer to an output buffer. + /// @param in Pointer to an input buffer. + /// @param out Pointer to an output buffer. void execute(const void *in, void *out) const { dnnl_status_t status = dnnl_brgemm_pack_B_execute(get(), in, out); if (status != dnnl_success) From 0301384531cdc01e2a1b0bf803841265b174b88d Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Tue, 21 May 2024 16:44:07 -0700 Subject: [PATCH 167/187] doc: random fixes --- doc/Doxyfile.in | 2 +- doc/performance_considerations/dispatcher_control.md | 2 +- doc/primitives/rnn.md | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index ee54ae50d08..39aef31e039 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -1962,7 +1962,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS DNNL_GPU_RUNTIME=DNNL_RUNTIME_OCL DNNL_WITH_SYCL DNNL_USE_SYCL_BUFFERS DNNL_EXPERIMENTAL_UKERNEL +PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS DNNL_GPU_RUNTIME=DNNL_RUNTIME_OCL DNNL_WITH_SYCL DNNL_USE_SYCL_BUFFERS DNNL_EXPERIMENTAL_SPARSE DNNL_EXPERIMENTAL_UKERNEL # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/doc/performance_considerations/dispatcher_control.md b/doc/performance_considerations/dispatcher_control.md index 05f196d0629..86c67a872ec 100644 --- a/doc/performance_considerations/dispatcher_control.md +++ b/doc/performance_considerations/dispatcher_control.md @@ -40,7 +40,7 @@ still take effect. | \ | AVX10_1_512_AMX or AVX512_CORE_AMX | Intel AVX10.1/512 with float16, Intel DL Boost and bfloat16 support and Intel Advanced Matrix Extensions (Intel AMX) with 8-bit integer and bfloat16 support | | \ | AVX2_VNNI_2 | Intel AVX2 with Intel Deep Learning Boost (Intel DL Boost) with 8-bit integer, float16 and bfloat16 support | | \ | AVX10_1_512_AMX_FP16 or AVX512_CORE_AMX_FP16 | Intel AVX10.1/512 with float16, Intel DL Boost and bfloat16 support and Intel Advanced Matrix Extensions (Intel AMX) with 8-bit integer, bfloat16 and float16 support | -| \ | **DEFAULT** | **No restrictions on the above ISAs, but excludes the below ISAs with preview support in the library (default) | +| \ | **DEFAULT** | **No restrictions on the above ISAs, but excludes the below ISAs with preview support in the library (default)** | @note The ISAs are partially ordered: * SSE41 < AVX < AVX2 < AVX2_VNNI < AVX2_VNNI_2, diff --git a/doc/primitives/rnn.md b/doc/primitives/rnn.md index 58c7a1ee928..85ead747aa9 100644 --- a/doc/primitives/rnn.md +++ b/doc/primitives/rnn.md @@ -275,7 +275,7 @@ h_t &= u_t * h_{t-1, l} + (1 - u_t) * o_t Note that for all tensors with a dimension depending on the gate number, except the bias, we implicitly require the order of these gates to be `u`, `r`, and `o`. For the \bias tensor, we implicitly require the order of the gates to be -`u`, `r`, `o`, and `u'`. +`u`, `r`, `o`, and `u\'`. @note If you need to replace u_t by (1-u_t) when computing h_t, you can achieve this by multiplying \f$W_u\f$, \f$U_u\f$ and \f$B_u\f$ by \f$-1\f$. @@ -342,7 +342,7 @@ h_t &= \tilde u_t * h_{t-1, l} + (1 - \tilde u_t) * o_t Note that for all tensors with a dimension depending on the gate number, except the bias, we implicitly require the order of these gates to be `u`, `r`, and `o`. For the \bias tensor, we implicitly require the order of the gates to be -`u`, `r`, `o`, and `u'`. +`u`, `r`, `o`, and `u\'`. ## Considerations for Training @@ -432,10 +432,10 @@ The following table summarizes the data layouts supported by the RNN primitive. Propagation | Input/Output Data | Recurrent Data | Layer and Iteration Weights | Peephole Weights and Bias | Projection LSTM Weights --------------------|----------------------|----------------------|-------------------------------|---------------------------|------------------------ -Forward / Backward | #dnnl_format_tag_any | #dnnl_format_tag_any | #dnnl_format_tag_any | #dnnl_ldgo | #dnnl_format_tag_any -Forward | #dnnl_ntc, #dnnl_tnc | #dnnl_ldnc | #dnnl_ldigo | #dnnl_ldgo | #dnnl_ldio -Backward | #dnnl_ntc, #dnnl_tnc | #dnnl_ldnc | #dnnl_ldigo, #dnnl_ldgoi(gpu) | #dnnl_ldgo | #dnnl_ldoi +-------------------|----------------------|----------------------|--------------------------------|---------------------------|------------------------ +Forward / Backward | #dnnl_format_tag_any | #dnnl_format_tag_any | #dnnl_format_tag_any | #dnnl_ldgo | #dnnl_format_tag_any +Forward | #dnnl_ntc, #dnnl_tnc | #dnnl_ldnc | #dnnl_ldigo | #dnnl_ldgo | #dnnl_ldio +Backward | #dnnl_ntc, #dnnl_tnc | #dnnl_ldnc | #dnnl_ldigo, #dnnl_ldgoi (GPU) | #dnnl_ldgo | #dnnl_ldoi While an RNN primitive can be created with memory formats specified explicitly, the performance is likely to be sub-optimal. When using `any`, it From 80b41a0582cdef8a79002ad483776978d50dcbd4 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Tue, 21 May 2024 16:44:17 -0700 Subject: [PATCH 168/187] include: fix autogenerated doc for graph api --- include/oneapi/dnnl/dnnl_graph.h | 7 ++++++- include/oneapi/dnnl/dnnl_graph.hpp | 15 +++++++++------ include/oneapi/dnnl/dnnl_graph_ocl.h | 5 +++++ include/oneapi/dnnl/dnnl_graph_ocl.hpp | 16 +++++++++++----- include/oneapi/dnnl/dnnl_graph_sycl.h | 7 ++++++- include/oneapi/dnnl/dnnl_graph_sycl.hpp | 14 ++++++++++---- include/oneapi/dnnl/dnnl_graph_types.h | 7 ++++++- 7 files changed, 53 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dnnl/dnnl_graph.h b/include/oneapi/dnnl/dnnl_graph.h index a5160dea351..a2db234f984 100644 --- a/include/oneapi/dnnl/dnnl_graph.h +++ b/include/oneapi/dnnl/dnnl_graph.h @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Intel Corporation +* Copyright 2020-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,6 +28,9 @@ extern "C" { #endif +/// @addtogroup dnnl_api +/// @{ + /// @addtogroup dnnl_graph_api /// @{ @@ -729,6 +732,8 @@ dnnl_status_t DNNL_API dnnl_graph_get_constant_tensor_cache_capacity( /// @} dnnl_graph_api +/// @} dnnl_api + #ifdef __cplusplus } #endif diff --git a/include/oneapi/dnnl/dnnl_graph.hpp b/include/oneapi/dnnl/dnnl_graph.hpp index 4161a46b277..3726f3e65af 100644 --- a/include/oneapi/dnnl/dnnl_graph.hpp +++ b/include/oneapi/dnnl/dnnl_graph.hpp @@ -29,10 +29,13 @@ /// @addtogroup dnnl_api /// @{ +namespace dnnl { + /// @addtogroup dnnl_graph_api Graph API +/// oneDNN Graph API /// @{ -namespace dnnl { +/// oneDNN Graph namespace namespace graph { /// @cond DO_NOT_DOCUMENT_THIS @@ -118,7 +121,6 @@ using req = typename std::enable_if::type; /// @addtogroup dnnl_graph_api_status Status /// Definitions of status values returned by the library functions. -/// /// @{ /// Status values returned by the library functions. @@ -147,7 +149,7 @@ enum class status { invalid_data_type = dnnl_invalid_data_type, }; -/// @} dnnl_api_status +/// @} dnnl_graph_api_status /// @addtogroup dnnl_graph_api_allocator Allocator /// @@ -1543,9 +1545,12 @@ inline size_t get_constant_tensor_cache_capacity(engine::kind kind) { return size; } -/// @} dnnl_graph_constant_tensor_cache +/// @} dnnl_graph_api_constant_tensor_cache } // namespace graph + +/// @} dnnl_graph_api + } // namespace dnnl /// @cond DO_NOT_DOCUMENT_THIS @@ -1562,8 +1567,6 @@ namespace dnnl = ::dnnl; /// @endcond -/// @} dnnl_graph_api - /// @} dnnl_api #endif diff --git a/include/oneapi/dnnl/dnnl_graph_ocl.h b/include/oneapi/dnnl/dnnl_graph_ocl.h index 5183c8c9b8f..81530fc7a37 100644 --- a/include/oneapi/dnnl/dnnl_graph_ocl.h +++ b/include/oneapi/dnnl/dnnl_graph_ocl.h @@ -32,6 +32,9 @@ extern "C" { #endif +/// @addtogroup dnnl_api +/// @{ + /// @addtogroup dnnl_graph_api /// @{ @@ -137,6 +140,8 @@ dnnl_status_t DNNL_API dnnl_graph_ocl_interop_compiled_partition_execute( /// @} dnnl_graph_api +/// @} dnnl_api + #ifdef __cplusplus } #endif diff --git a/include/oneapi/dnnl/dnnl_graph_ocl.hpp b/include/oneapi/dnnl/dnnl_graph_ocl.hpp index ba5b34a2440..b893ff95f83 100644 --- a/include/oneapi/dnnl/dnnl_graph_ocl.hpp +++ b/include/oneapi/dnnl/dnnl_graph_ocl.hpp @@ -27,19 +27,22 @@ #include "oneapi/dnnl/dnnl_ocl.hpp" /// @endcond -/// @addtogroup dnnl_graph_api +/// @addtogroup dnnl_api /// @{ namespace dnnl { + +/// @addtogroup dnnl_graph_api +/// @{ + namespace graph { /// @addtogroup dnnl_graph_api_interop Runtime interoperability API /// API extensions to interact with the underlying run-time. /// @{ -/// @addtogroup dnnl_graph_api_ocl_interop OpenCL interoperability API API -/// extensions to interact with the underlying OpenCL run-time. -/// +/// @addtogroup dnnl_graph_api_ocl_interop OpenCL interoperability API +/// API extensions to interact with the underlying OpenCL run-time. /// @{ /// OpenCL interoperability namespace @@ -140,8 +143,11 @@ inline cl_event execute(compiled_partition &c_partition, stream &astream, /// @} dnnl_graph_api_interop } // namespace graph -} // namespace dnnl /// @} dnnl_graph_api +} // namespace dnnl + +/// @} dnnl_api + #endif diff --git a/include/oneapi/dnnl/dnnl_graph_sycl.h b/include/oneapi/dnnl/dnnl_graph_sycl.h index cf247624fea..518b94e4739 100644 --- a/include/oneapi/dnnl/dnnl_graph_sycl.h +++ b/include/oneapi/dnnl/dnnl_graph_sycl.h @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,9 @@ extern "C" { #endif +/// @addtogroup dnnl_api +/// @{ + /// @addtogroup dnnl_graph_api /// @{ @@ -87,6 +90,8 @@ dnnl_status_t DNNL_API dnnl_graph_sycl_interop_compiled_partition_execute( /// @} dnnl_graph_api +/// @} dnnl_api + #ifdef __cplusplus } #endif diff --git a/include/oneapi/dnnl/dnnl_graph_sycl.hpp b/include/oneapi/dnnl/dnnl_graph_sycl.hpp index b0824b4f366..5569b8b852b 100644 --- a/include/oneapi/dnnl/dnnl_graph_sycl.hpp +++ b/include/oneapi/dnnl/dnnl_graph_sycl.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2022 Intel Corporation +* Copyright 2020-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,10 +32,14 @@ #include "oneapi/dnnl/dnnl_graph_sycl.h" /// @endcond -/// @addtogroup dnnl_graph_api +/// @addtogroup dnnl_api /// @{ namespace dnnl { + +/// @addtogroup dnnl_graph_api +/// @{ + namespace graph { /// @addtogroup dnnl_graph_api_interop Runtime interoperability API @@ -44,7 +48,6 @@ namespace graph { /// @addtogroup dnnl_graph_api_sycl_interop SYCL interoperability API /// API extensions to interact with the underlying SYCL run-time. -/// /// @{ /// SYCL interoperability namespace @@ -117,8 +120,11 @@ inline sycl::event execute(compiled_partition &c_partition, stream &astream, /// @} dnnl_graph_api_interop } // namespace graph -} // namespace dnnl /// @} dnnl_graph_api +} // namespace dnnl + +/// @} dnnl_api + #endif diff --git a/include/oneapi/dnnl/dnnl_graph_types.h b/include/oneapi/dnnl/dnnl_graph_types.h index e77e27706c0..f1e3cd60af7 100644 --- a/include/oneapi/dnnl/dnnl_graph_types.h +++ b/include/oneapi/dnnl/dnnl_graph_types.h @@ -31,6 +31,9 @@ extern "C" { #include "oneapi/dnnl/dnnl_common_types.h" /// @endcond +/// @addtogroup dnnl_api +/// @{ + /// @addtogroup dnnl_graph_api /// @{ @@ -162,7 +165,7 @@ typedef struct dnnl_graph_graph *dnnl_graph_graph_t; /// A constant graph handle. typedef const struct dnnl_graph_graph *const_dnnl_graph_graph_t; -/// @} +/// @} dnnl_graph_api_graph /// @addtogroup dnnl_graph_api_op /// @{ @@ -459,6 +462,8 @@ typedef const struct dnnl_graph_tensor *const_dnnl_graph_tensor_t; /// @} dnnl_graph_api +/// @} dnnl_api + #ifdef __cplusplus } #endif From abcaee12ad9dfe6e43cc6c5bf16d6cc464639ce0 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Tue, 21 May 2024 16:10:37 -0700 Subject: [PATCH 169/187] benchdnn: enable testing targets on Linux by default DNNL_BUILD_FOR_CI is no longer needed on Linux to use ctest with benchdnn targets. --- tests/benchdnn/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchdnn/CMakeLists.txt b/tests/benchdnn/CMakeLists.txt index 1291c66c858..29f8ab1f096 100644 --- a/tests/benchdnn/CMakeLists.txt +++ b/tests/benchdnn/CMakeLists.txt @@ -85,7 +85,7 @@ function(register_benchdnn_test engine driver test_file) set(cmd "--mode=${tm} ${mode_modifier} -v1 --engine=${engine} --${driver} --batch=${test_file}") set(benchdnn_target ${target_name}_${engine}) - if(DNNL_BUILD_FOR_CI) + if(NOT WIN32 OR DNNL_BUILD_FOR_CI) string(REPLACE " " ";" cmd "benchdnn ${cmd}") add_dnnl_test(${benchdnn_target} ${cmd}) else() From 092480a9fee7a265a414616f33ae7037db08cbf9 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Tue, 21 May 2024 16:11:14 -0700 Subject: [PATCH 170/187] doc: extend instructions on benchdnn validation --- CONTRIBUTING.md | 16 +++++++++++++--- tests/benchdnn/doc/benchdnn_general_info.md | 16 ++++++++-------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f72e7268895..ea8c718f80e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -137,7 +137,17 @@ following information: oneDNN uses gtests for lightweight functional testing and benchdnn for performance and functional testing. -Be sure to extend the existing tests when fixing an issue. +Verify the modified code is covered by existing tests. If not, update the +coverage to validate the change and sumbit it as a part of the PR. -Developing new benchdnn tests can be hard, so it is a good idea to start with -gtests first. +Use the following command to run tests selected by a build configuration: +``` sh + ctest +``` + +To modify the coverage, use the +[`ONEDNN_TEST_SET`](https://oneapi-src.github.io/oneDNN/dev_guide_build_options.html#onednn-test-set) +build option. + +More details on how to run benchdnn can be found in +[benchdnn documentation](tests/benchdnn/doc/benchdnn_general_info.md#running-tests). diff --git a/tests/benchdnn/doc/benchdnn_general_info.md b/tests/benchdnn/doc/benchdnn_general_info.md index 78ac514211d..99a3f6901fe 100644 --- a/tests/benchdnn/doc/benchdnn_general_info.md +++ b/tests/benchdnn/doc/benchdnn_general_info.md @@ -16,18 +16,18 @@ Returns `1` if any submitted tests returned status `FAILED` or `UNIMPLEMENTED`, ## Running Tests -oneDNN comes with its own testing infrastructure enabled through CMake. Tests -can be executed via the command: +oneDNN comes with its own testing infrastructure enabled through CMake. +If the project is built with `DNNL_BUILD_TESTS` set to `TRUE`, then Cmake will +add test targets automatically. +Targets can be triggered by the following command: ``` sh - make test_ + ctest [-R ".*benchdnn.*"] ``` -This instructs CMake to build a deployable project and run the specific test. - -These tests target specific oneDNN features and are based on benchdnn -configurable executions. +, where the content in brackets is optional and limits testing to benchdnn only. +The pattern can be extended further for finer granularity. The available tests can be found in the oneDNN directory: -tests/benchdnn/inputs//. +tests/benchdnn/inputs/ ## Glossary From fe4f317f95212ad3de30e4adef8f01f02183ffef Mon Sep 17 00:00:00 2001 From: Kealan Barbieri Date: Thu, 16 May 2024 11:50:58 -0700 Subject: [PATCH 171/187] gpu: intel: jit: gemm: update from upstream --- .../jit/gemm/gen_gemm_kernel_generator.cpp | 218 +-- .../jit/gemm/gen_gemm_kernel_generator.hpp | 17 +- src/gpu/intel/jit/gemm/kernel.db | 1457 +++++++++-------- src/gpu/intel/jit/gemm/kernel_selector.cpp | 2 +- src/gpu/intel/jit/gemm/strategy_parser.cpp | 2 +- src/gpu/intel/jit/gemm/utils.hpp | 12 + 6 files changed, 910 insertions(+), 798 deletions(-) diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp index 7999736bc68..b9543264d9a 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.cpp @@ -47,12 +47,6 @@ class need_vflag : public std::runtime_error { need_vflag() : std::runtime_error("Need virtual flag registers") {} }; -class stub_exception : public std::runtime_error { -public: - stub_exception() - : std::runtime_error("Functionality not yet implemented") {} -}; - class hw_unsupported_exception : public std::runtime_error { public: hw_unsupported_exception() @@ -63,10 +57,6 @@ class hw_unsupported_exception : public std::runtime_error { throw hw_unsupported_exception(); } -[[noreturn]] static void stub() { - throw stub_exception(); -} - // Forward declarations. static inline void safeReleaseRanges(GRFMultirange &ranges, CommonState &state); static inline void kLoopModifiedFlagAP(GEMMState &state); @@ -79,6 +69,7 @@ static inline Immediate cast(Type T, U val) { switch (T) { case Type::f16: return half(val); case Type::f32: return float(val); + case Type::f64: return double(val); case Type::u8: return uint8_t(val); case Type::s8: return int8_t(val); case Type::u16: return uint16_t(val); @@ -106,10 +97,6 @@ bool Type::isSubsetOf(Type T) const { return (bits() < T.bits()); } -void Type::subByteCheck() const { - if (isInt4()) stub(); -} - constexpr bool operator==(const RegData &rd, int i) { return false; } @@ -175,6 +162,8 @@ static inline bool hasNativeAtomicAdd(HW hw, Type T, return true; else if (T == Type::f32) return floatAtomics && (hw >= HW::XeHP); + else if (T == Type::f64) + return floatAtomics && (hw >= HW::XeHPC); else return false; } @@ -3371,7 +3360,7 @@ bool gemm_kernel_generator_t::getSubblocks(Type T, vector &sublayout, const vector &layout, bool column, int x1, int x2, bool overrunOK, const MatrixAddressing &atype, - const MatrixAddressingStrategy &astrategy) { + const MatrixAddressingStrategy &astrategy, bool decoalesce) { auto RegisterBlock::*nq = column ? &RegisterBlock::nc : &RegisterBlock::nr; auto RegisterBlock::*offsetQ = column ? &RegisterBlock::offsetC : &RegisterBlock::offsetR; @@ -3390,6 +3379,7 @@ bool gemm_kernel_generator_t::getSubblocks(Type T, status << "Could not make subblock." << status_stream::endl; return false; } + if (decoalesce) subblock.offsetAddr = 0; sublayout.push_back(subblock); } } @@ -4778,13 +4768,15 @@ InstructionModifier gemm_kernel_generator_t::registerBlockMasking( mod |= ~pflag; else mod |= pflag; - if (hw >= HW::XeHPC) { - if (block.flagAll) mod |= all; - if (block.flagAny) mod |= any; - } else if (block.flagAll) - mod |= (block.simdSize > 8) ? all16h : all8h; - else if (block.flagAny) - mod |= (block.simdSize > 8) ? any16h : any8h; + if (block.simdSize > 1) { + if (hw >= HW::XeHPC) { + if (block.flagAll) mod |= all; + if (block.flagAny) mod |= any; + } else if (block.flagAll) + mod |= (block.simdSize > 8) ? all16h : all8h; + else if (block.flagAny) + mod |= (block.simdSize > 8) ? any16h : any8h; + } } else if (outFlag) *outFlag = FlagRegister(); @@ -5230,6 +5222,11 @@ void gemm_kernel_generator_t::atomicAddMatrixBlock(Type T, const GRF &src, atomic(AtomicOp::fadd, mod, scattered_dword(), astrategy.base, addr[hoff], curSrc); break; + case Type::f64: + atomic(AtomicOp::fadd_64b, mod, + scattered_qword(), astrategy.base, + addr[hoff], curSrc); + break; case Type::u64: case Type::s64: atomic(AtomicOp::add, mod, scattered_qword(), @@ -5754,6 +5751,7 @@ LDMultiples gemm_kernel_generator_t::createLDMultiples(bool a64, LDMultiples result; result.range = r; result.a64 = a64; + result.count = nregs * simd; return result; } @@ -5776,8 +5774,8 @@ Subregister gemm_kernel_generator_t::findLDMultiple( static inline void releaseLDMultiples( LDMultiples &multiples, CommonState &state) { - state.ra.safeRelease(multiples.range); - multiples.a64 = false; + state.ra.release(multiples.range); + multiples = LDMultiples {}; } // Ugly helpers handling address shifts. constexpr if would clean this all up. @@ -5859,12 +5857,9 @@ static inline bool canRelAddr(const RegisterBlock &blockSrc, } static inline int block2DWidthAlignment(Type T, const RegisterBlock &block, - const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy) { // Block 2D width must be DW-aligned, but generally use QW alignment for better performance for reads. - return ((astrategy.noExtraPad || block.writable || atype.alignment % 8) - ? 4 - : 8); + return ((astrategy.noExtraPad || block.writable) ? 4 : 8); } static inline int block2DBaseAlignment(HW hw, int stepping) { @@ -5885,7 +5880,7 @@ void gemm_kernel_generator_t::setupAddr(Type T, const GRFRange &addr, bool a64 = astrategy.base.getModel() == ModelA64; auto ensureLDMultiples = [&](int n) { - if (ldMultiples.range.isInvalid()) { + if (ldMultiples.count < n) { ldMultiples = createLDMultiples(a64, n, bld, strategy, state); if (ldMultiples.range.isInvalid()) throw out_of_registers_exception(); @@ -6086,7 +6081,7 @@ void gemm_kernel_generator_t::setupAddr(Type T, const GRFRange &addr, if (doBaseAdjust && !astrategy.address2D) stub(); Subregister baStorage, baseAdjust, baseAdjustElems; - int widthAlign = block2DWidthAlignment(T, block, atype, astrategy); + int widthAlign = block2DWidthAlignment(T, block, astrategy); if (!astrategy.address2D) mov(4, addr[0].ud(4)(1), 0u); @@ -6843,7 +6838,6 @@ void gemm_kernel_generator_t::remaskLayout(Type T, int index, bool column, } static bool needsRemask(Type T, bool column, const RegisterBlock &block, - const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy, bool ignoreMasks = false) { if (!ignoreMasks) if (column ? !block.remainderC : !block.remainderR) return false; @@ -6855,8 +6849,8 @@ static bool needsRemask(Type T, bool column, const RegisterBlock &block, int maskGranularity = block.ebytes; if (block.ebytes >= 16) maskGranularity = 4; if (block2DRemask) - maskGranularity = std::max(maskGranularity, - block2DWidthAlignment(T, block, atype, astrategy)); + maskGranularity = std::max( + maskGranularity, block2DWidthAlignment(T, block, astrategy)); if (ignoreMasks && !(block2DRemask && astrategy.address2D)) maskGranularity = 256; @@ -6864,11 +6858,10 @@ static bool needsRemask(Type T, bool column, const RegisterBlock &block, } static bool needsRemask(Type T, bool column, - const vector &layout, const MatrixAddressing &atype, + const vector &layout, const MatrixAddressingStrategy &astrategy, bool ignoreMasks = false) { for (auto &block : layout) - if (needsRemask(T, column, block, atype, astrategy, ignoreMasks)) - return true; + if (needsRemask(T, column, block, astrategy, ignoreMasks)) return true; return false; } @@ -9273,8 +9266,8 @@ bool gemm_kernel_generator_t::gemmPrepMaskedAB( recalc = true; } // Avoid access patterns that require double masking, unless enabled. - if (isBlock2D(strategy.A.accessType) || strategy.unroll[LoopM] == 1 - || strategy.doubleMasking) + if (isBlock2D(strategy.A.accessType) + || strategy.allowDoubleMasking(LoopM)) noop(); else if (!isRegisterColMajor(problem.Ta_ext, problem.A, strategy.A)) { transposeAccessType(strategy.A); @@ -9294,8 +9287,8 @@ bool gemm_kernel_generator_t::gemmPrepMaskedAB( trimKChain(hw, strategy.kb_load, problem, strategy); recalc = true; } - if (isBlock2D(strategy.B.accessType) || strategy.unroll[LoopN] == 1 - || strategy.doubleMasking) + if (isBlock2D(strategy.B.accessType) + || strategy.allowDoubleMasking(LoopN)) noop(); else if (isRegisterColMajor(problem.Tb_ext, problem.B, strategy.B)) { transposeAccessType(strategy.B); @@ -10112,6 +10105,7 @@ void gemm_kernel_generator_t::accumulateSum(bool column, Type Tsrc, if (x0 % reduce || x1 % reduce) stub(); GRFRange temp; + Subregister imm; for (int y = y0; y < y1; y += yinc) { for (int x = x0; x < x1;) { @@ -10182,9 +10176,18 @@ void gemm_kernel_generator_t::accumulateSum(bool column, Type Tsrc, dp4a(ne, dst, dst, srcDP4A, state.all1s); else if (yinc == 1) add(ne, dst, srcBase(4), dst); - else - dp4a(ne, dst, dst, srcDP4A, - 0x01010101 & ((1 << (yinc * 8)) - 1)); + else { + if (hw == HW::XeHPC) { + // Workaround, some issue with dp4a with immediates + // TODO: hoist immediate out of inner-loop + if (imm.isInvalid()) + imm = state.ra.alloc_sub(Tdst.ngen()); + mov(1, imm, 0x01010101 & ((1 << (yinc * 8)) - 1)); + dp4a(ne, dst, dst, srcDP4A, imm); + } else + dp4a(ne, dst, dst, srcDP4A, + 0x01010101 & ((1 << (yinc * 8)) - 1)); + } } else if (hReduce && blockSrc->crosspack == 1) { if (Tsrc.isSigned()) dp4a(ne, dst, dst, srcDP4A, state.all1s); @@ -10202,6 +10205,7 @@ void gemm_kernel_generator_t::accumulateSum(bool column, Type Tsrc, } state.ra.safeRelease(temp); + state.ra.safeRelease(imm); } template @@ -10643,19 +10647,20 @@ bool gemm_kernel_generator_t::gemmConvertC(Type Tnew, template void gemm_kernel_generator_t::gemmAlphaScale(GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state, bool cxCombine) { - auto Ts = problem.Ts; + auto Tacc = state.Tacc; auto &alpha = problem.alpha; auto valphar = state.inputs.alpha_real; if (alpha == -1) { - map(hw, Ts.real(), state.C_regs[0], state.C_regs[0], strategy, + map(hw, Tacc.real(), state.C_regs[0], state.C_regs[0], strategy, [&](int esize, GRF acc, GRF _) { mov(esize, acc, -acc); }); } else if (alpha != 1) { - map(hw, Ts.real(), state.C_regs[0], state.C_regs[0], strategy, + map(hw, Tacc.real(), state.C_regs[0], state.C_regs[0], strategy, [&](int esize, GRF acc, GRF _) { - alpha.fixed() ? mul(esize, acc, acc, cast(Ts.real(), alpha)) - : mul(esize, acc, acc, - valphar.getRegAvoiding(hw, acc)); + alpha.fixed() + ? mul(esize, acc, acc, cast(Tacc.real(), alpha)) + : mul(esize, acc, acc, + valphar.getRegAvoiding(hw, acc)); }); } @@ -14073,7 +14078,7 @@ bool gemm_kernel_generator_t::kLoopSetup(const GEMMProblem &problem, Ai_addrsK.resize(1); success = getSubblocks(Ta_ext, Ai_layoutK[h], Ai_layoutRem, true, h, h + 1, state.Ai_strategy.padded, state.Ai, - state.Ai_strategy); + state.Ai_strategy, true); } if (!success) { @@ -14112,7 +14117,7 @@ bool gemm_kernel_generator_t::kLoopSetup(const GEMMProblem &problem, Bi_addrsK.resize(1); success = getSubblocks(Tb_ext, Bi_layoutK[h], Bi_layoutRem, false, h, h + 1, state.Bi_strategy.padded, state.Bi, - state.Bi_strategy); + state.Bi_strategy, true); } if (!success) { @@ -14496,11 +14501,11 @@ void gemm_kernel_generator_t::kLoopActivateSLMRemainder(bool active, bool asIfMaskedAi = Ai_lateKRem && state.Ai_strategy.padded; bool asIfMaskedBi = Bi_lateKRem && state.Bi_strategy.padded; slmRemaskA = slmA && mayAccessAllK && !Ai_remIncrCopy - && needsRemask(Ta_ext, true, state.Ai_layoutRem, state.Ai, - state.Ai_strategy, asIfMaskedAi); + && needsRemask(Ta_ext, true, state.Ai_layoutRem, state.Ai_strategy, + asIfMaskedAi); slmRemaskB = slmB && mayAccessAllK && !Bi_remIncrCopy - && needsRemask(Tb_ext, false, state.Bi_layoutRem, state.Bi, - state.Bi_strategy, asIfMaskedBi); + && needsRemask(Tb_ext, false, state.Bi_layoutRem, state.Bi_strategy, + asIfMaskedBi); } static inline void kLoopModifiedFlagAP(GEMMState &state) { @@ -15165,8 +15170,10 @@ void gemm_kernel_generator_t::kLoop(KLoop type, const GEMMProblem &problem, int last = unrollK; if (hasFlags(state.A_layout)) last = std::min(last, ka_loadMain); if (hasFlags(state.B_layout)) last = std::min(last, kb_loadMain); - if (hasFlags(state.Ap_layout)) last = std::min(last, ka_pfStride); - if (hasFlags(state.Bp_layout)) last = std::min(last, kb_pfStride); + if (hasFlags(state.Ap_layout)) + last = std::min(last, 1 + (strategy.prefetchA - 1) % ka_pfStride); + if (hasFlags(state.Bp_layout)) + last = std::min(last, 1 + (strategy.prefetchB - 1) % kb_pfStride); if (hasFlags(state.Ai_layout) || hasFlags(state.Bi_layout)) { last = std::min(last, unrollKSLM); if (lookaheadSLMReload % unrollKSLM != 0) @@ -15352,11 +15359,11 @@ void gemm_kernel_generator_t::kLoop(KLoop type, const GEMMProblem &problem, // A/B remasking in k dimension, during remainder handling. bool remaskA = !slmA && readA && (minOPCount > 1) - && needsRemask(Ta_load, true, state.A_layoutRem, problem.A, - strategy.A, state.A_lateKRem); + && needsRemask(Ta_load, true, state.A_layoutRem, strategy.A, + state.A_lateKRem); bool remaskB = !slmB && readB && (minOPCount > 1) - && needsRemask(Tb_load, false, state.B_layoutRem, problem.B, - strategy.B, state.B_lateKRem); + && needsRemask(Tb_load, false, state.B_layoutRem, strategy.B, + state.B_lateKRem); if (Ta.isInteger() && Tb.isInteger() && !calcASums && !calcBSums) { // Only need to remask one operand for integer A/B. Choose the smaller one. @@ -15618,16 +15625,15 @@ void gemm_kernel_generator_t::kLoop(KLoop type, const GEMMProblem &problem, } if (slmDequantize2DB) { if (bo2D) - gemmRepack2DOffsetData(Tb_ext, problem.Tbo, state.Tao_int, + gemmRepack2DOffsetData(Tb_ext, problem.Tbo, state.Tbo_int, state.B_offsetLayout, state.Br_offsetLayout, state.B_offsetRegs, state.Br_offsetRegs, problem, strategy, state); - if (bs2D) { + if (bs2D) gemmRepack2DQuantizationData(problem.Tb_scale, state.Tb_scaleOp, state.B_scaleLayout, state.Br_scaleLayout, state.B_scaleRegs, state.Br_scaleRegs, problem, strategy, state); - } } }); @@ -16279,37 +16285,30 @@ bool gemm_kernel_generator_t::gemmAccumulateCSetup( for (LoopType loop : {LoopM, LoopN, LoopK}) state.remaindersCoop[loop] = state.remainders[loop]; - if ((slmA || (strategy.prefetchA && strategy.cooperativePF)) && remM_A) - switch (state.effCoopA) { + auto calcMNRemCoop = [&](CoopSplit split, bool isM) { + auto loopX = isM ? LoopM : LoopN; + auto loopY = isM ? LoopN : LoopM; + switch (split) { + default: return state.remainders[loopX]; + case CoopSplit::FullK: return state.remaindersWG[loopX]; case CoopSplit::MN: { - state.remaindersCoop[LoopM] = state.ra.alloc_sub(); - int32_t chunkM = unrollM / strategy.wg[LoopN]; - emad(1 | sat, state.remaindersCoop[LoopM], - state.remainders[LoopM], -state.lidN.w(), chunkM, + auto rem = state.ra.alloc_sub(); + int32_t chunk = strategy.unroll[loopX] / strategy.wg[loopY]; + auto lid = isM ? state.lidN : state.lidM; + emad(1 | sat, rem, state.remainders[loopX], -lid.w(), chunk, strategy, state); - break; + return rem; } - case CoopSplit::FullK: - state.remaindersCoop[LoopM] = state.remaindersWG[LoopM]; - break; - default: break; } + }; - if ((slmB || (strategy.prefetchB && strategy.cooperativePF)) && remN_B) - switch (state.effCoopB) { - case CoopSplit::MN: { - state.remaindersCoop[LoopN] = state.ra.alloc_sub(); - int32_t chunkN = unrollN / strategy.wg[LoopM]; - emad(1 | sat, state.remaindersCoop[LoopN], - state.remainders[LoopN], -state.lidM.w(), chunkN, - strategy, state); - break; - } - case CoopSplit::FullK: - state.remaindersCoop[LoopN] = state.remaindersWG[LoopN]; - break; - default: break; - } + if ((slmA || (strategy.prefetchA && strategy.cooperativePF)) && remM_A) { + state.remaindersCoop[LoopM] = calcMNRemCoop(state.effCoopA, true); + } + + if ((slmB || (strategy.prefetchB && strategy.cooperativePF)) && remN_B) { + state.remaindersCoop[LoopN] = calcMNRemCoop(state.effCoopB, false); + } // Prepare layouts for prefetch. bool remM_Cp = remM_C && strategy.C.base.isStateless(); @@ -16972,30 +16971,22 @@ bool gemm_kernel_generator_t::gemmAccumulateCSetup( auto assignAllMasks = [&]() { return assignMasks(state.A_layout, LoopM, LoopK, masks, strategy, state) - && assignMasks( - state.A_layoutAlt, LoopM, LoopK, masks, strategy, state) && assignMasks(state.A_offsetLayout, LoopM, LoopK, masks, strategy, state) && assignMasks(state.A_scaleLayout, LoopM, LoopK, masks, strategy, state) && assignMasks(state.Ap_layout, LoopM, LoopK, A_cmasks, strategy, state) - && assignMasks(state.Ap_layoutAlt, LoopM, LoopK, A_cmasks, - strategy, state) && assignMasks(state.Ai_layout, LoopM, LoopNone, A_cmasks, strategy, state) && assignMasks( state.B_layout, LoopK, LoopN, masks, strategy, state) - && assignMasks( - state.B_layoutAlt, LoopK, LoopN, masks, strategy, state) && assignMasks(state.B_offsetLayout, LoopK, LoopN, masks, strategy, state) && assignMasks(state.B_scaleLayout, LoopK, LoopN, masks, strategy, state) && assignMasks(state.Bp_layout, LoopK, LoopN, B_cmasks, strategy, state) - && assignMasks(state.Bp_layoutAlt, LoopK, LoopN, B_cmasks, - strategy, state) && assignMasks(state.Bi_layout, LoopNone, LoopN, B_cmasks, strategy, state); }; @@ -19386,8 +19377,9 @@ void gemm_kernel_generator_t::gemmInitState(GEMMProblem &problem, state.lda = state.inputs.lda; state.ldb = state.inputs.ldb; - if (GRF::bytes(hw) == 64 && strategy.doubleMasking) { - if (!isRegisterColMajor(Ta_ext, problem.A, strategy.A)) { + if (GRF::bytes(hw) == 64) { + if (!isRegisterColMajor(Ta_ext, problem.A, strategy.A) + && strategy.allowDoubleMasking(LoopM)) { int ka = strategy.slmA ? strategy.unrollKSLM : strategy.ka_load; int effAlign = isBlocklike(strategy.A.accessType) ? problem.A.alignment @@ -19396,7 +19388,8 @@ void gemm_kernel_generator_t::gemmInitState(GEMMProblem &problem, * (std::min(4, effAlign) / std::min(4, Ta_ext.paddedSize()))); } - if (isRegisterColMajor(Tb_ext, problem.B, strategy.B)) { + if (isRegisterColMajor(Tb_ext, problem.B, strategy.B) + && strategy.allowDoubleMasking(LoopN)) { int kb = strategy.slmB ? strategy.unrollKSLM : strategy.kb_load; int effAlign = isBlocklike(strategy.B.accessType) ? problem.B.alignment @@ -22591,6 +22584,13 @@ void GEMMStrategy::preflight(HW hw, const GEMMProblem &problem) { if (hw < HW::Gen12LP && isIGEMM(Ta, Tb, Tc)) fmaSIMD = 32; } + // Force wider SIMD for 4x24 ZGEMM (otherwise uses too many flag registers) + if (GRF::bytes(hw) == 32 && (Tc == Type::f64) && unroll[LoopN] > 16) { + C.smode = ScatterSIMD::Wide; + } + + doubleWA |= (Tc_real == Type::f64) && (hw <= HW::Gen9); + slmFenceWARWA |= (hw >= HW::XeHPG); if (problem.batch != BatchMode::None) { @@ -22628,6 +22628,8 @@ void GEMMStrategy::preflight(HW hw, const GEMMProblem &problem) { // Priority: k chaining > extra C registers > r0 header storage. // 64-bit emulation > r0 header storage. if (hw <= HW::Gen9) kChain = 1; + if (AccumulatorRegister::count(hw, GRFs, problem.Tc.real().ngen()) == 0) + kChain = 1; cAccumulators &= (kChain == 1); bool emulateNeedsAcc = emulate.emulate64 || emulate.emulateDWxDW; @@ -22640,7 +22642,7 @@ void GEMMStrategy::preflight(HW hw, const GEMMProblem &problem) { // - mixed hf/f is max SIMD 8 on Gen9 // - mixed hf/f is not allowed on Gen12 // - mixed bf/f is max SIMD 8 on ATS+ - if ((Tc_real == Type::f32) + if ((hw == HW::Gen9) && (Tc_real == Type::f32) && (Ta_real != Type::f32 || Tb_real != Type::f32)) fmaSIMD = std::min(fmaSIMD, GRF::bytes(hw) >> 2); @@ -27220,8 +27222,7 @@ bool gemm_kernel_generator_t::copyRegisters(Type Ts, Type Td, } auto allocTemp = [&]() { - if (preswizzle && copyTemp.isInvalid()) - copyTemp = state.ra.alloc_range(2); + if (copyTemp.isInvalid()) copyTemp = state.ra.alloc_range(2); }; int srcM, srcN; @@ -27266,6 +27267,10 @@ bool gemm_kernel_generator_t::copyRegisters(Type Ts, Type Td, sblock.offsetC + eoffC + dOffC, dst, delems, dblockPtr, qCX); + // Limit due to powers of 2 instruction exec size + selems = rounddown_pow2(selems); + delems = rounddown_pow2(delems); + auto scrosspack = sblock.crosspack; auto dcrosspack = dblockPtr->crosspack; @@ -27318,8 +27323,8 @@ bool gemm_kernel_generator_t::copyRegisters(Type Ts, Type Td, nelems_real = std::min(nelems_real, nes_real); // Special case: mixed mode packed downconversion limited to 1 GRF. - bool src_f8 = Ts_real.isF8(); - bool dst_f8 = Td_real.isF8(); + bool src_f8 = Ts.isF8(); + bool dst_f8 = Td.isF8(); bool f8_align = src_f8 ^ dst_f8; // Check if separate conversions are needed due to size changes. @@ -27391,6 +27396,7 @@ bool gemm_kernel_generator_t::copyRegisters(Type Ts, Type Td, if (bfHfCvt) sregConverted = sreg.reinterpret( 0, ngen::DataType::f)(scrosspack); + if (f8_align || bfHfCvt) allocTemp(); if ((byteAlign || bfHfCvt) && Td_real != Type::u16) nelems_real = std::min( diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp index 0372132c4c3..ceff2997542 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp @@ -67,6 +67,7 @@ class Type { invalid = 0, f16 = 0x01000201, f32 = 0x01010402, + f64 = 0x01020803, u4 = 0x21840100, s4 = 0x21850100, u8 = 0x01840100, @@ -102,7 +103,7 @@ class Type { return (val == Type::u8) || (val == Type::s8); } constexpr bool isF8() const { - return (val == Type::bf8 || val == Type::hf8); + return (val == Type::bf8) || (val == Type::hf8); } constexpr bool isSigned() const { return (uint32_t(val) & 0x810000) != 0x800000; @@ -118,8 +119,9 @@ class Type { return paddedSize(); } constexpr int perByte() const { return isInt4() ? 2 : 1; } - - void subByteCheck() const; + void subByteCheck() const { + if (isInt4()) stub(); + } constexpr Type arithmetic() const { return (val == tf32) ? Type(f32) : real(); @@ -1324,6 +1326,10 @@ struct GEMMStrategy : public GEMMStrategyPOD { bool checkAdd32Rem() const { return checkAdd32 && emulate.emulate64; } + bool allowDoubleMasking(LoopType loop) const { + return doubleMasking || unroll[loop] == 1; + } + bool registerOutput() const { return C.base.getModel() == ngen::ModelInvalid; } @@ -1351,6 +1357,7 @@ struct GEMMStrategy : public GEMMStrategyPOD { struct LDMultiples { ngen::GRFRange range; bool a64 = false; + int count = 0; }; using LDIncrements = std::vector>; @@ -2122,7 +2129,7 @@ class gemm_kernel_generator_t : public jit_generator { bool getSubblocks(Type T, std::vector &sublayout, const std::vector &layout, bool column, int x1, int x2, bool overrunOK, const MatrixAddressing &atype, - const MatrixAddressingStrategy &astrategy); + const MatrixAddressingStrategy &astrategy, bool decoalesce = false); bool getSubblocks(Type T, std::vector &sublayout, std::vector *subaddrs, std::vector *indices, const std::vector &layout, @@ -2992,6 +2999,7 @@ inline char precisionChar(Type T) { case Type::bf8: return 'Q'; case Type::hf8: return 'q'; case Type::f32: return 'S'; + case Type::f64: return 'D'; case Type::u4: return 'f'; case Type::s4: return 'F'; case Type::u8: return 'o'; @@ -3014,6 +3022,7 @@ static inline Type charPrecision(char c) { case 'Q': return Type::bf8; case 'q': return Type::hf8; case 'S': return Type::f32; + case 'D': return Type::f64; case 'f': return Type::u4; case 'F': return Type::s4; case 'o': return Type::u8; diff --git a/src/gpu/intel/jit/gemm/kernel.db b/src/gpu/intel/jit/gemm/kernel.db index f40a645ddb2..4f301fc3a82 100644 --- a/src/gpu/intel/jit/gemm/kernel.db +++ b/src/gpu/intel/jit/gemm/kernel.db @@ -15,12 +15,22 @@ *******************************************************************************/ /*@kcatalog@*/ -kcatalog::FlatCatalog<1024> _CATALOG_ -{1, 8380, 1024, { +kcatalog::FlatCatalog<1109> _CATALOG_ +{1, 8309, 1109, { {{'9', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2x2 as8x2 ab l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'9', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4x2 ab l4 acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'9', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 as16 ab l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {256}}}, {{'9', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8x2 ab2x2 as l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, +{{'9', "gemm", {"D", "D", "D"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "s"}, "ab1 ab2 ab k8 acb bm4032 bn4032 bk1536", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4032, 4032, 1536}, {4096, 4096, 1536}, {48, 8, 8}, {1, 16, 1}, 1, (WGType) 0, 0, 0, 0, {128, 64, 8}, {true, true, true}}, {'W', 1, {384}}}, +{{'9', "gemm", {"D", "D", "D"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "s"}, "ab1 ab2 ab k8 acb bm1008 bn2016 bk1008", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {1008, 2016, 1008}, {4096, 4096, 1008}, {24, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 128, 8}, {true, true, true}}, {'W', 1, {1e+06}}}, +{{'9', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1024, 1024, -1}, {1, 1, 1}, ""}, "ab8 as16 ab acb nmk bk1024", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 8, 16}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {128}}}, +{{'9', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4/2 as4 ab k8 cs bk1024", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {32, 8, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {256}}}, +{{'9', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 128, -1}, {1, 1, 1}, ""}, "ab2x2 as4x2 ab acb bk1024", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {32}}}, +{{'9', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab2 ab k8 cs bk1024", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {32, 8, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {256}}}, +{{'9', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as4 as4 ab k8 cs bk512", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 512}, {4096, 4096, 512}, {16, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {256}}}, +{{'9', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {512, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as4x2 as16 ab acb bk512", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 512}, {4096, 4096, 512}, {16, 8, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {128}}}, +{{'9', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 1024, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as4x2 as16 ab acb bk512", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 512}, {4096, 4096, 512}, {16, 8, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {128}}}, +{{'9', "gemm", {"D", "D", "D"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as4 ab2 as k8 cs bk1024", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {256}}}, {{'9', "gemm", {"H", "H", "H"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 ab8x2 ab k16 acb bm8192 bn4096 bk1536", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 4096, 1536}, {4096, 4096, 1536}, {32, 32, 16}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 64, 2}, {true, true, true}}, {'W', 1, {1024}}}, {{'9', "gemm", {"H", "H", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4 as k8 acb bm8192 bn4096 bk1536", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 4096, 1536}, {4096, 4096, 1536}, {16, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {32, 64, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'9', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 as8 ab k16 l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 32, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, @@ -83,6 +93,12 @@ kcatalog::FlatCatalog<1024> _CATALOG_ {{'C', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2 ab8 ab l4 cab1 wg 4x4 int sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'C', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8x2 ab16 ab l4 cb1 wg 8x2 vnc nmk sr", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 16}, {8, 2, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {256}}}, {{'C', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 ab4 as l4 cb1 wg 8x2 int sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 8}, {8, 2, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, +{{'C', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb2x2 sb2x2 ab wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'C', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb4 sb4 ab wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'C', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb2x2 sb2x2 ab wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'C', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb4 sb4 ab wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'C', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb2x2 sb2x2 ab wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'C', "gemm", {"F", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb4 sb4 ab wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, {{'C', "gemm", {"H", "H", "H"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 ab4x2 ab k8 vnc bm8192 bn4096 bk1536", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 4096, 1536}, {4096, 4096, 1536}, {32, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 64, 2}, {true, true, true}}, {'W', 1, {1024}}}, {{'C', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "p"}, "ab4x2 ab32/8 ab k64 l4 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 64}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {512}}}, {{'C', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1024, 32, -1}, {1, 1, 1}, ""}, "ab4x2 as8x2 ab l4 int nmk", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 4, 16}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {128}}}, @@ -96,6 +112,10 @@ kcatalog::FlatCatalog<1024> _CATALOG_ {{'C', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8x2 ab16 ab l4 cb1 wg 8x2 vnc nmk", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 16}, {8, 2, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {256}}}, {{'C', "gemm", {"H", "H", "H"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 ab2x2 ab k16 l4 vnc", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 32, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, {{'C', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 ab4 as l4 cb1 wg 8x2 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 8}, {8, 2, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, +{{'C', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb2x2 sb2x2 ab wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'C', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb4 sb4 ab wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'C', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb2x2 sb2x2 ab wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'C', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb4 sb4 ab wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, {{'C', "gemm", {"O", "O", "I"}, {"A4", "B4", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab16x2 ab8x2 ab int bm8192", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 32}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 128, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'C', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb4 sb8 sb l4 int k32 cab1 wg 4x4 ek", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {512}}}, {{'C', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {2048, 8, -1}, {1, 1, 1}, "xyz"}, "sb8x2 su16x2 sb l4 ca1 wg 2x8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {128}}}, @@ -104,11 +124,13 @@ kcatalog::FlatCatalog<1024> _CATALOG_ {{'C', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb4 sb l4 int k16 cab1 wg 4x4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 3072, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {512}}}, {{'C', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8x2 sb8x2 sb l4 vnc k32 cab1 wg 4x4 ek", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {256}}}, {{'C', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb4 sb l4 int k32 cab1 wg 4x4 fn nmk ek", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {512}}}, +{{'C', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb2x2 sb2x2 ab wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'C', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb4 sb4 ab wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, {{'C', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 ab2x2 ab int bm8192 bn4096 bk1536", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 4096, 1536}, {4096, 4096, 1536}, {16, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 128, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'C', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4x2 ab k8 int bm4608 bn4608 bk1536", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4608, 4608, 1536}, {4096, 4096, 1536}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {128, 64, 4}, {true, true, true}}, {'W', 1, {1e+06}}}, {{'C', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 as4x2 ab int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 4, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {64}}}, {{'C', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2 ab32 ab ca1 wg 2x8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 8, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {128}}}, -{{'C', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "ab4x2 ab16/8 ab k32 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {32, 12, 32}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {384}}}, +{{'C', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qps"}, "ab4x2 ab16/8 ab k32 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {32, 12, 32}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {384}}}, {{'C', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 32, -1}, {1, 1, 1}, ""}, "ab2x2 ab8 as cb1 wg 8x2 int nmk", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {32, 16, 8}, {8, 2, 1}, 1, (WGType) 1, 1, 1024, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'C', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4096, -1, -1}, {1, 1, 1}, ""}, "ab8 ab8 ab int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {32, 8, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {256}}}, {{'C', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {256, -1, -1}, {1, 1, 1}, ""}, "ab16 ab32/16x2 ab ca1 wg 2x8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {32}}}, @@ -123,775 +145,838 @@ kcatalog::FlatCatalog<1024> _CATALOG_ {{'C', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 31, -1}, {1, 1, 1}, ""}, "ab8x2 as8x2 as cab1 wg 4x4 int bk1024", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {128}}}, {{'C', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab16 ab32 ab ca1 wg 2x8 int ek", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {32}}}, {{'C', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "ab16 ab32 ab ca1 wg 2x8 int ek kb", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 3, 2048, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {32}}}, -{{'C', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "zqp"}, "ab16/8 ab4x2 su k32 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {12, 32, 32}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {384}}}, -{{'C', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qz"}, "ab8 ab4x2 su k32 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {12, 32, 32}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {1e+06}}}, +{{'C', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "zqps"}, "ab16/8 ab4x2 su k32 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {12, 32, 32}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {384}}}, +{{'C', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qsz"}, "ab8 ab4x2 su k32 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {12, 32, 32}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {1e+06}}}, {{'D', "gemm", {"B", "B", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "smqp"}, "ab16x3 ab16x3 ab fs sc bo acb bk2048", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 2048}, {8192, 8192, 1024}, {32, 32, 16}, {4, 4, 1}, 2, (WGType) 1, 0, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, {{'D', "gemm", {"B", "B", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "dsmqp"}, "ab16 ab16 ab fs wg 4x4 bo acb bk4096", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 1024}, {32, 48, 16}, {4, 4, 1}, 1, (WGType) 1, 0, 32256, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1536}}}, {{'D', "gemm", {"B", "B", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Dsmqp"}, "ab16 ab16 ab fs wg 8x4 bo acb bk4096", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 1024}, {32, 48, 16}, {8, 4, 1}, 1, (WGType) 1, 0, 61440, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1536}}}, {{'D', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyn"}, "sb16 sb16 ab cab1 wg 4x4 fn nmk cs pab", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 5120, 0, {2, 2, 4}, {false, false, true}}, {'W', 1, {256}}}, -{{'D', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb1x4 sb1x4 sb l4 cs di nmk fn pab", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 4}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'D', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 128, -1}, {1, 1, 1}, "xyv"}, "sb4x2 sb4x2 ab cs di wg 2x2x16 kr kb bk64", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 64}, {8192, 8192, 64}, {8, 8, 8}, {2, 2, 16}, 1, (WGType) 0, 7, 0, 4096, {2, 2, 4}, {false, false, true}}, {'W', 1, {64}}}, +{{'D', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb1x4 sb1x4 sb l4 cs nmk fn pab", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 4}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {false, false, false}}, {'W', 1, {256}}}, +{{'D', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 128, -1}, {1, 1, 1}, "xyv"}, "sb4x2 sb4x2 ab cs wg 2x2x16 kr kb bk64", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 64}, {8192, 8192, 64}, {8, 8, 8}, {2, 2, 16}, 1, (WGType) 0, 7, 0, 4096, {2, 2, 4}, {false, false, true}}, {'W', 1, {64}}}, {{'D', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xynqp"}, "sb16 sb16 ab ca2 wg 1x4 fn nmk cs pab", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 8, 32}, {1, 4, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {false, false, true}}, {'W', 1, {256}}}, -{{'D', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 su16 sb l4 cab1 wg 2x8 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 4, 16}, {2, 8, 1}, 1, (WGType) 1, 1, 3072, 0, {2, 2, 4}, {false, false, false}}, {'W', 1, {128}}}, +{{'D', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 su16 sb l4 cab1 wg 2x8 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 4, 16}, {2, 8, 1}, 1, (WGType) 1, 1, 3072, 0, {2, 2, 4}, {false, false, false}}, {'W', 1, {128}}}, {{'D', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyn"}, "sb16 sb16 as cab1 wg 4x4 cs pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {8, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 5120, 0, {2, 2, 4}, {false, false, true}}, {'W', 1, {256}}}, +{{'D', "gemm", {"D", "D", "D"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb4 sb4 sb cs cab1x2 wg 4x4 hi kc4 bm6144 bn6144 bk1536", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {6144, 6144, 1536}, {8192, 8192, 1024}, {16, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 0, 4096, 0, {128, 128, 8}, {false, false, false}}, {'W', 1, {256}}}, +{{'D', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 1024, -1}, {1, 1, 1}, "xyz"}, "sb8 sb16 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 8192, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {64}}}, +{{'D', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 128, -1}, {1, 1, 1}, "xyz"}, "sb16 sb8 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 4, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {32}}}, +{{'D', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb4 sb8 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {128}}}, +{{'D', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb2x2 sb8x2 sb ca1 wg 2x8 cs sf", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 4, 16}, {2, 8, 1}, 1, (WGType) 1, 1, 4096, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {64}}}, +{{'D', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {2048, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzn"}, "sb2 sb4 sb cab2 wg 4x4 ar sn cs tb hi bm8192 bn8192 bk1536/1024", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {16, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 8192, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {256}}}, +{{'D', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, 4096, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzn"}, "sb2 sb4 sb cab2 wg 4x4 ar sn cs tb hi bm8192 bn8192 bk1536/1024", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {16, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 8192, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {256}}}, +{{'D', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb4/2x2 sb2 sb cb1 wg 8x2 cs nmk hi bk1024", {8, (LoopType) 1, 128, {(LoopType) 145, (LoopType) 255, (LoopType) 255}, {262144, 262144, 1024}, {8192, 8192, 1024}, {16, 16, 8}, {8, 2, 1}, 1, (WGType) 1, 1, 2048, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {256}}}, +{{'D', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {512, 1024, -1}, {1, 1, 1}, "xyz"}, "sb4 sb8 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {128}}}, +{{'D', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {32, 1024, -1}, {1, 1, 1}, "xyz"}, "sb16 sb8 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {4, 4, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {16}}}, +{{'D', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb4 sb cab1 wg 4x4 k8 cs hi bk1024", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {131072, 262144, 1024}, {8192, 8192, 1024}, {8, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {128}}}, +{{'D', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 1024, -1}, {1, 1, 1}, "xyz"}, "sb8 sb8 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {64}}}, +{{'D', "gemm", {"D", "D", "D"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb8/4 sb1 as cb1 wg 8x2 ar cs fn hi bm8192 bn8192 bk1024", {8, (LoopType) 1, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {8, 32, 8}, {8, 2, 1}, 1, (WGType) 1, 1, 4096, 0, {8, 8, 8}, {false, false, true}}, {'W', 1, {256}}}, {{'D', "gemm", {"H", "H", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Dsmqp"}, "ab16 ab16 ab fs wg 8x4 bo acb bk4096", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 1024}, {32, 48, 16}, {8, 4, 1}, 1, (WGType) 1, 0, 61440, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1536}}}, {{'D', "gemm", {"H", "H", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "dsmqp"}, "ab16 ab16 ab fs wg 4x4 bo acb bk4096", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 1024}, {32, 48, 16}, {4, 4, 1}, 1, (WGType) 1, 0, 32256, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1536}}}, {{'D', "gemm", {"H", "H", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "smqp"}, "ab16x3 ab16x3 ab fs sc bo acb bk2048", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 2048}, {8192, 8192, 1024}, {32, 32, 16}, {4, 4, 1}, 2, (WGType) 1, 0, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, -{{'D', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {64, -1, -1}, {1, 1, 1}, "p"}, "ab8 ab16 ab l4 ca1 wg 2x8 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 16}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {512}}}, -{{'D', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyz"}, "sb4 su16x2 sb l4 ca1 wg 2x8 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 2}, {false, false, false}}, {'W', 1, {64}}}, -{{'D', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {65, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab16/8 as16 ab l4 cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {512}}}, -{{'D', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sb16 sb16 ab wg 8x8 cab3 ks16 af dw vav di bo ar bk0 sn grf256 sys l4 pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 1024}, {32, 32, 48}, {8, 8, 1}, 1, (WGType) 1, 1, 49152, 0, {2, 2, 4}, {false, false, true}}, {'W', 1, {1024}}}, -{{'D', "gemm", {"H", "H", "H"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb16 sb l4 cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {2, 2, 2}, {false, false, false}}, {'W', 1, {512}}}, -{{'D', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "ab16 ab16 ab wg 8x8 cab3 ks16 af dw vav di bo ar bk0 grf256 sys l4 np", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 1024}, {32, 32, 48}, {8, 8, 1}, 1, (WGType) 1, 1, 49152, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'D', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {64, -1, -1}, {1, 1, 1}, "p"}, "ab8 ab16 ab l4 ca1 wg 2x8 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 16}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {512}}}, +{{'D', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyz"}, "sb4 su16x2 sb l4 ca1 wg 2x8 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 2}, {false, false, false}}, {'W', 1, {64}}}, +{{'D', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {65, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab16/8 as16 ab l4 cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {512}}}, +{{'D', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sb16 sb16 ab wg 8x8 cab3 ks16 af dw vav bo ar bk0 sn grf256 sys l4 pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 1024}, {32, 32, 48}, {8, 8, 1}, 1, (WGType) 1, 1, 49152, 0, {2, 2, 4}, {false, false, true}}, {'W', 1, {1024}}}, +{{'D', "gemm", {"H", "H", "H"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb16 sb l4 cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {2, 2, 2}, {false, false, false}}, {'W', 1, {512}}}, +{{'D', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "ab16 ab16 ab wg 8x8 cab3 ks16 af dw vav bo ar bk0 grf256 sys l4 np", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 1024}, {32, 32, 48}, {8, 8, 1}, 1, (WGType) 1, 1, 49152, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {1024}}}, {{'D', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {32, 32, -1}, {-1, -1, -1}, {32, 32, -1}, {1, 1, 1}, "xy"}, "sb16 sb16 ab cab2 wg 2x4 cs pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 8, 32}, {2, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {2, 2, 2}, {false, false, true}}, {'W', 1, {128}}}, -{{'D', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "su8 su32 sb l4 cab1 wg 4x4 cs di hi bk2048", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 2048}, {8192, 8192, 1024}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 8192, 0, {2, 2, 2}, {false, false, false}}, {'W', 1, {256}}}, -{{'D', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, "xyzv"}, "su8 sb32x2 sb wg 2x1x16 kr l4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 1, 64}, {2, 1, 16}, 1, (WGType) 0, 5, 0, 2048, {2, 2, 2}, {false, false, false}}, {'W', 1, {32}}}, -{{'D', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sb32 sb16 ab wg 8x4 cab3 ks32 af dw vav di bo ar bk0 sm sn grf256 sys pab l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 1024}, {16, 32, 96}, {8, 4, 1}, 1, (WGType) 1, 1, 49152, 0, {2, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'D', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "su8 su32 sb l4 cab1 wg 4x4 cs hi bk2048", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 2048}, {8192, 8192, 1024}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 8192, 0, {2, 2, 2}, {false, false, false}}, {'W', 1, {256}}}, +{{'D', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, "xyzv"}, "su8 sb32x2 sb wg 2x1x16 kr l4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 1, 64}, {2, 1, 16}, 1, (WGType) 0, 5, 0, 2048, {2, 2, 2}, {false, false, false}}, {'W', 1, {32}}}, +{{'D', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sb32 sb16 ab wg 8x4 cab3 ks32 af dw vav bo ar bk0 sm sn grf256 sys pab l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 1024}, {16, 32, 96}, {8, 4, 1}, 1, (WGType) 1, 1, 49152, 0, {2, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, {{'D', "gemm", {"H", "H", "H"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as4/1x2 ab4/1x2 ab l4 ca1 wg 2x8 cs nmk hi bk2048", {8, (LoopType) 1, 128, {(LoopType) 145, (LoopType) 255, (LoopType) 255}, {524288, 524288, 2048}, {8192, 8192, 1024}, {32, 32, 8}, {2, 8, 1}, 1, (WGType) 1, 1, 1024, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, -{{'D', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "ab16x2 ab16x2 ab wg 8x4 cab3 ks16 af dw vav di bo ar bk0 sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 1024}, {16, 32, 96}, {8, 4, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, +{{'D', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "ab16x2 ab16x2 ab wg 8x4 cab3 ks16 af dw vav bo ar bk0 sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 1024}, {16, 32, 96}, {8, 4, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'D', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "sm"}, "ab16x3 ab16x3 ab fs sc bo acb bk4096", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 4096}, {8192, 8192, 1024}, {32, 32, 32}, {4, 4, 1}, 2, (WGType) 1, 0, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, {{'D', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "dsm"}, "ab16 ab16 ab fs wg 4x4 bo acb bk8192", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 8192}, {8192, 8192, 1024}, {32, 48, 32}, {4, 4, 1}, 1, (WGType) 1, 0, 32256, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1536}}}, {{'D', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Dsm"}, "ab16 ab16 ab fs wg 8x4 bo acb bk8192", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 8192}, {8192, 8192, 1024}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 0, 61440, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1536}}}, -{{'D', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb32 sb32 sb l4 cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'D', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyznqp"}, "sb8 sb32 sb l4 cab1 wg 4x4 cs di pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {512}}}, -{{'D', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8x2 su16x2 sb l4 ca1 wg 2x8 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {128}}}, -{{'D', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8x2 sb8x2 sb l4 cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 3072, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {512}}}, -{{'D', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb32 sb32 sb l4 cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {256}}}, +{{'D', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb32 sb32 sb l4 cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {256}}}, +{{'D', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyznqp"}, "sb8 sb32 sb l4 cab1 wg 4x4 cs pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {512}}}, +{{'D', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8x2 su16x2 sb l4 ca1 wg 2x8 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {128}}}, +{{'D', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8x2 sb8x2 sb l4 cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 3072, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {512}}}, +{{'D', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb32 sb32 sb l4 cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {256}}}, {{'D', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, "xyzqp"}, "su16 sb64x2 sb wg 2x1x16 l4 cs kr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 1, 128}, {2, 1, 16}, 1, (WGType) 0, 5, 0, 2048, {1, 1, 4}, {false, false, false}}, {'W', 1, {32}}}, -{{'D', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb32 sb32 sb l4 cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'D', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzq"}, "sb2x2 sb4x2 sb ca1x2 wg 2x8 cs di hi bk1536/1536", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 262144, 1536}, {8192, 8192, 1536}, {32, 16, 16}, {2, 8, 1}, 1, (WGType) 1, 0, 2048, 0, {128, 64, 4}, {false, false, false}}, {'W', 1, {512}}}, -{{'D', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb4x2 sb2x2 ab cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {16, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 128, 4}, {false, false, true}}, {'W', 1, {1e+06}}}, -{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, 1024}, {1024, -1, -1}, {1, 1, 1}, "xyzv"}, "sb16 sb32 sb cab1 wg 2x8 cs di bm8192 bn8192 bk2048 kb", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 3, 6144, 0, {4, 4, 4}, {false, false, true}}, {'W', 1, {32}}}, -{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1024, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb32 sb cab1 wg 2x8 cs di bm8192 bn8192 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {32}}}, -{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzn"}, "sb1x2 su4/2x2 sb ca1 wg 2x8 cs di hi bk2048", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {1048576, 131072, 2048}, {8192, 8192, 1024}, {64, 8, 8}, {2, 8, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {512}}}, -{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs di bm2048 bn2048 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {2048, 2048, 2048}, {8192, 8192, 1024}, {16, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, -{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 31, -1}, {1, 1, 1}, "xyzn"}, "sb8 su32 sb cab1 wg 4x4 cs di bm4096 bn4096 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {8192, 8192, 1024}, {16, 4, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 10240, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {64}}}, -{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb4 sb8 sb cab1 wg 4x4 cs di bm4096 bn4096 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {8192, 8192, 1024}, {32, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 5120, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {16, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, -{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyznqp"}, "sb4/2x2 sb4x2 sb cs di hi bk2048", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 262144, 2048}, {8192, 8192, 1024}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {512}}}, -{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {64}}}, -{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 128, -1}, {1, 1, 1}, "xyv"}, "sb2x2 sb2x2 ab wg 2x2x16 kb kr cs di ar bk64", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 64}, {8192, 8192, 64}, {8, 8, 4}, {2, 2, 16}, 1, (WGType) 0, 7, 0, 4096, {4, 4, 4}, {false, false, true}}, {'W', 1, {64}}}, -{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzn"}, "sb4x3 sb4x3 sb cs di cab1 wg 4x4 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 12}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, -{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {64}}}, -{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzv"}, "sb16 sb32 sb cab1 wg 2x8 cs di kb", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 3, 6144, 0, {4, 4, 4}, {false, false, true}}, {'W', 1, {32}}}, -{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb32 sb cab1 wg 2x8 cs di", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {32}}}, -{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb8 su cab1 wg 4x4 cs di hi bk2048", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 2048}, {8192, 8192, 1024}, {16, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'D', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzn"}, "su4/2x2 sb1x2 su cb1 wg 8x2 cs di fn hi bk2048", {8, (LoopType) 1, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {131072, 1048576, 2048}, {8192, 8192, 1024}, {8, 64, 8}, {8, 2, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {512}}}, -{{'D', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb4 sb cab1 wg 4x4 cs di bm4096 bn4096 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {8192, 8192, 1024}, {8, 32, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 5120, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, +{{'D', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb32 sb32 sb l4 cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {256}}}, +{{'D', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzq"}, "sb2x2 sb4x2 sb ca1x2 wg 2x8 cs hi bk1536/1536", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 262144, 1536}, {8192, 8192, 1536}, {32, 16, 16}, {2, 8, 1}, 1, (WGType) 1, 0, 2048, 0, {128, 64, 4}, {false, false, false}}, {'W', 1, {512}}}, +{{'D', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb4x2 sb2x2 ab cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {16, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 128, 4}, {false, false, true}}, {'W', 1, {1e+06}}}, +{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, 1024}, {1024, -1, -1}, {1, 1, 1}, "xyzv"}, "sb16 sb32 sb cab1 wg 2x8 cs bm8192 bn8192 bk2048 kb", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 3, 6144, 0, {4, 4, 4}, {false, false, true}}, {'W', 1, {32}}}, +{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1024, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb32 sb cab1 wg 2x8 cs bm8192 bn8192 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {32}}}, +{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzn"}, "sb1x2 su4/2x2 sb ca1 wg 2x8 cs hi bk2048", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {1048576, 131072, 2048}, {8192, 8192, 1024}, {64, 8, 8}, {2, 8, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {512}}}, +{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs bm2048 bn2048 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {2048, 2048, 2048}, {8192, 8192, 1024}, {16, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, +{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 31, -1}, {1, 1, 1}, "xyzn"}, "sb8 su32 sb cab1 wg 4x4 cs bm4096 bn4096 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {8192, 8192, 1024}, {16, 4, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 10240, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {64}}}, +{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb4 sb8 sb cab1 wg 4x4 cs bm4096 bn4096 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {8192, 8192, 1024}, {32, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 5120, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, +{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {16, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, +{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyznqp"}, "sb4/2x2 sb4x2 sb cs hi bk2048", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 262144, 2048}, {8192, 8192, 1024}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {512}}}, +{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {64}}}, +{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 128, -1}, {1, 1, 1}, "xyv"}, "sb2x2 sb2x2 ab wg 2x2x16 kb kr cs ar bk64", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 64}, {8192, 8192, 64}, {8, 8, 4}, {2, 2, 16}, 1, (WGType) 0, 7, 0, 4096, {4, 4, 4}, {false, false, true}}, {'W', 1, {64}}}, +{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzn"}, "sb4x3 sb4x3 sb cs cab1 wg 4x4 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 12}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, +{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, +{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {64}}}, +{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzv"}, "sb16 sb32 sb cab1 wg 2x8 cs kb", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 3, 6144, 0, {4, 4, 4}, {false, false, true}}, {'W', 1, {32}}}, +{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb32 sb cab1 wg 2x8 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {32}}}, +{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb8 su cab1 wg 4x4 cs hi bk2048", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 2048}, {8192, 8192, 1024}, {16, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, +{{'D', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzn"}, "su4/2x2 sb1x2 su cb1 wg 8x2 cs fn hi bk2048", {8, (LoopType) 1, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {131072, 1048576, 2048}, {8192, 8192, 1024}, {8, 64, 8}, {8, 2, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {512}}}, +{{'D', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb4 sb cab1 wg 4x4 cs bm4096 bn4096 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {8192, 8192, 1024}, {8, 32, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 5120, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, {{'E', "gemm", {"B", "B", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "smqp"}, "ab16x3 ab16x3 ab fs sc bo acb bk2048 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {4, 4, 1}, 2, (WGType) 1, 256, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, {{'E', "gemm", {"B", "B", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzDsmqp"}, "sB16 sB16 sb fs wg 8x4 bo acb bk4096 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 4096}, {32, 48, 16}, {8, 4, 1}, 1, (WGType) 1, 0, 61440, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}}, {{'E', "gemm", {"B", "B", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzdsmqp"}, "sB16 sB16 sb fs wg 4x4 bo acb bk4096 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 4096}, {32, 48, 16}, {4, 4, 1}, 1, (WGType) 1, 0, 32256, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB32 aB wg 8x8 cab4x2 ks32 xaf dw vav di bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {579626, 1.30873e+06, 0, 0, 0, 0, 6.16503, 6.88343, 4.54054, 13.1904, 0.0528026, 0.0528026, 0, 1, 1.21396, 1.2014, 5.22092e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB16 aB wg 2x8x2 kr cab4 ks32 af dw vav di bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.13381e+06, 546572, 120155, 128063, 0, 0, 6.3128, 5.12909, 2.74316, 9.09192, 0.0675974, 0.111561, 0.0339879, 0.995238, 1.214, 1.20175, 2.29193e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB32 aB wg 2x16 cab4 ks64 af dw vav di bo bk0 sn grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13494e+06, 565723, 0, 0, 0, 0, 6.58651, 4.63582, 2.77482, 9.77739, 0.0825648, 0.0364345, 0.0773913, 0.987187, 1.21508, 1.20179, 4.02242e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 2x8x4 kr cab4 ks32 af dw vav di bo bk0 sn sys l4 sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 8, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.22911e+06, 773838, 50096.1, 48822.3, 0, 0, 6.20328, 5.09858, 3.43021, 11.3071, 0.103231, 0.141451, 0.0232521, 1, 1.21322, 1.20202, -3.99631e-16}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB16 aB wg 4x8 cab4 ks64 af dw vav di bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 65536, 16777216}, {8192, 8192, 16777216}, {8, 4, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17572e+06, 607273, 0, 0, 0, 0, 6.45464, 5.31845, 3.15766, 10.158, 0.132027, 0.0429529, 0.158416, 0.988589, 1.21634, 1.20182, 5.20966e-16}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x8x2 kr cab4 ks32 af dw vav di bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.1289e+06, 539201, 101500, 109853, 0, 0, 6.29672, 5.16729, 3.52331, 11.4759, 0.112275, 0.0279401, 0.126749, 1, 1.21183, 1.2015, 2.69946e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB64 aB wg 2x16 cab4x2 ks64 af dw vav di bo bk0 sn grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 32768, 16777216}, {8192, 8192, 16777216}, {8, 2, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12082e+06, 548504, 0, 0, 0, 0, 8.01234, 4.39347, 4.40929, 17.0427, 0.211072, 0.304088, 0.114402, 0.99056, 1.22213, 1.20282, -6.02264e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 4x4x2 kr cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08756e+06, 415262, 180278, 297725, 0, 0, 9.94822, 7.29012, 5.54688, 14.1945, 0.0334137, 0.00595066, 0.0429716, 0.949768, 1.29493, 1.19083, 5.7545e-13}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x8 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 10240, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.4135e+06, 253984, 0, 0, 0, 0, 10.236, 15.0824, 3.09469, 9.93698, 0.171264, 0.0287612, 0.217312, 1, 1.20862, 1.20257, -4.41257e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB16x2 aB wg 2x2x4 kr cb4 ks16 af dw vav di bo bk0 sn grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 16}, {2, 2, 4}, 1, (WGType) 1, 261, 2048, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.50656e+06, 288717, 59976.4, 18666.8, 0, 0, 13.9745, 17.2273, 2.34663, 2.40522, 0.247882, 0.118927, 0.409146, 0.910436, 1.20845, 1.20215, -2.62774e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x4x2 kr cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {2, 4, 2}, 1, (WGType) 1, 261, 20480, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.45489e+06, 278946, 176072, 54942.6, 0, 0, 9.31537, 5.56658, 3.88467, 12.3081, 0.135467, 0.0387979, 0.120639, 0.678997, 1.20712, 1.20311, -5.84273e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB32 aB wg 2x1x16 kr af vav di bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {32, 1, 32}, {2, 1, 16}, 1, (WGType) 0, 261, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {973927, 416501, 13164.2, 11370.8, 0, 0, 5.87904, 29.1737, 30.2783, 139.342, 4.6982, 0.134066, 0.752167, 0.5, 1.20337, 0, 0}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x1x32 kr af vav di bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {987579, 567586, 10256.4, 8671.09, 0, 0, 8.07511, 22.9319, 75.9137, 484.32, 4.37867, 3.86832, 0.0693596, 0.371804, 3.32803, 0, 0}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aB16 aB16 aB wg 2x1x32 kr af vav di bk0 sys l4 kb sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 263, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {985434, 567175, 10319.9, 8706.7, 0, 0, 8.10527, 23.0294, 76.4585, 483.706, 4.32676, 3.62297, 0.219537, 0.406914, 3.35396, 0, 0}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 4x8 kc2 cab4 ks8 nse di bo sr bk0 sn l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.34416e+06, 356498, 0, 0, 0, 0, 12.152, 9.68292, 6.37855, 17.5729, 0.129656, 0.12344, 0.0165283, 0.852645, 1.1782, 1.02886, 6.51238e-12}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB32 aB wg 8x8 cab4x2 ks32 xaf dw vav bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {579626, 1.30873e+06, 0, 0, 0, 0, 6.16503, 6.88343, 4.54054, 13.1904, 0.0528026, 0.0528026, 0, 1, 1.21396, 1.2014, 5.22092e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB16 aB wg 2x8x2 kr cab4 ks32 af dw vav bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.13381e+06, 546572, 120155, 128063, 0, 0, 6.3128, 5.12909, 2.74316, 9.09192, 0.0675974, 0.111561, 0.0339879, 0.995238, 1.214, 1.20175, 2.29193e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB32 aB wg 2x16 cab4 ks64 af dw vav bo bk0 sn grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13494e+06, 565723, 0, 0, 0, 0, 6.58651, 4.63582, 2.77482, 9.77739, 0.0825648, 0.0364345, 0.0773913, 0.987187, 1.21508, 1.20179, 4.02242e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 2x8x4 kr cab4 ks32 af dw vav bo bk0 sn sys l4 sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 8, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.22911e+06, 773838, 50096.1, 48822.3, 0, 0, 6.20328, 5.09858, 3.43021, 11.3071, 0.103231, 0.141451, 0.0232521, 1, 1.21322, 1.20202, -3.99631e-16}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB16 aB wg 4x8 cab4 ks64 af dw vav bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 65536, 16777216}, {8192, 8192, 16777216}, {8, 4, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17572e+06, 607273, 0, 0, 0, 0, 6.45464, 5.31845, 3.15766, 10.158, 0.132027, 0.0429529, 0.158416, 0.988589, 1.21634, 1.20182, 5.20966e-16}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x8x2 kr cab4 ks32 af dw vav bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.1289e+06, 539201, 101500, 109853, 0, 0, 6.29672, 5.16729, 3.52331, 11.4759, 0.112275, 0.0279401, 0.126749, 1, 1.21183, 1.2015, 2.69946e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB64 aB wg 2x16 cab4x2 ks64 af dw vav bo bk0 sn grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 32768, 16777216}, {8192, 8192, 16777216}, {8, 2, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12082e+06, 548504, 0, 0, 0, 0, 8.01234, 4.39347, 4.40929, 17.0427, 0.211072, 0.304088, 0.114402, 0.99056, 1.22213, 1.20282, -6.02264e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 4x4x2 kr cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08756e+06, 415262, 180278, 297725, 0, 0, 9.94822, 7.29012, 5.54688, 14.1945, 0.0334137, 0.00595066, 0.0429716, 0.949768, 1.29493, 1.19083, 5.7545e-13}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x8 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 10240, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.4135e+06, 253984, 0, 0, 0, 0, 10.236, 15.0824, 3.09469, 9.93698, 0.171264, 0.0287612, 0.217312, 1, 1.20862, 1.20257, -4.41257e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB16x2 aB wg 2x2x4 kr cb4 ks16 af dw vav bo bk0 sn grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 16}, {2, 2, 4}, 1, (WGType) 1, 261, 2048, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.50656e+06, 288717, 59976.4, 18666.8, 0, 0, 13.9745, 17.2273, 2.34663, 2.40522, 0.247882, 0.118927, 0.409146, 0.910436, 1.20845, 1.20215, -2.62774e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x4x2 kr cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {2, 4, 2}, 1, (WGType) 1, 261, 20480, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.45489e+06, 278946, 176072, 54942.6, 0, 0, 9.31537, 5.56658, 3.88467, 12.3081, 0.135467, 0.0387979, 0.120639, 0.678997, 1.20712, 1.20311, -5.84273e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB32 aB wg 2x1x16 kr af vav bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {32, 1, 32}, {2, 1, 16}, 1, (WGType) 0, 261, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {973927, 416501, 13164.2, 11370.8, 0, 0, 5.87904, 29.1737, 30.2783, 139.342, 4.6982, 0.134066, 0.752167, 0.5, 1.20337, 0, 0}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x1x32 kr af vav bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {987579, 567586, 10256.4, 8671.09, 0, 0, 8.07511, 22.9319, 75.9137, 484.32, 4.37867, 3.86832, 0.0693596, 0.371804, 3.32803, 0, 0}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aB16 aB16 aB wg 2x1x32 kr af vav bk0 sys l4 kb sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 263, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {985434, 567175, 10319.9, 8706.7, 0, 0, 8.10527, 23.0294, 76.4585, 483.706, 4.32676, 3.62297, 0.219537, 0.406914, 3.35396, 0, 0}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.34416e+06, 356498, 0, 0, 0, 0, 12.152, 9.68292, 6.37855, 17.5729, 0.129656, 0.12344, 0.0165283, 0.852645, 1.1782, 1.02886, 6.51238e-12}}}, {{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {3072, -1, -1}, {-1, 16, -1}, {3072, -1, -1}, {-1, 16, -1}, {4, 4, 1}, "xy"}, "sB2x2 sB2x2 aB wg 2x4x4 akr fg 0.1875 kc2 cab4x2 ks4 nse sr bm0 bk0 sn pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {2, 4, 4}, 1, (WGType) 1, 197381, 9216, 9216, {4, 4, 4}, {false, false, true}}, {'E', 17, {971705, 340654, 92160.5, 101990, 0, 0, 5.92923, 26.228, 2.5963, 9.84795, 0.206227, 0.121448, 0.0558817, 0.0676342, 1.29184, 1.05904, 5.1067e-12}}}, {{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 8x4 cab4x2 ks16 xaf st dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.08792e+06, 567270, 0, 0, 0, 0, 5.72096, 5.41042, 6.54953, 18.1672, 0.017198, 0.00716878, 0.0131277, 0.81547, 1.46216, 1.16767, 2.15216e-12}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipxy"}, "sB16 sB16 aB wg 4x8 cab3x2 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.12984e+06, 584334, 0, 0, 0, 0, 5.52191, 5.62337, 6.60649, 17.5355, 0.0194272, 0.00902898, 0.0139238, 0.977022, 1.42548, 1.17566, 1.22702e-12}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipsxy"}, "sB16 sB16 aB wg 4x8 cab3x2 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.12984e+06, 584334, 0, 0, 0, 0, 5.52191, 5.62337, 6.60649, 17.5355, 0.0194272, 0.00902898, 0.0139238, 0.977022, 1.42548, 1.17566, 1.22702e-12}}}, {{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 4x8 cab4x2 ks32 xaf dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.08172e+06, 516945, 0, 0, 0, 0, 5.56094, 5.32487, 6.43486, 16.8147, 0.0232929, 0.0282432, 0.0105684, 0.710233, 1.39477, 1.17707, 1.37161e-12}}}, {{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Inpxy"}, "sB16 sB32x2 aB wg 2x8x2 kr ca4x2 ks64 xaf dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {951775, 167661, 97516.6, 346695, 0, 0, 5.67445, 5.93222, 5.64271, 14.6548, 0.0227123, 0.0115144, 0.0220849, 0.96727, 1.42164, 1.17598, 1.83708e-12}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568975, 1.35348e+06, 0, 0, 0, 0, 5.67121, 5.37584, 6.51139, 18.2712, 0.0196859, 0.0196859, 0, 1, 1.35089, 1.18161, 7.86876e-13}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB16x2 sB16x2 aB wg 4x8 cab4 ks16 af dw vav di bo bk0 sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.04074e+06, 498949, 0, 0, 0, 0, 5.38936, 5.23734, 5.98628, 16.2827, 0.0274674, 0.010762, 0.0237025, 0.945984, 1.24132, 1.19515, 1.88597e-13}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568975, 1.35348e+06, 0, 0, 0, 0, 5.67121, 5.37584, 6.51139, 18.2712, 0.0196859, 0.0196859, 0, 1, 1.35089, 1.18161, 7.86876e-13}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16x2 sB16x2 aB wg 4x8 cab4 ks16 af dw vav bo bk0 sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.04074e+06, 498949, 0, 0, 0, 0, 5.38936, 5.23734, 5.98628, 16.2827, 0.0274674, 0.010762, 0.0237025, 0.945984, 1.24132, 1.19515, 1.88597e-13}}}, {{'E', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {64, 64, 64}, {-1, -1, -1}, {64, 64, 64}, {1, 1, 1}, "V"}, "aB8 aB8 aB wg 4x8 kc8 cab4 ks8 nse bo sr bk0 sn l4 dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {128}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x4x2 kr cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.0639e+06, 312359, 108225, 317555, 0, 0, 6.87331, 10.2335, 5.5851, 14.778, 0.0424248, 0.0190285, 0.0359952, 0.964204, 1.21752, 1.1995, 8.43029e-14}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {560656, 1.11604e+06, 0, 0, 0, 0, 8.60856, 11.0509, 6.14909, 15.6609, 0.0531671, 0.0531671, 0, 1, 1.21193, 1.20139, 7.41618e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "qpI"}, "aB16x2 aB16x2 aB wg 4x4x4 kr cab4 ks16 af dw vav di bo bk0 sys sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {4, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07651e+06, 700330, 90979.3, 100713, 0, 0, 6.47642, 9.87486, 3.09175, 10.7037, 0.0709771, 0.0751283, 0.0190257, 1, 1.2051, 1.20185, 2.15295e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {548489, 1.09288e+06, 0, 0, 0, 0, 8.2211, 16.0374, 5.20066, 13.9736, 0.103914, 0.103914, 0, 0.941962, 1.20843, 1.20104, 5.39326e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x2x4 kr cab4 ks16 af dw vav di bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {4, 2, 4}, 1, (WGType) 1, 261, 32768, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.38131e+06, 366456, 82852.3, 21186.1, 0, 0, 6.64448, 9.91401, 4.10931, 12.3642, 0.130607, 0.0865804, 0.0693461, 0.733554, 1.20619, 1.20357, -9.06876e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {563029, 1.05131e+06, 0, 0, 0, 0, 14.5376, 15.5776, 3.59843, 13.665, 0.178851, 0.178851, 0, 1, 1.20856, 1.20159, 4.80437e-16}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x2x8 kr cab4 ks16 af dw vav di bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {4, 2, 8}, 1, (WGType) 1, 261, 65536, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17272e+06, 810649, 39929, 35554.5, 0, 0, 6.81841, 10.181, 8.36637, 18.728, 0.14854, -0.00111841, 0.1893, 0.658952, 1.20602, 1.20229, -2.64245e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 8x4 cab4x2 ks16 af dw vav di bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.26456e+06, 286183, 0, 0, 0, 0, 5.81633, 8.57816, 4.66116, 13.8076, 0.137423, 0.0613044, 0.0898491, 0.892134, 1.20738, 1.20318, -9.09531e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x8 cab4x2 ks16 af dw vav di bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 65536, 16777216}, {8192, 8192, 16777216}, {8, 4, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.26179e+06, 282811, 0, 0, 0, 0, 5.93672, 11.9594, 3.04212, 9.63196, 0.237524, 0.1208, 0.174179, 0.963212, 1.20937, 1.20459, -2.01852e-14}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aB16 aB16 aB wg 4x2x8 kr cab4 ks16 af dw vav di bo bk0 sys kb l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {4, 2, 8}, 1, (WGType) 1, 263, 65536, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17459e+06, 810717, 40417.6, 35592.5, 0, 0, 6.84312, 10.1734, 8.3327, 18.7157, 0.14854, 0.0581618, 0.129903, 0.809611, 1.20546, 1.20576, -2.14223e-14}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 40960, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {573184, 1.12993e+06, 0, 0, 0, 0, 10.2158, 17.4327, 5.79394, 15.404, 0.0613861, 0.0613861, 0, 1, 1.20766, 1.20072, 1.08665e-14}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x8x2 kr cab4 ks16 af dw vav di bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 16}, {4, 8, 2}, 1, (WGType) 1, 261, 36864, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00723e+06, 612324, 154076, 132849, 0, 0, 10.3295, 30.9647, 2.10267, 7.80307, 0.209647, 0.110383, 0.122486, 0.861777, 1.20961, 1.20151, -1.19174e-16}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 2x8x2 kr cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {2, 8, 2}, 1, (WGType) 1, 261, 36864, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.05502e+06, 480976, 121442, 126775, 0, 0, 16.2326, 9.45215, 3.4583, 14.1256, 0.129892, 0.0648441, 0.140379, 1, 1.20694, 1.20234, -2.45794e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB2x2 aB2x2 aB wg 4x8 kc2 cab4 ks8 nse di bo sr bk0 l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.22155e+06, 318136, 0, 0, 0, 0, 11.9818, 10.4458, 6.43557, 17.6607, 0.129335, 0.122263, 0.016479, 0.835522, 1.17418, 1.03038, 5.72485e-12}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x4x2 kr cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.0639e+06, 312359, 108225, 317555, 0, 0, 6.87331, 10.2335, 5.5851, 14.778, 0.0424248, 0.0190285, 0.0359952, 0.964204, 1.21752, 1.1995, 8.43029e-14}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {560656, 1.11604e+06, 0, 0, 0, 0, 8.60856, 11.0509, 6.14909, 15.6609, 0.0531671, 0.0531671, 0, 1, 1.21193, 1.20139, 7.41618e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "qpI"}, "aB16x2 aB16x2 aB wg 4x4x4 kr cab4 ks16 af dw vav bo bk0 sys sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {4, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07651e+06, 700330, 90979.3, 100713, 0, 0, 6.47642, 9.87486, 3.09175, 10.7037, 0.0709771, 0.0751283, 0.0190257, 1, 1.2051, 1.20185, 2.15295e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {548489, 1.09288e+06, 0, 0, 0, 0, 8.2211, 16.0374, 5.20066, 13.9736, 0.103914, 0.103914, 0, 0.941962, 1.20843, 1.20104, 5.39326e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x2x4 kr cab4 ks16 af dw vav bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {4, 2, 4}, 1, (WGType) 1, 261, 32768, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.38131e+06, 366456, 82852.3, 21186.1, 0, 0, 6.64448, 9.91401, 4.10931, 12.3642, 0.130607, 0.0865804, 0.0693461, 0.733554, 1.20619, 1.20357, -9.06876e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {563029, 1.05131e+06, 0, 0, 0, 0, 14.5376, 15.5776, 3.59843, 13.665, 0.178851, 0.178851, 0, 1, 1.20856, 1.20159, 4.80437e-16}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x2x8 kr cab4 ks16 af dw vav bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {4, 2, 8}, 1, (WGType) 1, 261, 65536, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17272e+06, 810649, 39929, 35554.5, 0, 0, 6.81841, 10.181, 8.36637, 18.728, 0.14854, -0.00111841, 0.1893, 0.658952, 1.20602, 1.20229, -2.64245e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 8x4 cab4x2 ks16 af dw vav bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.26456e+06, 286183, 0, 0, 0, 0, 5.81633, 8.57816, 4.66116, 13.8076, 0.137423, 0.0613044, 0.0898491, 0.892134, 1.20738, 1.20318, -9.09531e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x8 cab4x2 ks16 af dw vav bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 65536, 16777216}, {8192, 8192, 16777216}, {8, 4, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.26179e+06, 282811, 0, 0, 0, 0, 5.93672, 11.9594, 3.04212, 9.63196, 0.237524, 0.1208, 0.174179, 0.963212, 1.20937, 1.20459, -2.01852e-14}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aB16 aB16 aB wg 4x2x8 kr cab4 ks16 af dw vav bo bk0 sys kb l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {4, 2, 8}, 1, (WGType) 1, 263, 65536, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17459e+06, 810717, 40417.6, 35592.5, 0, 0, 6.84312, 10.1734, 8.3327, 18.7157, 0.14854, 0.0581618, 0.129903, 0.809611, 1.20546, 1.20576, -2.14223e-14}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 40960, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {573184, 1.12993e+06, 0, 0, 0, 0, 10.2158, 17.4327, 5.79394, 15.404, 0.0613861, 0.0613861, 0, 1, 1.20766, 1.20072, 1.08665e-14}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x8x2 kr cab4 ks16 af dw vav bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 16}, {4, 8, 2}, 1, (WGType) 1, 261, 36864, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00723e+06, 612324, 154076, 132849, 0, 0, 10.3295, 30.9647, 2.10267, 7.80307, 0.209647, 0.110383, 0.122486, 0.861777, 1.20961, 1.20151, -1.19174e-16}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 2x8x2 kr cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {2, 8, 2}, 1, (WGType) 1, 261, 36864, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.05502e+06, 480976, 121442, 126775, 0, 0, 16.2326, 9.45215, 3.4583, 14.1256, 0.129892, 0.0648441, 0.140379, 1, 1.20694, 1.20234, -2.45794e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB2x2 aB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.22155e+06, 318136, 0, 0, 0, 0, 11.9818, 10.4458, 6.43557, 17.6607, 0.129335, 0.122263, 0.016479, 0.835522, 1.17418, 1.03038, 5.72485e-12}}}, {{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 4x8 cab4x2 ks16 xaf dw vav bo sr bk0 grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {982395, 479263, 0, 0, 0, 0, 6.10212, 6.1884, 6.59917, 17.8536, 0.0214419, 0.00652312, 0.0168029, 0.617745, 1.33872, 1.18408, 8.9311e-13}}}, {{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 4x8 cab3 ks32 xaf dw vav bo sr bk0 grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {993932, 437006, 0, 0, 0, 0, 5.67036, 6.48396, 6.52602, 16.8605, 0.0283748, -0.00314213, 0.0387446, 0.666054, 1.28983, 1.18846, 4.68723e-13}}}, -{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {581508, 1.13635e+06, 0, 0, 0, 0, 6.25209, 6.61747, 6.57388, 18.2853, 0.0236389, 0.0236389, 0, 0.993208, 1.26513, 1.19125, 2.88352e-13}}}, +{{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {581508, 1.13635e+06, 0, 0, 0, 0, 6.25209, 6.61747, 6.57388, 18.2853, 0.0236389, 0.0236389, 0, 0.993208, 1.26513, 1.19125, 2.88352e-13}}}, {{'E', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 4x8 cab3 ks32 xaf dw vav bo sr bk0 grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {981474, 437790, 0, 0, 0, 0, 6.24369, 6.37532, 6.49232, 16.9191, 0.0283921, 0.0130048, 0.0211151, 0.927229, 1.298, 1.18778, 5.89935e-13}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.09148e+06, 534838, 0, 0, 0, 0, 5.22674, 5.22482, 6.50064, 16.4865, 0.0218546, 0.00768442, 0.0199595, 0.840879, 1.41374, 1.18184, 1.21865e-12}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB32 aB wg 8x4 cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.0738e+06, 487395, 0, 0, 0, 0, 4.60875, 4.65483, 5.61489, 14.464, 0.0516162, 0.0102235, 0.0545566, 0.938533, 1.22067, 1.20222, 3.04423e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 4x4x2 kr cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.01536e+06, 414965, 119042, 135320, 0, 0, 5.19959, 5.46422, 2.88908, 9.79435, 0.0641788, 0.0186409, 0.0711063, 1, 1.21239, 1.20226, -1.69857e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 8x8 cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {566031, 1.06674e+06, 0, 0, 0, 0, 6.12801, 5.51734, 3.1656, 13.6348, 0.0846202, 0.0846202, 0, 1, 1.21348, 1.20076, 2.80227e-14}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.08813e+06, 495379, 47055.6, 54115.6, 0, 0, 5.21195, 5.22722, 4.82032, 12.6476, 0.0791895, 0.0740791, 0.0454014, 1, 1.20835, 1.20258, -4.00248e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav di bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.10031e+06, 465178, 0, 0, 0, 0, 4.66255, 5.56717, 3.95779, 12.18, 0.0860145, 0.0695545, 0.0336857, 1, 1.21111, 1.20136, 5.42319e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB64 aB wg 4x4x2 kr cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 65536, 32768, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03324e+06, 427799, 95570.9, 111482, 0, 0, 4.55809, 4.55425, 3.71723, 13.5832, 0.0957927, 0.0609845, 0.068625, 1, 1.22017, 1.20222, -1.64858e-16}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.33233e+06, 224556, 0, 0, 0, 0, 4.71443, 4.70675, 3.65673, 10.5957, 0.105005, 0.0590905, 0.0614575, 1, 1.21201, 1.20236, -1.02685e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "sB32 sB32 aB wg 2x4x4 kr cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab kb l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 4, 4}, 1, (WGType) 1, 263, 49152, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.08885e+06, 504080, 42551.8, 41211.4, 0, 0, 5.20644, 5.35098, 6.20633, 21.1454, 0.123602, -0.0149575, 0.229245, 0.801592, 1.21004, 1.2021, -1.72813e-17}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.08668e+06, 501544, 0, 0, 0, 0, 5.24563, 5.63352, 6.20576, 15.4182, 0.0341144, 0.0170101, 0.0263965, 1, 1.29682, 1.18968, 3.46159e-13}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.28342e+06, 207114, 0, 0, 0, 0, 5.20558, 6.84739, 3.90187, 10.8408, 0.114743, 0.00818414, 0.107703, 0.865881, 1.21171, 1.20195, 3.61977e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav di bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "pI"}, "aS16 aB16 aB wg 2x4x2 kr cb4 ks64 af dw vav di bo bk0 sn grf256 sys l4 sr dm kd", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 4, 2}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19793e+06, 500273, 115963, 112594, 0, 0, 10.1537, 5.40221, 6.58163, 20.9665, 0.132062, 0.0725799, 0.0889674, 0.68881, 1.21039, 1.20113, 2.7482e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav di bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.2525e+06, 130075, 26693.4, 3242.11, 0, 0, 6.67838, 6.83444, 9.45633, 9.46946, 0.911558, 0.265361, 0.494508, 0.553686, 1.20562, 0, 0}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aS16x2 aB16 aB wg 2x1x32 kr af vav di bo bk0 sys kb l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 263, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.0401e+06, 594545, 10211.6, 8497.89, 0, 0, 6.80094, 7.17184, 23.6622, 26.6115, 0.965044, 0.461764, 0.520482, 0.787753, 1.20703, 0, 0}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB4 aB4 aB wg 4x8 kc4 cab4 ks8 nse di bo sr bk0 sm sn l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.45772e+06, 391624, 0, 0, 0, 0, 14.2935, 9.26312, 6.37711, 17.5216, 0.129265, 0.120659, 0.0220404, 0.831546, 1.19531, 1.02838, 6.53531e-12}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.09148e+06, 534838, 0, 0, 0, 0, 5.22674, 5.22482, 6.50064, 16.4865, 0.0218546, 0.00768442, 0.0199595, 0.840879, 1.41374, 1.18184, 1.21865e-12}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB64 sB32 aB wg 8x4 cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.0738e+06, 487395, 0, 0, 0, 0, 4.60875, 4.65483, 5.61489, 14.464, 0.0516162, 0.0102235, 0.0545566, 0.938533, 1.22067, 1.20222, 3.04423e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 4x4x2 kr cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.01536e+06, 414965, 119042, 135320, 0, 0, 5.19959, 5.46422, 2.88908, 9.79435, 0.0641788, 0.0186409, 0.0711063, 1, 1.21239, 1.20226, -1.69857e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 8x8 cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {566031, 1.06674e+06, 0, 0, 0, 0, 6.12801, 5.51734, 3.1656, 13.6348, 0.0846202, 0.0846202, 0, 1, 1.21348, 1.20076, 2.80227e-14}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.08813e+06, 495379, 47055.6, 54115.6, 0, 0, 5.21195, 5.22722, 4.82032, 12.6476, 0.0791895, 0.0740791, 0.0454014, 1, 1.20835, 1.20258, -4.00248e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.10031e+06, 465178, 0, 0, 0, 0, 4.66255, 5.56717, 3.95779, 12.18, 0.0860145, 0.0695545, 0.0336857, 1, 1.21111, 1.20136, 5.42319e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB64 sB64 aB wg 4x4x2 kr cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 65536, 32768, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03324e+06, 427799, 95570.9, 111482, 0, 0, 4.55809, 4.55425, 3.71723, 13.5832, 0.0957927, 0.0609845, 0.068625, 1, 1.22017, 1.20222, -1.64858e-16}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.33233e+06, 224556, 0, 0, 0, 0, 4.71443, 4.70675, 3.65673, 10.5957, 0.105005, 0.0590905, 0.0614575, 1, 1.21201, 1.20236, -1.02685e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vIxy"}, "sB32 sB32 aB wg 2x4x4 kr cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab kb l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 4, 4}, 1, (WGType) 1, 263, 49152, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.08885e+06, 504080, 42551.8, 41211.4, 0, 0, 5.20644, 5.35098, 6.20633, 21.1454, 0.123602, -0.0149575, 0.229245, 0.801592, 1.21004, 1.2021, -1.72813e-17}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.08668e+06, 501544, 0, 0, 0, 0, 5.24563, 5.63352, 6.20576, 15.4182, 0.0341144, 0.0170101, 0.0263965, 1, 1.29682, 1.18968, 3.46159e-13}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.28342e+06, 207114, 0, 0, 0, 0, 5.20558, 6.84739, 3.90187, 10.8408, 0.114743, 0.00818414, 0.107703, 0.865881, 1.21171, 1.20195, 3.61977e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "pI"}, "aS16 aB16 aB wg 2x4x2 kr cb4 ks64 af dw vav bo bk0 sn grf256 sys l4 sr dm kd", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 4, 2}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19793e+06, 500273, 115963, 112594, 0, 0, 10.1537, 5.40221, 6.58163, 20.9665, 0.132062, 0.0725799, 0.0889674, 0.68881, 1.21039, 1.20113, 2.7482e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.2525e+06, 130075, 26693.4, 3242.11, 0, 0, 6.67838, 6.83444, 9.45633, 9.46946, 0.911558, 0.265361, 0.494508, 0.553686, 1.20562, 0, 0}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aS16x2 aB16 aB wg 2x1x32 kr af vav bo bk0 sys kb l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 263, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.0401e+06, 594545, 10211.6, 8497.89, 0, 0, 6.80094, 7.17184, 23.6622, 26.6115, 0.965044, 0.461764, 0.520482, 0.787753, 1.20703, 0, 0}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB4 aB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.45772e+06, 391624, 0, 0, 0, 0, 14.2935, 9.26312, 6.37711, 17.5216, 0.129265, 0.120659, 0.0220404, 0.831546, 1.19531, 1.02838, 6.53531e-12}}}, {{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {1024, -1, 4096}, {-1, 4, -1}, {-1, -1, 4096}, {-1, 8, -1}, {64, 4, 1}, "Inxy"}, "sB64 sS16 aS wg 2x1x32 ikr af vav sr bk0 bm0 sys pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {2, 1, 32}, 1, (WGType) 0, 4357, 0, 512, {64, 4, 4}, {false, false, true}}, {'E', 17, {837495, 318944, 13236, 12430, 0, 0, 5.7248, 7.42498, 17.2818, 78.3022, 0.407807, 0.385393, 0.018969, 0.514679, 1.20665, 1.20131, -5.91244e-16}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {0, 0, 0}, {0, 0, 0}, {4, 4, 1}, "Inpqxy"}, "sB16 sB16 aB wg 8x4 cab4 ks16 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 655360, 16777216}, {524288, 655360, 16777216}, {32, 40, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 53248, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.11178e+06, 622060, 0, 0, 0, 0, 5.62948, 5.39308, 6.34783, 18.0651, 0.0188949, 0.00722523, 0.0160939, 0.792179, 1.4111, 1.16673, 1.76659e-12}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {0, 0, 0}, {0, 0, 0}, {4, 4, 1}, "Inpqsxy"}, "sB16 sB16 aB wg 8x4 cab4 ks16 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 655360, 16777216}, {524288, 655360, 16777216}, {32, 40, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 53248, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.11178e+06, 622060, 0, 0, 0, 0, 5.62948, 5.39308, 6.34783, 18.0651, 0.0188949, 0.00722523, 0.0160939, 0.792179, 1.4111, 1.16673, 1.76659e-12}}}, {{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 4, 1}, "xyI"}, "sS16x2 sB16 aB wg 16x2 cb4x2 ks16 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 16}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {16, 4, 4}, {false, false, true}}, {'E', 17, {1.07596e+06, 547281, 0, 0, 0, 0, 5.3341, 5.22645, 6.47481, 17.5871, 0.0187721, 0.00841977, 0.0135396, 0.854887, 1.37769, 1.17708, 1.45209e-12}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipxy"}, "sB32 sB16 aB wg 8x4 cab3 ks32 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.05966e+06, 526716, 0, 0, 0, 0, 5.25431, 5.59994, 6.71912, 17.4322, 0.0194231, 0.00993451, 0.0135184, 0.980104, 1.41981, 1.17834, 1.31956e-12}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 xaf dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {596002, 1.33567e+06, 0, 0, 0, 0, 5.53572, 5.47044, 6.55621, 18.2618, 0.0211424, 0.0211424, 0, 1, 1.31489, 1.18381, 6.85524e-13}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipsxy"}, "sB32 sB16 aB wg 8x4 cab3 ks32 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.05966e+06, 526716, 0, 0, 0, 0, 5.25431, 5.59994, 6.71912, 17.4322, 0.0194231, 0.00993451, 0.0135184, 0.980104, 1.41981, 1.17834, 1.31956e-12}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 xaf dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {596002, 1.33567e+06, 0, 0, 0, 0, 5.53572, 5.47044, 6.55621, 18.2618, 0.0211424, 0.0211424, 0, 1, 1.31489, 1.18381, 6.85524e-13}}}, {{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB32x2 aB wg 2x8x2 kr ca4x2 ks64 xaf st dw vav bo sr bk0 dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.06751e+06, 204301, 68125.3, 363183, 0, 0, 5.23091, 5.8341, 5.50916, 14.4411, 0.0246809, 0.0146692, 0.0199046, 0.894464, 1.4112, 1.17796, 1.27893e-12}}}, {{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ip"}, "aB32 aB16 aB wg 2x4x4 kr ca3 ks64 af dw vav bo sr bk0 sm dm sys grf256", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 4, 4}, 1, (WGType) 1, 261, 12288, 12288, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.14066e+06, 578017, 75917, 84236.6, 0, 0, 6.02279, 6.09029, 4.49028, 11.4256, 0.0521067, 0.0432382, 0.0332794, 0.97984, 1.21403, 1.20108, 7.3813e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x4 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.31167e+06, 785275, 0, 0, 0, 0, 7.11381, 8.75643, 6.11098, 15.9972, 0.0503546, 0.0303966, 0.0484271, 0.842682, 1.20649, 1.2023, -1.8357e-15}}}, -{{'E', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 8x4 kc2 cab4 ks8 nse di bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.35147e+06, 357789, 0, 0, 0, 0, 11.5708, 11.8958, 6.40012, 17.218, 0.14396, 0.139657, 0.0130437, 0.882761, 1.16324, 1.0488, 3.12079e-12}}}, -{{'E', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav di bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, -{{'E', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav di bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, -{{'E', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB16 sB16 aB wg 4x16 cab4 ks32 af dw vav di bo bk0 sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {540457, 1.10675e+06, 0, 0, 0, 0, 5.61877, 5.78531, 5.50943, 14.7763, 0.0373088, 0.0373088, 0, 1, 1.21298, 1.20167, -5.60976e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x4 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.31167e+06, 785275, 0, 0, 0, 0, 7.11381, 8.75643, 6.11098, 15.9972, 0.0503546, 0.0303966, 0.0484271, 0.842682, 1.20649, 1.2023, -1.8357e-15}}}, +{{'E', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 8x4 kc2 cab4 ks8 nse bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.35147e+06, 357789, 0, 0, 0, 0, 11.5708, 11.8958, 6.40012, 17.218, 0.14396, 0.139657, 0.0130437, 0.882761, 1.16324, 1.0488, 3.12079e-12}}}, +{{'E', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, +{{'E', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, +{{'E', "gemm", {"D", "D", "D"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 8, 4}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 64, 8}, {true, true, true}}, {'W', 1, {64}}}, +{{'E', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aS4x2 aB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {8, 8, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {64}}}, +{{'E', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 8, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {64}}}, +{{'E', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS4x2 aS4x2 aB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 256}, {8192, 8192, 256}, {8, 8, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {64}}}, +{{'E', "gemm", {"D", "D", "D"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS4x2 aB4x2 aB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {8, 8, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {64}}}, +{{'E', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16 sB16 aB wg 4x16 cab4 ks32 af dw vav bo bk0 sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {540457, 1.10675e+06, 0, 0, 0, 0, 5.61877, 5.78531, 5.50943, 14.7763, 0.0373088, 0.0373088, 0, 1, 1.21298, 1.20167, -5.60976e-15}}}, {{'E', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav di bo bk0 sm sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27535e+06, 215187, 0, 0, 0, 0, 4.64559, 4.65327, 3.47044, 9.89232, 0.105235, 0.0510608, 0.0580013, 1, 1.20879, 1.2012, -2.49667e-15}}}, -{{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav di bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20075e+06, 136815, 51380.2, 5732.06, 0, 0, 6.73706, 6.52116, 7.79608, 7.48028, 2.1671, 0.385165, 0.687428, 0.333333, 1.20286, 0, 0}}}, +{{'E', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2x2 sB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav bo bk0 sm sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27535e+06, 215187, 0, 0, 0, 0, 4.64559, 4.65327, 3.47044, 9.89232, 0.105235, 0.0510608, 0.0580013, 1, 1.20879, 1.2012, -2.49667e-15}}}, {{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, +{{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, {{'E', "gemm", {"F", "H", "S"}, {"A", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "xyzI"}, "sB32 sB32 sB wg 2x1x16 akr fg 0.25 nse sr sb32 bk0 bm0 pab sys", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 1, 16}, 1, (WGType) 0, 262917, 0, 512, {8, 4, 4}, {false, false, false}}, {'W', 1, {64}}}, -{{'E', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}}, +{{'E', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}}, {{'E', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 4, -1}, {-1, -1, -1}, {-1, 4, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS8 sB wg 2x1x16 akr fg 0.5 kc4 nse sr sb32 bk0 bm0 grf256 pab", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 8}, {2, 1, 16}, 1, (WGType) 0, 525061, 0, 2048, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav di bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06234e+06, 445401, 0, 0, 0, 0, 4.62244, 5.31323, 3.78655, 11.4316, 0.08593, 0.0721937, 0.0244972, 1, 1.21089, 1.20155, -5.01322e-15}}}, -{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.24598e+06, 199092, 0, 0, 0, 0, 5.11406, 6.54063, 3.6868, 10.138, 0.114685, 0.0491769, 0.0713602, 0.902881, 1.20724, 1.20086, 6.44189e-15}}}, -{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.00833e+06, 472991, 67249.4, 65461, 0, 0, 5.08333, 5.12749, 4.89837, 12.0468, 0.0787666, 0.065148, 0.0481217, 1, 1.20614, 1.20116, 2.71563e-15}}}, -{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 8x4 cab4 ks16 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.05172e+06, 557389, 0, 0, 0, 0, 2.23646, 5.1076, 6.24833, 17.6243, 0.0206356, 0.0106181, 0.0095341, 0.748815, 1.31282, 1.18539, 7.88099e-13}}}, +{{'E', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2x2 sB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06234e+06, 445401, 0, 0, 0, 0, 4.62244, 5.31323, 3.78655, 11.4316, 0.08593, 0.0721937, 0.0244972, 1, 1.21089, 1.20155, -5.01322e-15}}}, +{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.24598e+06, 199092, 0, 0, 0, 0, 5.11406, 6.54063, 3.6868, 10.138, 0.114685, 0.0491769, 0.0713602, 0.902881, 1.20724, 1.20086, 6.44189e-15}}}, +{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.00833e+06, 472991, 67249.4, 65461, 0, 0, 5.08333, 5.12749, 4.89837, 12.0468, 0.0787666, 0.065148, 0.0481217, 1, 1.20614, 1.20116, 2.71563e-15}}}, {{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {4, 4, 4}, {false, false, true}}, {'W', 1, {128}}}, -{{'E', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03504e+06, 483759, 0, 0, 0, 0, 5.14655, 5.4381, 5.95309, 14.8916, 0.0340942, 0.0152122, 0.0258839, 1, 1.27788, 1.19062, 3.54023e-13}}}, +{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 8x4 cab4 ks16 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.05172e+06, 557389, 0, 0, 0, 0, 2.23646, 5.1076, 6.24833, 17.6243, 0.0206356, 0.0106181, 0.0095341, 0.748815, 1.31282, 1.18539, 7.88099e-13}}}, +{{'E', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2x2 sB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"F", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03504e+06, 483759, 0, 0, 0, 0, 5.14655, 5.4381, 5.95309, 14.8916, 0.0340942, 0.0152122, 0.0258839, 1, 1.27788, 1.19062, 3.54023e-13}}}, {{'E', "gemm", {"H", "H", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "smqp"}, "ab16x3 ab16x3 ab fs sc bo acb bk2048 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {4, 4, 1}, 2, (WGType) 1, 256, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, {{'E', "gemm", {"H", "H", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzDsmqp"}, "sB16 sB16 sb fs wg 8x4 bo acb bk4096 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 4096}, {32, 48, 16}, {8, 4, 1}, 1, (WGType) 1, 0, 61440, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}}, {{'E', "gemm", {"H", "H", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzdsmqp"}, "sB16 sB16 sb fs wg 4x4 bo acb bk4096 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 4096}, {32, 48, 16}, {4, 4, 1}, 1, (WGType) 1, 0, 32256, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}}, -{{'E', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB8/4 aB8 aB wg 8x4 kc8 cab4 ks8 nse di bo sr bk0 sn l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB32 aB wg 8x8 cab4x2 ks32 xaf dw vav di bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {577632, 1.31042e+06, 0, 0, 0, 0, 6.15959, 6.87283, 4.53581, 13.1549, 0.0527369, 0.0527369, 0, 1, 1.2126, 1.20117, 9.25713e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB16 aB wg 2x8x2 kr cab4 ks32 af dw vav di bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.12932e+06, 544915, 123721, 128797, 0, 0, 6.30189, 5.1396, 2.74585, 9.06516, 0.067585, 0.0230792, 0.085076, 0.989844, 1.26214, 1.19529, 2.62781e-13}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB32 aB wg 2x16 cab4 ks64 af dw vav di bo bk0 sn grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.14032e+06, 564642, 0, 0, 0, 0, 6.56444, 4.63397, 2.77135, 9.77858, 0.0825907, 0.035527, 0.0779663, 0.969555, 1.21564, 1.20197, 9.03249e-16}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 2x8x4 kr cab4 ks32 af dw vav di bo bk0 sn sys l4 sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 8, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.21829e+06, 775644, 52726.6, 48444.6, 0, 0, 6.17887, 5.08004, 3.42121, 11.3469, 0.103226, 0.115399, 0.0351692, 1, 1.24138, 1.19941, 1.1955e-13}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB16 aB wg 4x8 cab4 ks64 af dw vav di bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 65536, 16777216}, {8192, 8192, 16777216}, {8, 4, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.16856e+06, 607673, 0, 0, 0, 0, 6.45483, 5.30858, 3.13856, 10.1588, 0.13211, -0.0608739, 0.253501, 0.955862, 1.21927, 1.20201, -3.92067e-16}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x8x2 kr cab4 ks32 af dw vav di bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.13297e+06, 539423, 98100.1, 109567, 0, 0, 6.29758, 5.15781, 3.48286, 11.4885, 0.111863, 0.047075, 0.101661, 1, 1.21423, 1.20138, 2.63863e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB64 aB wg 2x16 cab4x2 ks64 af dw vav di bo bk0 sn grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 32768, 16777216}, {8192, 8192, 16777216}, {8, 2, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.11663e+06, 549008, 0, 0, 0, 0, 7.99056, 4.41464, 4.40063, 17.071, 0.210664, 0.0956988, 0.192101, 1, 1.22479, 1.20258, -6.10193e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 4x4x2 kr cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06669e+06, 417697, 188659, 296717, 0, 0, 9.98714, 7.29662, 5.5625, 14.2025, 0.0333489, 0.0231727, 0.0262804, 0.997553, 1.35082, 1.18563, 1.89826e-12}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x8 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 10240, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.41774e+06, 253424, 0, 0, 0, 0, 10.239, 15.1219, 3.1142, 9.95356, 0.171272, 0.0208162, 0.21871, 0.99885, 1.21298, 1.20221, -2.86036e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB16x2 aB wg 2x2x4 kr cb4 ks16 af dw vav di bo bk0 sn grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 16}, {2, 2, 4}, 1, (WGType) 1, 261, 2048, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.49998e+06, 289148, 60010.6, 18632.6, 0, 0, 13.9528, 17.1172, 2.34028, 2.34028, 0.247876, 0.154424, 0.382193, 0.91448, 1.20938, 1.20173, 6.46366e-16}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x4x2 kr cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {2, 4, 2}, 1, (WGType) 1, 261, 20480, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.46359e+06, 278844, 172499, 54829, 0, 0, 9.30779, 5.55711, 3.90031, 12.3179, 0.135468, 0.0309154, 0.125808, 0.588163, 1.2112, 1.20207, -2.05782e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB32 aB wg 2x1x16 kr af vav di bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {32, 1, 32}, {2, 1, 16}, 1, (WGType) 0, 261, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {968115, 416537, 13403.9, 11376.9, 0, 0, 5.84714, 29.1929, 30.0901, 139.255, 4.71428, 0.110628, 0.752025, 0.5, 1.20255, 0, 0}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x1x32 kr af vav di bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {990414, 567348, 10101, 8687.74, 0, 0, 8.04883, 22.9903, 77.7377, 483.846, 4.54742, 3.65227, 0.142562, 0.349467, 3.2483, 0, 0}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aB16 aB16 aB wg 2x1x32 kr af vav di bk0 sys l4 kb sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 263, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {983163, 567602, 10327.5, 8699.11, 0, 0, 8.06398, 23.0086, 75.5768, 483.562, 4.49549, 3.68522, 0.162808, 0.357516, 3.27235, 0, 0}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 4x8 kc2 cab4 ks8 nse di bo sr bk0 sn l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.34934e+06, 356229, 0, 0, 0, 0, 12.1849, 9.67542, 6.39456, 17.6021, 0.130818, 0.124693, 0.0163365, 0.855674, 1.1991, 1.01518, 9.01296e-12}}}, +{{'E', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB8/4 aB8 aB wg 8x4 kc8 cab4 ks8 nse bo sr bk0 sn l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB32 aB wg 8x8 cab4x2 ks32 xaf dw vav bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {577632, 1.31042e+06, 0, 0, 0, 0, 6.15959, 6.87283, 4.53581, 13.1549, 0.0527369, 0.0527369, 0, 1, 1.2126, 1.20117, 9.25713e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB16 aB wg 2x8x2 kr cab4 ks32 af dw vav bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.12932e+06, 544915, 123721, 128797, 0, 0, 6.30189, 5.1396, 2.74585, 9.06516, 0.067585, 0.0230792, 0.085076, 0.989844, 1.26214, 1.19529, 2.62781e-13}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB32 aB wg 2x16 cab4 ks64 af dw vav bo bk0 sn grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.14032e+06, 564642, 0, 0, 0, 0, 6.56444, 4.63397, 2.77135, 9.77858, 0.0825907, 0.035527, 0.0779663, 0.969555, 1.21564, 1.20197, 9.03249e-16}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 2x8x4 kr cab4 ks32 af dw vav bo bk0 sn sys l4 sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 8, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.21829e+06, 775644, 52726.6, 48444.6, 0, 0, 6.17887, 5.08004, 3.42121, 11.3469, 0.103226, 0.115399, 0.0351692, 1, 1.24138, 1.19941, 1.1955e-13}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB16 aB wg 4x8 cab4 ks64 af dw vav bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 65536, 16777216}, {8192, 8192, 16777216}, {8, 4, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.16856e+06, 607673, 0, 0, 0, 0, 6.45483, 5.30858, 3.13856, 10.1588, 0.13211, -0.0608739, 0.253501, 0.955862, 1.21927, 1.20201, -3.92067e-16}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x8x2 kr cab4 ks32 af dw vav bo bk0 sn grf256 sys l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.13297e+06, 539423, 98100.1, 109567, 0, 0, 6.29758, 5.15781, 3.48286, 11.4885, 0.111863, 0.047075, 0.101661, 1, 1.21423, 1.20138, 2.63863e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB64 aB wg 2x16 cab4x2 ks64 af dw vav bo bk0 sn grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 32768, 16777216}, {8192, 8192, 16777216}, {8, 2, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.11663e+06, 549008, 0, 0, 0, 0, 7.99056, 4.41464, 4.40063, 17.071, 0.210664, 0.0956988, 0.192101, 1, 1.22479, 1.20258, -6.10193e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 4x4x2 kr cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06669e+06, 417697, 188659, 296717, 0, 0, 9.98714, 7.29662, 5.5625, 14.2025, 0.0333489, 0.0231727, 0.0262804, 0.997553, 1.35082, 1.18563, 1.89826e-12}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x8 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 10240, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.41774e+06, 253424, 0, 0, 0, 0, 10.239, 15.1219, 3.1142, 9.95356, 0.171272, 0.0208162, 0.21871, 0.99885, 1.21298, 1.20221, -2.86036e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB16x2 aB wg 2x2x4 kr cb4 ks16 af dw vav bo bk0 sn grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 16}, {2, 2, 4}, 1, (WGType) 1, 261, 2048, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.49998e+06, 289148, 60010.6, 18632.6, 0, 0, 13.9528, 17.1172, 2.34028, 2.34028, 0.247876, 0.154424, 0.382193, 0.91448, 1.20938, 1.20173, 6.46366e-16}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x4x2 kr cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {2, 4, 2}, 1, (WGType) 1, 261, 20480, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.46359e+06, 278844, 172499, 54829, 0, 0, 9.30779, 5.55711, 3.90031, 12.3179, 0.135468, 0.0309154, 0.125808, 0.588163, 1.2112, 1.20207, -2.05782e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB32 aB wg 2x1x16 kr af vav bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {32, 1, 32}, {2, 1, 16}, 1, (WGType) 0, 261, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {968115, 416537, 13403.9, 11376.9, 0, 0, 5.84714, 29.1929, 30.0901, 139.255, 4.71428, 0.110628, 0.752025, 0.5, 1.20255, 0, 0}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 2x1x32 kr af vav bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {990414, 567348, 10101, 8687.74, 0, 0, 8.04883, 22.9903, 77.7377, 483.846, 4.54742, 3.65227, 0.142562, 0.349467, 3.2483, 0, 0}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aB16 aB16 aB wg 2x1x32 kr af vav bk0 sys l4 kb sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 263, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {983163, 567602, 10327.5, 8699.11, 0, 0, 8.06398, 23.0086, 75.5768, 483.562, 4.49549, 3.68522, 0.162808, 0.357516, 3.27235, 0, 0}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.34934e+06, 356229, 0, 0, 0, 0, 12.1849, 9.67542, 6.39456, 17.6021, 0.130818, 0.124693, 0.0163365, 0.855674, 1.1991, 1.01518, 9.01296e-12}}}, {{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {3072, -1, -1}, {-1, 16, -1}, {3072, -1, -1}, {-1, 16, -1}, {4, 4, 1}, "xy"}, "sB2x2 sB2x2 aB wg 2x4x4 akr fg 0.1875 kc2 cab4x2 ks4 nse sr bm0 bk0 sn pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {2, 4, 4}, 1, (WGType) 1, 197381, 9216, 9216, {4, 4, 4}, {false, false, true}}, {'E', 17, {993770, 339888, 82818.2, 102321, 0, 0, 5.92669, 26.2821, 2.60334, 9.78266, 0.202651, 0.12328, 0.0554409, 0.094767, 1.33433, 1.06696, 5.87551e-12}}}, {{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 8x4 cab4x2 ks16 xaf st dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.08421e+06, 567300, 0, 0, 0, 0, 5.72582, 5.41621, 6.5602, 18.1823, 0.0171802, 0.00649357, 0.0137923, 0.748807, 1.55527, 1.15042, 2.94264e-12}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipxy"}, "sB16 sB16 aB wg 4x8 cab3x2 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.13571e+06, 583999, 0, 0, 0, 0, 5.51731, 5.62733, 6.63464, 17.5494, 0.0194115, 0.00899104, 0.0142425, 0.950599, 1.50472, 1.17739, 2.4916e-12}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipsxy"}, "sB16 sB16 aB wg 4x8 cab3x2 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.13571e+06, 583999, 0, 0, 0, 0, 5.51731, 5.62733, 6.63464, 17.5494, 0.0194115, 0.00899104, 0.0142425, 0.950599, 1.50472, 1.17739, 2.4916e-12}}}, {{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 4x8 cab4x2 ks32 xaf dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.0878e+06, 516397, 0, 0, 0, 0, 5.57335, 5.3253, 6.42346, 16.8151, 0.0232576, 0.0155179, 0.01274, 0.807047, 1.46838, 1.16337, 1.98005e-12}}}, {{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Inpxy"}, "sB16 sB32x2 aB wg 2x8x2 kr ca4x2 ks64 xaf dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {961366, 166672, 94208.5, 346931, 0, 0, 5.68945, 5.93633, 5.6386, 14.6711, 0.022562, 0.0117897, 0.0224195, 0.973005, 1.51181, 1.16546, 3.21814e-12}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {572492, 1.35365e+06, 0, 0, 0, 0, 5.65906, 5.37186, 6.53579, 18.2267, 0.0196679, 0.0196679, 0, 1, 1.44388, 1.17216, 1.57451e-12}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB16x2 sB16x2 aB wg 4x8 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03854e+06, 499303, 0, 0, 0, 0, 5.39147, 5.24663, 6.00784, 16.2676, 0.0273688, 0.01059, 0.0237607, 0.949573, 1.33386, 1.18647, 7.31943e-13}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {572492, 1.35365e+06, 0, 0, 0, 0, 5.65906, 5.37186, 6.53579, 18.2267, 0.0196679, 0.0196679, 0, 1, 1.44388, 1.17216, 1.57451e-12}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16x2 sB16x2 aB wg 4x8 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03854e+06, 499303, 0, 0, 0, 0, 5.39147, 5.24663, 6.00784, 16.2676, 0.0273688, 0.01059, 0.0237607, 0.949573, 1.33386, 1.18647, 7.31943e-13}}}, {{'E', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {64, 64, 64}, {-1, -1, -1}, {64, 64, 64}, {1, 1, 1}, "V"}, "aB8 aB8 aB wg 4x8 kc8 cab4 ks8 nse bo sr bk0 sn l4 dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {128}}}, -{{'E', "gemm", {"H", "H", "H"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8 aB8 aB wg 8x4 kc8 cab4 ks8 nse di bo sr bk0 l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x4x2 kr cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07733e+06, 311210, 97136.7, 318812, 0, 0, 6.84531, 10.2509, 5.59093, 14.7738, 0.0422813, 0.0257461, 0.0301233, 0.979167, 1.29285, 1.19251, 4.9428e-13}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {568679, 1.11539e+06, 0, 0, 0, 0, 8.57882, 11.0683, 6.15089, 15.6211, 0.0531227, 0.0531227, 0, 1, 1.20724, 1.20171, 2.80388e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "qpI"}, "aB16x2 aB16x2 aB wg 4x4x4 kr cab4 ks16 af dw vav di bo bk0 sys sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {4, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07076e+06, 700555, 91664.9, 100642, 0, 0, 6.46487, 9.88089, 3.12169, 10.6898, 0.0710486, 0.0726906, 0.0195202, 1, 1.2056, 1.20186, 4.94821e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {549980, 1.09293e+06, 0, 0, 0, 0, 8.24445, 16.052, 5.21936, 14.0172, 0.104056, 0.104056, 0, 0.927309, 1.21031, 1.20229, -3.26446e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x2x4 kr cab4 ks16 af dw vav di bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {4, 2, 4}, 1, (WGType) 1, 261, 32768, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.37431e+06, 366902, 84576.9, 21099.9, 0, 0, 6.62681, 9.9022, 4.09505, 12.3611, 0.130677, 0.124146, 0.06546, 0.645556, 1.2071, 1.20156, 1.54005e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {563998, 1.05034e+06, 0, 0, 0, 0, 14.5229, 15.5849, 3.5838, 13.7015, 0.178533, 0.178533, 0, 1, 1.20753, 1.20157, 1.35023e-17}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x2x8 kr cab4 ks16 af dw vav di bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {4, 2, 8}, 1, (WGType) 1, 261, 65536, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17156e+06, 810490, 40614.6, 35571, 0, 0, 6.83315, 10.1518, 8.36302, 18.7103, 0.148382, 0.031612, 0.153252, 0.799695, 1.2072, 1.20176, -3.40823e-16}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 8x4 cab4x2 ks16 af dw vav di bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.25687e+06, 286505, 0, 0, 0, 0, 5.80146, 8.50854, 4.6718, 13.7597, 0.13751, 0.0574509, 0.0903835, 0.891908, 1.21163, 1.20152, 2.71443e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x8 cab4x2 ks16 af dw vav di bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 65536, 16777216}, {8192, 8192, 16777216}, {8, 4, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.26051e+06, 282866, 0, 0, 0, 0, 5.93413, 11.8843, 3.04053, 9.63044, 0.237344, 0.0914411, 0.191295, 0.942713, 1.21014, 1.20249, -5.81423e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aB16 aB16 aB wg 4x2x8 kr cab4 ks16 af dw vav di bo bk0 sys kb l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {4, 2, 8}, 1, (WGType) 1, 263, 65536, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20049e+06, 808861, 36402.1, 35833.7, 0, 0, 6.82428, 10.1509, 8.32306, 18.8232, 0.148622, 0.0439741, 0.141327, 0.778051, 1.20854, 1.20178, 2.91878e-16}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 40960, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {571150, 1.13043e+06, 0, 0, 0, 0, 10.2194, 17.4328, 5.7629, 15.4125, 0.0613411, 0.0613411, 0, 1, 1.20744, 1.20181, 2.57881e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x8x2 kr cab4 ks16 af dw vav di bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 16}, {4, 8, 2}, 1, (WGType) 1, 261, 36864, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {992438, 612374, 164538, 132831, 0, 0, 10.2863, 30.8207, 2.10206, 7.8021, 0.209568, 0.0962382, 0.130917, 0.862782, 1.20949, 1.20158, -4.19713e-17}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 2x8x2 kr cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {2, 8, 2}, 1, (WGType) 1, 261, 36864, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.04787e+06, 483213, 129702, 125274, 0, 0, 16.2392, 9.44468, 3.47675, 14.3852, 0.129889, 0.00194463, 0.209385, 0.956022, 1.21048, 1.20181, 6.72822e-17}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB2x2 aB2x2 aB wg 4x8 kc2 cab4 ks8 nse di bo sr bk0 l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.22683e+06, 317772, 0, 0, 0, 0, 11.9975, 10.4327, 6.41073, 17.69, 0.130587, 0.124094, 0.0159742, 0.849448, 1.19435, 1.02497, 8.14606e-12}}}, +{{'E', "gemm", {"H", "H", "H"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8 aB8 aB wg 8x4 kc8 cab4 ks8 nse bo sr bk0 l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x4x2 kr cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07733e+06, 311210, 97136.7, 318812, 0, 0, 6.84531, 10.2509, 5.59093, 14.7738, 0.0422813, 0.0257461, 0.0301233, 0.979167, 1.29285, 1.19251, 4.9428e-13}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {568679, 1.11539e+06, 0, 0, 0, 0, 8.57882, 11.0683, 6.15089, 15.6211, 0.0531227, 0.0531227, 0, 1, 1.20724, 1.20171, 2.80388e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "qpI"}, "aB16x2 aB16x2 aB wg 4x4x4 kr cab4 ks16 af dw vav bo bk0 sys sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {4, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07076e+06, 700555, 91664.9, 100642, 0, 0, 6.46487, 9.88089, 3.12169, 10.6898, 0.0710486, 0.0726906, 0.0195202, 1, 1.2056, 1.20186, 4.94821e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {549980, 1.09293e+06, 0, 0, 0, 0, 8.24445, 16.052, 5.21936, 14.0172, 0.104056, 0.104056, 0, 0.927309, 1.21031, 1.20229, -3.26446e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x2x4 kr cab4 ks16 af dw vav bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {4, 2, 4}, 1, (WGType) 1, 261, 32768, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.37431e+06, 366902, 84576.9, 21099.9, 0, 0, 6.62681, 9.9022, 4.09505, 12.3611, 0.130677, 0.124146, 0.06546, 0.645556, 1.2071, 1.20156, 1.54005e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {563998, 1.05034e+06, 0, 0, 0, 0, 14.5229, 15.5849, 3.5838, 13.7015, 0.178533, 0.178533, 0, 1, 1.20753, 1.20157, 1.35023e-17}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x2x8 kr cab4 ks16 af dw vav bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {4, 2, 8}, 1, (WGType) 1, 261, 65536, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17156e+06, 810490, 40614.6, 35571, 0, 0, 6.83315, 10.1518, 8.36302, 18.7103, 0.148382, 0.031612, 0.153252, 0.799695, 1.2072, 1.20176, -3.40823e-16}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 8x4 cab4x2 ks16 af dw vav bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.25687e+06, 286505, 0, 0, 0, 0, 5.80146, 8.50854, 4.6718, 13.7597, 0.13751, 0.0574509, 0.0903835, 0.891908, 1.21163, 1.20152, 2.71443e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x8 cab4x2 ks16 af dw vav bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 65536, 16777216}, {8192, 8192, 16777216}, {8, 4, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.26051e+06, 282866, 0, 0, 0, 0, 5.93413, 11.8843, 3.04053, 9.63044, 0.237344, 0.0914411, 0.191295, 0.942713, 1.21014, 1.20249, -5.81423e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aB16 aB16 aB wg 4x2x8 kr cab4 ks16 af dw vav bo bk0 sys kb l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {4, 2, 8}, 1, (WGType) 1, 263, 65536, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20049e+06, 808861, 36402.1, 35833.7, 0, 0, 6.82428, 10.1509, 8.32306, 18.8232, 0.148622, 0.0439741, 0.141327, 0.778051, 1.20854, 1.20178, 2.91878e-16}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 40960, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {571150, 1.13043e+06, 0, 0, 0, 0, 10.2194, 17.4328, 5.7629, 15.4125, 0.0613411, 0.0613411, 0, 1, 1.20744, 1.20181, 2.57881e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x8x2 kr cab4 ks16 af dw vav bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 16}, {4, 8, 2}, 1, (WGType) 1, 261, 36864, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {992438, 612374, 164538, 132831, 0, 0, 10.2863, 30.8207, 2.10206, 7.8021, 0.209568, 0.0962382, 0.130917, 0.862782, 1.20949, 1.20158, -4.19713e-17}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 2x8x2 kr cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 16}, {2, 8, 2}, 1, (WGType) 1, 261, 36864, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.04787e+06, 483213, 129702, 125274, 0, 0, 16.2392, 9.44468, 3.47675, 14.3852, 0.129889, 0.00194463, 0.209385, 0.956022, 1.21048, 1.20181, 6.72822e-17}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB2x2 aB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.22683e+06, 317772, 0, 0, 0, 0, 11.9975, 10.4327, 6.41073, 17.69, 0.130587, 0.124094, 0.0159742, 0.849448, 1.19435, 1.02497, 8.14606e-12}}}, {{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 4x8 cab4x2 ks16 xaf dw vav bo sr bk0 grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {979393, 479807, 0, 0, 0, 0, 6.09588, 6.17619, 6.59461, 17.8681, 0.0214328, 0.00474331, 0.0185975, 0.676442, 1.42917, 1.17445, 1.63534e-12}}}, {{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 4x8 cab3 ks32 xaf dw vav bo sr bk0 grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {986809, 437370, 0, 0, 0, 0, 5.68695, 6.48375, 6.53936, 16.9925, 0.0282225, 0.0226525, 0.0239336, 0.736827, 1.37762, 1.17817, 1.10401e-12}}}, -{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {585301, 1.13594e+06, 0, 0, 0, 0, 6.24472, 6.60826, 6.57509, 18.329, 0.0236345, 0.0236345, 0, 0.985563, 1.32741, 1.18438, 6.65402e-13}}}, +{{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 grf256 sys l4 sr pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {585301, 1.13594e+06, 0, 0, 0, 0, 6.24472, 6.60826, 6.57509, 18.329, 0.0236345, 0.0236345, 0, 0.985563, 1.32741, 1.18438, 6.65402e-13}}}, {{'E', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 4x8 cab3 ks32 xaf dw vav bo sr bk0 grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {982154, 437724, 0, 0, 0, 0, 6.2565, 6.37164, 6.49676, 16.9924, 0.0283206, 0.0139661, 0.0208344, 0.966273, 1.36049, 1.17681, 1.17759e-12}}}, -{{'E', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB8/4 aS8 aB wg 4x8 kc8 ca4 ks8 nse di bo sr bk0 sm dm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 8192, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.0836e+06, 535346, 0, 0, 0, 0, 5.23021, 5.23405, 6.51087, 16.4718, 0.0217916, 0.0114948, 0.0166387, 0.885878, 1.48291, 1.16519, 1.92343e-12}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB32 aB wg 8x4 cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06855e+06, 487726, 0, 0, 0, 0, 4.61972, 4.65428, 5.62471, 14.4914, 0.0515614, 0.0277546, 0.0342646, 0.995874, 1.21142, 1.20166, 4.60295e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 4x4x2 kr cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.00925e+06, 416770, 125268, 134008, 0, 0, 5.20487, 5.45943, 2.99579, 9.74749, 0.064019, 0.0148854, 0.0709916, 1, 1.26814, 1.19451, 2.48286e-13}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 8x8 cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {577238, 1.06536e+06, 0, 0, 0, 0, 6.1297, 5.50253, 3.15556, 13.6879, 0.0845357, 0.0845357, 0, 1, 1.21247, 1.20135, 6.92574e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.08018e+06, 496164, 49541.4, 53882.6, 0, 0, 5.19797, 5.23637, 4.86973, 12.6255, 0.0790981, 0.00367296, 0.116228, 0.944192, 1.24641, 1.19957, 1.6214e-13}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav di bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.10783e+06, 465033, 0, 0, 0, 0, 4.65887, 5.56569, 3.95764, 12.1666, 0.0862301, 0.0694934, 0.0295468, 1, 1.21388, 1.20186, 3.63753e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB64 aB wg 4x4x2 kr cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 65536, 32768, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.01594e+06, 428512, 105061, 111208, 0, 0, 4.57657, 4.56121, 3.76336, 13.6659, 0.0951637, 0.0550303, 0.0735522, 1, 1.21613, 1.20157, 3.14445e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.33132e+06, 224343, 0, 0, 0, 0, 4.74023, 4.70375, 3.6736, 10.5877, 0.105009, 0.0387103, 0.0712581, 0.996133, 1.21448, 1.20153, 2.71117e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "sB32 sB32 aB wg 2x4x4 kr cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab kb l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 4, 4}, 1, (WGType) 1, 263, 49152, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.09603e+06, 503460, 39429.3, 41466.7, 0, 0, 5.21052, 5.34778, 6.21144, 21.1016, 0.123605, 0.0781543, 0.0963408, 1, 1.21338, 1.20115, 1.0006e-14}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.08886e+06, 501207, 0, 0, 0, 0, 5.25936, 5.63221, 6.18774, 15.4045, 0.0340166, 0.0129824, 0.0294272, 1, 1.34352, 1.18285, 7.68453e-13}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27453e+06, 207401, 0, 0, 0, 0, 5.20901, 6.85109, 3.83997, 10.8185, 0.114739, 0.0329578, 0.0902767, 0.944924, 1.21429, 1.202, -4.89835e-16}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav di bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.36326e+06, 242779, 0, 0, 0, 0, 5.88843, 23.0453, 1.43253, 1.41056, 0.411444, 0.0655792, 0.343105, 0.89732, 1.20931, 0, 0}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "pI"}, "aS16 aB16 aB wg 2x4x2 kr cb4 ks64 af dw vav di bo bk0 sn grf256 sys l4 sr dm kd", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 4, 2}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19957e+06, 499856, 112998, 113101, 0, 0, 10.1775, 5.39975, 6.57215, 20.8122, 0.132268, 0.0683795, 0.0912759, 0.550172, 1.2103, 1.20101, 8.94364e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav di bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.25025e+06, 129886, 28070.5, 3255.7, 0, 0, 6.67857, 6.83906, 9.62635, 9.48964, 0.911406, 0.250273, 0.495172, 0.551189, 1.20592, 0, 0}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aS16x2 aB16 aB wg 2x1x32 kr af vav di bo bk0 sys kb l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 263, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.0371e+06, 594770, 10558.7, 8467.85, 0, 0, 6.80094, 7.18147, 24.637, 28.0116, 0.965377, 0.436856, 0.531691, 0.788518, 1.20508, 0, 0}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB4 aB4 aB wg 4x8 kc4 cab4 ks8 nse di bo sr bk0 sm sn l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.45189e+06, 390082, 0, 0, 0, 0, 14.2605, 9.2455, 6.35145, 17.5546, 0.130786, 0.123001, 0.0212675, 0.845631, 1.20876, 1.00031, 1.19283e-11}}}, +{{'E', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB8/4 aS8 aB wg 4x8 kc8 ca4 ks8 nse bo sr bk0 sm dm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 8192, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.0836e+06, 535346, 0, 0, 0, 0, 5.23021, 5.23405, 6.51087, 16.4718, 0.0217916, 0.0114948, 0.0166387, 0.885878, 1.48291, 1.16519, 1.92343e-12}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB64 sB32 aB wg 8x4 cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06855e+06, 487726, 0, 0, 0, 0, 4.61972, 4.65428, 5.62471, 14.4914, 0.0515614, 0.0277546, 0.0342646, 0.995874, 1.21142, 1.20166, 4.60295e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 4x4x2 kr cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.00925e+06, 416770, 125268, 134008, 0, 0, 5.20487, 5.45943, 2.99579, 9.74749, 0.064019, 0.0148854, 0.0709916, 1, 1.26814, 1.19451, 2.48286e-13}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 8x8 cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {577238, 1.06536e+06, 0, 0, 0, 0, 6.1297, 5.50253, 3.15556, 13.6879, 0.0845357, 0.0845357, 0, 1, 1.21247, 1.20135, 6.92574e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.08018e+06, 496164, 49541.4, 53882.6, 0, 0, 5.19797, 5.23637, 4.86973, 12.6255, 0.0790981, 0.00367296, 0.116228, 0.944192, 1.24641, 1.19957, 1.6214e-13}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.10783e+06, 465033, 0, 0, 0, 0, 4.65887, 5.56569, 3.95764, 12.1666, 0.0862301, 0.0694934, 0.0295468, 1, 1.21388, 1.20186, 3.63753e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB64 sB64 aB wg 4x4x2 kr cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 65536, 32768, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.01594e+06, 428512, 105061, 111208, 0, 0, 4.57657, 4.56121, 3.76336, 13.6659, 0.0951637, 0.0550303, 0.0735522, 1, 1.21613, 1.20157, 3.14445e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.33132e+06, 224343, 0, 0, 0, 0, 4.74023, 4.70375, 3.6736, 10.5877, 0.105009, 0.0387103, 0.0712581, 0.996133, 1.21448, 1.20153, 2.71117e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vIxy"}, "sB32 sB32 aB wg 2x4x4 kr cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab kb l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 4, 4}, 1, (WGType) 1, 263, 49152, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.09603e+06, 503460, 39429.3, 41466.7, 0, 0, 5.21052, 5.34778, 6.21144, 21.1016, 0.123605, 0.0781543, 0.0963408, 1, 1.21338, 1.20115, 1.0006e-14}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.08886e+06, 501207, 0, 0, 0, 0, 5.25936, 5.63221, 6.18774, 15.4045, 0.0340166, 0.0129824, 0.0294272, 1, 1.34352, 1.18285, 7.68453e-13}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27453e+06, 207401, 0, 0, 0, 0, 5.20901, 6.85109, 3.83997, 10.8185, 0.114739, 0.0329578, 0.0902767, 0.944924, 1.21429, 1.202, -4.89835e-16}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.36326e+06, 242779, 0, 0, 0, 0, 5.88843, 23.0453, 1.43253, 1.41056, 0.411444, 0.0655792, 0.343105, 0.89732, 1.20931, 0, 0}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "pI"}, "aS16 aB16 aB wg 2x4x2 kr cb4 ks64 af dw vav bo bk0 sn grf256 sys l4 sr dm kd", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 4, 2}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19957e+06, 499856, 112998, 113101, 0, 0, 10.1775, 5.39975, 6.57215, 20.8122, 0.132268, 0.0683795, 0.0912759, 0.550172, 1.2103, 1.20101, 8.94364e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.25025e+06, 129886, 28070.5, 3255.7, 0, 0, 6.67857, 6.83906, 9.62635, 9.48964, 0.911406, 0.250273, 0.495172, 0.551189, 1.20592, 0, 0}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aS16x2 aB16 aB wg 2x1x32 kr af vav bo bk0 sys kb l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 263, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.0371e+06, 594770, 10558.7, 8467.85, 0, 0, 6.80094, 7.18147, 24.637, 28.0116, 0.965377, 0.436856, 0.531691, 0.788518, 1.20508, 0, 0}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB4 aB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.45189e+06, 390082, 0, 0, 0, 0, 14.2605, 9.2455, 6.35145, 17.5546, 0.130786, 0.123001, 0.0212675, 0.845631, 1.20876, 1.00031, 1.19283e-11}}}, {{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1024, -1, 4096}, {-1, 4, -1}, {-1, -1, 4096}, {-1, 8, -1}, {64, 4, 1}, "Inxy"}, "sB64 sS16 aS wg 2x1x32 ikr af vav sr bk0 bm0 sys pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {2, 1, 32}, 1, (WGType) 0, 4357, 0, 512, {64, 4, 4}, {false, false, true}}, {'E', 17, {834613, 319024, 13483.2, 12404.8, 0, 0, 5.7325, 7.41925, 17.4593, 78.4906, 0.407648, 0.377702, 0.0220867, 0.472262, 1.20351, 1.20138, 3.75682e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {0, 0, 0}, {0, 0, 0}, {4, 4, 1}, "Inpqxy"}, "sB16 sB16 aB wg 8x4 cab4 ks16 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 655360, 16777216}, {524288, 655360, 16777216}, {32, 40, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 53248, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.117e+06, 621749, 0, 0, 0, 0, 5.63531, 5.38748, 6.34314, 18.0359, 0.0187448, 0.0158313, 0.0117899, 0.815598, 1.50444, 1.15596, 2.96809e-12}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {0, 0, 0}, {0, 0, 0}, {4, 4, 1}, "Inpqsxy"}, "sB16 sB16 aB wg 8x4 cab4 ks16 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 655360, 16777216}, {524288, 655360, 16777216}, {32, 40, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 53248, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.117e+06, 621749, 0, 0, 0, 0, 5.63531, 5.38748, 6.34314, 18.0359, 0.0187448, 0.0158313, 0.0117899, 0.815598, 1.50444, 1.15596, 2.96809e-12}}}, {{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 4, 1}, "xyI"}, "sS16x2 sB16 aB wg 16x2 cb4x2 ks16 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 16}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {16, 4, 4}, {false, false, true}}, {'E', 17, {1.08445e+06, 546785, 0, 0, 0, 0, 5.33037, 5.24024, 6.46841, 17.5866, 0.0186739, 0.00499822, 0.0174057, 0.790315, 1.46202, 1.17542, 3.02139e-12}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipxy"}, "sB32 sB16 aB wg 8x4 cab3 ks32 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.05615e+06, 527155, 0, 0, 0, 0, 5.24242, 5.61507, 6.74453, 17.432, 0.0191776, 0.0115211, 0.0120279, 1, 1.50506, 1.17144, 2.14593e-12}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 xaf dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {599248, 1.33458e+06, 0, 0, 0, 0, 5.52852, 5.45748, 6.54024, 18.2766, 0.0211426, 0.0211426, 0, 1, 1.40808, 1.175, 1.4855e-12}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipsxy"}, "sB32 sB16 aB wg 8x4 cab3 ks32 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.05615e+06, 527155, 0, 0, 0, 0, 5.24242, 5.61507, 6.74453, 17.432, 0.0191776, 0.0115211, 0.0120279, 1, 1.50506, 1.17144, 2.14593e-12}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 xaf dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {599248, 1.33458e+06, 0, 0, 0, 0, 5.52852, 5.45748, 6.54024, 18.2766, 0.0211426, 0.0211426, 0, 1, 1.40808, 1.175, 1.4855e-12}}}, {{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB32x2 aB wg 2x8x2 kr ca4x2 ks64 xaf st dw vav bo sr bk0 dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.06367e+06, 203220, 74026.4, 363426, 0, 0, 5.22592, 5.83255, 5.53361, 14.4283, 0.0245863, -0.000813833, 0.0346436, 0.773746, 1.49124, 1.1734, 2.5839e-12}}}, {{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ip"}, "aB32 aB16 aB wg 2x4x4 kr ca3 ks64 af dw vav bo sr bk0 sm dm sys grf256", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 4, 4}, 1, (WGType) 1, 261, 12288, 12288, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13288e+06, 578431, 81868.2, 83815, 0, 0, 6.01706, 6.08456, 4.49408, 11.3129, 0.0520799, 0.0402399, 0.0372522, 0.940886, 1.2079, 1.2015, 4.21515e-15}}}, -{{'E', "gemm", {"H", "H", "H"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB8 aB8/4 aB wg 4x8 kc8 cab4 ks8 nse di bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x4 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.30894e+06, 785550, 0, 0, 0, 0, 7.11935, 8.75656, 6.13129, 16.041, 0.0504259, 0.0420693, 0.0674256, 0.73758, 1.20696, 1.20187, 1.15297e-15}}}, -{{'E', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 8x4 kc2 cab4 ks8 nse di bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.34765e+06, 357923, 0, 0, 0, 0, 11.5919, 11.8886, 6.4384, 17.1449, 0.145301, 0.14134, 0.0125881, 0.886981, 1.17153, 1.00812, 9.37293e-12}}}, -{{'E', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03504e+06, 483759, 0, 0, 0, 0, 5.14655, 5.4381, 5.95309, 14.8916, 0.0340942, 0.0152122, 0.0258839, 1, 1.27788, 1.19062, 3.54023e-13}}}, -{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB16 sB16 aB wg 4x16 cab4 ks32 af dw vav di bo bk0 sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {540457, 1.10675e+06, 0, 0, 0, 0, 5.61877, 5.78531, 5.50943, 14.7763, 0.0373088, 0.0373088, 0, 1, 1.21298, 1.20167, -5.60976e-15}}}, +{{'E', "gemm", {"H", "H", "H"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB8 aB8/4 aB wg 4x8 kc8 cab4 ks8 nse bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x4 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.30894e+06, 785550, 0, 0, 0, 0, 7.11935, 8.75656, 6.13129, 16.041, 0.0504259, 0.0420693, 0.0674256, 0.73758, 1.20696, 1.20187, 1.15297e-15}}}, +{{'E', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 8x4 kc2 cab4 ks8 nse bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.34765e+06, 357923, 0, 0, 0, 0, 11.5919, 11.8886, 6.4384, 17.1449, 0.145301, 0.14134, 0.0125881, 0.886981, 1.17153, 1.00812, 9.37293e-12}}}, +{{'E', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03504e+06, 483759, 0, 0, 0, 0, 5.14655, 5.4381, 5.95309, 14.8916, 0.0340942, 0.0152122, 0.0258839, 1, 1.27788, 1.19062, 3.54023e-13}}}, +{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16 sB16 aB wg 4x16 cab4 ks32 af dw vav bo bk0 sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {540457, 1.10675e+06, 0, 0, 0, 0, 5.61877, 5.78531, 5.50943, 14.7763, 0.0373088, 0.0373088, 0, 1, 1.21298, 1.20167, -5.60976e-15}}}, {{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'E', "gemm", {"O", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {560656, 1.11604e+06, 0, 0, 0, 0, 8.60856, 11.0509, 6.14909, 15.6609, 0.0531671, 0.0531671, 0, 1, 1.21193, 1.20139, 7.41618e-15}}}, -{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav di bo bk0 sm sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27535e+06, 215187, 0, 0, 0, 0, 4.64559, 4.65327, 3.47044, 9.89232, 0.105235, 0.0510608, 0.0580013, 1, 1.20879, 1.2012, -2.49667e-15}}}, -{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav di bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20075e+06, 136815, 51380.2, 5732.06, 0, 0, 6.73706, 6.52116, 7.79608, 7.48028, 2.1671, 0.385165, 0.687428, 0.333333, 1.20286, 0, 0}}}, +{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2x2 sB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav bo bk0 sm sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27535e+06, 215187, 0, 0, 0, 0, 4.64559, 4.65327, 3.47044, 9.89232, 0.105235, 0.0510608, 0.0580013, 1, 1.20879, 1.2012, -2.49667e-15}}}, +{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20075e+06, 136815, 51380.2, 5732.06, 0, 0, 6.73706, 6.52116, 7.79608, 7.48028, 2.1671, 0.385165, 0.687428, 0.333333, 1.20286, 0, 0}}}, {{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, +{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, {{'E', "gemm", {"O", "H", "S"}, {"A", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "xyz"}, "sB8 sB8 sB wg 2x1x16 akr kc8 fg 0.25 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {32, 4, 8}, {2, 1, 16}, 1, (WGType) 0, 262917, 0, 1024, {32, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, -{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav di bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}}, +{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}}, {{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB64 aB wg 4x4x2 kr cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 65536, 32768, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.01594e+06, 428512, 105061, 111208, 0, 0, 4.57657, 4.56121, 3.76336, 13.6659, 0.0951637, 0.0550303, 0.0735522, 1, 1.21613, 1.20157, 3.14445e-15}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav di bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06234e+06, 445401, 0, 0, 0, 0, 4.62244, 5.31323, 3.78655, 11.4316, 0.08593, 0.0721937, 0.0244972, 1, 1.21089, 1.20155, -5.01322e-15}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav di bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20626e+06, 136698, 52320.2, 5700.91, 0, 0, 6.73519, 6.52141, 8.28549, 8.11665, 2.16712, 0.382531, 0.689817, 0.333333, 1.20221, 0, 0}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.24598e+06, 199092, 0, 0, 0, 0, 5.11406, 6.54063, 3.6868, 10.138, 0.114685, 0.0491769, 0.0713602, 0.902881, 1.20724, 1.20086, 6.44189e-15}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.00833e+06, 472991, 67249.4, 65461, 0, 0, 5.08333, 5.12749, 4.89837, 12.0468, 0.0787666, 0.065148, 0.0481217, 1, 1.20614, 1.20116, 2.71563e-15}}}, +{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2x2 sB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06234e+06, 445401, 0, 0, 0, 0, 4.62244, 5.31323, 3.78655, 11.4316, 0.08593, 0.0721937, 0.0244972, 1, 1.21089, 1.20155, -5.01322e-15}}}, +{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20626e+06, 136698, 52320.2, 5700.91, 0, 0, 6.73519, 6.52141, 8.28549, 8.11665, 2.16712, 0.382531, 0.689817, 0.333333, 1.20221, 0, 0}}}, +{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.24598e+06, 199092, 0, 0, 0, 0, 5.11406, 6.54063, 3.6868, 10.138, 0.114685, 0.0491769, 0.0713602, 0.902881, 1.20724, 1.20086, 6.44189e-15}}}, +{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.00833e+06, 472991, 67249.4, 65461, 0, 0, 5.08333, 5.12749, 4.89837, 12.0468, 0.0787666, 0.065148, 0.0481217, 1, 1.20614, 1.20116, 2.71563e-15}}}, {{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, +{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, {{'E', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzdsm"}, "sB16 sB16 sb fs wg 4x4 bo acb bk8192 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 8192}, {8192, 8192, 8192}, {32, 48, 32}, {4, 4, 1}, 1, (WGType) 1, 256, 32256, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}}, {{'E', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzDsm"}, "sB16 sB16 sb fs wg 8x4 bo acb bk8192 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 8192}, {8192, 8192, 8192}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 256, 61440, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}}, {{'E', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "sm"}, "ab16x3 ab16x3 ab fs sc bo acb bk4096 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 4096}, {8192, 8192, 4096}, {32, 32, 32}, {4, 4, 1}, 2, (WGType) 1, 256, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, -{{'E', "gemm", {"O", "O", "I"}, {"A4", "B4", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB16x2 aB wg 8x4 nse di bo sr sb32 bk0 grf256", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 0, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "sB32 sB32 aB wg 4x8x2 kr cab4 ks32 af dw vav di bo bk0 sn sys pab sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 8, 2}, 1, (WGType) 1, 261, 49152, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.00712e+06, 406000, 172581, 272859, 0, 0, 3.12891, 3.34068, 4.87654, 14.094, 0.0252127, 0.0218325, 0.0113697, 0.982774, 1.22754, 1.20144, 7.00834e-14}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 8x8 cab4 ks32 af dw vav di bo bk0 sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {552020, 1.12529e+06, 0, 0, 0, 0, 5.16852, 2.80811, 6.1605, 15.5799, 0.0215997, 0.0215997, 0, 1, 1.2111, 1.20564, -3.21736e-14}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "sB32 sB64 aB wg 4x16 cab4 ks64 af dw vav di bo bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {8192, 8192, 16777216}, {32, 4, 64}, {4, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {545420, 1.0938e+06, 0, 0, 0, 0, 3.27178, 5.27297, 4.83132, 13.78, 0.0374045, 0.0374045, 0, 1, 1.21335, 1.20085, 1.04884e-14}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "sB128 sB32 aB wg 4x16 cab4 ks128 af dw vav di bo bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 128}, {4, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {558963, 1.08579e+06, 0, 0, 0, 0, 4.41143, 3.44507, 2.93976, 12.2012, 0.0459434, 0.0459434, 0, 1, 1.2227, 1.20171, 7.0997e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "sB8 aS16x2 aB wg 2x2x8 kr ca4 ks16 nse di bo bk0 grf256 l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 16}, {2, 2, 8}, 1, (WGType) 1, 261, 32768, 8192, {1, 1, 4}, {false, true, true}}, {'E', 17, {995462, 445130, 67258.2, 63170.1, 0, 0, 3.07902, 3.72917, 3.04058, 12.2691, 0.0676881, 0.00390551, 0.0986196, 0.737096, 1.29596, 1.0591, 3.4655e-12}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "sB128 sB32 aB wg 2x16 cab4 ks128 af dw vav di bo bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {8192, 8192, 16777216}, {16, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.06759e+06, 469635, 0, 0, 0, 0, 4.35669, 3.29013, 2.80884, 10.6334, 0.0840327, -0.117777, 0.237041, 0.791787, 1.22717, 1.20241, 4.66387e-16}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "sB16 aS16x2 aB wg 2x4x4 kr ca4x2 ks16 nse di bo bk0 grf256 l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 16}, {2, 4, 4}, 1, (WGType) 1, 261, 8192, 16384, {1, 1, 4}, {false, true, true}}, {'E', 17, {1.01044e+06, 423368, 46294, 47299.6, 0, 0, 3.99441, 3.47806, 6.43467, 20.6609, 0.11319, -0.00927457, 0.22023, 0.935341, 1.20336, 1.03247, 4.0724e-12}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "sB64 sB32 aB wg 2x16 cab4 ks128 af dw vav di bo bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {8192, 8192, 16777216}, {16, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.06794e+06, 471134, 0, 0, 0, 0, 4.47378, 3.21765, 2.79237, 10.5871, 0.0837144, -0.00244281, 0.11683, 0.989653, 1.22114, 1.20202, 2.17019e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qI"}, "sB32 sB32 aB wg 8x4x2 kr cab4 ks32 af dw vav di bo bk0 sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.01215e+06, 439063, 187718, 269395, 0, 0, 4.42767, 3.28808, 4.96732, 13.9022, 0.0276486, 0.0437776, 0.0290471, 0.756816, 1.21069, 1.20543, -2.44339e-14}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 2x8 cab4 ks32 af dw vav di bo bk0 sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 257, 10240, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.30659e+06, 218350, 0, 0, 0, 0, 3.0037, 5.69885, 3.5694, 10.4219, 0.0863226, -0.00310343, 0.129888, 0.997265, 1.21109, 1.20141, 4.79304e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32x2 sB32x2 aB wg 2x8x2 kr cab4 ks32 af dw vav di bo bk0 sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 36864, 32768, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.05126e+06, 420841, 100681, 134635, 0, 0, 8.79641, 2.65858, 3.45759, 14.5304, 0.0641485, 0.0439817, 0.0926801, 0.889725, 1.21117, 1.20103, 5.79406e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, ""}, "aB4x2 aB8x2 aB wg 2x1x4 kr nse di bo sb64 bk0 grf256 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {2097152, 16384, 16777216}, {8192, 8192, 16777216}, {128, 1, 8}, {2, 1, 4}, 1, (WGType) 0, 261, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.2101e+06, 94342.6, 74144.7, 10646.1, 0, 0, 2.98791, 52.3853, 0.173033, 0.165709, 0.310447, 0.16497, 0.290758, 0.789018, 1.2386, 0, 0}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16 aB8 aB wg 2x1x16 kr nse di bo sb64 bk0 grf256 l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 16}, {2, 1, 16}, 1, (WGType) 0, 261, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.1196e+06, 561885, 13601.6, 11097.3, 0, 0, 3.96406, 24.24, 23.7773, 24.1473, 0.989971, 0.408827, 1.42672, 0.959858, 1.0887, 0, 0}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16 aB4 aB wg 2x1x32 kr nse di bo sb64 bk0 l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 261, 0, 2048, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.19884e+06, 735512, 9993.29, 8835.1, 0, 0, 5.27858, 22.7597, 42.6848, 49.005, 2.19826, 2.24466, 1.01836, 0.938948, 1.0929, 0, 0}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aB16 aB4 aB wg 2x1x32 kr nse di bo sb64 bk0 l4 kb sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 263, 0, 2048, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.18625e+06, 737315, 10372.2, 8793.1, 0, 0, 5.28209, 22.7531, 41.279, 48.1094, 2.1982, 2.22264, 1.03136, 0.940299, 1.0929, 0, 0}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB8 aB wg 8x4 cab4 ks16 nse di bo sr bk0 sn grf256 dm l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.1687e+06, 680034, 0, 0, 0, 0, 3.04718, 3.55084, 6.45873, 17.7424, 0.0351353, 0.0322876, 0.00620525, 0.874922, 1.23355, 1.02944, 3.75965e-12}}}, +{{'E', "gemm", {"O", "O", "I"}, {"A4", "B4", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB16x2 aB wg 8x4 nse bo sr sb32 bk0 grf256", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 0, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ixy"}, "sB32 sB32 aB wg 4x8x2 kr cab4 ks32 af dw vav bo bk0 sn sys pab sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 8, 2}, 1, (WGType) 1, 261, 49152, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.00712e+06, 406000, 172581, 272859, 0, 0, 3.12891, 3.34068, 4.87654, 14.094, 0.0252127, 0.0218325, 0.0113697, 0.982774, 1.22754, 1.20144, 7.00834e-14}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 8x8 cab4 ks32 af dw vav bo bk0 sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {552020, 1.12529e+06, 0, 0, 0, 0, 5.16852, 2.80811, 6.1605, 15.5799, 0.0215997, 0.0215997, 0, 1, 1.2111, 1.20564, -3.21736e-14}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ixy"}, "sB32 sB64 aB wg 4x16 cab4 ks64 af dw vav bo bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {8192, 8192, 16777216}, {32, 4, 64}, {4, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {545420, 1.0938e+06, 0, 0, 0, 0, 3.27178, 5.27297, 4.83132, 13.78, 0.0374045, 0.0374045, 0, 1, 1.21335, 1.20085, 1.04884e-14}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ixy"}, "sB128 sB32 aB wg 4x16 cab4 ks128 af dw vav bo bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 128}, {4, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {558963, 1.08579e+06, 0, 0, 0, 0, 4.41143, 3.44507, 2.93976, 12.2012, 0.0459434, 0.0459434, 0, 1, 1.2227, 1.20171, 7.0997e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "x"}, "sB8 aS16x2 aB wg 2x2x8 kr ca4 ks16 nse bo bk0 grf256 l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 16}, {2, 2, 8}, 1, (WGType) 1, 261, 32768, 8192, {1, 1, 4}, {false, true, true}}, {'E', 17, {995462, 445130, 67258.2, 63170.1, 0, 0, 3.07902, 3.72917, 3.04058, 12.2691, 0.0676881, 0.00390551, 0.0986196, 0.737096, 1.29596, 1.0591, 3.4655e-12}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ixy"}, "sB128 sB32 aB wg 2x16 cab4 ks128 af dw vav bo bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {8192, 8192, 16777216}, {16, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.06759e+06, 469635, 0, 0, 0, 0, 4.35669, 3.29013, 2.80884, 10.6334, 0.0840327, -0.117777, 0.237041, 0.791787, 1.22717, 1.20241, 4.66387e-16}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "x"}, "sB16 aS16x2 aB wg 2x4x4 kr ca4x2 ks16 nse bo bk0 grf256 l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 16}, {2, 4, 4}, 1, (WGType) 1, 261, 8192, 16384, {1, 1, 4}, {false, true, true}}, {'E', 17, {1.01044e+06, 423368, 46294, 47299.6, 0, 0, 3.99441, 3.47806, 6.43467, 20.6609, 0.11319, -0.00927457, 0.22023, 0.935341, 1.20336, 1.03247, 4.0724e-12}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ixy"}, "sB64 sB32 aB wg 2x16 cab4 ks128 af dw vav bo bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {8192, 8192, 16777216}, {16, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.06794e+06, 471134, 0, 0, 0, 0, 4.47378, 3.21765, 2.79237, 10.5871, 0.0837144, -0.00244281, 0.11683, 0.989653, 1.22114, 1.20202, 2.17019e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qIxy"}, "sB32 sB32 aB wg 8x4x2 kr cab4 ks32 af dw vav bo bk0 sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.01215e+06, 439063, 187718, 269395, 0, 0, 4.42767, 3.28808, 4.96732, 13.9022, 0.0276486, 0.0437776, 0.0290471, 0.756816, 1.21069, 1.20543, -2.44339e-14}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 2x8 cab4 ks32 af dw vav bo bk0 sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 32768, 16777216}, {8192, 8192, 16777216}, {32, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 257, 10240, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.30659e+06, 218350, 0, 0, 0, 0, 3.0037, 5.69885, 3.5694, 10.4219, 0.0863226, -0.00310343, 0.129888, 0.997265, 1.21109, 1.20141, 4.79304e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32x2 sB32x2 aB wg 2x8x2 kr cab4 ks32 af dw vav bo bk0 sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 36864, 32768, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.05126e+06, 420841, 100681, 134635, 0, 0, 8.79641, 2.65858, 3.45759, 14.5304, 0.0641485, 0.0439817, 0.0926801, 0.889725, 1.21117, 1.20103, 5.79406e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, ""}, "aB4x2 aB8x2 aB wg 2x1x4 kr nse bo sb64 bk0 grf256 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {2097152, 16384, 16777216}, {8192, 8192, 16777216}, {128, 1, 8}, {2, 1, 4}, 1, (WGType) 0, 261, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.2101e+06, 94342.6, 74144.7, 10646.1, 0, 0, 2.98791, 52.3853, 0.173033, 0.165709, 0.310447, 0.16497, 0.290758, 0.789018, 1.2386, 0, 0}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16 aB8 aB wg 2x1x16 kr nse bo sb64 bk0 grf256 l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 16}, {2, 1, 16}, 1, (WGType) 0, 261, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.1196e+06, 561885, 13601.6, 11097.3, 0, 0, 3.96406, 24.24, 23.7773, 24.1473, 0.989971, 0.408827, 1.42672, 0.959858, 1.0887, 0, 0}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16 aB4 aB wg 2x1x32 kr nse bo sb64 bk0 l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 261, 0, 2048, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.19884e+06, 735512, 9993.29, 8835.1, 0, 0, 5.27858, 22.7597, 42.6848, 49.005, 2.19826, 2.24466, 1.01836, 0.938948, 1.0929, 0, 0}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aB16 aB4 aB wg 2x1x32 kr nse bo sb64 bk0 l4 kb sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 32}, 1, (WGType) 0, 263, 0, 2048, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.18625e+06, 737315, 10372.2, 8793.1, 0, 0, 5.28209, 22.7531, 41.279, 48.1094, 2.1982, 2.22264, 1.03136, 0.940299, 1.0929, 0, 0}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB8 aB wg 8x4 cab4 ks16 nse bo sr bk0 sn grf256 dm l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.1687e+06, 680034, 0, 0, 0, 0, 3.04718, 3.55084, 6.45873, 17.7424, 0.0351353, 0.0322876, 0.00620525, 0.874922, 1.23355, 1.02944, 3.75965e-12}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {64, 64, 1}, "Inpqxy"}, "sB32 sB32x2 aB wg 2x16 ca3 ks64 xaf dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {64, 64, 4}, {false, false, true}}, {'E', 17, {1.00154e+06, 513574, 0, 0, 0, 0, 2.73866, 2.65797, 6.51583, 18.2287, 0.00996636, 0.00691341, 0.00686787, 0.986643, 1.39382, 1.18598, 1.13183e-12}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Inpqxy"}, "sB32 sB32 aB wg 2x16 ca4 ks128 xaf dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {991698, 514197, 0, 0, 0, 0, 2.9311, 2.62991, 6.73613, 18.2732, 0.00996362, -0.00212772, 0.0186261, 0.740865, 1.40306, 1.17934, 9.21266e-13}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Inpxy"}, "sB64 sB32 aB wg 4x8 cab3 ks64 xaf dw vav bo sr bk0 dm grf256 sys l4 pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 61440, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.07695e+06, 540770, 0, 0, 0, 0, 3.25146, 2.5002, 6.59555, 17.6112, 0.0108004, 0.00702661, 0.00787872, 0.935703, 1.34469, 1.18472, 4.82174e-13}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Inpsxy"}, "sB64 sB32 aB wg 4x8 cab3 ks64 xaf dw vav bo sr bk0 dm grf256 sys l4 pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 61440, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.07695e+06, 540770, 0, 0, 0, 0, 3.25146, 2.5002, 6.59555, 17.6112, 0.0108004, 0.00702661, 0.00787872, 0.935703, 1.34469, 1.18472, 4.82174e-13}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Inpqxy"}, "sB32 sB32 aB wg 2x8x2 kr ca3 ks64 af dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 24576, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {905358, -142888, 156169, 708291, 0, 0, 2.80808, 2.70632, 5.86735, 16.3366, 0.0116269, 0.00613968, 0.00837311, 0.898385, 1.42173, 1.18156, 1.07791e-12}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Inpxy"}, "sB32 sB64x2 aB wg 4x8 ca4 ks128 xaf st dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.00948e+06, 476755, 0, 0, 0, 0, 3.40086, 2.5695, 6.47179, 16.9, 0.0122444, 0.00343553, 0.0130855, 0.923057, 1.31761, 1.18319, 6.35596e-13}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "IPnxy"}, "sB64 sB32 aB wg 4x8 cab3 ks64 xaf dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.07695e+06, 540770, 0, 0, 0, 0, 3.25146, 2.5002, 6.59555, 17.6112, 0.0108004, 0.00702661, 0.00787872, 0.935703, 1.34469, 1.18472, 4.82174e-13}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8/4x2 aB8/4x2 aB wg 4x4x4 kr cab4 ks16 nse di bo bk0 l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {4, 4, 4}, 1, (WGType) 1, 261, 49152, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {969115, 583269, 188510, 252220, 0, 0, 3.42328, 4.82691, 4.54757, 13.174, 0.0460941, 0.043499, 0.00642526, 0.922341, 1.28419, 1.04574, 3.16587e-12}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB8 aB wg 4x1x8 kr cb4 ks16 nse di bo bk0 grf256 l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {4, 1, 8}, 1, (WGType) 1, 261, 16384, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {925117, 381536, 217388, 254033, 0, 0, 3.37631, 5.16271, 2.19276, 9.90428, 0.0485492, 0.0129792, 0.042163, 0.629897, 1.27615, 1.05045, 4.02882e-12}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32x2 aB32x2 aB wg 8x8 cab4 ks32 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {563627, 1.13673e+06, 0, 0, 0, 0, 4.3956, 9.25108, 5.16051, 13.8631, 0.0539423, 0.0539423, 0, 1, 1.21113, 1.20077, 9.36221e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "p"}, "aB8 aB4x2 aB wg 2x1x16 kr cb4 ks8 nse di bo bk0 grf256 l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {2, 1, 16}, 1, (WGType) 1, 261, 16384, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {970194, 442926, 172506, 217024, 0, 0, 3.49819, 5.23457, -1.65975, 6.81699, 0.0602844, -0.018389, 0.0876532, 0.612555, 1.26532, 1.03934, 6.07546e-12}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 aB wg 8x4 cab4 ks32 af dw vav di bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.34117e+06, 329996, 0, 0, 0, 0, 4.49293, 4.23853, 6.05585, 15.6037, 0.057776, 0.0380034, 0.0257597, 1, 1.20975, 1.20161, 4.0607e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB8 aB wg 2x1x16 kr cb4 ks8 nse di bo bk0 grf256 l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 8}, {2, 1, 16}, 1, (WGType) 1, 261, 16384, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.01421e+06, 482423, 94878.6, 99927.2, 0, 0, 4.48997, 5.17153, 11.1876, 24.4444, 0.0850567, 0.0178826, 0.114954, 1, 1.20024, 1.03, 1.20479e-11}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 aB wg 8x8 cab4 ks32 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 65536, 16777216}, {8192, 8192, 16777216}, {8, 4, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 12288, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {549881, 1.11668e+06, 0, 0, 0, 0, 7.90656, 17.6481, 3.58236, 14.012, 0.170192, 0.170192, 0, 1, 1.21497, 1.20301, -7.65714e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4 aB8 aB wg 2x1x32 kr cb4 ks8 nse di bo bk0 l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {2, 1, 32}, 1, (WGType) 1, 261, 16384, 2048, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.09509e+06, 695591, 49047.5, 50168.2, 0, 0, 4.64557, 5.83422, 18.8435, 32.7117, 0.231845, 0.0587003, 0.203916, 0.602232, 1.09526, 1.03678, 7.22866e-12}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aB4 aB8 aB wg 2x1x32 kr cb4 ks8 nse di bo bk0 l4 kb sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {2, 1, 32}, 1, (WGType) 1, 263, 16384, 2048, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.09085e+06, 696199, 49629.3, 50141.4, 0, 0, 4.65663, 5.85851, 19.1566, 32.7463, 0.231622, 0.0567543, 0.20655, 0.602515, 1.09796, 1.03662, 8.00337e-12}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 aB wg 4x8 cab4 ks32 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 65536, 16777216}, {8192, 8192, 16777216}, {8, 4, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.09363e+06, 526547, 0, 0, 0, 0, 8.12314, 11.9414, 3.64859, 10.586, 0.206556, 0.126187, 0.252045, 1, 1.21209, 1.20195, 3.97516e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB32 aB32 aB wg 8x4x2 kr cab4 ks32 af dw vav di bo bk0 sys sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {945748, 636946, 250304, 203124, 0, 0, 3.01255, 4.96355, 4.8833, 13.6742, 0.0356961, 0.0146557, 0.0376148, 0.565879, 1.2069, 1.20121, 4.67581e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 aB wg 2x8x2 kr cab4 ks32 af dw vav di bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 36864, 32768, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.14223e+06, 581772, 120357, 128475, 0, 0, 11.2353, 4.4467, 3.13043, 14.0343, 0.0905754, -0.00383606, 0.142187, 0.90184, 1.21011, 1.20096, 8.87549e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB16 aB wg 4x4 cab4 ks16 nse di bo sr bk0 grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.325e+06, 271215, 0, 0, 0, 0, 3.1067, 4.51079, 6.43656, 17.4888, 0.0350908, 0.0307137, 0.00851217, 0.777143, 1.26441, 1.04115, 3.50353e-12}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "IPnsxy"}, "sB64 sB32 aB wg 4x8 cab3 ks64 xaf dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.07695e+06, 540770, 0, 0, 0, 0, 3.25146, 2.5002, 6.59555, 17.6112, 0.0108004, 0.00702661, 0.00787872, 0.935703, 1.34469, 1.18472, 4.82174e-13}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8/4x2 aB8/4x2 aB wg 4x4x4 kr cab4 ks16 nse bo bk0 l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {4, 4, 4}, 1, (WGType) 1, 261, 49152, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {969115, 583269, 188510, 252220, 0, 0, 3.42328, 4.82691, 4.54757, 13.174, 0.0460941, 0.043499, 0.00642526, 0.922341, 1.28419, 1.04574, 3.16587e-12}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB8 aB wg 4x1x8 kr cb4 ks16 nse bo bk0 grf256 l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {4, 1, 8}, 1, (WGType) 1, 261, 16384, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {925117, 381536, 217388, 254033, 0, 0, 3.37631, 5.16271, 2.19276, 9.90428, 0.0485492, 0.0129792, 0.042163, 0.629897, 1.27615, 1.05045, 4.02882e-12}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32x2 aB32x2 aB wg 8x8 cab4 ks32 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {563627, 1.13673e+06, 0, 0, 0, 0, 4.3956, 9.25108, 5.16051, 13.8631, 0.0539423, 0.0539423, 0, 1, 1.21113, 1.20077, 9.36221e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "p"}, "aB8 aB4x2 aB wg 2x1x16 kr cb4 ks8 nse bo bk0 grf256 l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {2, 1, 16}, 1, (WGType) 1, 261, 16384, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {970194, 442926, 172506, 217024, 0, 0, 3.49819, 5.23457, -1.65975, 6.81699, 0.0602844, -0.018389, 0.0876532, 0.612555, 1.26532, 1.03934, 6.07546e-12}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 aB wg 8x4 cab4 ks32 af dw vav bo bk0 sys l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.34117e+06, 329996, 0, 0, 0, 0, 4.49293, 4.23853, 6.05585, 15.6037, 0.057776, 0.0380034, 0.0257597, 1, 1.20975, 1.20161, 4.0607e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB8 aB wg 2x1x16 kr cb4 ks8 nse bo bk0 grf256 l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 8}, {2, 1, 16}, 1, (WGType) 1, 261, 16384, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.01421e+06, 482423, 94878.6, 99927.2, 0, 0, 4.48997, 5.17153, 11.1876, 24.4444, 0.0850567, 0.0178826, 0.114954, 1, 1.20024, 1.03, 1.20479e-11}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 aB wg 8x8 cab4 ks32 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 65536, 16777216}, {8192, 8192, 16777216}, {8, 4, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 12288, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {549881, 1.11668e+06, 0, 0, 0, 0, 7.90656, 17.6481, 3.58236, 14.012, 0.170192, 0.170192, 0, 1, 1.21497, 1.20301, -7.65714e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4 aB8 aB wg 2x1x32 kr cb4 ks8 nse bo bk0 l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {2, 1, 32}, 1, (WGType) 1, 261, 16384, 2048, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.09509e+06, 695591, 49047.5, 50168.2, 0, 0, 4.64557, 5.83422, 18.8435, 32.7117, 0.231845, 0.0587003, 0.203916, 0.602232, 1.09526, 1.03678, 7.22866e-12}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aB4 aB8 aB wg 2x1x32 kr cb4 ks8 nse bo bk0 l4 kb sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {2, 1, 32}, 1, (WGType) 1, 263, 16384, 2048, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.09085e+06, 696199, 49629.3, 50141.4, 0, 0, 4.65663, 5.85851, 19.1566, 32.7463, 0.231622, 0.0567543, 0.20655, 0.602515, 1.09796, 1.03662, 8.00337e-12}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 aB wg 4x8 cab4 ks32 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 65536, 16777216}, {8192, 8192, 16777216}, {8, 4, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.09363e+06, 526547, 0, 0, 0, 0, 8.12314, 11.9414, 3.64859, 10.586, 0.206556, 0.126187, 0.252045, 1, 1.21209, 1.20195, 3.97516e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB32 aB32 aB wg 8x4x2 kr cab4 ks32 af dw vav bo bk0 sys sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {945748, 636946, 250304, 203124, 0, 0, 3.01255, 4.96355, 4.8833, 13.6742, 0.0356961, 0.0146557, 0.0376148, 0.565879, 1.2069, 1.20121, 4.67581e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32 aB32 aB wg 2x8x2 kr cab4 ks32 af dw vav bo bk0 grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 36864, 32768, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.14223e+06, 581772, 120357, 128475, 0, 0, 11.2353, 4.4467, 3.13043, 14.0343, 0.0905754, -0.00383606, 0.142187, 0.90184, 1.21011, 1.20096, 8.87549e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB16 aB wg 4x4 cab4 ks16 nse bo sr bk0 grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.325e+06, 271215, 0, 0, 0, 0, 3.1067, 4.51079, 6.43656, 17.4888, 0.0350908, 0.0307137, 0.00851217, 0.777143, 1.26441, 1.04115, 3.50353e-12}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Inxy"}, "sB32 sB32 aB wg 8x4 cab4 ks32 xaf st dw vav bo sr bk0 grf256 sys pab l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.01391e+06, 509797, 0, 0, 0, 0, 3.18295, 3.81981, 6.57362, 18.2308, 0.0132914, 0.00669773, 0.00903364, 0.924525, 1.25888, 1.19572, 2.0617e-13}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB32 sB32 aB wg 4x4x2 kr cab4 ks32 xaf st dw vav bo sr bk0 grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {588062, -204062, 489793, 781400, 0, 0, 3.40801, 3.41401, 5.96958, 15.8543, 0.0153168, 0.00865183, 0.0102862, 0.874684, 1.30067, 1.19121, 4.28268e-13}}}, {{'E', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB32 sB32 aB wg 4x8 cab4 ks64 xaf dw vav bo sr bk0 grf256 sys pab l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.03222e+06, 489032, 0, 0, 0, 0, 3.6293, 4.30606, 6.4398, 16.8446, 0.0180904, 0.00975268, 0.0127029, 0.887231, 1.21983, 1.20127, -3.55483e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB32 aB wg 8x4 cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.08485e+06, 534090, 0, 0, 0, 0, 2.26038, 2.28822, 6.54241, 16.5807, 0.0108941, -0.00616224, 0.023865, 0.743702, 1.35093, 1.1815, 6.50109e-13}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB64 aB wg 8x8 cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {546628, 1.16417e+06, 0, 0, 0, 0, 2.69496, 2.82552, 5.90182, 15.4398, 0.0165818, 0.0165818, 0, 1, 1.23827, 1.20187, -6.19745e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB128 sB64 aB wg 8x4 cab4 ks128 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 128}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.18851e+06, 476513, 0, 0, 0, 0, 2.34744, 2.36664, 5.65933, 14.5041, 0.0251992, 0.0179545, 0.0233066, 0.991699, 1.2182, 1.2025, -6.17732e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB64 aB wg 4x4x2 kr cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.02187e+06, 412759, 115515, 134546, 0, 0, 2.32791, 2.76242, 2.88333, 9.82242, 0.0318178, 0.00610933, 0.0401331, 1, 1.21551, 1.20136, 3.9947e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB64 aB wg 8x8 cab4 ks128 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 128}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {560263, 1.07097e+06, 0, 0, 0, 0, 3.06127, 2.86759, 3.07389, 13.5153, 0.0419574, 0.0419574, 0, 1, 1.22194, 1.20059, 1.54207e-14}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB32 aB wg 2x4x4 kr cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr kd", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.08038e+06, 496577, 52296.1, 54199.9, 0, 0, 2.32365, 2.33229, 4.84206, 12.7203, 0.0382543, 0.00737813, 0.0544927, 0.996035, 1.22057, 1.19876, 8.36382e-14}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB128 aB wg 8x4 cab4 ks128 af dw vav di bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 128}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.09557e+06, 462553, 0, 0, 0, 0, 2.17329, 2.8532, 3.94421, 12.162, 0.0422784, 0.00389671, 0.0410451, 0.844249, 1.21956, 1.20074, 1.36525e-14}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB128 sB128 aB wg 4x4x2 kr cab4 ks128 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 128}, {4, 4, 2}, 1, (WGType) 1, 261, 65536, 32768, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.00381e+06, 424665, 113187, 111684, 0, 0, 2.20755, 2.21331, 3.79495, 13.7684, 0.0456296, -0.0460188, 0.120625, 0.679484, 1.22531, 1.20124, 8.07533e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB64 aB wg 2x8 cab4 ks128 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 128}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.32689e+06, 222623, 0, 0, 0, 0, 2.31002, 2.30138, 3.66802, 10.5821, 0.0515592, -0.0216913, 0.0738923, 0.87394, 1.219, 1.20127, 6.74603e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "sB64 sB64 aB wg 2x4x4 kr cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab kb l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 4, 4}, 1, (WGType) 1, 263, 49152, 16384, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.08393e+06, 500607, 42364.6, 41193.8, 0, 0, 2.32944, 2.71544, 6.37895, 21.2114, 0.0615537, -0.0454186, 0.139862, 0.925759, 1.21673, 1.20122, 6.12627e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB32 aB wg 8x4 cab4 ks64 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.07668e+06, 500480, 0, 0, 0, 0, 2.2985, 2.85765, 6.19196, 15.4234, 0.0169503, 0.00108194, 0.0249639, 0.993166, 1.24039, 1.19939, 4.69831e-14}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB64 sB64 aB wg 4x4 cab3 ks128 af dw vav di bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 128}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.26622e+06, 205878, 0, 0, 0, 0, 2.41748, 3.44852, 3.85081, 10.8513, 0.0565124, 0.0196373, 0.0513865, 0.975821, 1.22132, 1.20149, 6.09696e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB64 aB32 aB wg 4x4 cab3 ks128 af vav di bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 128}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.35303e+06, 235808, 0, 0, 0, 0, 3.04897, 11.8664, 1.44909, 1.44909, 0.203641, 0.052135, 0.168812, 0.965466, 1.22155, 0, 0}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16 sB32 aB wg 2x4x2 kr cb4 ks128 af dw vav di bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 128}, {2, 4, 2}, 1, (WGType) 1, 261, 65536, 16384, {1, 1, 4}, {true, false, true}}, {'E', 17, {1.09647e+06, 406349, 125402, 112986, 0, 0, 4.76469, 3.00307, 6.68777, 20.562, 0.0550678, 0.0352072, 0.0473407, 0.991295, 1.21719, 1.20108, 5.11504e-15}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB32 aB wg 2x1x8 kr af vav di bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 32}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.24264e+06, 124211, 25865.7, 3392.63, 0, 0, 3.01435, 3.34443, 10.1625, 10.1426, 0.40946, 0.110641, 0.265553, 0.590246, 1.21134, 0, 0}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aS16x2 aB32 aB wg 2x1x32 kr af vav di bo bk0 sys kb l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 32}, {2, 1, 32}, 1, (WGType) 0, 263, 0, 2048, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.03023e+06, 570479, 10646.8, 8815.77, 0, 0, 2.93605, 3.56278, 35.057, 37.7768, 0.464873, 0.0651331, 0.429985, 0.718768, 1.20923, 0, 0}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8x2 aB8x2 aB wg 8x8 cab4 ks16 nse di bo sr bk0 sm sn grf256 dm l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {582554, 1.48982e+06, 0, 0, 0, 0, 3.76758, 3.77334, 6.62494, 17.6039, 0.0363451, 0.0363451, 0, 1, 1.20147, 1.02529, 3.01473e-12}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB64 sB32 aB wg 8x4 cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.08485e+06, 534090, 0, 0, 0, 0, 2.26038, 2.28822, 6.54241, 16.5807, 0.0108941, -0.00616224, 0.023865, 0.743702, 1.35093, 1.1815, 6.50109e-13}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB64 sB64 aB wg 8x8 cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {546628, 1.16417e+06, 0, 0, 0, 0, 2.69496, 2.82552, 5.90182, 15.4398, 0.0165818, 0.0165818, 0, 1, 1.23827, 1.20187, -6.19745e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB128 sB64 aB wg 8x4 cab4 ks128 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 128}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.18851e+06, 476513, 0, 0, 0, 0, 2.34744, 2.36664, 5.65933, 14.5041, 0.0251992, 0.0179545, 0.0233066, 0.991699, 1.2182, 1.2025, -6.17732e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB64 sB64 aB wg 4x4x2 kr cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 32768, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.02187e+06, 412759, 115515, 134546, 0, 0, 2.32791, 2.76242, 2.88333, 9.82242, 0.0318178, 0.00610933, 0.0401331, 1, 1.21551, 1.20136, 3.9947e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB64 sB64 aB wg 8x8 cab4 ks128 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 128}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {560263, 1.07097e+06, 0, 0, 0, 0, 3.06127, 2.86759, 3.07389, 13.5153, 0.0419574, 0.0419574, 0, 1, 1.22194, 1.20059, 1.54207e-14}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB64 sB32 aB wg 2x4x4 kr cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr kd", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.08038e+06, 496577, 52296.1, 54199.9, 0, 0, 2.32365, 2.33229, 4.84206, 12.7203, 0.0382543, 0.00737813, 0.0544927, 0.996035, 1.22057, 1.19876, 8.36382e-14}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB64 sB128 aB wg 8x4 cab4 ks128 af dw vav bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 128}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.09557e+06, 462553, 0, 0, 0, 0, 2.17329, 2.8532, 3.94421, 12.162, 0.0422784, 0.00389671, 0.0410451, 0.844249, 1.21956, 1.20074, 1.36525e-14}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB128 sB128 aB wg 4x4x2 kr cab4 ks128 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 128}, {4, 4, 2}, 1, (WGType) 1, 261, 65536, 32768, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.00381e+06, 424665, 113187, 111684, 0, 0, 2.20755, 2.21331, 3.79495, 13.7684, 0.0456296, -0.0460188, 0.120625, 0.679484, 1.22531, 1.20124, 8.07533e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB64 sB64 aB wg 2x8 cab4 ks128 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 128}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.32689e+06, 222623, 0, 0, 0, 0, 2.31002, 2.30138, 3.66802, 10.5821, 0.0515592, -0.0216913, 0.0738923, 0.87394, 1.219, 1.20127, 6.74603e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vIxy"}, "sB64 sB64 aB wg 2x4x4 kr cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab kb l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 4, 4}, 1, (WGType) 1, 263, 49152, 16384, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.08393e+06, 500607, 42364.6, 41193.8, 0, 0, 2.32944, 2.71544, 6.37895, 21.2114, 0.0615537, -0.0454186, 0.139862, 0.925759, 1.21673, 1.20122, 6.12627e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB64 sB32 aB wg 8x4 cab4 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.07668e+06, 500480, 0, 0, 0, 0, 2.2985, 2.85765, 6.19196, 15.4234, 0.0169503, 0.00108194, 0.0249639, 0.993166, 1.24039, 1.19939, 4.69831e-14}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB64 sB64 aB wg 4x4 cab3 ks128 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 128}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {1, 1, 4}, {false, false, true}}, {'E', 17, {1.26622e+06, 205878, 0, 0, 0, 0, 2.41748, 3.44852, 3.85081, 10.8513, 0.0565124, 0.0196373, 0.0513865, 0.975821, 1.22132, 1.20149, 6.09696e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB64 aB32 aB wg 4x4 cab3 ks128 af vav bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 128}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.35303e+06, 235808, 0, 0, 0, 0, 3.04897, 11.8664, 1.44909, 1.44909, 0.203641, 0.052135, 0.168812, 0.965466, 1.22155, 0, 0}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Iy"}, "aS16 sB32 aB wg 2x4x2 kr cb4 ks128 af dw vav bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 16777216}, {8, 16, 128}, {2, 4, 2}, 1, (WGType) 1, 261, 65536, 16384, {1, 1, 4}, {true, false, true}}, {'E', 17, {1.09647e+06, 406349, 125402, 112986, 0, 0, 4.76469, 3.00307, 6.68777, 20.562, 0.0550678, 0.0352072, 0.0473407, 0.991295, 1.21719, 1.20108, 5.11504e-15}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB32 aB wg 2x1x8 kr af vav bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 32}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.24264e+06, 124211, 25865.7, 3392.63, 0, 0, 3.01435, 3.34443, 10.1625, 10.1426, 0.40946, 0.110641, 0.265553, 0.590246, 1.21134, 0, 0}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "vI"}, "aS16x2 aB32 aB wg 2x1x32 kr af vav bo bk0 sys kb l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 32}, {2, 1, 32}, 1, (WGType) 0, 263, 0, 2048, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.03023e+06, 570479, 10646.8, 8815.77, 0, 0, 2.93605, 3.56278, 35.057, 37.7768, 0.464873, 0.0651331, 0.429985, 0.718768, 1.20923, 0, 0}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8x2 aB8x2 aB wg 8x8 cab4 ks16 nse bo sr bk0 sm sn grf256 dm l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {582554, 1.48982e+06, 0, 0, 0, 0, 3.76758, 3.77334, 6.62494, 17.6039, 0.0363451, 0.0363451, 0, 1, 1.20147, 1.02529, 3.01473e-12}}}, {{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {64, 4, 1}, "xyI"}, "sS32x2 sB32 aB wg 16x2 cb4x2 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {64, 4, 4}, {false, false, true}}, {'E', 17, {1.06789e+06, 533234, 0, 0, 0, 0, 2.49913, 2.42261, 6.47452, 17.5868, 0.00865422, 0.00164569, 0.0106404, 0.852414, 1.37422, 1.18475, 8.41776e-13}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipxy"}, "sB64 sB32 aB wg 8x4 cab3 ks64 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.0553e+06, 523707, 0, 0, 0, 0, 2.27443, 2.47386, 6.75043, 17.4507, 0.00973723, 0.0173445, 0.00450943, 0.936605, 1.39613, 1.18223, 6.42735e-13}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb32 sb32 sb l4 cab1 wg 4x4 cs di sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 4096, 0, {1, 1, 4}, {false, false, false}}, {'E', 17, {1.01946e+06, 85192.9, 0, 0, 0, 0, 3.69873, 4.09617, 6.42674, 17.041, 0.0424222, 0.0270009, 0.0195008, 0.698122, 1.40723, 1.13886, 7.68905e-13}}}, -{{'E', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8 aB16 aB wg 4x8 cab4 ks16 nse di bo sr bk0 sn grf256 dm l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.31114e+06, 828815, 0, 0, 0, 0, 4.12868, 4.53677, 6.35113, 17.6714, 0.04015, 0.0278237, 0.024414, 0.810338, 1.20421, 1.02447, 3.99205e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 4x8 kc4 nse di sb64 bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 4}, {4, 8, 1}, 1, (WGType) 0, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 8x4 kc4 cab4x2 ks8 nse di bo bk0 sn sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.3546e+06, 315336, 0, 0, 0, 0, 19.529, 12.4603, 5.98766, 15.7596, 0.125914, 0.121288, 0.0101254, 0.873111, 1.32691, 1.13232, 2.47813e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 aB wg 4x4x2 kr kc8 cab4x2 ks8 nse di hi bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 8}, {4, 4, 2}, 1, (WGType) 1, 261, 40960, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.01037e+06, 330255, 116967, 182860, 0, 0, 17.1106, 16.5232, 4.44267, 11.7098, 0.141417, 0.103027, 0.0331418, 0.58131, 1.36247, 1.11242, 2.61462e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4 aB8 aB wg 4x4x4 kr kc4 cab4 ks8 nse di hi bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {4, 4, 4}, 1, (WGType) 1, 261, 49152, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06097e+06, 556945, 66027.4, 91054.2, 0, 0, 18.5093, 14.0722, 2.66696, 7.68563, 0.176615, 0.0866836, 0.0998873, 0.702223, 1.31234, 1.09234, 7.41931e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aS4x2 aB wg 2x4x4 kr kc4 ca4x2 ks8 nse di hi bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {2, 4, 4}, 1, (WGType) 1, 261, 16384, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08458e+06, 216717, 84835.8, 36815.4, 0, 0, 18.3579, 12.9505, 1.89546, 7.76547, 0.219793, 0.153769, 0.0577075, 0.933834, 1.27783, 0.949068, 7.31161e-11}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4 aB4 aB wg 2x8x4 kr kc4 cab4x2 ks8 nse di bo bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 8}, {2, 8, 4}, 1, (WGType) 1, 261, 32768, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12674e+06, 648871, 55478.5, 61257.5, 0, 0, 19.0863, 11.8242, 3.43093, 10.4549, 0.259657, 0.247568, 0.0568037, 1, 1.22429, 1.06723, 4.08595e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 aB wg 8x4 kc8 cab4x2 ks8 nse di bo bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.31235e+06, 288775, 0, 0, 0, 0, 18.7973, 21.3706, 5.54761, 14.4861, 0.147428, 0.107901, 0.033153, 1, 1.33996, 1.1511, 2.16816e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 4x8 kc4 cab4x2 ks8 nse di bo bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.33992e+06, 296845, 0, 0, 0, 0, 20.3993, 13.23, 5.72152, 14.4818, 0.143213, 0.116468, 0.028474, 0.9246, 1.33587, 1.1524, 2.2794e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2 aB8x2 aB wg 4x4 kc8 cab4x2 ks8 nse di bo bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.42251e+06, 151979, 0, 0, 0, 0, 18.9169, 13.1915, 5.54022, 14.3571, 0.153986, 0.0983515, 0.0423043, 0.945168, 1.35424, 1.15241, 2.28656e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 2x8 kc4 cab4x2 ks8 nse di bo bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 8}, {2, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.3331e+06, 140159, 0, 0, 0, 0, 19.7793, 12.1262, 4.33379, 12.2776, 0.235893, 0.114077, 0.0969446, 0.891475, 1.27279, 1.13859, 7.22824e-13}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB8x2 aB wg 4x1 kc4 nse di bo sb64 bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 16384, 16777216}, {8192, 8192, 16777216}, {32, 1, 8}, {4, 1, 1}, 1, (WGType) 0, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16919e+06, 27880.4, 0, 0, 0, 0, 11.5008, 22.3297, 0.230151, 0.23026, 1.23566, 0.719548, 0.513589, 0.5, 1.11024, 0, 0}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4 aB wg 4x1x8 kr kc4 nse di bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 4}, {4, 1, 8}, 1, (WGType) 0, 261, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16027e+06, 252146, 49096.8, 13045.4, 0, 0, 11.7305, 45.2153, 8.44143, 8.40403, 1.34796, 0.539253, 1.01482, 0.832486, 1.15634, 0, 0}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4 aB wg 2x1x16 kr kc4 nse di bo sb64 bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 4}, {2, 1, 16}, 1, (WGType) 0, 261, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.1956e+06, 262989, 42928.1, 11139.1, 0, 0, 12.5624, 26.0997, 13.5639, 13.2709, 2.46022, 1.15535, 1.74003, 0.862105, 1.04038, 0, 0}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aB4x2 aB4 aB wg 2x1x16 kr kc4 nse di bo sb64 bk0 kb sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 4}, {2, 1, 16}, 1, (WGType) 0, 263, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.20007e+06, 262941, 42185.5, 11144.5, 0, 0, 12.5473, 25.954, 13.4924, 13.1994, 2.45843, 1.17019, 1.73564, 0.861639, 1.04437, 0, 0}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {2048, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "aB4x2 aB8/4x2 aB wg 8x4 kc4 cb4 ks8 nse di bo bk0 sn grf256 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08605e+06, 604168, 0, 0, 0, 0, 29.905, 14.9021, 6.30949, 17.3852, 0.127384, 0.123773, 0.010991, 0.905338, 1.1857, 1.02549, 5.95169e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {2048, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4/2x2 aB4x2 aB wg 8x4 kc4 nse di sb64 bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 4}, {8, 4, 1}, 1, (WGType) 0, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {989969, 519612, 0, 0, 0, 0, 16.7886, 17.9252, 6.45578, 17.4626, 0.12653, 0.124499, 0.0102408, 0.914174, 1.18982, 1.01788, 6.88478e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 8x4 kc4 cab4x2 ks8 nse di bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.14372e+06, 268176, 0, 0, 0, 0, 19.0558, 21.4548, 6.11653, 15.9061, 0.125914, 0.1165, 0.0129836, 0.766057, 1.30565, 1.10714, 3.63242e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8/2x2 aB8/2x2 aB wg 4x4x4 kr kc8 cab4 ks8 nse di bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {4, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {985735, 579142, 89571.4, 111952, 0, 0, 19.2371, 19.292, 3.04924, 10.2567, 0.134813, 0.127874, 0.0154638, 0.964384, 1.32465, 1.09359, 5.97777e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8/4 aB4 aB wg 2x4x4 kr kc4 cab4 ks8 nse di bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {2, 4, 4}, 1, (WGType) 1, 261, 49152, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {965437, 378870, 128663, 136758, 0, 0, 20.0729, 19.6239, 4.64349, 11.2569, 0.159785, 0.104953, 0.0557092, 0.629327, 1.32709, 1.10857, 3.56295e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8/4 aB4 aB wg 2x2x8 kr kc4 cab4 ks8/4 nse di bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 261, 65536, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03793e+06, 442598, 84792.3, 92330.5, 0, 0, 18.8298, 19.2546, 5.24556, 13.5387, 0.221282, 0.145021, 0.0472079, 0.11012, 1.24889, 1.03634, 1.74167e-11}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4 aB wg 2x1x16 kr kc4 cb4 ks8 nse di bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {2, 1, 16}, 1, (WGType) 1, 261, 32768, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.22266e+06, 300480, 71990.6, 29344.4, 0, 0, 17.7499, 19.4931, 3.79453, 10.2325, 0.753585, 0.13462, 0.0863731, 0, 1.00212, 1.03517, -2.40598e-13}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aB4x2 aB4 aB wg 2x1x16 kr kc4 cb4 ks8 nse di bo bk0 kb sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {2, 1, 16}, 1, (WGType) 1, 263, 32768, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.20037e+06, 301140, 72741.6, 29330.7, 0, 0, 17.7375, 19.5246, 3.79028, 10.2894, 0.755373, 0.133988, 0.0869263, 0, 1.00095, 1.00147, -3.84892e-15}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 aB wg 8x4 kc8 cab4x2 ks8 nse di bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.14681e+06, 252797, 0, 0, 0, 0, 18.7556, 22.0632, 5.55765, 14.7217, 0.129576, 0.0903504, 0.0430783, 0.608364, 1.37895, 1.18499, 2.72061e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 4x8 kc4 cab4x2 ks8 nse di bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.1555e+06, 250247, 0, 0, 0, 0, 20.2189, 17.5095, 5.50136, 14.7401, 0.130247, 0.121949, 0.0272943, 0.822585, 1.38372, 1.17114, 2.4229e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 8x4 kc4 cab4x2 ks8 nse di bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.14264e+06, 242222, 0, 0, 0, 0, 11.8247, 19.2893, 4.69171, 13.5115, 0.206087, 0.141717, 0.0799846, 1, 1.23922, 1.11159, 2.37456e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 4x4 kc4 cab4x2 ks8 nse di bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 8192, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16938e+06, 118796, 0, 0, 0, 0, 12.0084, 17.0436, 4.73424, 13.6962, 0.223144, 0.129865, 0.0961405, 0.885772, 1.23839, 1.03623, 1.12784e-11}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "enpqw"}, "aB8x2 aS4x2 aP wg 4x8 kc4 ca4 ks8 nse di bk0 sm grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08113e+06, 644110, 0, 0, 0, 0, 17.8779, 12.8384, 6.35841, 21.2735, 0.128279, 0.125356, 0.0107346, 0.963002, 1.21137, 1.04637, 4.08712e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aS4x2 aP wg 4x8 kc4 ca4 ks8 nse di bk0 sm grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08113e+06, 644110, 0, 0, 0, 0, 17.8779, 12.8384, 6.35841, 21.2735, 0.12828, 0.125356, 0.0107346, 0.963002, 1.21137, 1.04637, 4.08712e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 aB wg 8x4 kc8 cab4x2 ks8 nse di bo bk0 sm sn sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.3433e+06, 325408, 0, 0, 0, 0, 13.5129, 12.9096, 5.98721, 15.7372, 0.12661, 0.0960571, 0.0304219, 0.583089, 1.34377, 1.11974, 3.39207e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4 aS4x2 aB wg 2x8x2 kr kc4 ca4x2 ks8 nse di bo bk0 sm sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 8}, {2, 8, 2}, 1, (WGType) 1, 261, 16384, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.22693e+06, 251321, 210418, 106612, 0, 0, 13.5399, 13.9651, 4.39951, 13.0611, 0.147223, 0.115872, 0.0300123, 0.916152, 1.36645, 1.0844, 1.44051e-11}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qnpe"}, "aB4 aS4 aP wg 2x4x8 kr kc4 ca4 ks8 nse di bo bk0 sm sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {2, 4, 8}, 1, (WGType) 1, 261, 65536, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06485e+06, 693034, 195987, 231547, 0, 0, 18.6107, 13.8914, 4.94462, 26.3695, 0.161527, 0.132206, 0.0318929, 0.806319, 1.34391, 1.09114, 1.06794e-11}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aS4x2 aB wg 2x4x4 kr kc4 ca4x2 ks8 nse di bo bk0 sm sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {2, 4, 4}, 1, (WGType) 1, 261, 16384, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.35407e+06, 307675, 87230.9, 35239.5, 0, 0, 13.0052, 12.6903, 2.93583, 6.59897, 0.275248, 0.216252, 0.0489705, 1, 1.24302, 1.11488, 3.02106e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 2x8x4 kr kc4 cab4 ks8 nse di bo bk0 sm sn sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 8}, {2, 8, 4}, 1, (WGType) 1, 261, 32768, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07577e+06, 617108, 56822.5, 58479.9, 0, 0, 11.7402, 11.6903, 3.28451, 10.3784, 0.334271, 0.377107, 0.0165214, 1, 1.1634, 1.05007, 2.92522e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aB4x2 aB4x2 aB wg 2x8x4 kr kc4 cab4 ks8 nse di bo bk0 sm sn kb sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 8}, {2, 8, 4}, 1, (WGType) 1, 263, 32768, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06847e+06, 617445, 58297, 58439, 0, 0, 11.7324, 11.6979, 3.27931, 10.3848, 0.334181, 0.377289, 0.0161202, 1, 1.16193, 1.05447, 2.55089e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 aB wg 4x8 kc8 cab4x2 ks8 nse di bo bk0 sm sn sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.33091e+06, 294791, 0, 0, 0, 0, 12.113, 12.1053, 5.76463, 14.438, 0.174666, 0.153877, 0.0284394, 1, 1.21846, 1.09172, 2.58579e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 2x8 kc4 cab4x2 ks8 nse di bo bk0 sm sn sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 8}, {2, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37172e+06, 143581, 0, 0, 0, 0, 12.0273, 11.939, 4.30531, 12.11, 0.300404, 0.190268, 0.0847494, 0.961414, 1.14371, 1.0774, 1.20649e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8 aB8x2 aB wg 4x1 kc8 nse di bo bk0 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 8}, {4, 1, 1}, 1, (WGType) 0, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.18688e+06, 30123, 0, 0, 0, 0, 11.8183, 1.08352, -3.02547, -3.02137, 3.45176, 1.04334, 0.610591, 0, 1.00207, 0, 0}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS4x2 aB4x2 aB wg 4x1x4 kr kc4 nse di bo bk0 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 4}, {4, 1, 4}, 1, (WGType) 0, 261, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.095e+06, 114618, 67456.5, 7026.97, 0, 0, 14.2166, 26.1172, 4.27831, 4.3586, 5.636, 1.92292, 0.879608, 0.833333, 1.00143, 0, 0}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS4 aB4x2 aB wg 2x1x16 kr kc4 nse di bo sb64 bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 4}, {2, 1, 16}, 1, (WGType) 0, 261, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16669e+06, 252491, 41900.2, 11020.1, 0, 0, 15.0751, 15.6212, 13.7121, 12.5973, 6.13369, 4.14404, 0.329035, 0.50991, 1.00145, 0, 0}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aS4 aB4x2 aB wg 2x1x16 kr kc4 nse di bo sb64 bk0 kb sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 4}, {2, 1, 16}, 1, (WGType) 0, 263, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.15281e+06, 253100, 56985.6, 10434.6, 0, 0, 15.075, 15.6287, 30.852, 29.2407, 6.13546, 4.08401, 0.367041, 0.496243, 1.00145, 0, 0}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "aS4x2 aB8x2 aP wg 8x4 kc4 cb4 ks8 nse di bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {981989, 555240, 0, 0, 0, 0, 15.2859, 19.3328, 6.46524, 20.9306, 0.131163, 0.116872, 0.0205996, 0.805001, 1.19768, 1.00218, 8.63883e-12}}}, -{{'E', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS4x2 aB8/4x2 aB wg 8x4 kc4 cb4 ks8 nse di bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {984519, 554553, 0, 0, 0, 0, 15.6433, 19.3034, 6.32057, 17.386, 0.131163, 0.116872, 0.020599, 0.810127, 1.20102, 1.02803, 6.01287e-12}}}, -{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 16x4 cab4 ks16 af dw vav di hi bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {8192, 8192, 16777216}, {16, 64, 16}, {16, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {555289, 1.12387e+06, 0, 0, 0, 0, 11.0812, 12.062, 6.31783, 16.9321, 0.033502, 0.033502, 0, 0.924238, 1.20788, 1.20316, -1.03588e-14}}}, -{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xypIn"}, "sB32 sB32 aB wg 8x8 cab4 ks32 af dw vav di hi bk0 sn sys pab sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.01712e+06, 501677, 0, 0, 0, 0, 11.2352, 11.0061, 6.00586, 15.9727, 0.0368901, 0.0329381, 0.0158743, 0.872583, 1.26733, 1.18539, 7.97223e-13}}}, -{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16x2 sB16x2 aB wg 8x8 cab4 ks32 af dw vav di hi bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {538723, 893443, 0, 0, 0, 0, 10.7801, 9.8182, 5.03684, 13.5344, 0.0761836, 0.0761836, 0, 0.79624, 1.20514, 1.20124, -4.36232e-15}}}, -{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav di hi bk0 grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {554386, 926318, 0, 0, 0, 0, 11.2535, 11.3159, 6.30666, 17.7295, 0.0362105, 0.0362105, 0, 0.820846, 1.20543, 1.20182, -7.27096e-15}}}, -{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyIn"}, "sB16 sB32 aB wg 8x8 cab4 ks32 af dw vav di hi bk0 sys pab sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {880413, 426415, 0, 0, 0, 0, 11.314, 12.5284, 6.07049, 16.1179, 0.0387733, 0.0216213, 0.024149, 0.726888, 1.2321, 1.19468, 2.81285e-13}}}, -{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16x2 sB16x2 aB wg 8x4 cab4 ks16 af dw vav di hi bk0 grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {873052, 300083, 0, 0, 0, 0, 11.3512, 11.8331, 6.13255, 15.5383, 0.0704048, 0.0182551, 0.0658259, 0.497503, 1.20284, 1.20066, 6.06715e-15}}}, -{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 16x4 cab4 ks16 af dw vav di hi bk0 sm sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {8192, 8192, 16777216}, {16, 64, 16}, {16, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {550200, 1.13694e+06, 0, 0, 0, 0, 12.19, 11.8291, 6.29315, 16.9118, 0.0357158, 0.0357158, 0, 0.84371, 1.20301, 1.20198, 5.41083e-16}}}, -{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav di hi bk0 sm sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {980496, 425866, 0, 0, 0, 0, 11.0307, 11.0614, 6.30278, 16.1363, 0.0345007, 0.0181085, 0.026025, 0.869399, 1.25124, 1.18998, 4.50928e-13}}}, -{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB32 sB32 aB wg 16x4 cab4 ks32 af dw vav di hi bk0 sm sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {131072, 524288, 16777216}, {8192, 8192, 16777216}, {8, 32, 32}, {16, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {546206, 910844, 0, 0, 0, 0, 10.0561, 10.0714, 6.1261, 16.2049, 0.0537128, 0.0537128, 0, 0.921267, 1.20585, 1.20112, -2.2528e-15}}}, -{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB32 sB16 aB wg 2x8x2 kr cab4 ks32 af dw vav di hi bk0 sm sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 65536, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {903008, 291386, 111362, 166961, 0, 0, 10.979, 11.0174, 4.70886, 12.1312, 0.0687579, 0.0438927, 0.045381, 0.911378, 1.20424, 1.20048, 2.76292e-14}}}, -{{'F', "gemm", {"B", "B", "S"}, {"A2#16,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16+B16@48 aB16+B16@48 aB vav di sys grf256 af hi pt wg 4x8 sb256 bk0 sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {2048}}}, -{{'F', "gemm", {"B", "B", "S"}, {"A2", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2+B16@80 aB16x2+B16@80 aB vav sb256 wg 4x8 di bo pt sys bk0 sr br", {16, (LoopType) 255, 128, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {64, 128, 4}, {true, true, true}}, {'W', 1, {256}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@72 am32+m32@64 aB wg 4x8 xaf vav di hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {901925, 700635, 0, 0, 9.16685e+06, 1.23699e+07, 0.721771, 0.719501, 0.918422, 1.55461, 0.00404125, 0.00404125, 0, 0.990031, 1.64922, 1.16161, 1.79221e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m32@48 am32+S32@64 aB wg 4x8 xaf st vav di hi pt sb32 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {887798, 736266, 0, 0, 8.25754e+06, 1.06742e+07, 0.730652, 0.777015, 0.882231, 1.50445, 0.00406892, 0.00406892, 0, 0.972567, 1.60737, 1.13182, 3.10708e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av64+m16@64 am32+m32@72 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 64}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {904313, 557140, 0, 0, 6.41761e+06, 7.86432e+06, 0.630762, 0.876288, 0.890116, 1.496, 0.00449959, 0.00449959, 0, 0.908549, 1.99433, 1.14102, 2.53933e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIpq"}, "av64+m16@64 am16+m16@48 aB wg 8x2x2 xaf vav di hi pt sr br sb64 bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 2, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.08493e+06, 292142, 0, 0, 7.6546e+06, 9.12589e+06, 0.650233, 1.15949, 0.898431, 1.61747, 0.00551188, 0.000223805, 0.00523812, 0.479798, 1.50717, 1.16902, 1.70322e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@32 am16x2 aB wg 4x4x2 kr cb4 ks16 xaf st vav di hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 16384, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.07727e+06, 5507, -10646.8, 762877, 0, 0, 0.529053, 1.20619, 0.913416, 1.83814, 0.00530696, 0.00530696, 0, 1, 1.45979, 1.04002, 4.167e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am16 aB wg 8x4 cb4x2 ks32 xaf vav di hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {997384, 523666, 0, 0, 0, 0, 0.725909, 1.44913, 0.969352, 1.7371, 0.00684326, 0.00684326, 0, 0.948745, 1.28965, 1.02213, 2.98663e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av64+m32@72 am64 aB wg 4x4x2 kr cb3 ks64 xaf st vav di hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.39811e+06, -511740, -162098, 776498, 3.66838e+06, 3.61759e+06, 0.615352, 0.834481, 0.947487, 1.59755, 0.00699936, 0.00120041, 0.00602363, 0.481222, 1.38965, 1.12497, 1.73138e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@48 am32x2+m32@16 aB wg 4x4x2 kr af vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.0551e+06, 178202, 11243.3, 367637, 0, 0, 0.503711, 0.807112, 0.950782, 1.87168, 0.00738357, 0.00738357, 0, 1, 1.30902, 1.0037, 2.17431e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am32+m64@64 aB wg 4x8 af vav di hi pt sr br sb256 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {884175, 432893, 0, 0, 0, 0, 0.691705, 0.930636, 0.67952, 1.28623, 0.00834359, 0.00834359, 0, 0.829417, 1.2862, 0.998202, 1.83773e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@16 am16x2 aB wg 4x4 cb3x2 ks64 xaf vav di hi pt sr br bk0 sn nb 0x4 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {4, 4, 1}, 1, (WGType) 1, 441, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.20728e+06, 220586, 0, 0, 3.33332e+06, 3.75194e+06, 0.652532, 1.23869, 0.91158, 1.56406, 0.0117214, 0.00109919, 0.0112405, 0.60464, 1.3107, 0.968174, 2.71041e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@40 am32+m32@32 aB wg 2x8x2 kr xaf st vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 16777216}, {64, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 0, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.05877e+06, 219793, -594.893, 277484, 0, 0, 0.525177, 1.26692, 0.792718, 2.22147, 0.0096798, 0.0096798, 0, 0.987109, 1.45813, 0.98519, 3.76481e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@32 am32+m16@32 aB wg 4x8 xaf vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {886608, 390320, 0, 0, 0, 0, 0.857697, 1.69524, 0.882366, 1.49272, 0.0152973, 0.0152973, 0, 0.87518, 1.13598, 0.985664, 7.22943e-13}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 aB wg 2x4x4 kr ca3x2 ks16 af vav di hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 6144, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.32904e+06, -190080, -59958.5, 292338, 3.36691e+06, 2.61734e+06, 0.669227, 0.829673, 0.945076, 1.6481, 0.0148001, 0.00579724, 0.00959387, 0.821019, 1.23241, -0.89576, 6.77612e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 aB wg 2x4x4 kr ca3 ks32 xaf vav di hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 12288, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06604e+06, 422653, -3313.33, 140939, 0, 0, 0.706171, 0.923488, 1.55544, 3.37632, 0.0138008, 0.0138008, 0, 0.955024, 1.30322, 0.993942, 1.51997e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m64@64 am32x2+m64@32 aB wg 4x8 xaf vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {892820, 387589, 0, 0, 0, 0, 1.37414, 1.37091, 1.57321, 2.49804, 0.0230095, 0.0230095, 0, 0.798198, 1.0699, 0.943911, 8.62443e-13}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@16 am64+m16@32 aB wg 2x4x4 kr af vav di hi pt sr br sb64 bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 64}, {32, 8, 64}, {2, 4, 4}, 1, (WGType) 1, 445, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.14845e+06, -147110, -26044.3, 242040, 2.51494e+06, 1.99066e+06, 0.589278, 0.947466, 0.896827, 1.473, 0.0220317, 0.0143856, 0.00994993, 0.785111, 1.09932, 0.536897, 2.83611e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@32 am16x2+m64@16 aB wg 2x4x4 kr af vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 16}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00506e+06, 345733, -956.611, 109569, 0, 0, 0.545862, 1.10699, 2.27023, 4.75849, 0.0196336, 0.0196336, 0, 0.943309, 1.17941, 0.979339, 1.80419e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@64 am32+m64@64 aB wg 4x8 af vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 16777216}, {16, 4, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {889098, 373289, 0, 0, 0, 0, 1.28342, 2.80624, 2.15509, 3.83409, 0.037508, 0.037508, 0, 0.905233, 1.05006, -0.205455, 1.1953e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@32 am32+m32@32 aB wg 2x4x4 kr xaf vav di hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 32}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.14345e+06, -137745, -33413, 227017, 2.82624e+06, 1.87433e+06, 0.82029, 0.877149, 1.66039, 2.54748, 0.0298959, 0.00933916, 0.0216032, 0.865203, 1.23765, 0.941145, 3.68181e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m64@16 am32+m64@32 aB wg 2x4x4 kr xaf vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {956963, 344746, 4218.91, 97361.9, 0, 0, 0.820239, 0.969908, 3.18837, 7.05539, 0.0297591, 0.0297591, 0, 0.916253, 1.11397, 0.761398, 3.4084e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m32@16 am32 aB wg 2x8 af vav di li nmk pt sr br sb64 bk0 sys kv afb", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.04885e+06, 107041, 0, 0, 2.47316e+06, 3.29318e+06, 0.654999, 3.81135, 0.843479, 1.3591, 0.038736, 0.023268, 0.0232375, 0.751197, 1.17223, 0.496182, 1.38208e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#I"}, "aB16 aB16 aB wg 4x8 cab4x2 ks16 xaf vav di hi pt sr br bk0 sn dm grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04424e+06, 973248, 0, 0, 7.84302e+06, 1.21242e+07, 0.953302, 1.18133, 1.01052, 1.70291, 0.00479485, 0.00479485, 0, 0.800073, 1.59939, 1.10436, 6.87712e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB32 aB wg 4x8 cab4 ks32 xaf st vav di hi pt sr br bk0 sn dm grf256 sys kv afb rr l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.05252e+06, 860723, 0, 0, 4.41549e+06, 6.58637e+06, 0.871936, 1.44298, 0.913188, 1.80988, 0.00617766, 0.00617766, 0, 0.994839, 1.55547, 1.04563, 3.62354e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 aB wg 4x4x2 kr cab3 ks32 xaf st vav di hi pt sr br bk0 sn dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.81493e+06, -570976, -358468, 965905, 3.85024e+06, 4.4073e+06, 0.888133, 0.947947, 0.979375, 1.62192, 0.00765437, 0.00116269, 0.00661508, 0.746058, 1.42628, 1.00722, 4.56276e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 aB wg 4x4x2 kr cab3 ks32 xaf st vav di hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07957e+06, 604291, 6300.71, 377494, 0, 0, 0.936182, 1.0353, 0.966747, 1.93797, 0.0075457, 0.0075457, 0, 0.983368, 1.39707, 0.974702, 4.83491e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB64 aB wg 4x8 cab3x2 ks64 af vav di hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.50672e+06, 832915, 0, 0, 0, 0, 0.987646, 1.02217, 0.93956, 1.85826, 0.00865513, 0.00865513, 0, 0.919371, 1.38478, 0.970416, 3.20276e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB32 aB wg 4x4x2 kr cab4x2 ks32 af vav di hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 49152, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.73798e+06, -437907, -318331, 822139, 2.89178e+06, 3.01466e+06, 0.809057, 1.16853, 0.915011, 1.60895, 0.0117213, 0.00153425, 0.0104969, 0.446714, 1.33828, 0.977686, 2.82538e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 aB wg 2x8x2 kr cab4x2 ks32 xaf st vav di hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 445, 32768, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.6828e+06, -385192, -295917, 726406, 2.7435e+06, 2.62144e+06, 0.822119, 0.992845, 0.985335, 1.93178, 0.0162175, 0.0015617, 0.0148216, 0.580448, 1.2832, 0.963286, 2.7298e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav di hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.0376e+06, 806829, 0, 0, 0, 0, 1.28219, 1.0295, 1.49381, 3.00966, 0.0179815, 0.0179815, 0, 0.929009, 1.36496, 0.83458, 3.84739e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB32 aB wg 1x4x8 kr cab3 ks32 xaf vav di hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 445, 12288, 12288, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.64155e+06, -103333, -66679.8, 190802, 3.23584e+06, 2.12992e+06, 0.833578, 0.885824, 0.842945, 1.56023, 0.0285113, 0.00669822, 0.0219822, 0.893832, 1.23518, -0.453662, 6.29633e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav di li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "Ip"}, "aB16+m32@32 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19025e+06, -94418.5, -16051.4, 154897, 2.84262e+06, 1.62529e+06, 1.82485, 0.600489, 0.594593, 1.14879, 0.0290928, 0.0261561, 0.0155745, 1, 1.18676, 0.246189, 2.30089e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "aB32 aB32 aB wg 8x4 cab3 ks32 af vav di hi pt sr br bk0 sn nb 8x4 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 86016, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07095e+06, 921756, 0, 0, 5.66559e+06, 9.26515e+06, 0.874939, 1.19488, 1.04455, 1.6478, 0.00468774, 0.00468774, 0, 0.993164, 1.66254, 1.1552, 3.26236e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {11000, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {64, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4", {16, (LoopType) 255, 128, {(LoopType) 160, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 16777216}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 257, 0, 0, {64, 8, 4}, {true, true, true}}, {'E', 17, {722541, 469395, 0, 0, 0, 0, 0.464722, 19.0441, 0.942118, 2.23237, 0.0530335, 0.0530335, 0, 0.0814462, 1.17896, -0.305624, 2.12632e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879752, 350276, 0, 0, 0, 0, 0.455042, 18.3558, 0.734906, 2.00265, 0.0538244, 0.0538244, 0, 0.0478715, 1.11789, 0.996902, 3.19089e-13}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipsxy"}, "sB64 sB32 aB wg 8x4 cab3 ks64 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.0553e+06, 523707, 0, 0, 0, 0, 2.27443, 2.47386, 6.75043, 17.4507, 0.00973723, 0.0173445, 0.00450943, 0.936605, 1.39613, 1.18223, 6.42735e-13}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb32 sb32 sb l4 cab1 wg 4x4 cs sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 4096, 0, {1, 1, 4}, {false, false, false}}, {'E', 17, {1.01946e+06, 85192.9, 0, 0, 0, 0, 3.69873, 4.09617, 6.42674, 17.041, 0.0424222, 0.0270009, 0.0195008, 0.698122, 1.40723, 1.13886, 7.68905e-13}}}, +{{'E', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8 aB16 aB wg 4x8 cab4 ks16 nse bo sr bk0 sn grf256 dm l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.31114e+06, 828815, 0, 0, 0, 0, 4.12868, 4.53677, 6.35113, 17.6714, 0.04015, 0.0278237, 0.024414, 0.810338, 1.20421, 1.02447, 3.99205e-12}}}, +{{'E', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2x2 sB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 4x8 kc4 nse sb64 bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 4}, {4, 8, 1}, 1, (WGType) 0, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 8x4 kc4 cab4x2 ks8 nse bo bk0 sn sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.3546e+06, 315336, 0, 0, 0, 0, 19.529, 12.4603, 5.98766, 15.7596, 0.125914, 0.121288, 0.0101254, 0.873111, 1.32691, 1.13232, 2.47813e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 aB wg 4x4x2 kr kc8 cab4x2 ks8 nse hi bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 8}, {4, 4, 2}, 1, (WGType) 1, 261, 40960, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.01037e+06, 330255, 116967, 182860, 0, 0, 17.1106, 16.5232, 4.44267, 11.7098, 0.141417, 0.103027, 0.0331418, 0.58131, 1.36247, 1.11242, 2.61462e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4 aB8 aB wg 4x4x4 kr kc4 cab4 ks8 nse hi bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {4, 4, 4}, 1, (WGType) 1, 261, 49152, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06097e+06, 556945, 66027.4, 91054.2, 0, 0, 18.5093, 14.0722, 2.66696, 7.68563, 0.176615, 0.0866836, 0.0998873, 0.702223, 1.31234, 1.09234, 7.41931e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aS4x2 aB wg 2x4x4 kr kc4 ca4x2 ks8 nse hi bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {2, 4, 4}, 1, (WGType) 1, 261, 16384, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08458e+06, 216717, 84835.8, 36815.4, 0, 0, 18.3579, 12.9505, 1.89546, 7.76547, 0.219793, 0.153769, 0.0577075, 0.933834, 1.27783, 0.949068, 7.31161e-11}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4 aB4 aB wg 2x8x4 kr kc4 cab4x2 ks8 nse bo bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 8}, {2, 8, 4}, 1, (WGType) 1, 261, 32768, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12674e+06, 648871, 55478.5, 61257.5, 0, 0, 19.0863, 11.8242, 3.43093, 10.4549, 0.259657, 0.247568, 0.0568037, 1, 1.22429, 1.06723, 4.08595e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 aB wg 8x4 kc8 cab4x2 ks8 nse bo bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.31235e+06, 288775, 0, 0, 0, 0, 18.7973, 21.3706, 5.54761, 14.4861, 0.147428, 0.107901, 0.033153, 1, 1.33996, 1.1511, 2.16816e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 4x8 kc4 cab4x2 ks8 nse bo bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.33992e+06, 296845, 0, 0, 0, 0, 20.3993, 13.23, 5.72152, 14.4818, 0.143213, 0.116468, 0.028474, 0.9246, 1.33587, 1.1524, 2.2794e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2 aB8x2 aB wg 4x4 kc8 cab4x2 ks8 nse bo bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.42251e+06, 151979, 0, 0, 0, 0, 18.9169, 13.1915, 5.54022, 14.3571, 0.153986, 0.0983515, 0.0423043, 0.945168, 1.35424, 1.15241, 2.28656e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 2x8 kc4 cab4x2 ks8 nse bo bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 8}, {2, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.3331e+06, 140159, 0, 0, 0, 0, 19.7793, 12.1262, 4.33379, 12.2776, 0.235893, 0.114077, 0.0969446, 0.891475, 1.27279, 1.13859, 7.22824e-13}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB8x2 aB wg 4x1 kc4 nse bo sb64 bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 16384, 16777216}, {8192, 8192, 16777216}, {32, 1, 8}, {4, 1, 1}, 1, (WGType) 0, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16919e+06, 27880.4, 0, 0, 0, 0, 11.5008, 22.3297, 0.230151, 0.23026, 1.23566, 0.719548, 0.513589, 0.5, 1.11024, 0, 0}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4 aB wg 4x1x8 kr kc4 nse bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 4}, {4, 1, 8}, 1, (WGType) 0, 261, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16027e+06, 252146, 49096.8, 13045.4, 0, 0, 11.7305, 45.2153, 8.44143, 8.40403, 1.34796, 0.539253, 1.01482, 0.832486, 1.15634, 0, 0}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4 aB wg 2x1x16 kr kc4 nse bo sb64 bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 4}, {2, 1, 16}, 1, (WGType) 0, 261, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.1956e+06, 262989, 42928.1, 11139.1, 0, 0, 12.5624, 26.0997, 13.5639, 13.2709, 2.46022, 1.15535, 1.74003, 0.862105, 1.04038, 0, 0}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aB4x2 aB4 aB wg 2x1x16 kr kc4 nse bo sb64 bk0 kb sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 4}, {2, 1, 16}, 1, (WGType) 0, 263, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.20007e+06, 262941, 42185.5, 11144.5, 0, 0, 12.5473, 25.954, 13.4924, 13.1994, 2.45843, 1.17019, 1.73564, 0.861639, 1.04437, 0, 0}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {2048, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "aB4x2 aB8/4x2 aB wg 8x4 kc4 cb4 ks8 nse bo bk0 sn grf256 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08605e+06, 604168, 0, 0, 0, 0, 29.905, 14.9021, 6.30949, 17.3852, 0.127384, 0.123773, 0.010991, 0.905338, 1.1857, 1.02549, 5.95169e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {2048, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4/2x2 aB4x2 aB wg 8x4 kc4 nse sb64 bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 4}, {8, 4, 1}, 1, (WGType) 0, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {989969, 519612, 0, 0, 0, 0, 16.7886, 17.9252, 6.45578, 17.4626, 0.12653, 0.124499, 0.0102408, 0.914174, 1.18982, 1.01788, 6.88478e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 8x4 kc4 cab4x2 ks8 nse bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.14372e+06, 268176, 0, 0, 0, 0, 19.0558, 21.4548, 6.11653, 15.9061, 0.125914, 0.1165, 0.0129836, 0.766057, 1.30565, 1.10714, 3.63242e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8/2x2 aB8/2x2 aB wg 4x4x4 kr kc8 cab4 ks8 nse bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {4, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {985735, 579142, 89571.4, 111952, 0, 0, 19.2371, 19.292, 3.04924, 10.2567, 0.134813, 0.127874, 0.0154638, 0.964384, 1.32465, 1.09359, 5.97777e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8/4 aB4 aB wg 2x4x4 kr kc4 cab4 ks8 nse bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {2, 4, 4}, 1, (WGType) 1, 261, 49152, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {965437, 378870, 128663, 136758, 0, 0, 20.0729, 19.6239, 4.64349, 11.2569, 0.159785, 0.104953, 0.0557092, 0.629327, 1.32709, 1.10857, 3.56295e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8/4 aB4 aB wg 2x2x8 kr kc4 cab4 ks8/4 nse bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 261, 65536, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03793e+06, 442598, 84792.3, 92330.5, 0, 0, 18.8298, 19.2546, 5.24556, 13.5387, 0.221282, 0.145021, 0.0472079, 0.11012, 1.24889, 1.03634, 1.74167e-11}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4 aB wg 2x1x16 kr kc4 cb4 ks8 nse bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {2, 1, 16}, 1, (WGType) 1, 261, 32768, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.22266e+06, 300480, 71990.6, 29344.4, 0, 0, 17.7499, 19.4931, 3.79453, 10.2325, 0.753585, 0.13462, 0.0863731, 0, 1.00212, 1.03517, -2.40598e-13}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aB4x2 aB4 aB wg 2x1x16 kr kc4 cb4 ks8 nse bo bk0 kb sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {2, 1, 16}, 1, (WGType) 1, 263, 32768, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.20037e+06, 301140, 72741.6, 29330.7, 0, 0, 17.7375, 19.5246, 3.79028, 10.2894, 0.755373, 0.133988, 0.0869263, 0, 1.00095, 1.00147, -3.84892e-15}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 aB wg 8x4 kc8 cab4x2 ks8 nse bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.14681e+06, 252797, 0, 0, 0, 0, 18.7556, 22.0632, 5.55765, 14.7217, 0.129576, 0.0903504, 0.0430783, 0.608364, 1.37895, 1.18499, 2.72061e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 4x8 kc4 cab4x2 ks8 nse bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.1555e+06, 250247, 0, 0, 0, 0, 20.2189, 17.5095, 5.50136, 14.7401, 0.130247, 0.121949, 0.0272943, 0.822585, 1.38372, 1.17114, 2.4229e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 8x4 kc4 cab4x2 ks8 nse bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.14264e+06, 242222, 0, 0, 0, 0, 11.8247, 19.2893, 4.69171, 13.5115, 0.206087, 0.141717, 0.0799846, 1, 1.23922, 1.11159, 2.37456e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 4x4 kc4 cab4x2 ks8 nse bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 8192, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16938e+06, 118796, 0, 0, 0, 0, 12.0084, 17.0436, 4.73424, 13.6962, 0.223144, 0.129865, 0.0961405, 0.885772, 1.23839, 1.03623, 1.12784e-11}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "enpqw"}, "aB8x2 aS4x2 aP wg 4x8 kc4 ca4 ks8 nse bk0 sm grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08113e+06, 644110, 0, 0, 0, 0, 17.8779, 12.8384, 6.35841, 21.2735, 0.128279, 0.125356, 0.0107346, 0.963002, 1.21137, 1.04637, 4.08712e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aS4x2 aP wg 4x8 kc4 ca4 ks8 nse bk0 sm grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08113e+06, 644110, 0, 0, 0, 0, 17.8779, 12.8384, 6.35841, 21.2735, 0.12828, 0.125356, 0.0107346, 0.963002, 1.21137, 1.04637, 4.08712e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 aB wg 8x4 kc8 cab4x2 ks8 nse bo bk0 sm sn sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.3433e+06, 325408, 0, 0, 0, 0, 13.5129, 12.9096, 5.98721, 15.7372, 0.12661, 0.0960571, 0.0304219, 0.583089, 1.34377, 1.11974, 3.39207e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4 aS4x2 aB wg 2x8x2 kr kc4 ca4x2 ks8 nse bo bk0 sm sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 8}, {2, 8, 2}, 1, (WGType) 1, 261, 16384, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.22693e+06, 251321, 210418, 106612, 0, 0, 13.5399, 13.9651, 4.39951, 13.0611, 0.147223, 0.115872, 0.0300123, 0.916152, 1.36645, 1.0844, 1.44051e-11}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qnpe"}, "aB4 aS4 aP wg 2x4x8 kr kc4 ca4 ks8 nse bo bk0 sm sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {2, 4, 8}, 1, (WGType) 1, 261, 65536, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06485e+06, 693034, 195987, 231547, 0, 0, 18.6107, 13.8914, 4.94462, 26.3695, 0.161527, 0.132206, 0.0318929, 0.806319, 1.34391, 1.09114, 1.06794e-11}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aS4x2 aB wg 2x4x4 kr kc4 ca4x2 ks8 nse bo bk0 sm sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {2, 4, 4}, 1, (WGType) 1, 261, 16384, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.35407e+06, 307675, 87230.9, 35239.5, 0, 0, 13.0052, 12.6903, 2.93583, 6.59897, 0.275248, 0.216252, 0.0489705, 1, 1.24302, 1.11488, 3.02106e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 2x8x4 kr kc4 cab4 ks8 nse bo bk0 sm sn sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 8}, {2, 8, 4}, 1, (WGType) 1, 261, 32768, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07577e+06, 617108, 56822.5, 58479.9, 0, 0, 11.7402, 11.6903, 3.28451, 10.3784, 0.334271, 0.377107, 0.0165214, 1, 1.1634, 1.05007, 2.92522e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aB4x2 aB4x2 aB wg 2x8x4 kr kc4 cab4 ks8 nse bo bk0 sm sn kb sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 8}, {2, 8, 4}, 1, (WGType) 1, 263, 32768, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06847e+06, 617445, 58297, 58439, 0, 0, 11.7324, 11.6979, 3.27931, 10.3848, 0.334181, 0.377289, 0.0161202, 1, 1.16193, 1.05447, 2.55089e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 aB wg 4x8 kc8 cab4x2 ks8 nse bo bk0 sm sn sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.33091e+06, 294791, 0, 0, 0, 0, 12.113, 12.1053, 5.76463, 14.438, 0.174666, 0.153877, 0.0284394, 1, 1.21846, 1.09172, 2.58579e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 2x8 kc4 cab4x2 ks8 nse bo bk0 sm sn sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 8}, {2, 8, 1}, 1, (WGType) 1, 257, 8192, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37172e+06, 143581, 0, 0, 0, 0, 12.0273, 11.939, 4.30531, 12.11, 0.300404, 0.190268, 0.0847494, 0.961414, 1.14371, 1.0774, 1.20649e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8 aB8x2 aB wg 4x1 kc8 nse bo bk0 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 8}, {4, 1, 1}, 1, (WGType) 0, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.18688e+06, 30123, 0, 0, 0, 0, 11.8183, 1.08352, -3.02547, -3.02137, 3.45176, 1.04334, 0.610591, 0, 1.00207, 0, 0}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS4x2 aB4x2 aB wg 4x1x4 kr kc4 nse bo bk0 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 4}, {4, 1, 4}, 1, (WGType) 0, 261, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.095e+06, 114618, 67456.5, 7026.97, 0, 0, 14.2166, 26.1172, 4.27831, 4.3586, 5.636, 1.92292, 0.879608, 0.833333, 1.00143, 0, 0}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS4 aB4x2 aB wg 2x1x16 kr kc4 nse bo sb64 bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 4}, {2, 1, 16}, 1, (WGType) 0, 261, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16669e+06, 252491, 41900.2, 11020.1, 0, 0, 15.0751, 15.6212, 13.7121, 12.5973, 6.13369, 4.14404, 0.329035, 0.50991, 1.00145, 0, 0}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aS4 aB4x2 aB wg 2x1x16 kr kc4 nse bo sb64 bk0 kb sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 4}, {2, 1, 16}, 1, (WGType) 0, 263, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.15281e+06, 253100, 56985.6, 10434.6, 0, 0, 15.075, 15.6287, 30.852, 29.2407, 6.13546, 4.08401, 0.367041, 0.496243, 1.00145, 0, 0}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "aS4x2 aB8x2 aP wg 8x4 kc4 cb4 ks8 nse bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {981989, 555240, 0, 0, 0, 0, 15.2859, 19.3328, 6.46524, 20.9306, 0.131163, 0.116872, 0.0205996, 0.805001, 1.19768, 1.00218, 8.63883e-12}}}, +{{'E', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS4x2 aB8/4x2 aB wg 8x4 kc4 cb4 ks8 nse bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {984519, 554553, 0, 0, 0, 0, 15.6433, 19.3034, 6.32057, 17.386, 0.131163, 0.116872, 0.020599, 0.810127, 1.20102, 1.02803, 6.01287e-12}}}, +{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 16x4 cab4 ks16 af dw vav hi bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {8192, 8192, 16777216}, {16, 64, 16}, {16, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {555289, 1.12387e+06, 0, 0, 0, 0, 11.0812, 12.062, 6.31783, 16.9321, 0.033502, 0.033502, 0, 0.924238, 1.20788, 1.20316, -1.03588e-14}}}, +{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xypIn"}, "sB32 sB32 aB wg 8x8 cab4 ks32 af dw vav hi bk0 sn sys pab sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.01712e+06, 501677, 0, 0, 0, 0, 11.2352, 11.0061, 6.00586, 15.9727, 0.0368901, 0.0329381, 0.0158743, 0.872583, 1.26733, 1.18539, 7.97223e-13}}}, +{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16x2 sB16x2 aB wg 8x8 cab4 ks32 af dw vav hi bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {538723, 893443, 0, 0, 0, 0, 10.7801, 9.8182, 5.03684, 13.5344, 0.0761836, 0.0761836, 0, 0.79624, 1.20514, 1.20124, -4.36232e-15}}}, +{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav hi bk0 grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {554386, 926318, 0, 0, 0, 0, 11.2535, 11.3159, 6.30666, 17.7295, 0.0362105, 0.0362105, 0, 0.820846, 1.20543, 1.20182, -7.27096e-15}}}, +{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyIn"}, "sB16 sB32 aB wg 8x8 cab4 ks32 af dw vav hi bk0 sys pab sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {880413, 426415, 0, 0, 0, 0, 11.314, 12.5284, 6.07049, 16.1179, 0.0387733, 0.0216213, 0.024149, 0.726888, 1.2321, 1.19468, 2.81285e-13}}}, +{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16x2 sB16x2 aB wg 8x4 cab4 ks16 af dw vav hi bk0 grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {873052, 300083, 0, 0, 0, 0, 11.3512, 11.8331, 6.13255, 15.5383, 0.0704048, 0.0182551, 0.0658259, 0.497503, 1.20284, 1.20066, 6.06715e-15}}}, +{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 16x4 cab4 ks16 af dw vav hi bk0 sm sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {8192, 8192, 16777216}, {16, 64, 16}, {16, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {550200, 1.13694e+06, 0, 0, 0, 0, 12.19, 11.8291, 6.29315, 16.9118, 0.0357158, 0.0357158, 0, 0.84371, 1.20301, 1.20198, 5.41083e-16}}}, +{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav hi bk0 sm sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {980496, 425866, 0, 0, 0, 0, 11.0307, 11.0614, 6.30278, 16.1363, 0.0345007, 0.0181085, 0.026025, 0.869399, 1.25124, 1.18998, 4.50928e-13}}}, +{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB32 sB32 aB wg 16x4 cab4 ks32 af dw vav hi bk0 sm sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {131072, 524288, 16777216}, {8192, 8192, 16777216}, {8, 32, 32}, {16, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {546206, 910844, 0, 0, 0, 0, 10.0561, 10.0714, 6.1261, 16.2049, 0.0537128, 0.0537128, 0, 0.921267, 1.20585, 1.20112, -2.2528e-15}}}, +{{'E', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB32 sB16 aB wg 2x8x2 kr cab4 ks32 af dw vav hi bk0 sm sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 65536, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {903008, 291386, 111362, 166961, 0, 0, 10.979, 11.0174, 4.70886, 12.1312, 0.0687579, 0.0438927, 0.045381, 0.911378, 1.20424, 1.20048, 2.76292e-14}}}, +{{'F', "gemm", {"B", "B", "S"}, {"A2#16,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16+B16@48 aB16+B16@48 aB vav sys grf256 af hi pt wg 4x8 sb256 bk0 sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {2048}}}, +{{'F', "gemm", {"B", "B", "S"}, {"A2", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2+B16@80 aB16x2+B16@80 aB vav sb256 wg 4x8 bo pt sys bk0 sr br", {16, (LoopType) 255, 128, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {64, 128, 4}, {true, true, true}}, {'W', 1, {256}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqps"}, "av16+m32@72 am32+m32@64 aB wg 4x8 xaf vav hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {901925, 700635, 0, 0, 9.16685e+06, 1.23699e+07, 0.721771, 0.719501, 0.918422, 1.55461, 0.00404125, 0.00404125, 0, 0.990031, 1.64922, 1.16161, 1.79221e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m32@48 am32+S32@64 aB wg 4x8 xaf st vav hi pt sb32 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {887798, 736266, 0, 0, 8.25754e+06, 1.06742e+07, 0.730652, 0.777015, 0.882231, 1.50445, 0.00406892, 0.00406892, 0, 0.972567, 1.60737, 1.13182, 3.10708e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIps"}, "av64+m16@64 am32+m32@72 aB wg 8x4 xaf vav hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 64}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {904313, 557140, 0, 0, 6.41761e+06, 7.86432e+06, 0.630762, 0.876288, 0.890116, 1.496, 0.00449959, 0.00449959, 0, 0.908549, 1.99433, 1.14102, 2.53933e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIpq"}, "av64+m16@64 am16+m16@48 aB wg 8x2x2 xaf vav hi pt sr br sb64 bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 2, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.08493e+06, 292142, 0, 0, 7.6546e+06, 9.12589e+06, 0.650233, 1.15949, 0.898431, 1.61747, 0.00551188, 0.000223805, 0.00523812, 0.479798, 1.50717, 1.16902, 1.70322e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@32 am16x2 aB wg 4x4x2 kr cb4 ks16 xaf st vav hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 16384, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.07727e+06, 5507, -10646.8, 762877, 0, 0, 0.529053, 1.20619, 0.913416, 1.83814, 0.00530696, 0.00530696, 0, 1, 1.45979, 1.04002, 4.167e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am16 aB wg 8x4 cb4x2 ks32 xaf vav hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {997384, 523666, 0, 0, 0, 0, 0.725909, 1.44913, 0.969352, 1.7371, 0.00684326, 0.00684326, 0, 0.948745, 1.28965, 1.02213, 2.98663e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av64+m32@72 am64 aB wg 4x4x2 kr cb3 ks64 xaf st vav hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.39811e+06, -511740, -162098, 776498, 3.66838e+06, 3.61759e+06, 0.615352, 0.834481, 0.947487, 1.59755, 0.00699936, 0.00120041, 0.00602363, 0.481222, 1.38965, 1.12497, 1.73138e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@48 am32x2+m32@16 aB wg 4x4x2 kr af vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.0551e+06, 178202, 11243.3, 367637, 0, 0, 0.503711, 0.807112, 0.950782, 1.87168, 0.00738357, 0.00738357, 0, 1, 1.30902, 1.0037, 2.17431e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am32+m64@64 aB wg 4x8 af vav hi pt sr br sb256 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {884175, 432893, 0, 0, 0, 0, 0.691705, 0.930636, 0.67952, 1.28623, 0.00834359, 0.00834359, 0, 0.829417, 1.2862, 0.998202, 1.83773e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@16 am16x2 aB wg 4x4 cb3x2 ks64 xaf vav hi pt sr br bk0 sn nb 0x4 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {4, 4, 1}, 1, (WGType) 1, 441, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.20728e+06, 220586, 0, 0, 3.33332e+06, 3.75194e+06, 0.652532, 1.23869, 0.91158, 1.56406, 0.0117214, 0.00109919, 0.0112405, 0.60464, 1.3107, 0.968174, 2.71041e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@40 am32+m32@32 aB wg 2x8x2 kr xaf st vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 16777216}, {64, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 0, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.05877e+06, 219793, -594.893, 277484, 0, 0, 0.525177, 1.26692, 0.792718, 2.22147, 0.0096798, 0.0096798, 0, 0.987109, 1.45813, 0.98519, 3.76481e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@32 am32+m16@32 aB wg 4x8 xaf vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {886608, 390320, 0, 0, 0, 0, 0.857697, 1.69524, 0.882366, 1.49272, 0.0152973, 0.0152973, 0, 0.87518, 1.13598, 0.985664, 7.22943e-13}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 aB wg 2x4x4 kr ca3x2 ks16 af vav hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 6144, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.32904e+06, -190080, -59958.5, 292338, 3.36691e+06, 2.61734e+06, 0.669227, 0.829673, 0.945076, 1.6481, 0.0148001, 0.00579724, 0.00959387, 0.821019, 1.23241, -0.89576, 6.77612e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 aB wg 2x4x4 kr ca3 ks32 xaf vav hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 12288, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06604e+06, 422653, -3313.33, 140939, 0, 0, 0.706171, 0.923488, 1.55544, 3.37632, 0.0138008, 0.0138008, 0, 0.955024, 1.30322, 0.993942, 1.51997e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m64@64 am32x2+m64@32 aB wg 4x8 xaf vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {892820, 387589, 0, 0, 0, 0, 1.37414, 1.37091, 1.57321, 2.49804, 0.0230095, 0.0230095, 0, 0.798198, 1.0699, 0.943911, 8.62443e-13}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@16 am64+m16@32 aB wg 2x4x4 kr af vav hi pt sr br sb64 bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 64}, {32, 8, 64}, {2, 4, 4}, 1, (WGType) 1, 445, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.14845e+06, -147110, -26044.3, 242040, 2.51494e+06, 1.99066e+06, 0.589278, 0.947466, 0.896827, 1.473, 0.0220317, 0.0143856, 0.00994993, 0.785111, 1.09932, 0.536897, 2.83611e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@32 am16x2+m64@16 aB wg 2x4x4 kr af vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 16}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00506e+06, 345733, -956.611, 109569, 0, 0, 0.545862, 1.10699, 2.27023, 4.75849, 0.0196336, 0.0196336, 0, 0.943309, 1.17941, 0.979339, 1.80419e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@64 am32+m64@64 aB wg 4x8 af vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 16777216}, {16, 4, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {889098, 373289, 0, 0, 0, 0, 1.28342, 2.80624, 2.15509, 3.83409, 0.037508, 0.037508, 0, 0.905233, 1.05006, -0.205455, 1.1953e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@32 am32+m32@32 aB wg 2x4x4 kr xaf vav hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 32}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.14345e+06, -137745, -33413, 227017, 2.82624e+06, 1.87433e+06, 0.82029, 0.877149, 1.66039, 2.54748, 0.0298959, 0.00933916, 0.0216032, 0.865203, 1.23765, 0.941145, 3.68181e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m64@16 am32+m64@32 aB wg 2x4x4 kr xaf vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {956963, 344746, 4218.91, 97361.9, 0, 0, 0.820239, 0.969908, 3.18837, 7.05539, 0.0297591, 0.0297591, 0, 0.916253, 1.11397, 0.761398, 3.4084e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m32@16 am32 aB wg 2x8 af vav li nmk pt sr br sb64 bk0 sys kv afb", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.04885e+06, 107041, 0, 0, 2.47316e+06, 3.29318e+06, 0.654999, 3.81135, 0.843479, 1.3591, 0.038736, 0.023268, 0.0232375, 0.751197, 1.17223, 0.496182, 1.38208e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#I"}, "aB16 aB16 aB wg 4x8 cab4x2 ks16 xaf vav hi pt sr br bk0 sn dm grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04424e+06, 973248, 0, 0, 7.84302e+06, 1.21242e+07, 0.953302, 1.18133, 1.01052, 1.70291, 0.00479485, 0.00479485, 0, 0.800073, 1.59939, 1.10436, 6.87712e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB32 aB wg 4x8 cab4 ks32 xaf st vav hi pt sr br bk0 sn dm grf256 sys kv afb rr l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.05252e+06, 860723, 0, 0, 4.41549e+06, 6.58637e+06, 0.871936, 1.44298, 0.913188, 1.80988, 0.00617766, 0.00617766, 0, 0.994839, 1.55547, 1.04563, 3.62354e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 aB wg 4x4x2 kr cab3 ks32 xaf st vav hi pt sr br bk0 sn dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.81493e+06, -570976, -358468, 965905, 3.85024e+06, 4.4073e+06, 0.888133, 0.947947, 0.979375, 1.62192, 0.00765437, 0.00116269, 0.00661508, 0.746058, 1.42628, 1.00722, 4.56276e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 aB wg 4x4x2 kr cab3 ks32 xaf st vav hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07957e+06, 604291, 6300.71, 377494, 0, 0, 0.936182, 1.0353, 0.966747, 1.93797, 0.0075457, 0.0075457, 0, 0.983368, 1.39707, 0.974702, 4.83491e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB64 aB wg 4x8 cab3x2 ks64 af vav hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.50672e+06, 832915, 0, 0, 0, 0, 0.987646, 1.02217, 0.93956, 1.85826, 0.00865513, 0.00865513, 0, 0.919371, 1.38478, 0.970416, 3.20276e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB32 aB wg 4x4x2 kr cab4x2 ks32 af vav hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 49152, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.73798e+06, -437907, -318331, 822139, 2.89178e+06, 3.01466e+06, 0.809057, 1.16853, 0.915011, 1.60895, 0.0117213, 0.00153425, 0.0104969, 0.446714, 1.33828, 0.977686, 2.82538e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 aB wg 2x8x2 kr cab4x2 ks32 xaf st vav hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 445, 32768, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.6828e+06, -385192, -295917, 726406, 2.7435e+06, 2.62144e+06, 0.822119, 0.992845, 0.985335, 1.93178, 0.0162175, 0.0015617, 0.0148216, 0.580448, 1.2832, 0.963286, 2.7298e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.0376e+06, 806829, 0, 0, 0, 0, 1.28219, 1.0295, 1.49381, 3.00966, 0.0179815, 0.0179815, 0, 0.929009, 1.36496, 0.83458, 3.84739e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB32 aB wg 1x4x8 kr cab3 ks32 xaf vav hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 445, 12288, 12288, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.64155e+06, -103333, -66679.8, 190802, 3.23584e+06, 2.12992e+06, 0.833578, 0.885824, 0.842945, 1.56023, 0.0285113, 0.00669822, 0.0219822, 0.893832, 1.23518, -0.453662, 6.29633e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 441, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "Ip"}, "aB16+m32@32 aB64 aB wg 1x4x8 kr af vav li pt sr br sb64 bk0 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19025e+06, -94418.5, -16051.4, 154897, 2.84262e+06, 1.62529e+06, 1.82485, 0.600489, 0.594593, 1.14879, 0.0290928, 0.0261561, 0.0155745, 1, 1.18676, 0.246189, 2.30089e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ips"}, "aB32 aB32 aB wg 8x4 cab3 ks32 af vav hi pt sr br bk0 sn nb 8x4 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 86016, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07095e+06, 921756, 0, 0, 5.66559e+06, 9.26515e+06, 0.874939, 1.19488, 1.04455, 1.6478, 0.00468774, 0.00468774, 0, 0.993164, 1.66254, 1.1552, 3.26236e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {11000, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {64, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav li sr br sb256 bk0 sys ska rr wx4", {16, (LoopType) 255, 128, {(LoopType) 160, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 16777216}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 257, 0, 0, {64, 8, 4}, {true, true, true}}, {'E', 17, {722541, 469395, 0, 0, 0, 0, 0.464722, 19.0441, 0.942118, 2.23237, 0.0530335, 0.0530335, 0, 0.0814462, 1.17896, -0.305624, 2.12632e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879752, 350276, 0, 0, 0, 0, 0.455042, 18.3558, 0.734906, 2.00265, 0.0538244, 0.0538244, 0, 0.0478715, 1.11789, 0.996902, 3.19089e-13}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8+m16@20 aS8x2+m16@20 aB wg 4x8 kc8 nse hi pt sr br sb32 bk0 sm sn grf256 kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {892682, 628982, 0, 0, 3.18669e+06, 6.10304e+06, 1.35529, 2.79397, 0.639478, 1.11864, 0.062606, 0.062606, 0, 1, 1.00571, 1.0015, -7.07015e-15}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8+m16@20 aS8x2+m16@20 aB wg 4x8 kc8 nse hi pt sr br sb32 bk0 sm grf256 kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.27607e+06, 520636, 0, 0, 3.24403e+06, 3.02285e+06, 1.78044, 2.04816, 0.439647, 0.861601, 0.0628166, 0.0628166, 0, 1, 1.00681, 0.891913, 8.19094e-13}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@32 av16x2 aB wg 4x8 cb4x2 ks32 xaf vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03592e+06, 704672, 0, 0, 7.32365e+06, 1.08544e+07, 0.894904, 1.09998, 0.983005, 1.70679, 0.00421397, 0.00421397, 0, 1, 1.62696, 1.15394, 2.31737e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@48 av32 aB wg 8x4 cb4 ks32 xaf st vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00518e+06, 647557, 0, 0, 5.6361e+06, 8.82278e+06, 0.824036, 1.52239, 1.05594, 1.7661, 0.00544276, 0.00544276, 0, 0.821492, 1.58157, 1.1307, 1.25438e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 aB wg 8x4 cb3 ks64 af vav di hi pt sr br bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01546e+06, 551460, 0, 0, 5.50994e+06, 5.39853e+06, 0.786066, 1.60403, 1.03564, 1.72987, 0.00627267, 0.00627267, 0, 0.945912, 1.4299, 1.12176, 2.119e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m16@32 av16 aB wg 4x4x2 kr cb3 ks32 xaf vav di hi pt sr br bk0 nb 0x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.34327e+06, -452800, -134815, 707026, 3.90758e+06, 4.02227e+06, 0.83768, 0.940982, 0.959085, 1.58238, 0.00734203, 0.000556818, 0.00668236, 0.749254, 1.38444, 1.02403, 3.33022e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@16 av16 aB wg 4x4x2 kr cb3x2 ks32 xaf st vav di hi pt sr br bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.02449e+06, 270457, 23073.2, 372805, 0, 0, 0.830225, 1.05507, 0.948323, 1.96684, 0.00705564, 0.00705564, 0, 0.97946, 1.39837, 1.01868, 2.73855e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@48 at16x2+m64@48 aB wg 4x8 af vav di hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {884302, 427442, 0, 0, 0, 0, 1.10721, 1.21304, 0.68084, 1.29522, 0.00895717, 0.00895717, 0, 0.917109, 1.35302, 0.98768, 3.54927e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 at64+m64@48 aB wg 4x4x2 kr af vav di hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {970705, -384977, 57542, 601914, 2.51904e+06, 2.77955e+06, 0.70559, 1.27931, 0.712808, 1.42004, 0.0108625, 0.000797412, 0.0106601, 0.747036, 1.45242, 0.89344, 7.5701e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@16 av32 aB wg 4x2x4 kr cb3 ks32 xaf vav di hi pt sr br bk0 nb 0x2 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 12288, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.07678e+06, 379073, -5781.23, 196518, 0, 0, 0.631348, 1.26482, 1.13283, 2.56929, 0.00963654, 0.00963654, 0, 0.94834, 1.42474, 1.01278, 1.42516e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@64 at32+m32@64 aB wg 4x8 xaf vav di hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {882279, 402227, 0, 0, 0, 0, 1.096, 2.25336, 0.87212, 1.48589, 0.0146719, 0.0146719, 0, 0.967546, 1.27381, 0.958332, 1.758e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@16 at16+m32@32 aB wg 4x2x4 kr af vav di hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 16}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.108e+06, -159098, -1506.45, 263377, 3.35462e+06, 2.46088e+06, 0.968705, 0.943372, 0.747695, 1.27763, 0.0163497, 0.00954244, 0.00740148, 0.567584, 1.32154, 0.910281, 9.36606e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@32 av16 aB wg 4x2x4 kr cb3 ks32 xaf st vav di hi pt sr br bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 12288, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.05959e+06, 435181, -437.468, 139770, 0, 0, 1.12543, 1.44694, 1.58504, 3.36927, 0.0165758, 0.0165758, 0, 0.994817, 1.15331, 0.994279, 8.64602e-13}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@32 at32x2+m64@16 aB wg 4x2x4 kr xaf vav di hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.13295e+06, -131068, -24845.2, 227188, 2.85082e+06, 2.12828e+06, 0.759763, 1.40657, 0.884702, 1.49164, 0.0269937, 0.0148542, 0.0126213, 0.40094, 1.20735, 0.590554, 2.76112e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@16 at16+m64@16 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 16}, {4, 2, 4}, 1, (WGType) 1, 261, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {998062, 346108, 2022.24, 105839, 0, 0, 1.11771, 1.68117, 2.09896, 4.51037, 0.0255242, 0.0255242, 0, 0.920669, 1.11411, 0.955328, 1.38947e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m16@48 at16x2+m32@32 aB wg 2x4x4 kr xaf st vav di hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {963808, 336536, 3636.04, 97057.3, 0, 0, 1.849, 1.53405, 3.14892, 7.01495, 0.0368708, 0.0368708, 0, 0.924505, 1.07141, -0.467636, 1.90285e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m16@32 at32x2+m16@16 aB wg 4x2x2 kr af vav di li nmk pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 32}, {4, 2, 2}, 1, (WGType) 1, 445, 0, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {774823, -159604, 298303, 251381, 2.48054e+06, 3.06135e+06, 0.49199, 4.0514, 0.750455, 1.55245, 0.0179351, 0.0189282, 0.0113946, 1, 1.20887, 0.775438, 7.38219e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABIpq"}, "aB16+m16@48 at32+m16@48 aB wg 4x8 cab3 ks32 xaf vav di hi pt sr br bk0 sn nb 4x8 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04733e+06, 850139, 0, 0, 7.14342e+06, 1.02482e+07, 1.35439, 1.1268, 0.968002, 1.56332, 0.0049908, 0.0049908, 0, 1, 1.5967, 1.09547, 2.65254e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at16+m16@32 aB wg 8x2x2 kr cab3 ks16 xaf st vav di hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 16}, {8, 2, 2}, 1, (WGType) 1, 445, 36864, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.62752e+06, -1.34654e+06, -263082, 1.70692e+06, 5.27729e+06, 7.17619e+06, 1.02404, 1.43681, 1.0099, 1.67277, 0.00669992, 0.000934616, 0.00604576, 0.813201, 1.39102, 1.03493, 5.39508e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16+m32@48 at32+m32@48 aB wg 8x4 cab4 ks32 xaf st vav di hi pt sr br bk0 sn nb 8x4 grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07446e+06, 778165, 0, 0, 0, 0, 1.04453, 2.06434, 0.993405, 1.67185, 0.00809777, 0.00809777, 0, 0.966922, 1.35225, 0.983739, 2.64748e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m32@64 at32+m16@64 aB wg 4x4x2 kr cab3 ks32 xaf st vav di hi pt sr br bk0 sn nb 4x4 grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.66893e+06, -477403, -298914, 842863, 4.01408e+06, 4.51379e+06, 1.32266, 0.980356, 0.957919, 1.60267, 0.00874929, 0.000443986, 0.00826981, 0.601212, 1.33967, 0.974327, 4.12771e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16+m16@16 aB wg 4x4x2 kr cab4x2 ks16 af vav di hi pt sr br bk0 sn nb 4x4 grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06612e+06, 307476, 2776.1, 398222, 0, 0, 1.55249, 0.974457, 0.947265, 1.9437, 0.00891094, 0.00891094, 0, 0.999859, 1.35378, 0.955966, 4.71609e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 at32+m32@48 aB wg 4x8 cab4 ks64 xaf st vav di hi pt sr br bk0 sn grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 131072, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08109e+06, 676708, 0, 0, 0, 0, 1.46721, 1.49766, 0.911352, 1.83399, 0.0123018, 0.0123018, 0, 0.930092, 1.22199, 0.994137, 1.11339e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab4 ks16 xaf vav di hi pt sr br bk0 sn nb 2x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.53714e+06, -226698, -110555, 381984, 4.01408e+06, 3.08838e+06, 1.02014, 1.2588, 0.99644, 1.73352, 0.0120789, 0.00866919, 0.00421095, 0.619534, 1.33753, 0.955156, 5.93572e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at32 aB wg 4x2x4 kr cab2 ks32 xaf vav di hi pt sr br bk0 nb 4x2 grf256 sys sn", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08374e+06, 622037, -3355.41, 197096, 0, 0, 1.60574, 1.79337, 1.16087, 2.52873, 0.0170908, 0.0170908, 0, 0.954666, 1.0389, 0.191897, 4.8148e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 at16x2 aB wg 4x8 cab4x2 ks64 xaf st vav di hi pt sr br bk0 nb 4x8 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08079e+06, 618844, 0, 0, 0, 0, 1.25013, 3.49017, 1.047, 2.28189, 0.0251551, 0.0251551, 0, 0.955754, 1.00322, 1.00053, 1.67181e-14}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab3x2 ks32 xaf vav di hi pt sr br bk0 sn nb 2x4 grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.59894e+06, -165886, -123352, 328698, 3.3538e+06, 2.68698e+06, 1.29378, 0.935395, 0.914424, 1.63719, 0.0161479, 0.0115282, 0.00624899, 0.60152, 1.23882, 0.978417, 2.1418e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 at16x2 aB wg 2x4x4 kr cab3 ks32 xaf vav di hi pt sr br bk0 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07496e+06, 610685, -736.443, 135973, 0, 0, 2.10737, 1.71053, 1.57349, 3.40087, 0.0274909, 0.0274909, 0, 0.932509, 1.00516, 0.683467, 2.27771e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 2x2x8 kr cab4x2 ks16 xaf vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 445, 12288, 12288, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.46165e+06, -119762, -43325.5, 189248, 3.71917e+06, 2.31834e+06, 0.816729, 1.39844, 0.648548, 1.38561, 0.0254865, 0.0233101, 0.00872739, 0.939824, 1.2192, -0.0765997, 2.67484e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07937e+06, 601693, -1889.77, 111799, 0, 0, 1.25642, 1.87167, 2.39915, 4.97536, 0.0272706, 0.0272706, 0, 0.945164, 1.20671, 0.942435, 2.33159e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#Ip"}, "aB16x2 aS16x2 aB wg 4x16 cab4 ks64 af vav di hi pt sr br bk0 nb 4x16 sys", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 64}, {4, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.0392e+06, 894929, 0, 0, 0, 0, 1.62399, 5.20002, 2.09792, 4.09594, 0.063358, 0.063358, 0, 0.952898, 1.11995, 0.921533, 1.32456e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "aB32+m16@48 aS16 aB wg 16x1x2 kr cb4x2 ks16 xaf vav di li nmk pt sr br bk0 grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {16, 1, 2}, 1, (WGType) 1, 445, 2048, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.4553e+06, -444405, -228349, 795645, 2.85082e+06, 2.41664e+06, 0.798343, 8.5823, 0.957266, 1.78402, 0.0570677, 0.00285529, 0.0528963, 0.711108, 1.00269, 0.593861, 3.05276e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav di li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03388e+06, 205982, 0, 0, 2.69353e+06, 2.9311e+06, 5.87694, 0.62071, 0.59549, 1.16678, 0.0302158, 0.00216455, 0.029364, 0.590076, 1.35965, 0.82526, 6.19021e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m32@48 at64+m16@48 aB wg 2x2x8 kr xaf cs di hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.12446e+06, -88323.8, -10316.2, 139507, 2.89997e+06, 1.7449e+06, 0.820255, 0.848685, 0.849377, 1.53444, 0.0324984, 0.0285972, 0.0106938, 0.747769, 1.17424, 0.248229, 3.53369e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@32 av16x2 aB wg 4x8 cb4x2 ks32 xaf vav hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03592e+06, 704672, 0, 0, 7.32365e+06, 1.08544e+07, 0.894904, 1.09998, 0.983005, 1.70679, 0.00421397, 0.00421397, 0, 1, 1.62696, 1.15394, 2.31737e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIs"}, "av32+m32@48 av32 aB wg 8x4 cb4 ks32 xaf st vav hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00518e+06, 647557, 0, 0, 5.6361e+06, 8.82278e+06, 0.824036, 1.52239, 1.05594, 1.7661, 0.00544276, 0.00544276, 0, 0.821492, 1.58157, 1.1307, 1.25438e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 aB wg 8x4 cb3 ks64 af vav hi pt sr br bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01546e+06, 551460, 0, 0, 5.50994e+06, 5.39853e+06, 0.786066, 1.60403, 1.03564, 1.72987, 0.00627267, 0.00627267, 0, 0.945912, 1.4299, 1.12176, 2.119e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m16@32 av16 aB wg 4x4x2 kr cb3 ks32 xaf vav hi pt sr br bk0 nb 0x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.34327e+06, -452800, -134815, 707026, 3.90758e+06, 4.02227e+06, 0.83768, 0.940982, 0.959085, 1.58238, 0.00734203, 0.000556818, 0.00668236, 0.749254, 1.38444, 1.02403, 3.33022e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@16 av16 aB wg 4x4x2 kr cb3x2 ks32 xaf st vav hi pt sr br bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.02449e+06, 270457, 23073.2, 372805, 0, 0, 0.830225, 1.05507, 0.948323, 1.96684, 0.00705564, 0.00705564, 0, 0.97946, 1.39837, 1.01868, 2.73855e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@48 at16x2+m64@48 aB wg 4x8 af vav hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {884302, 427442, 0, 0, 0, 0, 1.10721, 1.21304, 0.68084, 1.29522, 0.00895717, 0.00895717, 0, 0.917109, 1.35302, 0.98768, 3.54927e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 at64+m64@48 aB wg 4x4x2 kr af vav hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {970705, -384977, 57542, 601914, 2.51904e+06, 2.77955e+06, 0.70559, 1.27931, 0.712808, 1.42004, 0.0108625, 0.000797412, 0.0106601, 0.747036, 1.45242, 0.89344, 7.5701e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@16 av32 aB wg 4x2x4 kr cb3 ks32 xaf vav hi pt sr br bk0 nb 0x2 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 12288, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.07678e+06, 379073, -5781.23, 196518, 0, 0, 0.631348, 1.26482, 1.13283, 2.56929, 0.00963654, 0.00963654, 0, 0.94834, 1.42474, 1.01278, 1.42516e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@64 at32+m32@64 aB wg 4x8 xaf vav hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {882279, 402227, 0, 0, 0, 0, 1.096, 2.25336, 0.87212, 1.48589, 0.0146719, 0.0146719, 0, 0.967546, 1.27381, 0.958332, 1.758e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@16 at16+m32@32 aB wg 4x2x4 kr af vav hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 16}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.108e+06, -159098, -1506.45, 263377, 3.35462e+06, 2.46088e+06, 0.968705, 0.943372, 0.747695, 1.27763, 0.0163497, 0.00954244, 0.00740148, 0.567584, 1.32154, 0.910281, 9.36606e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@32 av16 aB wg 4x2x4 kr cb3 ks32 xaf st vav hi pt sr br bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 12288, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.05959e+06, 435181, -437.468, 139770, 0, 0, 1.12543, 1.44694, 1.58504, 3.36927, 0.0165758, 0.0165758, 0, 0.994817, 1.15331, 0.994279, 8.64602e-13}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@32 at32x2+m64@16 aB wg 4x2x4 kr xaf vav hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.13295e+06, -131068, -24845.2, 227188, 2.85082e+06, 2.12828e+06, 0.759763, 1.40657, 0.884702, 1.49164, 0.0269937, 0.0148542, 0.0126213, 0.40094, 1.20735, 0.590554, 2.76112e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@16 at16+m64@16 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 16}, {4, 2, 4}, 1, (WGType) 1, 261, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {998062, 346108, 2022.24, 105839, 0, 0, 1.11771, 1.68117, 2.09896, 4.51037, 0.0255242, 0.0255242, 0, 0.920669, 1.11411, 0.955328, 1.38947e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m16@48 at16x2+m32@32 aB wg 2x4x4 kr xaf st vav hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {963808, 336536, 3636.04, 97057.3, 0, 0, 1.849, 1.53405, 3.14892, 7.01495, 0.0368708, 0.0368708, 0, 0.924505, 1.07141, -0.467636, 1.90285e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m16@32 at32x2+m16@16 aB wg 4x2x2 kr af vav li nmk pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 32}, {4, 2, 2}, 1, (WGType) 1, 445, 0, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {774823, -159604, 298303, 251381, 2.48054e+06, 3.06135e+06, 0.49199, 4.0514, 0.750455, 1.55245, 0.0179351, 0.0189282, 0.0113946, 1, 1.20887, 0.775438, 7.38219e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABIpq"}, "aB16+m16@48 at32+m16@48 aB wg 4x8 cab3 ks32 xaf vav hi pt sr br bk0 sn nb 4x8 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04733e+06, 850139, 0, 0, 7.14342e+06, 1.02482e+07, 1.35439, 1.1268, 0.968002, 1.56332, 0.0049908, 0.0049908, 0, 1, 1.5967, 1.09547, 2.65254e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at16+m16@32 aB wg 8x2x2 kr cab3 ks16 xaf st vav hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 16}, {8, 2, 2}, 1, (WGType) 1, 445, 36864, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.62752e+06, -1.34654e+06, -263082, 1.70692e+06, 5.27729e+06, 7.17619e+06, 1.02404, 1.43681, 1.0099, 1.67277, 0.00669992, 0.000934616, 0.00604576, 0.813201, 1.39102, 1.03493, 5.39508e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16+m32@48 at32+m32@48 aB wg 8x4 cab4 ks32 xaf st vav hi pt sr br bk0 sn nb 8x4 grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07446e+06, 778165, 0, 0, 0, 0, 1.04453, 2.06434, 0.993405, 1.67185, 0.00809777, 0.00809777, 0, 0.966922, 1.35225, 0.983739, 2.64748e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m32@64 at32+m16@64 aB wg 4x4x2 kr cab3 ks32 xaf st vav hi pt sr br bk0 sn nb 4x4 grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.66893e+06, -477403, -298914, 842863, 4.01408e+06, 4.51379e+06, 1.32266, 0.980356, 0.957919, 1.60267, 0.00874929, 0.000443986, 0.00826981, 0.601212, 1.33967, 0.974327, 4.12771e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16+m16@16 aB wg 4x4x2 kr cab4x2 ks16 af vav hi pt sr br bk0 sn nb 4x4 grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06612e+06, 307476, 2776.1, 398222, 0, 0, 1.55249, 0.974457, 0.947265, 1.9437, 0.00891094, 0.00891094, 0, 0.999859, 1.35378, 0.955966, 4.71609e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 at32+m32@48 aB wg 4x8 cab4 ks64 xaf st vav hi pt sr br bk0 sn grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 131072, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08109e+06, 676708, 0, 0, 0, 0, 1.46721, 1.49766, 0.911352, 1.83399, 0.0123018, 0.0123018, 0, 0.930092, 1.22199, 0.994137, 1.11339e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab4 ks16 xaf vav hi pt sr br bk0 sn nb 2x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.53714e+06, -226698, -110555, 381984, 4.01408e+06, 3.08838e+06, 1.02014, 1.2588, 0.99644, 1.73352, 0.0120789, 0.00866919, 0.00421095, 0.619534, 1.33753, 0.955156, 5.93572e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at32 aB wg 4x2x4 kr cab2 ks32 xaf vav hi pt sr br bk0 nb 4x2 grf256 sys sn", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08374e+06, 622037, -3355.41, 197096, 0, 0, 1.60574, 1.79337, 1.16087, 2.52873, 0.0170908, 0.0170908, 0, 0.954666, 1.0389, 0.191897, 4.8148e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 at16x2 aB wg 4x8 cab4x2 ks64 xaf st vav hi pt sr br bk0 nb 4x8 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08079e+06, 618844, 0, 0, 0, 0, 1.25013, 3.49017, 1.047, 2.28189, 0.0251551, 0.0251551, 0, 0.955754, 1.00322, 1.00053, 1.67181e-14}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab3x2 ks32 xaf vav hi pt sr br bk0 sn nb 2x4 grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.59894e+06, -165886, -123352, 328698, 3.3538e+06, 2.68698e+06, 1.29378, 0.935395, 0.914424, 1.63719, 0.0161479, 0.0115282, 0.00624899, 0.60152, 1.23882, 0.978417, 2.1418e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 at16x2 aB wg 2x4x4 kr cab3 ks32 xaf vav hi pt sr br bk0 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07496e+06, 610685, -736.443, 135973, 0, 0, 2.10737, 1.71053, 1.57349, 3.40087, 0.0274909, 0.0274909, 0, 0.932509, 1.00516, 0.683467, 2.27771e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 2x2x8 kr cab4x2 ks16 xaf vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 445, 12288, 12288, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.46165e+06, -119762, -43325.5, 189248, 3.71917e+06, 2.31834e+06, 0.816729, 1.39844, 0.648548, 1.38561, 0.0254865, 0.0233101, 0.00872739, 0.939824, 1.2192, -0.0765997, 2.67484e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07937e+06, 601693, -1889.77, 111799, 0, 0, 1.25642, 1.87167, 2.39915, 4.97536, 0.0272706, 0.0272706, 0, 0.945164, 1.20671, 0.942435, 2.33159e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#Ip"}, "aB16x2 aS16x2 aB wg 4x16 cab4 ks64 af vav hi pt sr br bk0 nb 4x16 sys", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 64}, {4, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.0392e+06, 894929, 0, 0, 0, 0, 1.62399, 5.20002, 2.09792, 4.09594, 0.063358, 0.063358, 0, 0.952898, 1.11995, 0.921533, 1.32456e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "aB32+m16@48 aS16 aB wg 16x1x2 kr cb4x2 ks16 xaf vav li nmk pt sr br bk0 grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {16, 1, 2}, 1, (WGType) 1, 445, 2048, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.4553e+06, -444405, -228349, 795645, 2.85082e+06, 2.41664e+06, 0.798343, 8.5823, 0.957266, 1.78402, 0.0570677, 0.00285529, 0.0528963, 0.711108, 1.00269, 0.593861, 3.05276e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03388e+06, 205982, 0, 0, 2.69353e+06, 2.9311e+06, 5.87694, 0.62071, 0.59549, 1.16678, 0.0302158, 0.00216455, 0.029364, 0.590076, 1.35965, 0.82526, 6.19021e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m32@48 at64+m16@48 aB wg 2x2x8 kr xaf cs hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.12446e+06, -88323.8, -10316.2, 139507, 2.89997e+06, 1.7449e+06, 0.820255, 0.848685, 0.849377, 1.53444, 0.0324984, 0.0285972, 0.0106938, 0.747769, 1.17424, 0.248229, 3.53369e-11}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "i"}, "aB4+m8@16 aB8+m8@16 aB wg 2x8 kc8 nse hi pt sr br sb32 bk0 sm grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 8}, {2, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13559e+06, 329144, 0, 0, 3.98131e+06, 1.02646e+07, 1.8086, 0.959722, 0.825272, 1.32219, 0.0625979, -3.40781e-07, 0.0627435, 0.508371, 1.004, 1.00112, -3.56531e-15}}}, {{'F', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#i"}, "aB8x2+B8@12 aB8+B16@12 aB wg 8x4 kc8 nse hi pt sr br sb32 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {890697, 660867, 0, 0, 3.01466e+06, 6.75021e+06, 2.14558, 4.35317, 0.646535, 1.04541, 0.0629187, 0.0629187, 0, 1, 1.00416, 0.912565, 4.53331e-13}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIqp"}, "at16+m64@48 am32+m32@56 aB wg 4x8 xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {891672, 716622, 0, 0, 9.15866e+06, 1.23699e+07, 1.01186, 0.784937, 0.920647, 1.54228, 0.00412477, 0.00412477, 0, 1, 1.49446, 1.10147, 4.62125e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m64@48 am32+m32@64 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {878563, 640029, 0, 0, 7.94624e+06, 1.00516e+07, 0.792598, 0.747562, 0.882257, 1.4892, 0.00427013, 0.00427013, 0, 0.943254, 1.61821, 1.15931, 2.48177e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at16x2+m32@32 am32+m64@48 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {874741, 575243, 0, 0, 5.8327e+06, 8.40499e+06, 0.789453, 0.905132, 0.884603, 1.48669, 0.00505136, 0.00505136, 0, 0.980586, 1.4651, 1.12795, 2.59575e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16+m32@32 am32+m32@48 aB wg 8x2x2 kr af vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.1314e+06, -878271, -20025.1, 1.16609e+06, 4.7276e+06, 6.41434e+06, 0.491471, 0.826693, 0.899462, 1.61955, 0.00543978, 0.000623448, 0.00517972, 0.502855, 1.47936, 1.13884, 2.33468e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at16+m32@32 am32+m32@48 aB wg 8x2x2 kr af vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 261, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.0377e+06, -70021.4, 17987.1, 761277, 0, 0, 0.51355, 0.842872, 0.925527, 1.68866, 0.00607608, 0.00607608, 0, 1, 1.3891, 1.07829, 1.97293e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@64 am16 aB wg 16x2 cb3x2 ks32 af vav di hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.00078e+06, 522321, 0, 0, 0, 0, 0.682391, 1.20192, 1.01335, 1.54654, 0.0071952, 0.0071952, 0, 0.953151, 1.34794, 1.05729, 2.20536e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am16x2+m64@32 aB wg 4x4x2 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03784e+06, -450885, 26588.1, 694717, 3.98295e+06, 3.92397e+06, 0.81751, 0.78514, 0.845116, 1.57539, 0.00816675, 0.000557843, 0.00774396, 0.425496, 1.24902, 0.987854, 2.88769e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m64@32 am32+m16@32 aB wg 4x4x2 kr xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03655e+06, 184468, 20950.5, 378409, 0, 0, 0.710712, 0.679749, 0.896112, 1.86296, 0.00760741, 0.00760741, 0, 1, 1.31797, 1.02569, 2.04733e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m64@64 am32+m16@64 aB wg 8x4 af vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {885956, 430908, 0, 0, 0, 0, 0.914642, 0.833252, 0.684654, 1.25859, 0.00868088, 0.00868088, 0, 0.977653, 1.23981, 0.98687, 1.9126e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am32+m32@48 aB wg 4x2x4 kr xaf vav di hi pt sr br sb64 bk0 sm sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 32768, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.12707e+06, -214752, -7383.8, 333698, 3.94035e+06, 3.08838e+06, 0.484581, 0.643672, 0.911706, 1.72451, 0.00993042, 0.0077074, 0.00295794, 0.390742, 1.38294, 0.925243, 9.30327e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am32+m64@48 aB wg 8x4 af vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {894564, 394448, 0, 0, 0, 0, 0.740088, 1.32909, 0.850093, 1.46728, 0.0134182, 0.0134182, 0, 0.841095, 1.18393, 0.950667, 1.90558e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am32+m32@48 aB wg 2x2x8 kr xaf vav di hi pt sr br sb64 bk0 sm sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 16384, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.12909e+06, -145288, -3837.8, 206075, 4.36634e+06, 2.87539e+06, 0.472236, 0.412666, 0.84861, 1.70891, 0.0122108, 0.0128924, 0.00189719, 0.642098, 1.34608, 0.926981, 8.76031e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am32+m32@48 aB wg 4x4x2 kr xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 16384, {4, 8, 4}, {true, true, true}}, {'E', 17, {982924, 237479, 9738.81, 215541, 0, 0, 0.699103, 0.69007, 1.44207, 3.02178, 0.0135621, 0.0135621, 0, 1, 1.29716, 0.969497, 2.36412e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am16+m64@48 aB wg 4x8 xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {884081, 380764, 0, 0, 0, 0, 1.36831, 1.27261, 1.55496, 2.49777, 0.0278439, 0.0278439, 0, 0.839345, 1.01284, 0.740188, 3.17551e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m16@48 am16x2+m32@32 aB wg 4x2x4 kr af vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01144e+06, 342078, -2010.24, 109121, 0, 0, 0.570066, 0.953291, 2.16852, 4.49654, 0.0218051, 0.0218051, 0, 0.914819, 1.12368, 0.962575, 2.39589e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@64 am32+m64@64 aB wg 2x2x8 kr xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15274e+06, -78649.5, -7526.38, 127813, 3.10477e+06, 2.06438e+06, 0.435838, 0.461228, 0.861245, 1.55509, 0.0239119, 0.0234452, 0.00542143, 0.84336, 1.27869, 0.990632, 3.60161e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m16@48 am16x2+m32@32 aB wg 2x4x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {960630, 347905, 5419.42, 94045.1, 0, 0, 0.805695, 0.777097, 3.17542, 7.16008, 0.0298069, 0.0298069, 0, 0.998823, 1.13902, 0.941603, 2.02756e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 8, 1}, "ABI"}, "at16+m64@16 am64+m64@64 aB wg 8x4 af vav di li nmk pt sr br sb64 bk0 sm sys kv afb", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 64}, {16, 4, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03147e+06, 232150, 0, 0, 2.25853e+06, 2.39288e+06, 0.636799, 3.56799, 1.54923, 2.28375, 0.0371495, 0.00331608, 0.0414442, 0.906021, 1.00388, 1.00126, 2.00587e-14}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at16+m32@48 aB16 aB wg 8x4 cb4x2 ks32 af vav di hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.05957e+06, 973070, 0, 0, 7.52845e+06, 1.03301e+07, 0.801208, 0.831268, 0.985204, 1.59017, 0.00445847, 0.00445847, 0, 0.989412, 1.66875, 1.16553, 2.2562e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AIp"}, "at16+m32@48 aB16 aB wg 16x2 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1572864, 16777216}, {262144, 1572864, 64}, {16, 96, 64}, {16, 2, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07221e+06, 859869, 0, 0, 5.8327e+06, 7.83974e+06, 0.766638, 1.10644, 1.03215, 1.48351, 0.00498767, 0.00498767, 0, 0.940961, 1.6724, 1.16768, 2.0947e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@64 aB16x2 aB wg 16x2 cb3 ks32 xaf st vav di hi pt sr br bk0 sm sn nb 0x2 dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 441, 24576, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {997268, 779168, 0, 0, 4.36634e+06, 5.75078e+06, 0.740136, 1.4932, 1.0067, 1.5422, 0.00624117, 0.00624117, 0, 0.962734, 1.45555, 1.07753, 2.97255e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at16x2+m32@32 aB16x2 aB wg 8x2x2 kr cb4 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 445, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.79219e+06, -531031, -361044, 940628, 4.07142e+06, 4.00589e+06, 0.824906, 0.816235, 1.02815, 1.50776, 0.00733025, 0.00114441, 0.00653515, 0.764373, 1.47826, 1.10061, 2.80374e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m32@48 aB32 aB wg 8x4 cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00274e+06, 774511, 0, 0, 0, 0, 0.955103, 0.935315, 0.976805, 1.72673, 0.00884678, 0.00884678, 0, 1, 1.29366, 0.922321, 3.85241e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at16+m32@16 aB32 aB wg 8x1x4 kr cb4x2 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 1, 4}, 1, (WGType) 1, 445, 16384, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.73971e+06, -258868, -158599, 451327, 3.80928e+06, 3.05316e+06, 0.482671, 0.706304, 1.03317, 1.54125, 0.00920456, 0.00608899, 0.00408001, 0.912767, 1.36748, 1.00539, 5.47259e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ipq"}, "at32+m32@32 aB16 aB wg 8x2x4 kr cb4 ks16 af vav di hi pt sr br bk0 sm sn dm sys l4 l2d", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 2, 4}, 1, (WGType) 1, 261, 8192, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.11084e+06, 1.12066e+06, -4115.24, 188026, 0, 0, 0.763818, 1.33134, 1.06246, 2.16315, 0.0116126, 0.0116126, 0, 0.839402, 1.31397, 1.00293, 1.95132e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m16@64 aB32x2 aB wg 8x4 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn nb 0x4 dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {965920, 744570, 0, 0, 0, 0, 0.791602, 1.36895, 0.948542, 2.16931, 0.0146708, 0.0146708, 0, 0.923434, 1.19189, 0.953727, 1.73617e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@32 aB16x2 aB wg 4x2x4 kr cb4x2 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.88588e+06, -200788, -193135, 423330, 3.44883e+06, 2.47398e+06, 0.572193, 0.527027, 0.933872, 1.59818, 0.0129759, 0.0095749, 0.00498263, 0.687385, 1.38427, 0.929972, 5.58992e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m16@32 aB16 aB wg 4x2x4 kr cb4x2 ks32 af vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06355e+06, 786133, -174.663, 137254, 0, 0, 0.678583, 0.661981, 1.59904, 3.40955, 0.0149449, 0.0149449, 0, 0.904272, 1.23465, 1.00013, 1.46897e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m32@64 aB64 aB wg 4x8 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {994587, 706072, 0, 0, 0, 0, 1.16172, 1.12925, 1.50732, 3.11789, 0.0225454, 0.0225454, 0, 0.932581, 1.112, -1.70688, 2.19465e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.63784e+06, -161637, -133730, 320507, 2.49037e+06, 1.72196e+06, 0.553135, 0.77437, 0.93673, 1.73418, 0.0199906, 0.0144326, 0.00833266, 0.7979, 1.32142, 0.974789, 3.83166e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m16@32 aB64+m32@32 aB wg 2x2x8 kr af vav di hi pt sr br sb64 bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 261, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.56172e+06, 526043, -33531.5, 75691, 0, 0, 0.616943, 0.690186, 3.77748, 8.29004, 0.0313301, 0.0313301, 0, 0.830451, 1.08692, 0.965356, 1.18354e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "at16x2+m64@16 aB32+m16@32 aB wg 4x2x4 kr af vav di li nmk pt sr br sb64 bk0 sm sn dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 32}, {16, 8, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.28894e+06, -131687, -76296.7, 251879, 2.71155e+06, 2.04882e+06, 0.531564, 1.43928, 1.43076, 2.58493, 0.0282574, 0.0178971, 0.0205543, 1, 1.23116, -0.0703731, 1.35882e-11}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "I"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19242e+06, -90063.1, -14616.2, 144232, 2.71974e+06, 1.70476e+06, 1.08584, 0.412189, 0.608623, 1.21556, 0.0200475, 0.0220263, 0.0153697, 1, 1.02408, 0.835925, 1.49508e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIqps"}, "at16+m64@48 am32+m32@56 aB wg 4x8 xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {891672, 716622, 0, 0, 9.15866e+06, 1.23699e+07, 1.01186, 0.784937, 0.920647, 1.54228, 0.00412477, 0.00412477, 0, 1, 1.49446, 1.10147, 4.62125e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m64@48 am32+m32@64 aB wg 8x4 xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {878563, 640029, 0, 0, 7.94624e+06, 1.00516e+07, 0.792598, 0.747562, 0.882257, 1.4892, 0.00427013, 0.00427013, 0, 0.943254, 1.61821, 1.15931, 2.48177e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at16x2+m32@32 am32+m64@48 aB wg 8x4 xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {874741, 575243, 0, 0, 5.8327e+06, 8.40499e+06, 0.789453, 0.905132, 0.884603, 1.48669, 0.00505136, 0.00505136, 0, 0.980586, 1.4651, 1.12795, 2.59575e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16+m32@32 am32+m32@48 aB wg 8x2x2 kr af vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.1314e+06, -878271, -20025.1, 1.16609e+06, 4.7276e+06, 6.41434e+06, 0.491471, 0.826693, 0.899462, 1.61955, 0.00543978, 0.000623448, 0.00517972, 0.502855, 1.47936, 1.13884, 2.33468e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at16+m32@32 am32+m32@48 aB wg 8x2x2 kr af vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 261, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.0377e+06, -70021.4, 17987.1, 761277, 0, 0, 0.51355, 0.842872, 0.925527, 1.68866, 0.00607608, 0.00607608, 0, 1, 1.3891, 1.07829, 1.97293e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@64 am16 aB wg 16x2 cb3x2 ks32 af vav hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.00078e+06, 522321, 0, 0, 0, 0, 0.682391, 1.20192, 1.01335, 1.54654, 0.0071952, 0.0071952, 0, 0.953151, 1.34794, 1.05729, 2.20536e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am16x2+m64@32 aB wg 4x4x2 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03784e+06, -450885, 26588.1, 694717, 3.98295e+06, 3.92397e+06, 0.81751, 0.78514, 0.845116, 1.57539, 0.00816675, 0.000557843, 0.00774396, 0.425496, 1.24902, 0.987854, 2.88769e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m64@32 am32+m16@32 aB wg 4x4x2 kr xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03655e+06, 184468, 20950.5, 378409, 0, 0, 0.710712, 0.679749, 0.896112, 1.86296, 0.00760741, 0.00760741, 0, 1, 1.31797, 1.02569, 2.04733e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m64@64 am32+m16@64 aB wg 8x4 af vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {885956, 430908, 0, 0, 0, 0, 0.914642, 0.833252, 0.684654, 1.25859, 0.00868088, 0.00868088, 0, 0.977653, 1.23981, 0.98687, 1.9126e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am32+m32@48 aB wg 4x2x4 kr xaf vav hi pt sr br sb64 bk0 sm sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 32768, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.12707e+06, -214752, -7383.8, 333698, 3.94035e+06, 3.08838e+06, 0.484581, 0.643672, 0.911706, 1.72451, 0.00993042, 0.0077074, 0.00295794, 0.390742, 1.38294, 0.925243, 9.30327e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am32+m64@48 aB wg 8x4 af vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {894564, 394448, 0, 0, 0, 0, 0.740088, 1.32909, 0.850093, 1.46728, 0.0134182, 0.0134182, 0, 0.841095, 1.18393, 0.950667, 1.90558e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am32+m32@48 aB wg 2x2x8 kr xaf vav hi pt sr br sb64 bk0 sm sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 16384, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.12909e+06, -145288, -3837.8, 206075, 4.36634e+06, 2.87539e+06, 0.472236, 0.412666, 0.84861, 1.70891, 0.0122108, 0.0128924, 0.00189719, 0.642098, 1.34608, 0.926981, 8.76031e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am32+m32@48 aB wg 4x4x2 kr xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 16384, {4, 8, 4}, {true, true, true}}, {'E', 17, {982924, 237479, 9738.81, 215541, 0, 0, 0.699103, 0.69007, 1.44207, 3.02178, 0.0135621, 0.0135621, 0, 1, 1.29716, 0.969497, 2.36412e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am16+m64@48 aB wg 4x8 xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {884081, 380764, 0, 0, 0, 0, 1.36831, 1.27261, 1.55496, 2.49777, 0.0278439, 0.0278439, 0, 0.839345, 1.01284, 0.740188, 3.17551e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m16@48 am16x2+m32@32 aB wg 4x2x4 kr af vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01144e+06, 342078, -2010.24, 109121, 0, 0, 0.570066, 0.953291, 2.16852, 4.49654, 0.0218051, 0.0218051, 0, 0.914819, 1.12368, 0.962575, 2.39589e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@64 am32+m64@64 aB wg 2x2x8 kr xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15274e+06, -78649.5, -7526.38, 127813, 3.10477e+06, 2.06438e+06, 0.435838, 0.461228, 0.861245, 1.55509, 0.0239119, 0.0234452, 0.00542143, 0.84336, 1.27869, 0.990632, 3.60161e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m16@48 am16x2+m32@32 aB wg 2x4x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {960630, 347905, 5419.42, 94045.1, 0, 0, 0.805695, 0.777097, 3.17542, 7.16008, 0.0298069, 0.0298069, 0, 0.998823, 1.13902, 0.941603, 2.02756e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 8, 1}, "ABI"}, "at16+m64@16 am64+m64@64 aB wg 8x4 af vav li nmk pt sr br sb64 bk0 sm sys kv afb", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 64}, {16, 4, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03147e+06, 232150, 0, 0, 2.25853e+06, 2.39288e+06, 0.636799, 3.56799, 1.54923, 2.28375, 0.0371495, 0.00331608, 0.0414442, 0.906021, 1.00388, 1.00126, 2.00587e-14}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at16+m32@48 aB16 aB wg 8x4 cb4x2 ks32 af vav hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.05957e+06, 973070, 0, 0, 7.52845e+06, 1.03301e+07, 0.801208, 0.831268, 0.985204, 1.59017, 0.00445847, 0.00445847, 0, 0.989412, 1.66875, 1.16553, 2.2562e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AIps"}, "at16+m32@48 aB16 aB wg 16x2 cb4x2 ks64 xaf st vav hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1572864, 16777216}, {262144, 1572864, 64}, {16, 96, 64}, {16, 2, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07221e+06, 859869, 0, 0, 5.8327e+06, 7.83974e+06, 0.766638, 1.10644, 1.03215, 1.48351, 0.00498767, 0.00498767, 0, 0.940961, 1.6724, 1.16768, 2.0947e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@64 aB16x2 aB wg 16x2 cb3 ks32 xaf st vav hi pt sr br bk0 sm sn nb 0x2 dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 441, 24576, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {997268, 779168, 0, 0, 4.36634e+06, 5.75078e+06, 0.740136, 1.4932, 1.0067, 1.5422, 0.00624117, 0.00624117, 0, 0.962734, 1.45555, 1.07753, 2.97255e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at16x2+m32@32 aB16x2 aB wg 8x2x2 kr cb4 ks32 xaf st vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 445, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.79219e+06, -531031, -361044, 940628, 4.07142e+06, 4.00589e+06, 0.824906, 0.816235, 1.02815, 1.50776, 0.00733025, 0.00114441, 0.00653515, 0.764373, 1.47826, 1.10061, 2.80374e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m32@48 aB32 aB wg 8x4 cb4x2 ks32 xaf vav hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00274e+06, 774511, 0, 0, 0, 0, 0.955103, 0.935315, 0.976805, 1.72673, 0.00884678, 0.00884678, 0, 1, 1.29366, 0.922321, 3.85241e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at16+m32@16 aB32 aB wg 8x1x4 kr cb4x2 ks32 xaf st vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 1, 4}, 1, (WGType) 1, 445, 16384, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.73971e+06, -258868, -158599, 451327, 3.80928e+06, 3.05316e+06, 0.482671, 0.706304, 1.03317, 1.54125, 0.00920456, 0.00608899, 0.00408001, 0.912767, 1.36748, 1.00539, 5.47259e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ipq"}, "at32+m32@32 aB16 aB wg 8x2x4 kr cb4 ks16 af vav hi pt sr br bk0 sm sn dm sys l4 l2d", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 2, 4}, 1, (WGType) 1, 261, 8192, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.11084e+06, 1.12066e+06, -4115.24, 188026, 0, 0, 0.763818, 1.33134, 1.06246, 2.16315, 0.0116126, 0.0116126, 0, 0.839402, 1.31397, 1.00293, 1.95132e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m16@64 aB32x2 aB wg 8x4 cb4x2 ks64 xaf st vav hi pt sr br bk0 sm sn nb 0x4 dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {965920, 744570, 0, 0, 0, 0, 0.791602, 1.36895, 0.948542, 2.16931, 0.0146708, 0.0146708, 0, 0.923434, 1.19189, 0.953727, 1.73617e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@32 aB16x2 aB wg 4x2x4 kr cb4x2 ks32 xaf st vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.88588e+06, -200788, -193135, 423330, 3.44883e+06, 2.47398e+06, 0.572193, 0.527027, 0.933872, 1.59818, 0.0129759, 0.0095749, 0.00498263, 0.687385, 1.38427, 0.929972, 5.58992e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m16@32 aB16 aB wg 4x2x4 kr cb4x2 ks32 af vav hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06355e+06, 786133, -174.663, 137254, 0, 0, 0.678583, 0.661981, 1.59904, 3.40955, 0.0149449, 0.0149449, 0, 0.904272, 1.23465, 1.00013, 1.46897e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m32@64 aB64 aB wg 4x8 cb4x2 ks64 xaf st vav hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {994587, 706072, 0, 0, 0, 0, 1.16172, 1.12925, 1.50732, 3.11789, 0.0225454, 0.0225454, 0, 0.932581, 1.112, -1.70688, 2.19465e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.63784e+06, -161637, -133730, 320507, 2.49037e+06, 1.72196e+06, 0.553135, 0.77437, 0.93673, 1.73418, 0.0199906, 0.0144326, 0.00833266, 0.7979, 1.32142, 0.974789, 3.83166e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m16@32 aB64+m32@32 aB wg 2x2x8 kr af vav hi pt sr br sb64 bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 261, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.56172e+06, 526043, -33531.5, 75691, 0, 0, 0.616943, 0.690186, 3.77748, 8.29004, 0.0313301, 0.0313301, 0, 0.830451, 1.08692, 0.965356, 1.18354e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "at16x2+m64@16 aB32+m16@32 aB wg 4x2x4 kr af vav li nmk pt sr br sb64 bk0 sm sn dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 32}, {16, 8, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.28894e+06, -131687, -76296.7, 251879, 2.71155e+06, 2.04882e+06, 0.531564, 1.43928, 1.43076, 2.58493, 0.0282574, 0.0178971, 0.0205543, 1, 1.23116, -0.0703731, 1.35882e-11}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "I"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19242e+06, -90063.1, -14616.2, 144232, 2.71974e+06, 1.70476e+06, 1.08584, 0.412189, 0.608623, 1.21556, 0.0200475, 0.0220263, 0.0153697, 1, 1.02408, 0.835925, 1.49508e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav di sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS16+m16@12 aS8+m16@12 aB wg 8x4 kc8 nse hi pt sr br sb32 bk0 sm sn grf256 kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {874480, 814710, 0, 0, 3.14573e+06, 1.04694e+07, 1.59572, 1.73308, 0.796063, 1.18049, 0.0626559, 0.0626559, 0, 1, 1.00457, 1.00258, -3.94402e-14}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABqI"}, "am32+S1,32@64 av32+B32@64 aS cs di sys grf256 af wg 8x4 bo sb256 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {705599, 1.02394e+06, 0, 0, 0, 0, 0.719208, 0.662079, 1.08787, 2.05052, 0.00435156, 0.00435156, 0, 0.998842, 1.73144, 1.10326, 2.79532e-12}}}, -{{'F', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16 aS16 aB sys grf256 cab2 wg 4x4 l4 sr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.2896e+06, 327092, 0, 0, 0, 0, 1.64753, 1.69722, 1.01172, 1.48812, 0.0145767, 0.000763122, 0.0157325, 0.871871, 1.01157, 1.00431, 1.22757e-13}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABqI"}, "am32+S1,32@64 av32+B32@64 aS cs sys grf256 af wg 8x4 bo sb256 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {705599, 1.02394e+06, 0, 0, 0, 0, 0.719208, 0.662079, 1.08787, 2.05052, 0.00435156, 0.00435156, 0, 0.998842, 1.73144, 1.10326, 2.79532e-12}}}, +{{'F', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Is"}, "aS16 aS16 aB sys grf256 cab2 wg 4x4 l4 sr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.2896e+06, 327092, 0, 0, 0, 0, 1.64753, 1.69722, 1.01172, 1.48812, 0.0145767, 0.000763122, 0.0157325, 0.871871, 1.01157, 1.00431, 1.22757e-13}}}, {{'F', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS8x2+m16@20 aB8+m16@20 aS wg 8x4 kc8 nse hi pt sr sb32 bk0 sm sn grf256 kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 8}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {888082, 626619, 0, 0, 1.5319e+07, 2.15941e+07, 3.12178, 1.38894, 4.6861, 5.03272, 0.0626843, 0.0626843, 0, 1, 1.00285, 1.00112, -2.5704e-15}}}, -{{'F', "gemm", {"B", "F", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav di li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, -{{'F', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, +{{'F', "gemm", {"B", "F", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, +{{'F', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, {{'F', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, -{{'F', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, -{{'F', "gemm", {"B", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav di li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, -{{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, +{{'F', "gemm", {"B", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, +{{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, {{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, -{{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, -{{'F', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, -{{'F', "gemm", {"F", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, +{{'F', "gemm", {"D", "D", "D"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+B8@16 aB nse grf256 wg 4x8 bo pt kc8 sb256 bk0 br sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 8}, {true, true, true}}, {'W', 1, {1024}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@24 aS8x2+S8@24 aB wg 4x8 kc8 nse hi pt sb256 bk0 sn grf256 br sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {915642, 488057, 0, 0, 0, 0, 2.62682, 4.67056, 1.01353, 1.76192, 0.0687398, 0.0687398, 0, 0.998364, 1.80644, 1.08579, 3.08664e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB32+B16@32 aS16+S8@32 aB wg 4x8 kc16 nse hi pt sb256 bk0 sn grf256 br sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {887309, 574758, 0, 0, 0, 0, 4.77569, 4.82861, 0.536993, 1.65054, 0.0889844, 0.0889844, 0, 1, 1.65309, 1.06232, 1.19911e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am16+B8@16 am/S8x2+S16@8 aB wg 4x8 kc8 nse hi pt sb256 bk0 sn grf256 br sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {874030, 476421, 0, 0, 0, 0, 4.8312, 6.40891, 1.05734, 2.94303, 0.18704, 0.18704, 0, 0.861733, 1.30529, 0.905412, 1.30557e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am16+S16@32 am/S16x2+S32@16 aB wg 4x8 kc16 nse hi pt sb256 bk0 sn grf256 br sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {131072, 65536, 16777216}, {8192, 8192, 16777216}, {8, 4, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {878381, 502381, 0, 0, 0, 0, 3.51216, 4.32155, 2.1606, 5.68106, 0.153809, 0.153809, 0, 0.992787, 1.53401, 1.00819, 5.99333e-12}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am8+B8@32 am16+S32@32 aB wg 2x8 kc8 nse hi pt sb256 bk0 grf256 br sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.07599e+06, 227353, 0, 0, 0, 0, 2.32632, 10.6671, 1.95612, 9.18304, 0.317695, 0.0293572, 0.283039, 0.966881, 1.48893, 0, 0}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am8x2+S32@56 am32x2+S8@32 aB wg 2x8 kc8 nse hi pt sb256 bk0 grf256 br sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 32}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.07715e+06, 250361, 0, 0, 0, 0, 2.59315, 7.24109, 5.29152, 18.9828, 0.384253, 0.0758976, 0.344607, 0.971505, 1.50646, 0, 0}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aS16+S1,8@24 aB wg 2x4x4 kr kc8 nse hi pt br sr sb32 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 413, 0, 32768, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.18501e+06, -243475, -23032.8, 379112, 3.1703e+06, 0, 1.97161, 2.22356, 1.44309, 2.66453, 0.0699686, 0.0453516, 0.0318017, 0.918119, 1.74896, 1.09407, 3.48966e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@20 aS8+S8@16 aB wg 2x4x4 kr kc8 nse hi pt br sr sb32 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 0}, {32, 8, 8}, {2, 4, 4}, 1, (WGType) 1, 413, 0, 16384, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.24656e+06, -164669, -48002.1, 283386, 2.77709e+06, 0, 1.98606, 2.80806, 1.04589, 2.3437, 0.0784483, 0.0455955, 0.0428373, 0.974247, 1.873, 1.30826, 1.26458e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+B32@8 am/S8+S32@8 aB wg 2x4x4 kr kc8 nse hi pt br sr sb256 bk0 sn grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 0}, {16, 8, 16}, {2, 4, 4}, 1, (WGType) 1, 413, 0, 8192, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.2169e+06, -138561, -51393, 265750, 2.82624e+06, 0, 3.0771, 2.47131, 0.605585, 1.74042, 0.122303, 0.0704165, 0.0648163, 0.869056, 1.56192, 1.18982, 4.27875e-12}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 63, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 am/S32+S8@16 aB wg 8x2x2 kr kc8 nse nmk li pt br sr sb256 bk0 sn grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 0}, {32, 8, 32}, {8, 2, 2}, 1, (WGType) 1, 413, 0, 32768, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.05699e+06, -428258, 173.365, 680582, 2.47398e+06, 0, 2.18296, 8.55503, 0.998614, 1.95224, 0.116745, 0.0199768, 0.0971294, 0.407718, 1.56937, 1.07565, 2.15851e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8x2+S8@16 am/S16x2+S32@8 aB wg 8x1x8 kr kc8 nse nmk li pt br sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {131072, 65536, 16777216}, {8192, 8192, 0}, {8, 4, 16}, {8, 1, 8}, 1, (WGType) 1, 413, 0, 2048, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.1402e+06, -71205.9, -21903.7, 142433, 2.55017e+06, 0, 1.66009, 9.05773, 1.11937, 2.68092, 0.156119, 0.215721, 0.084158, 0.95334, 1.51236, 1.10163, 1.46309e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, ""}, "am8x2+S8@24 aB16+S16@32 aB wg 8x1x8 kr kc8 nse nmk li pt br sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 0}, {8, 1, 16}, {8, 1, 8}, 1, (WGType) 1, 413, 0, 512, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.16128e+06, -65247.7, -22289, 130631, 2.53133e+06, 0, 1.64855, 28.7631, 2.52926, 8.66628, 0.473162, 0.41267, 0.237065, 0.973421, 1.33231, 0, 0}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {63, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@16 am/S8+S1,16@16 aB wg 1x16x2 kr kc8 nse li pt br sr sb256 bk0 sn grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 8}, {1, 16, 2}, 1, (WGType) 1, 413, 0, 32768, {8, 8, 8}, {true, true, true}}, {'E', 17, {976429, -406266, 45745.6, 629685, 2.86228e+06, 0, 7.99903, 1.64031, 1.12683, 2.08743, 0.110815, 0.0136219, 0.0976198, 0.772308, 1.44929, 1.07416, 1.34478e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {1, 1, 1}, ""}, "aB8x2+S16@24 aS16x2+S1,8@16 aS wg 1x8x4 kr kc8 nse li pt sr sb256 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {65536, 262144, 16777216}, {8192, 8192, 0}, {4, 16, 16}, {1, 8, 4}, 1, (WGType) 1, 413, 0, 4096, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.21325e+06, -117438, -61556.4, 217477, 2.2528e+06, 0, 12.0563, 1.81403, 1.31532, 3.89594, 0.568058, 0.26898, 0.145357, 0, 1.19505, 0.748462, 6.62819e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, ""}, "am32+S32@16 at8+S1,8@16 aS wg 1x4x8 kr kc8 nse li pt sr sb256 bk0 sn grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {16384, 262144, 16777216}, {8192, 8192, 0}, {1, 16, 32}, {1, 4, 8}, 1, (WGType) 1, 413, 0, 512, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.16219e+06, -57394.2, -25547.5, 110434, 2.43302e+06, 0, 33.3002, 1.74263, 6.19487, 15.3289, 1.83114, 0.655931, 0.435916, 0, 1.20699, 0, 0}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB32/16+B32@24 aS8+S8@16 aB wg 1x4x8 kr kc8 nse hi pt br sr sb32 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {32, 4, 32}, {1, 4, 8}, 1, (WGType) 1, 413, 0, 4096, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.18047e+06, -71272.1, -20070.7, 138650, 2.71974e+06, 0, 2.17632, 2.94444, 0.603756, 1.71966, 0.167455, 0.134419, 0.0534319, 0.781359, 1.48835, 1.28544, 1.46591e-12}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aS8/4x2+S1,8@12 aB wg 8x4 kc8 nse hi pt sb32 bk0 sn grf256 br sr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 0}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 409, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {879168, 722368, 0, 0, 4.75054e+06, 0, 2.06787, 4.41992, 1.60573, 2.51717, 0.0641037, 0.0641037, 0, 1, 1.60856, 1.11787, 2.05124e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aS16+S1,8@32 aB wg 4x8 kc8 nse hi pt sb256 bk0 sn grf256 br sr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 409, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {912372, 563212, 0, 0, 3.32595e+06, 0, 2.38855, 3.24194, 1.35317, 2.28502, 0.0657287, 0.0657287, 0, 1, 1.66986, 1.09282, 2.26599e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16/8+B16@24 aS16+S1,8@24 aB wg 4x4x2 kr kc16 nse hi pt br sr sb32 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 16}, {4, 4, 2}, 1, (WGType) 1, 413, 0, 65536, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.10777e+06, -506066, -12890.7, 799323, 3.24076e+06, 0, 2.01254, 3.23584, 1.4086, 2.31622, 0.0656034, 0.00300752, 0.0661711, 0.737749, 1.7214, 1.10676, 2.39035e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8/4+B16@16 aB8/4x2+B16@12 aB wg 8x4 kc8 nse hi pt sb32 bk0 grf256 br sr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 0}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 409, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {861097, 722621, 0, 0, 4.43187e+06, 0, 2.3239, 3.58269, 1.60841, 2.53024, 0.064521, 0.064521, 0, 0.999741, 1.56629, 1.08295, 1.63858e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16/8+B8@24 aB8x2+B8@8 aB wg 4x8 kc8 nse hi pt sb256 bk0 grf256 br sr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 409, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {871542, 592164, 0, 0, 3.45702e+06, 0, 2.36848, 3.02961, 1.32902, 2.25867, 0.0662883, 0.0662883, 0, 0.994455, 1.63546, 1.12381, 1.69159e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@24 am8x2+B16@16 aB wg 8x4 kc8 nse hi pt br sr sb256 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 409, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {880192, 520025, 0, 0, 2.84099e+06, 0, 2.31648, 4.16414, 0.909713, 1.66985, 0.0692239, 0.0692239, 0, 1, 1.71165, 1.07463, 2.09285e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+B16@24 aB32+B8@24 aB wg 4x4x2 kr kc16 nse hi pt br sr sb64 bk0 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 32}, {4, 4, 2}, 1, (WGType) 1, 413, 0, 32768, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.11809e+06, -466011, -16563.9, 764493, 2.61325e+06, 0, 2.84324, 3.20443, 0.85062, 1.78271, 0.0761868, 0.00753119, 0.0759547, 0.644822, 1.67986, 1.12972, 2.44869e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@8 aB32+S32@8 aB wg 2x2x8 kr kc8 nse hi pt br sr sb256 bk0 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 0}, {16, 8, 32}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 4096, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.22853e+06, -82782.2, -23670.7, 148817, 3.12689e+06, 0, 2.3891, 3.33249, 0.600665, 1.68573, 0.163761, 0.082605, 0.0800501, 0.814058, 1.43822, 1.26394, 2.17885e-12}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8x2+S8@16 aB8x2+S8@16 aB wg 8x1x8 kr kc8 nse nmk li pt br sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {131072, 65536, 16777216}, {8192, 8192, 0}, {8, 4, 8}, {8, 1, 8}, 1, (WGType) 1, 413, 0, 2048, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.11999e+06, -72570.5, -14923.8, 138124, 2.42483e+06, 0, 1.78133, 14.7744, 1.09287, 2.73154, 0.489395, 0.247154, 0.0835583, 0, 1.20981, 0.834056, 5.52179e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, ""}, "aB8x2+S16@16 aS8x2+S16@16 aB wg 8x1x4 kr kc8 nse nmk li pt br sr sb256 bk0 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 0}, {8, 1, 8}, {8, 1, 4}, 1, (WGType) 1, 413, 0, 512, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.18155e+06, -101841, -56144.7, 192951, 2.19546e+06, 0, 1.84095, 49.5117, 4.87015, 18.3577, 2.19278, 0.799335, 0.551365, 0.333333, 1.22289, 0, 0}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, ""}, "aS8x2+S16@16 am16+S8@24 aS wg 1x8x8 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {16384, 131072, 16777216}, {8192, 8192, 0}, {1, 8, 16}, {1, 8, 8}, 1, (WGType) 1, 413, 0, 512, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.10592e+06, -66294.5, -13583.7, 130433, 2.4576e+06, 0, 41.6616, 1.75295, 2.80747, 13.7042, 3.09892, 0.798956, 0.29321, 0, 1.09357, 0, 0}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {63, -1, -1}, {1, 1, 1}, ""}, "aB8+S16@8 aB16/8+B32@8 aB wg 2x8x2 kr kc8 nse li pt br sr sb256 bk0 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {131072, 524288, 16777216}, {8192, 8192, 0}, {8, 32, 16}, {2, 8, 2}, 1, (WGType) 1, 413, 0, 32768, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.03036e+06, -481494, 30842.2, 727737, 2.94093e+06, 0, 10.2738, 2.35775, 1.05763, 1.93135, 0.117162, 0.0193778, 0.131382, 0.981071, 1.41432, 1.12958, 6.8823e-12}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aB32+B32@24 aB wg 4x2x4 kr kc8 nse hi pt sr br bk0 grf256 kv afb sb32", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 413, 0, 16384, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.2646e+06, -196909, -51005.6, 337453, 2.80986e+06, 0, 2.27659, 3.26787, 0.897445, 2.02238, 0.102658, 0.061305, 0.034087, 0.0702678, 1.56451, 1.36193, 2.83441e-12}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB32/16+B32@16 aB8x2+B8@8 aB wg 2x2x8 kr kc8 nse hi pt sr br bk0 grf256 kv afb sb32", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.23282e+06, -129218, -13345.6, 202286, 3.46522e+06, 0, 2.44022, 2.28397, 0.985247, 2.22005, 0.137039, 0.0856075, 0.0371964, 0.0656057, 1.44605, 1.31939, 1.07507e-12}}}, +{{'F', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16x2+B8@16 am8x2+B32@24 aB wg 16x1x2 kr kc8 nse hi pt sr br bk0 grf256 kv afb sb32 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 16}, {16, 1, 2}, 1, (WGType) 1, 413, 0, 32768, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.08915e+06, -431335, -5997.05, 700679, 2.9737e+06, 0, 2.21771, 7.76121, 0.980592, 1.76723, 0.139327, 0.00941652, 0.0971052, 0.262078, 1.46315, 1.21861, 3.7446e-12}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,8@40 aS16+S1,8@40 aB wg 4x8 kc8 nse hi pt sb32 bk0 sm sn grf256 br sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {877644, 599578, 0, 0, 0, 0, 3.15342, 2.59702, 1.3226, 2.25765, 0.0670018, 0.0670018, 0, 1, 1.68766, 1.11233, 2.23688e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,8@40 am/S8x2+S32@24 aB wg 4x8 kc8 nse hi pt sb256 bk0 sm sn grf256 br sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {883553, 504171, 0, 0, 0, 0, 3.77786, 2.16465, 0.612725, 1.79563, 0.0848513, 0.0848513, 0, 0.998512, 1.75631, 1.12347, 1.1711e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at32+S32@64 am/S16+S32@64 aB wg 8x4 kc16 nse hi pt sb256 bk0 sm sn grf256 br sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {865413, 570645, 0, 0, 0, 0, 2.86633, 4.96099, 1.2902, 3.23687, 0.123001, 0.123001, 0, 0.965442, 1.63164, 1.05986, 1.02296e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at16+S32@64 am/S16x2+S32@48 aB wg 4x8 kc16 nse hi pt sb256 bk0 sm grf256 br sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {131072, 32768, 16777216}, {8192, 8192, 16777216}, {8, 2, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {863028, 510360, 0, 0, 0, 0, 3.9041, 7.2614, 3.96106, 10.675, 0.250578, 0.250578, 0, 1, 1.51731, 1.03361, 3.84937e-12}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at16x2+S32@48 am16x2+S32@48 aB wg 2x16 kc16 nse hi pt sb256 bk0 grf256 br sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 16, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {881189, 505307, 0, 0, 0, 0, 7.06485, 7.28102, 5.42581, 19.0715, 0.482831, 0.482831, 0, 1, 1.34232, 0, 0}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,8@24 aS8+S1,8@24 aB nse wg 4x8 bo pt kc8 sm sn sb32 grf256 bk0 br sr kv afb", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 0}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 409, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {859973, 874478, 0, 0, 5.06266e+06, 0, 3.29404, 2.37429, 1.4817, 2.42039, 0.0651324, 0.0651324, 0, 1, 1.62478, 1.08017, 2.64218e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,8@12 at8+S1,8@12 aB wg 4x4x2 kr kc8 nse hi pt br sr sb32 bk0 sm sn grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {8192, 8192, 0}, {32, 32, 8}, {4, 4, 2}, 1, (WGType) 1, 413, 0, 65536, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.25951e+06, -1.32422e+06, -91876, 1.6803e+06, 4.04111e+06, 0, 3.66458, 2.28958, 1.60107, 2.58557, 0.0653312, 0.00193227, 0.0644636, 0.609814, 1.66452, 1.13819, 2.10824e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS16x2+S1,8@16 at8+S1,8@32 aB wg 4x2x4 kr kc8 nse hi pt br sr sb32 bk0 sm sn grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {8192, 8192, 0}, {16, 32, 16}, {4, 2, 4}, 1, (WGType) 1, 413, 0, 32768, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.2613e+06, -288637, -37005, 436502, 3.16211e+06, 0, 2.96943, 2.18687, 1.4616, 2.61866, 0.0832218, 0.0600046, 0.0297515, 0.685298, 1.63098, 1.03251, 4.21843e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "at8x2+S1,8@16 am/S8+S16@24 aB wg 8x1x4 kr kc8 nse nmk li pt br sr sb256 bk0 sm grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {8192, 8192, 0}, {16, 4, 8}, {8, 1, 4}, 1, (WGType) 1, 413, 0, 4096, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.20635e+06, -115723, -66951.5, 223965, 2.56983e+06, 0, 1.66705, 14.0732, 0.141298, 2.83505, 0.169419, 0.184522, 0.10455, 1, 1.54672, 1.272, 2.05682e-13}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, ""}, "at8x2+S1,8@16 am8+S16@24 aB wg 8x1x4 kr kc8 nse nmk li pt br sr sb256 bk0 sm grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 16384, 16777216}, {8192, 8192, 0}, {16, 1, 8}, {8, 1, 4}, 1, (WGType) 1, 413, 0, 1024, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.22796e+06, -97461.3, -70391.8, 202283, 2.28884e+06, 0, 1.64601, 50.7867, 1.43536, 8.53536, 0.609792, 0.200389, 0.445621, 0.98454, 1.34486, 0, 0}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {63, -1, -1}, {1, 1, 1}, ""}, "at16+S1,16@8 am/S8+S1,8@8 aB wg 1x16x2 kr kc8 nse li pt br sr sb256 bk0 sm sn grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 16}, {1, 16, 2}, 1, (WGType) 1, 413, 0, 32768, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.06061e+06, -422040, -138.053, 673111, 2.44531e+06, 0, 21.8595, 1.74146, 1.02, 1.99062, 0.125827, 0.0373741, 0.118184, 0.973939, 1.46707, 1.1825, 1.18024e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, ""}, "aB8+S8@8 at8+S1,8@8 aS wg 1x4x8 kr kc8 nse li pt sr sb256 bk0 sn grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {16384, 262144, 16777216}, {8192, 8192, 0}, {1, 16, 8}, {1, 4, 8}, 1, (WGType) 1, 413, 0, 512, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.13093e+06, -53054.3, -19486.3, 101494, 2.50675e+06, 0, 32.1889, 1.67287, 6.89438, 17.2631, 0.697187, 0.183575, 0.653988, 1, 1.26574, 0, 0}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8x2+S1,8@24 am8+S1,8@32 aB wg 8x4 kc8 nse hi pt sr br bk0 sm sn grf256 sb32 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {890085, 527131, 0, 0, 0, 0, 4.69062, 2.67511, 0.899379, 1.66937, 0.0769989, 0.0769989, 0, 1, 1.69862, 1.05048, 2.36119e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,8@32 aS16+S1,8@32 aB wg 8x2x2 kr kc8 nse hi pt sr br bk0 sm sn grf256 kv afb sb32", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {8192, 8192, 0}, {16, 32, 16}, {8, 2, 2}, 1, (WGType) 1, 413, 0, 65536, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.12406e+06, -567411, -26695.9, 856136, 3.45784e+06, 0, 1.98785, 3.48264, 1.41613, 2.24603, 0.0703441, 0.00331692, 0.0659792, 0.597622, 1.7232, 1.14925, 3.0157e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8x2+S8@16 aS8+S1,8@24 aB wg 8x1x4 kr kc8 nse hi pt sr br bk0 sm sn grf256 kv afb sb32", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {131072, 524288, 16777216}, {8192, 8192, 0}, {8, 32, 8}, {8, 1, 4}, 1, (WGType) 1, 413, 0, 16384, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.21584e+06, -238804, -30942.1, 366268, 2.7689e+06, 0, 1.96496, 3.24266, 0.99871, 1.67489, 0.0963645, 0.0221903, 0.0792587, 0.999858, 1.69956, 1.20732, 1.33481e-11}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,8@40 am32+S32@32 aB wg 2x4x4 kr kc8 nse hi pt sr br bk0 sm sn grf256 kv afb sb32 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 0}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 413, 0, 8192, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.30275e+06, -140576, -80892.2, 270400, 2.52314e+06, 0, 2.41886, 1.96183, 0.657308, 1.79007, 0.118095, 0.0805585, 0.0575456, 0.781989, 1.6348, 1.22619, 7.4659e-12}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8x2+S16@24 aS16+S16@32 aB wg 4x2x4 kr kc8 nse hi pt sr br bk0 sm sn grf256 kv afb sb32", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {131072, 131072, 16777216}, {8192, 8192, 0}, {8, 8, 16}, {4, 2, 4}, 1, (WGType) 1, 413, 0, 4096, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.2692e+06, -138155, -69946.2, 247166, 2.48218e+06, 0, 2.22535, 2.9076, -0.226323, 2.36118, 0.179437, 0.0955498, 0.0965902, 0.986833, 1.48781, 1.14074, 6.5259e-12}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS16+S8@32 aS8+S1,8@32 aB wg 16x1x2 kr kc8 nse li nmk pt sr br bk0 sm sn grf256 sb32 kv afb", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 0}, {8, 16, 16}, {16, 1, 2}, 1, (WGType) 1, 413, 0, 16384, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.08982e+06, -365235, -57570.3, 563017, 2.44122e+06, 0, 1.97404, 6.45479, 0.729323, 2.0721, 0.109576, 0.0188555, 0.121551, 0.995776, 1.6362, 1.09808, 8.84348e-12}}}, +{{'F', "gemm", {"D", "D", "D"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,8@16 aB8+B8@16 aS nse wg 8x4 bo pt kc8 sm sb256 grf256 bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {863507, 843091, 0, 0, 0, 0, 2.99578, 2.42156, 4.42824, 4.74906, 0.0650645, 0.0650645, 0, 0.993213, 1.59693, 1.16972, 1.1178e-11}}}, +{{'F', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, +{{'F', "gemm", {"F", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, {{'F', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "IB"}, "aB64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr dm", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {1, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.21597e+06, 165524, 28763.4, 9411.33, 0, 0, 0.278476, 0.599765, 0.920572, 5.21577, 0.021695, 0.0325353, 0.0148497, 1, 1.30279, 0.785158, 1.47381e-11}}}, -{{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, -{{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav di hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, +{{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, +{{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, {{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "BIp"}, "aB16+m32@48 am32+S32@64 aB wg 4x8 xaf st vav hi pt ca4x2 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {953955, 796880, 0, 0, 6.54705e+06, 1.02728e+07, 0.868475, 0.996885, 0.927331, 1.55842, 0.00423299, 0.00423299, 0, 0.997407, 1.58612, 1.11537, 2.43106e-12}}}, -{{'F', "gemm", {"F", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, -{{'F', "gemm", {"F", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav di li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, +{{'F', "gemm", {"F", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, +{{'F', "gemm", {"F", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, {{'F', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "IB"}, "aB64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr dm", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {1, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.21597e+06, 165524, 28763.4, 9411.33, 0, 0, 0.278476, 0.599765, 0.920572, 5.21577, 0.021695, 0.0325353, 0.0148497, 1, 1.30279, 0.785158, 1.47381e-11}}}, {{'F', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "BIp"}, "aB16+m64@48 am32+m32@64 aB wg 8x4 xaf ca4x2 vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb dm", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 32768, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {986359, 1.14807e+06, 0, 0, 5.29203e+06, 1.04448e+07, 0.958606, 0.919165, 0.917099, 1.4058, 0.00482064, 0.00482064, 0, 0.992145, 1.42432, 1.05623, 3.49665e-12}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {16, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 aS8x2+S32@8 aB wg 2x8x2 kr kc8 nse hi pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 8}, {2, 8, 2}, 1, (WGType) 1, 413, 0, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12483e+06, -574330, -20527.6, 853244, 4.03046e+06, 0, 2.33007, 1.51079, 0.80632, 1.51643, 0.0625462, -1.54388e-05, 0.0629715, 0.425906, 1.06028, 0.229788, 1.06051e-11}}}, -{{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse di li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {16, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}}, +{{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {16, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, {{'F', "gemm", {"F", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, -{{'F', "gemm", {"H", "F", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, -{{'F', "gemm", {"H", "F", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, -{{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 2}, "ABI"}, "at16+m32@64 aB16 aB wg 16x2 cb3x2 ks32 af vav di hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, -{{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, -{{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 dm sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, -{{'F', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 dm sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, -{{'F', "gemm", {"H", "H", "S"}, {"A2#16,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16+B16@48 aB16+B16@48 aB vav di sys grf256 af hi pt wg 4x8 sb256 bk0 sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {2048}}}, -{{'F', "gemm", {"H", "H", "S"}, {"A2", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2+B16@80 aB16x2+B16@80 aB vav sb256 wg 4x8 di bo pt sys bk0 sr br", {16, (LoopType) 255, 128, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {64, 128, 4}, {true, true, true}}, {'W', 1, {256}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@72 am32+m32@64 aB wg 4x8 xaf vav di hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {903365, 697556, 0, 0, 8.2903e+06, 1.21651e+07, 0.724506, 0.722081, 0.92287, 1.55416, 0.00402055, 0.00402055, 0, 0.997691, 1.6726, 1.18622, 5.18793e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m32@48 am32+S32@64 aB wg 4x8 xaf st vav di hi pt sb32 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {869760, 741196, 0, 0, 8.192e+06, 1.05431e+07, 0.731287, 0.777012, 0.881104, 1.51408, 0.00403024, 0.00403024, 0, 0.998184, 1.76752, 1.28618, 2.08947e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av64+m16@64 am32+m32@72 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 64}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {876646, 564122, 0, 0, 6.5151e+06, 7.83974e+06, 0.629669, 0.87362, 0.885543, 1.48097, 0.00440774, 0.00440774, 0, 1, 1.66234, 1.24996, 2.85794e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIpq"}, "av64+m16@64 am16+m16@48 aB wg 8x2x2 xaf vav di hi pt sr br sb64 bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 2, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.09587e+06, 290213, 0, 0, 7.65133e+06, 9.1177e+06, 0.649959, 1.16004, 0.89878, 1.61737, 0.0055749, -0.00026212, 0.00571368, 0.41686, 1.92252, 1.24755, 2.35066e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@32 am16x2 aB wg 4x4x2 kr cb4 ks16 xaf st vav di hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 16384, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.02495e+06, 13797.9, 15430.1, 758509, 0, 0, 0.535333, 1.20812, 0.912657, 1.84068, 0.00529983, 0.00529983, 0, 1, 1.60581, 1.15873, 3.51036e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am16 aB wg 8x4 cb4x2 ks32 xaf vav di hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00706e+06, 522382, 0, 0, 0, 0, 0.725659, 1.44632, 0.970408, 1.74134, 0.0067111, 0.0067111, 0, 0.90349, 1.42986, 1.13348, 2.91269e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av64+m32@72 am64 aB wg 4x4x2 kr cb3 ks64 xaf st vav di hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.39136e+06, -511133, -159173, 776441, 3.49225e+06, 3.79699e+06, 0.616898, 0.829845, 0.950983, 1.59577, 0.00679651, 0.000143485, 0.00679319, 0.412801, 1.47949, 1.21242, 2.01548e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@48 am32x2+m32@16 aB wg 4x4x2 kr af vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06539e+06, 179591, 2980.61, 366479, 0, 0, 0.495831, 0.80952, 0.955144, 1.86869, 0.00743227, 0.00743227, 0, 1, 1.41657, 1.02535, 4.14899e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am32+m64@64 aB wg 4x8 af vav di hi pt sr br sb256 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {872440, 433979, 0, 0, 0, 0, 0.692755, 0.929392, 0.682568, 1.28977, 0.00829318, 0.00829318, 0, 0.933146, 1.44966, 1.06633, 2.17433e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@16 am16x2 aB wg 4x4 cb3x2 ks64 xaf vav di hi pt sr br bk0 sn nb 0x4 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {4, 4, 1}, 1, (WGType) 1, 441, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.202e+06, 220946, 0, 0, 3.22519e+06, 3.72408e+06, 0.649739, 1.23543, 0.910187, 1.56677, 0.0118941, 0.00175498, 0.01085, 0.587028, 1.3097, 0.977204, 4.11281e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@40 am32+m32@32 aB wg 2x8x2 kr xaf st vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 16777216}, {64, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 0, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06324e+06, 220443, 364.907, 276934, 0, 0, 0.524524, 1.25881, 0.793843, 2.21167, 0.00974309, 0.00974309, 0, 0.984682, 1.55809, 1.03396, 4.08729e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@32 am32+m16@32 aB wg 4x8 xaf vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {883175, 391500, 0, 0, 0, 0, 0.858496, 1.69057, 0.878366, 1.49214, 0.0153564, 0.0153564, 0, 0.862765, 1.22512, 0.990538, 1.20433e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 aB wg 2x4x4 kr ca3x2 ks16 af vav di hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 6144, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.27954e+06, -187821, -42333.9, 291644, 3.34234e+06, 2.63782e+06, 0.670967, 0.826166, 0.942564, 1.64083, 0.0148244, 0.00555253, 0.00975056, 0.806514, 1.26716, 0.788997, 1.48059e-11}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 aB wg 2x4x4 kr ca3 ks32 xaf vav di hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 12288, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.07984e+06, 421819, -3423.5, 140776, 0, 0, 0.704327, 0.923358, 1.5625, 3.37748, 0.0138366, 0.0138366, 0, 0.98715, 1.39019, 0.984829, 3.35716e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m64@64 am32x2+m64@32 aB wg 4x8 xaf vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {888453, 388680, 0, 0, 0, 0, 1.36981, 1.36706, 1.57389, 2.50026, 0.0230892, 0.0230892, 0, 0.779666, 1.13216, 0.970824, 1.11098e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@16 am64+m16@32 aB wg 2x4x4 kr af vav di hi pt sr br sb64 bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 64}, {32, 8, 64}, {2, 4, 4}, 1, (WGType) 1, 445, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.11866e+06, -145727, -15120.1, 241492, 2.46743e+06, 1.84648e+06, 0.59095, 0.943411, 0.892557, 1.46629, 0.0222611, 0.0141188, 0.0101768, 0.737216, 1.20804, 0.716031, 2.22642e-11}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@32 am16x2+m64@16 aB wg 2x4x4 kr af vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 16}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01077e+06, 345281, -849.568, 109530, 0, 0, 0.536426, 1.11, 2.2752, 4.75922, 0.0199271, 0.0199271, 0, 0.901814, 1.25549, 0.986701, 2.34156e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@64 am32+m64@64 aB wg 4x8 af vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 16777216}, {16, 4, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {891563, 373077, 0, 0, 0, 0, 1.28395, 2.80375, 2.15367, 3.8278, 0.0375084, 0.0375084, 0, 0.91257, 1.09932, -0.409383, 1.56754e-11}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@32 am32+m32@32 aB wg 2x4x4 kr xaf vav di hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 32}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.1218e+06, -136578, -30749.5, 226811, 2.85082e+06, 1.87597e+06, 0.817567, 0.877924, 1.64935, 2.53899, 0.0298413, 0.00967317, 0.0214053, 0.870582, 1.28882, 0.938989, 4.84082e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m64@16 am32+m64@32 aB wg 2x4x4 kr xaf vav di hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {965058, 344296, 950.182, 97490.4, 0, 0, 0.824255, 0.96663, 3.18959, 7.04116, 0.0300276, 0.0300276, 0, 0.888446, 1.17944, 0.622709, 5.98811e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m32@16 am32 aB wg 2x8 af vav di li nmk pt sr br sb64 bk0 sys kv afb", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.04087e+06, 107650, 0, 0, 2.52723e+06, 3.24731e+06, 0.657844, 3.81434, 0.838331, 1.35005, 0.038394, 0.0231761, 0.0232745, 0.770273, 1.2243, 0.973306, 4.56074e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB16 aB wg 4x8 cab4x2 ks16 xaf vav di hi pt sr br bk0 sn dm grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04604e+06, 973086, 0, 0, 7.53336e+06, 1.18866e+07, 0.951056, 1.1825, 1.0066, 1.69808, 0.00473475, 0.00473475, 0, 0.835362, 2.23999, 1.37476, 2.87047e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB32 aB wg 4x8 cab4 ks32 xaf st vav di hi pt sr br bk0 sn dm grf256 sys kv afb rr l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.04992e+06, 862089, 0, 0, 4.41549e+06, 6.61914e+06, 0.869043, 1.44302, 0.910417, 1.82393, 0.00618576, 0.00618576, 0, 0.990298, 1.63647, 1.07524, 5.899e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 aB wg 4x4x2 kr cab3 ks32 xaf st vav di hi pt sr br bk0 sn dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.79824e+06, -569026, -348921, 964550, 3.97312e+06, 4.26803e+06, 0.900273, 0.952519, 0.97609, 1.62415, 0.0075791, 0.00139129, 0.00649277, 0.622224, 1.5355, 1.13493, 2.82312e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 aB wg 4x4x2 kr cab3 ks32 xaf st vav di hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07259e+06, 606160, 8708.3, 376521, 0, 0, 0.935803, 1.03785, 0.967359, 1.94136, 0.00756671, 0.00756671, 0, 0.887857, 1.52455, 1.05985, 4.0743e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB64 aB wg 4x8 cab3x2 ks64 af vav di hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.71378e+06, 809360, 0, 0, 0, 0, 0.986084, 1.01074, 0.832813, 1.74844, 0.00859306, 0.00859306, 0, 0.738196, 1.53185, 1.01022, 3.30219e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB32 aB wg 4x4x2 kr cab4x2 ks32 af vav di hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 49152, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.74298e+06, -438399, -331091, 823020, 2.83443e+06, 3.02285e+06, 0.827087, 1.16983, 0.912475, 1.60769, 0.0117213, 0.00142707, 0.0105272, 0.504681, 1.41325, 0.945808, 5.78409e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 aB wg 2x8x2 kr cab4x2 ks32 xaf st vav di hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 445, 32768, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69805e+06, -386506, -310168, 727551, 2.85082e+06, 2.46579e+06, 0.823076, 0.993902, 0.988626, 1.9249, 0.0161014, 0.00259962, 0.0143172, 0.708518, 1.38784, 0.977495, 3.17493e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav di hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB32 aB wg 1x4x8 kr cab3 ks32 xaf vav di hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 445, 12288, 12288, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.63678e+06, -103227, -63581.8, 190716, 3.1785e+06, 2.12173e+06, 0.894659, 0.884893, 0.842745, 1.56296, 0.0283829, 0.00669699, 0.0220137, 0.907502, 1.26828, 0.579024, 2.5883e-11}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav di li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.31212e+06, 162444, 0, 0, 2.20365e+06, 0, 0.717898, 4.15784, 0.784983, 1.4051, 0.0339977, 0.013915, 0.0254973, 0.96039, 1.47669, 1.00421, 5.83703e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "Ip"}, "aB16+m32@32 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17788e+06, -94434.5, -16004.3, 154932, 2.83443e+06, 1.63758e+06, 1.82204, 0.60134, 0.592597, 1.15461, 0.0290724, 0.0265558, 0.015314, 1, 1.23184, 0.449695, 1.93026e-11}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "aB32 aB32 aB wg 8x4 cab3 ks32 af vav di hi pt sr br bk0 sn nb 8x4 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 86016, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07153e+06, 922406, 0, 0, 5.48536e+06, 9.18323e+06, 0.85293, 1.19559, 1.04485, 1.64281, 0.00471518, 0.00471518, 0, 0.961495, 1.72318, 1.24713, 3.69593e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {11000, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {64, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4", {16, (LoopType) 255, 128, {(LoopType) 160, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 16777216}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 257, 0, 0, {64, 8, 4}, {true, true, true}}, {'E', 17, {715095, 469673, 0, 0, 0, 0, 0.454681, 19.058, 0.94253, 2.23273, 0.0531575, 0.0531575, 0, 0.0832933, 1.2383, -1.03685, 3.66291e-11}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, +{{'F', "gemm", {"H", "F", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, +{{'F', "gemm", {"H", "H", "S"}, {"A2#16,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16+B16@48 aB16+B16@48 aB vav sys grf256 af hi pt wg 4x8 sb256 bk0 sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {2048}}}, +{{'F', "gemm", {"H", "H", "S"}, {"A2", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2+B16@80 aB16x2+B16@80 aB vav sb256 wg 4x8 bo pt sys bk0 sr br", {16, (LoopType) 255, 128, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {64, 128, 4}, {true, true, true}}, {'W', 1, {256}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqps"}, "av16+m32@72 am32+m32@64 aB wg 4x8 xaf vav hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {903365, 697556, 0, 0, 8.2903e+06, 1.21651e+07, 0.724506, 0.722081, 0.92287, 1.55416, 0.00402055, 0.00402055, 0, 0.997691, 1.6726, 1.18622, 5.18793e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m32@48 am32+S32@64 aB wg 4x8 xaf st vav hi pt sb32 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {869760, 741196, 0, 0, 8.192e+06, 1.05431e+07, 0.731287, 0.777012, 0.881104, 1.51408, 0.00403024, 0.00403024, 0, 0.998184, 1.76752, 1.28618, 2.08947e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIps"}, "av64+m16@64 am32+m32@72 aB wg 8x4 xaf vav hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 64}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {876646, 564122, 0, 0, 6.5151e+06, 7.83974e+06, 0.629669, 0.87362, 0.885543, 1.48097, 0.00440774, 0.00440774, 0, 1, 1.66234, 1.24996, 2.85794e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIpq"}, "av64+m16@64 am16+m16@48 aB wg 8x2x2 xaf vav hi pt sr br sb64 bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 2, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.09587e+06, 290213, 0, 0, 7.65133e+06, 9.1177e+06, 0.649959, 1.16004, 0.89878, 1.61737, 0.0055749, -0.00026212, 0.00571368, 0.41686, 1.92252, 1.24755, 2.35066e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@32 am16x2 aB wg 4x4x2 kr cb4 ks16 xaf st vav hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 16384, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.02495e+06, 13797.9, 15430.1, 758509, 0, 0, 0.535333, 1.20812, 0.912657, 1.84068, 0.00529983, 0.00529983, 0, 1, 1.60581, 1.15873, 3.51036e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am16 aB wg 8x4 cb4x2 ks32 xaf vav hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00706e+06, 522382, 0, 0, 0, 0, 0.725659, 1.44632, 0.970408, 1.74134, 0.0067111, 0.0067111, 0, 0.90349, 1.42986, 1.13348, 2.91269e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av64+m32@72 am64 aB wg 4x4x2 kr cb3 ks64 xaf st vav hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.39136e+06, -511133, -159173, 776441, 3.49225e+06, 3.79699e+06, 0.616898, 0.829845, 0.950983, 1.59577, 0.00679651, 0.000143485, 0.00679319, 0.412801, 1.47949, 1.21242, 2.01548e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@48 am32x2+m32@16 aB wg 4x4x2 kr af vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06539e+06, 179591, 2980.61, 366479, 0, 0, 0.495831, 0.80952, 0.955144, 1.86869, 0.00743227, 0.00743227, 0, 1, 1.41657, 1.02535, 4.14899e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am32+m64@64 aB wg 4x8 af vav hi pt sr br sb256 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {872440, 433979, 0, 0, 0, 0, 0.692755, 0.929392, 0.682568, 1.28977, 0.00829318, 0.00829318, 0, 0.933146, 1.44966, 1.06633, 2.17433e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@16 am16x2 aB wg 4x4 cb3x2 ks64 xaf vav hi pt sr br bk0 sn nb 0x4 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {4, 4, 1}, 1, (WGType) 1, 441, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.202e+06, 220946, 0, 0, 3.22519e+06, 3.72408e+06, 0.649739, 1.23543, 0.910187, 1.56677, 0.0118941, 0.00175498, 0.01085, 0.587028, 1.3097, 0.977204, 4.11281e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@40 am32+m32@32 aB wg 2x8x2 kr xaf st vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 16777216}, {64, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 0, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06324e+06, 220443, 364.907, 276934, 0, 0, 0.524524, 1.25881, 0.793843, 2.21167, 0.00974309, 0.00974309, 0, 0.984682, 1.55809, 1.03396, 4.08729e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@32 am32+m16@32 aB wg 4x8 xaf vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {883175, 391500, 0, 0, 0, 0, 0.858496, 1.69057, 0.878366, 1.49214, 0.0153564, 0.0153564, 0, 0.862765, 1.22512, 0.990538, 1.20433e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 aB wg 2x4x4 kr ca3x2 ks16 af vav hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 6144, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.27954e+06, -187821, -42333.9, 291644, 3.34234e+06, 2.63782e+06, 0.670967, 0.826166, 0.942564, 1.64083, 0.0148244, 0.00555253, 0.00975056, 0.806514, 1.26716, 0.788997, 1.48059e-11}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 aB wg 2x4x4 kr ca3 ks32 xaf vav hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 12288, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.07984e+06, 421819, -3423.5, 140776, 0, 0, 0.704327, 0.923358, 1.5625, 3.37748, 0.0138366, 0.0138366, 0, 0.98715, 1.39019, 0.984829, 3.35716e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m64@64 am32x2+m64@32 aB wg 4x8 xaf vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {888453, 388680, 0, 0, 0, 0, 1.36981, 1.36706, 1.57389, 2.50026, 0.0230892, 0.0230892, 0, 0.779666, 1.13216, 0.970824, 1.11098e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@16 am64+m16@32 aB wg 2x4x4 kr af vav hi pt sr br sb64 bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 64}, {32, 8, 64}, {2, 4, 4}, 1, (WGType) 1, 445, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.11866e+06, -145727, -15120.1, 241492, 2.46743e+06, 1.84648e+06, 0.59095, 0.943411, 0.892557, 1.46629, 0.0222611, 0.0141188, 0.0101768, 0.737216, 1.20804, 0.716031, 2.22642e-11}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@32 am16x2+m64@16 aB wg 2x4x4 kr af vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 16}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01077e+06, 345281, -849.568, 109530, 0, 0, 0.536426, 1.11, 2.2752, 4.75922, 0.0199271, 0.0199271, 0, 0.901814, 1.25549, 0.986701, 2.34156e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@64 am32+m64@64 aB wg 4x8 af vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 16777216}, {16, 4, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {891563, 373077, 0, 0, 0, 0, 1.28395, 2.80375, 2.15367, 3.8278, 0.0375084, 0.0375084, 0, 0.91257, 1.09932, -0.409383, 1.56754e-11}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@32 am32+m32@32 aB wg 2x4x4 kr xaf vav hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 32}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.1218e+06, -136578, -30749.5, 226811, 2.85082e+06, 1.87597e+06, 0.817567, 0.877924, 1.64935, 2.53899, 0.0298413, 0.00967317, 0.0214053, 0.870582, 1.28882, 0.938989, 4.84082e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m64@16 am32+m64@32 aB wg 2x4x4 kr xaf vav hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {965058, 344296, 950.182, 97490.4, 0, 0, 0.824255, 0.96663, 3.18959, 7.04116, 0.0300276, 0.0300276, 0, 0.888446, 1.17944, 0.622709, 5.98811e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m32@16 am32 aB wg 2x8 af vav li nmk pt sr br sb64 bk0 sys kv afb", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.04087e+06, 107650, 0, 0, 2.52723e+06, 3.24731e+06, 0.657844, 3.81434, 0.838331, 1.35005, 0.038394, 0.0231761, 0.0232745, 0.770273, 1.2243, 0.973306, 4.56074e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB16 aB16 aB wg 4x8 cab4x2 ks16 xaf vav hi pt sr br bk0 sn dm grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04604e+06, 973086, 0, 0, 7.53336e+06, 1.18866e+07, 0.951056, 1.1825, 1.0066, 1.69808, 0.00473475, 0.00473475, 0, 0.835362, 2.23999, 1.37476, 2.87047e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB32 aB wg 4x8 cab4 ks32 xaf st vav hi pt sr br bk0 sn dm grf256 sys kv afb rr l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.04992e+06, 862089, 0, 0, 4.41549e+06, 6.61914e+06, 0.869043, 1.44302, 0.910417, 1.82393, 0.00618576, 0.00618576, 0, 0.990298, 1.63647, 1.07524, 5.899e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 aB wg 4x4x2 kr cab3 ks32 xaf st vav hi pt sr br bk0 sn dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.79824e+06, -569026, -348921, 964550, 3.97312e+06, 4.26803e+06, 0.900273, 0.952519, 0.97609, 1.62415, 0.0075791, 0.00139129, 0.00649277, 0.622224, 1.5355, 1.13493, 2.82312e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 aB wg 4x4x2 kr cab3 ks32 xaf st vav hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07259e+06, 606160, 8708.3, 376521, 0, 0, 0.935803, 1.03785, 0.967359, 1.94136, 0.00756671, 0.00756671, 0, 0.887857, 1.52455, 1.05985, 4.0743e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB64 aB wg 4x8 cab3x2 ks64 af vav hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.71378e+06, 809360, 0, 0, 0, 0, 0.986084, 1.01074, 0.832813, 1.74844, 0.00859306, 0.00859306, 0, 0.738196, 1.53185, 1.01022, 3.30219e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB32 aB wg 4x4x2 kr cab4x2 ks32 af vav hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 49152, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.74298e+06, -438399, -331091, 823020, 2.83443e+06, 3.02285e+06, 0.827087, 1.16983, 0.912475, 1.60769, 0.0117213, 0.00142707, 0.0105272, 0.504681, 1.41325, 0.945808, 5.78409e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16 aB wg 2x8x2 kr cab4x2 ks32 xaf st vav hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 32}, {2, 8, 2}, 1, (WGType) 1, 445, 32768, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69805e+06, -386506, -310168, 727551, 2.85082e+06, 2.46579e+06, 0.823076, 0.993902, 0.988626, 1.9249, 0.0161014, 0.00259962, 0.0143172, 0.708518, 1.38784, 0.977495, 3.17493e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB32 aB wg 1x4x8 kr cab3 ks32 xaf vav hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 445, 12288, 12288, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.63678e+06, -103227, -63581.8, 190716, 3.1785e+06, 2.12173e+06, 0.894659, 0.884893, 0.842745, 1.56296, 0.0283829, 0.00669699, 0.0220137, 0.907502, 1.26828, 0.579024, 2.5883e-11}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 441, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.31212e+06, 162444, 0, 0, 2.20365e+06, 0, 0.717898, 4.15784, 0.784983, 1.4051, 0.0339977, 0.013915, 0.0254973, 0.96039, 1.47669, 1.00421, 5.83703e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "Ip"}, "aB16+m32@32 aB64 aB wg 1x4x8 kr af vav li pt sr br sb64 bk0 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17788e+06, -94434.5, -16004.3, 154932, 2.83443e+06, 1.63758e+06, 1.82204, 0.60134, 0.592597, 1.15461, 0.0290724, 0.0265558, 0.015314, 1, 1.23184, 0.449695, 1.93026e-11}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ips"}, "aB32 aB32 aB wg 8x4 cab3 ks32 af vav hi pt sr br bk0 sn nb 8x4 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 86016, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07153e+06, 922406, 0, 0, 5.48536e+06, 9.18323e+06, 0.85293, 1.19559, 1.04485, 1.64281, 0.00471518, 0.00471518, 0, 0.961495, 1.72318, 1.24713, 3.69593e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {11000, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {64, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav li sr br sb256 bk0 sys ska rr wx4", {16, (LoopType) 255, 128, {(LoopType) 160, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 16777216}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 257, 0, 0, {64, 8, 4}, {true, true, true}}, {'E', 17, {715095, 469673, 0, 0, 0, 0, 0.454681, 19.058, 0.94253, 2.23273, 0.0531575, 0.0531575, 0, 0.0832933, 1.2383, -1.03685, 3.66291e-11}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "iv"}, "aB8+m16@20 aS8x2+m16@20 aB wg 4x8 kc8 nse hi pt sb32 bk0 sm sn grf256 kv afb l4 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {882414, 629829, 0, 0, 2.86065e+06, 5.90643e+06, 1.12485, 2.37649, 0.641517, 1.11886, 0.0675119, 0.0675119, 0, 1, 1.00282, 1.00087, -2.48723e-15}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8+m16@20 aS8x2+m16@20 aB wg 4x8 kc8 nse hi pt sb32 bk0 sm grf256 kv afb l4 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {899247, 559339, 0, 0, 3.09248e+06, 3.03104e+06, 1.69164, 2.03658, 0.425236, 0.833504, 0.0686503, 0.0686503, 0, 1, 1.0052, 0.967023, 2.81088e-13}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@32 av16x2 aB wg 4x8 cb4x2 ks32 xaf vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03762e+06, 706048, 0, 0, 6.93043e+06, 1.10019e+07, 0.892859, 1.0972, 0.98165, 1.70677, 0.00434444, 0.00434444, 0, 0.705466, 1.6217, 1.19447, 5.03184e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@48 av32 aB wg 8x4 cb4 ks32 xaf st vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00046e+06, 649003, 0, 0, 5.58776e+06, 8.89651e+06, 0.806799, 1.52159, 1.05017, 1.76588, 0.00548707, 0.00548707, 0, 0.843701, 1.54307, 1.23912, 1.78553e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 aB wg 8x4 cb3 ks64 af vav di hi pt sr br bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01629e+06, 551456, 0, 0, 5.42147e+06, 5.37395e+06, 0.789185, 1.6085, 1.03879, 1.72752, 0.00628363, 0.00628363, 0, 0.924224, 1.51259, 1.24909, 2.21382e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m16@32 av16 aB wg 4x4x2 kr cb3 ks32 xaf vav di hi pt sr br bk0 nb 0x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.34547e+06, -453771, -133582, 707431, 3.85843e+06, 4.096e+06, 0.838397, 0.942262, 0.957325, 1.57766, 0.00740113, 0.0010301, 0.00648511, 0.743417, 1.52474, 1.0944, 4.90834e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@16 av16 aB wg 4x4x2 kr cb3x2 ks32 xaf st vav di hi pt sr br bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.0521e+06, 268865, 3540.64, 373906, 0, 0, 0.826581, 1.05467, 0.951328, 1.96908, 0.00706831, 0.00706831, 0, 1, 1.51882, 1.06722, 4.31716e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@48 at16x2+m64@48 aB wg 4x8 af vav di hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {883225, 427290, 0, 0, 0, 0, 1.11407, 1.21203, 0.677771, 1.29342, 0.00889462, 0.00889462, 0, 0.963151, 1.53208, 1.10128, 2.13999e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 at64+m64@48 aB wg 4x4x2 kr af vav di hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {979562, -385642, 51711.6, 602420, 2.47235e+06, 2.84426e+06, 0.69937, 1.28105, 0.710825, 1.4275, 0.0108655, 0.00130873, 0.010468, 0.846726, 1.54637, 0.967193, 7.32783e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@16 av32 aB wg 4x2x4 kr cb3 ks32 xaf vav di hi pt sr br bk0 nb 0x2 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 12288, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06276e+06, 380058, -538.669, 196123, 0, 0, 0.631592, 1.26582, 1.12427, 2.56131, 0.00965567, 0.00965567, 0, 0.98502, 1.40091, 0.996067, 3.99183e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@64 at32+m32@64 aB wg 4x8 xaf vav di hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {894456, 401519, 0, 0, 0, 0, 1.09663, 2.26086, 0.873233, 1.4831, 0.0147437, 0.0147437, 0, 0.972414, 1.38406, 0.982763, 2.2604e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@16 at16+m32@32 aB wg 4x2x4 kr af vav di hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 16}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.117e+06, -159631, -1790.36, 263388, 3.24403e+06, 2.48627e+06, 0.970315, 0.939915, 0.746986, 1.27856, 0.016383, 0.00986423, 0.00728157, 0.639606, 1.39693, 0.908242, 1.20384e-11}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@32 av16 aB wg 4x2x4 kr cb3 ks32 xaf st vav di hi pt sr br bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 12288, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.05876e+06, 435391, -169.783, 139707, 0, 0, 1.13034, 1.44101, 1.59717, 3.3735, 0.0165162, 0.0165162, 0, 0.995866, 1.20318, 0.997611, 1.40306e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@32 at32x2+m64@16 aB wg 4x2x4 kr xaf vav di hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.14232e+06, -131424, -30012.5, 227440, 2.74432e+06, 2.18972e+06, 0.756622, 1.40904, 0.883778, 1.49413, 0.0268538, 0.0148448, 0.0126023, 0.415284, 1.26391, 0.590636, 3.31179e-11}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@16 at16+m64@16 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 16}, {4, 2, 4}, 1, (WGType) 1, 261, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00561e+06, 345184, -1729.23, 106245, 0, 0, 1.12096, 1.68087, 2.10926, 4.50625, 0.0258241, 0.0258241, 0, 0.896027, 1.15536, 0.976316, 1.76651e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m16@48 at16x2+m32@32 aB wg 2x4x4 kr xaf st vav di hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {960557, 337602, 5086.78, 96767.1, 0, 0, 1.84687, 1.53092, 3.16337, 7.00593, 0.0372383, 0.0372383, 0, 0.907708, 1.13173, 0.809805, 4.10609e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m16@32 at32x2+m16@16 aB wg 4x2x2 kr af vav di li nmk pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 32}, {4, 2, 2}, 1, (WGType) 1, 445, 0, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {726421, -157896, 326311, 250406, 2.55754e+06, 3.09658e+06, 0.491778, 4.05366, 0.747811, 1.5515, 0.0177586, 0.0187752, 0.0114616, 1, 1.28009, 0.929609, 5.62845e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABIpq"}, "aB16+m16@48 at32+m16@48 aB wg 4x8 cab3 ks32 xaf vav di hi pt sr br bk0 sn nb 4x8 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06016e+06, 848171, 0, 0, 7.38099e+06, 1.0453e+07, 1.36089, 1.12404, 0.968578, 1.56365, 0.00464109, 0.00464109, 0, 1, 1.59464, 1.11033, 6.56771e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at16+m16@32 aB wg 8x2x2 kr cab3 ks16 xaf st vav di hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 16}, {8, 2, 2}, 1, (WGType) 1, 445, 36864, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.60943e+06, -1.34483e+06, -253630, 1.70607e+06, 5.342e+06, 7.41376e+06, 1.0168, 1.44036, 1.01244, 1.68138, 0.00675092, 0.0008524, 0.00611925, 0.798697, 1.5218, 1.14975, 5.05128e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16+m32@48 at32+m32@48 aB wg 8x4 cab4 ks32 xaf st vav di hi pt sr br bk0 sn nb 8x4 grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08227e+06, 776908, 0, 0, 0, 0, 1.04569, 2.05877, 0.996213, 1.67003, 0.00827411, 0.00827411, 0, 0.961616, 1.45528, 1.11502, 2.00546e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m32@64 at32+m16@64 aB wg 4x4x2 kr cab3 ks32 xaf st vav di hi pt sr br bk0 sn nb 4x4 grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.63235e+06, -473591, -275698, 840536, 4.04275e+06, 4.34176e+06, 1.32077, 0.982863, 0.96265, 1.60044, 0.00888121, 0.000531032, 0.00822524, 0.585331, 1.48926, 0.995812, 5.75876e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16+m16@16 aB wg 4x4x2 kr cab4x2 ks16 af vav di hi pt sr br bk0 sn nb 4x4 grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.09218e+06, 305991, -14323.3, 399143, 0, 0, 1.5323, 0.974695, 0.948184, 1.94112, 0.00890454, 0.00890454, 0, 1, 1.48844, 1.02749, 3.65821e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 at32+m32@48 aB wg 4x8 cab4 ks64 xaf st vav di hi pt sr br bk0 sn grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 131072, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07062e+06, 676936, 0, 0, 0, 0, 1.46521, 1.49357, 0.913257, 1.83797, 0.0123376, 0.0123376, 0, 0.978477, 1.29593, 1.02611, 1.67976e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab4 ks16 xaf vav di hi pt sr br bk0 sn nb 2x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.51888e+06, -225905, -111399, 382282, 3.85024e+06, 3.13754e+06, 1.02462, 1.25826, 0.997766, 1.72944, 0.0116731, 0.00921103, 0.00406354, 0.785111, 1.49836, 0.998742, 6.32281e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at32 aB wg 4x2x4 kr cab2 ks32 xaf vav di hi pt sr br bk0 nb 4x2 grf256 sys sn", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07937e+06, 621635, 2230.81, 196698, 0, 0, 1.60744, 1.79127, 1.1599, 2.52429, 0.0169955, 0.0169955, 0, 0.96773, 1.11098, 0.946816, 1.02001e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 at16x2 aB wg 4x8 cab4x2 ks64 xaf st vav di hi pt sr br bk0 nb 4x8 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.09253e+06, 617958, 0, 0, 0, 0, 1.24883, 3.48717, 1.05505, 2.27656, 0.0251952, 0.0251952, 0, 0.980002, 1.05051, 0.285902, 7.08638e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab3x2 ks32 xaf vav di hi pt sr br bk0 sn nb 2x4 grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {2.50658e+06, -199989, -350122, 337287, 3.39968e+06, 2.49856e+06, 0.784682, 0.339369, 0.497498, 1.59625, 0.016336, 0.0120673, 0.0057422, 0.609408, 1.33312, 0.989535, 3.60691e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 at16x2 aB wg 2x4x4 kr cab3 ks32 xaf vav di hi pt sr br bk0 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06937e+06, 611760, 4194.1, 135343, 0, 0, 2.11403, 1.71129, 1.57142, 3.39186, 0.0275862, 0.0275862, 0, 0.920705, 1.00345, 0.413363, 6.7934e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 2x2x8 kr cab4x2 ks16 xaf vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 445, 12288, 12288, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.47272e+06, -119971, -42490.2, 189240, 3.69459e+06, 2.31014e+06, 0.811882, 1.39447, 0.646674, 1.39008, 0.025457, 0.0234095, 0.00867396, 0.960377, 1.27279, 0.825982, 8.35402e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#Ip"}, "aB16x2 aS16x2 aB wg 4x16 cab4 ks64 af vav di hi pt sr br bk0 nb 4x16 sys", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 64}, {4, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04437e+06, 894474, 0, 0, 0, 0, 1.61942, 5.20006, 2.09495, 4.09093, 0.063329, 0.063329, 0, 0.953584, 1.16518, 0.929804, 1.67951e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "aB32+m16@48 aS16 aB wg 16x1x2 kr cb4x2 ks16 xaf vav di li nmk pt sr br bk0 grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {16, 1, 2}, 1, (WGType) 1, 445, 2048, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.42997e+06, -443650, -203140, 794602, 2.88358e+06, 2.22003e+06, 0.794365, 8.5854, 0.944109, 1.77614, 0.0570978, 0.00252013, 0.0532169, 0.706182, 1.0381, 0.381417, 6.1353e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav di li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m32@48 at64+m16@48 aB wg 2x2x8 kr xaf cs di hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.13361e+06, -87922.6, -10020.2, 138891, 2.8672e+06, 1.83501e+06, 0.818937, 0.8499, 0.851117, 1.53666, 0.0324244, 0.0285585, 0.0107132, 0.756667, 1.09867, 0.84864, 1.283e-11}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@32 av16x2 aB wg 4x8 cb4x2 ks32 xaf vav hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03762e+06, 706048, 0, 0, 6.93043e+06, 1.10019e+07, 0.892859, 1.0972, 0.98165, 1.70677, 0.00434444, 0.00434444, 0, 0.705466, 1.6217, 1.19447, 5.03184e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIs"}, "av32+m32@48 av32 aB wg 8x4 cb4 ks32 xaf st vav hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00046e+06, 649003, 0, 0, 5.58776e+06, 8.89651e+06, 0.806799, 1.52159, 1.05017, 1.76588, 0.00548707, 0.00548707, 0, 0.843701, 1.54307, 1.23912, 1.78553e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 aB wg 8x4 cb3 ks64 af vav hi pt sr br bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01629e+06, 551456, 0, 0, 5.42147e+06, 5.37395e+06, 0.789185, 1.6085, 1.03879, 1.72752, 0.00628363, 0.00628363, 0, 0.924224, 1.51259, 1.24909, 2.21382e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m16@32 av16 aB wg 4x4x2 kr cb3 ks32 xaf vav hi pt sr br bk0 nb 0x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.34547e+06, -453771, -133582, 707431, 3.85843e+06, 4.096e+06, 0.838397, 0.942262, 0.957325, 1.57766, 0.00740113, 0.0010301, 0.00648511, 0.743417, 1.52474, 1.0944, 4.90834e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@16 av16 aB wg 4x4x2 kr cb3x2 ks32 xaf st vav hi pt sr br bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.0521e+06, 268865, 3540.64, 373906, 0, 0, 0.826581, 1.05467, 0.951328, 1.96908, 0.00706831, 0.00706831, 0, 1, 1.51882, 1.06722, 4.31716e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@48 at16x2+m64@48 aB wg 4x8 af vav hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {883225, 427290, 0, 0, 0, 0, 1.11407, 1.21203, 0.677771, 1.29342, 0.00889462, 0.00889462, 0, 0.963151, 1.53208, 1.10128, 2.13999e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 at64+m64@48 aB wg 4x4x2 kr af vav hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {979562, -385642, 51711.6, 602420, 2.47235e+06, 2.84426e+06, 0.69937, 1.28105, 0.710825, 1.4275, 0.0108655, 0.00130873, 0.010468, 0.846726, 1.54637, 0.967193, 7.32783e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@16 av32 aB wg 4x2x4 kr cb3 ks32 xaf vav hi pt sr br bk0 nb 0x2 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 12288, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06276e+06, 380058, -538.669, 196123, 0, 0, 0.631592, 1.26582, 1.12427, 2.56131, 0.00965567, 0.00965567, 0, 0.98502, 1.40091, 0.996067, 3.99183e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@64 at32+m32@64 aB wg 4x8 xaf vav hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {894456, 401519, 0, 0, 0, 0, 1.09663, 2.26086, 0.873233, 1.4831, 0.0147437, 0.0147437, 0, 0.972414, 1.38406, 0.982763, 2.2604e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@16 at16+m32@32 aB wg 4x2x4 kr af vav hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 16}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.117e+06, -159631, -1790.36, 263388, 3.24403e+06, 2.48627e+06, 0.970315, 0.939915, 0.746986, 1.27856, 0.016383, 0.00986423, 0.00728157, 0.639606, 1.39693, 0.908242, 1.20384e-11}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@32 av16 aB wg 4x2x4 kr cb3 ks32 xaf st vav hi pt sr br bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 12288, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.05876e+06, 435391, -169.783, 139707, 0, 0, 1.13034, 1.44101, 1.59717, 3.3735, 0.0165162, 0.0165162, 0, 0.995866, 1.20318, 0.997611, 1.40306e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m16@32 at32x2+m64@16 aB wg 4x2x4 kr xaf vav hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.14232e+06, -131424, -30012.5, 227440, 2.74432e+06, 2.18972e+06, 0.756622, 1.40904, 0.883778, 1.49413, 0.0268538, 0.0148448, 0.0126023, 0.415284, 1.26391, 0.590636, 3.31179e-11}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@16 at16+m64@16 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 16}, {4, 2, 4}, 1, (WGType) 1, 261, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00561e+06, 345184, -1729.23, 106245, 0, 0, 1.12096, 1.68087, 2.10926, 4.50625, 0.0258241, 0.0258241, 0, 0.896027, 1.15536, 0.976316, 1.76651e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m16@48 at16x2+m32@32 aB wg 2x4x4 kr xaf st vav hi pt sr br sb64 bk0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {960557, 337602, 5086.78, 96767.1, 0, 0, 1.84687, 1.53092, 3.16337, 7.00593, 0.0372383, 0.0372383, 0, 0.907708, 1.13173, 0.809805, 4.10609e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 8, 1}, "ABI"}, "av16+m16@32 at32x2+m16@16 aB wg 4x2x2 kr af vav li nmk pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 32}, {4, 2, 2}, 1, (WGType) 1, 445, 0, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {726421, -157896, 326311, 250406, 2.55754e+06, 3.09658e+06, 0.491778, 4.05366, 0.747811, 1.5515, 0.0177586, 0.0187752, 0.0114616, 1, 1.28009, 0.929609, 5.62845e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABIpq"}, "aB16+m16@48 at32+m16@48 aB wg 4x8 cab3 ks32 xaf vav hi pt sr br bk0 sn nb 4x8 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06016e+06, 848171, 0, 0, 7.38099e+06, 1.0453e+07, 1.36089, 1.12404, 0.968578, 1.56365, 0.00464109, 0.00464109, 0, 1, 1.59464, 1.11033, 6.56771e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at16+m16@32 aB wg 8x2x2 kr cab3 ks16 xaf st vav hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 16}, {8, 2, 2}, 1, (WGType) 1, 445, 36864, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.60943e+06, -1.34483e+06, -253630, 1.70607e+06, 5.342e+06, 7.41376e+06, 1.0168, 1.44036, 1.01244, 1.68138, 0.00675092, 0.0008524, 0.00611925, 0.798697, 1.5218, 1.14975, 5.05128e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16+m32@48 at32+m32@48 aB wg 8x4 cab4 ks32 xaf st vav hi pt sr br bk0 sn nb 8x4 grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08227e+06, 776908, 0, 0, 0, 0, 1.04569, 2.05877, 0.996213, 1.67003, 0.00827411, 0.00827411, 0, 0.961616, 1.45528, 1.11502, 2.00546e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m32@64 at32+m16@64 aB wg 4x4x2 kr cab3 ks32 xaf st vav hi pt sr br bk0 sn nb 4x4 grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.63235e+06, -473591, -275698, 840536, 4.04275e+06, 4.34176e+06, 1.32077, 0.982863, 0.96265, 1.60044, 0.00888121, 0.000531032, 0.00822524, 0.585331, 1.48926, 0.995812, 5.75876e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16+m16@16 aB wg 4x4x2 kr cab4x2 ks16 af vav hi pt sr br bk0 sn nb 4x4 grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.09218e+06, 305991, -14323.3, 399143, 0, 0, 1.5323, 0.974695, 0.948184, 1.94112, 0.00890454, 0.00890454, 0, 1, 1.48844, 1.02749, 3.65821e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 at32+m32@48 aB wg 4x8 cab4 ks64 xaf st vav hi pt sr br bk0 sn grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 131072, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07062e+06, 676936, 0, 0, 0, 0, 1.46521, 1.49357, 0.913257, 1.83797, 0.0123376, 0.0123376, 0, 0.978477, 1.29593, 1.02611, 1.67976e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab4 ks16 xaf vav hi pt sr br bk0 sn nb 2x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.51888e+06, -225905, -111399, 382282, 3.85024e+06, 3.13754e+06, 1.02462, 1.25826, 0.997766, 1.72944, 0.0116731, 0.00921103, 0.00406354, 0.785111, 1.49836, 0.998742, 6.32281e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB16 at32 aB wg 4x2x4 kr cab2 ks32 xaf vav hi pt sr br bk0 nb 4x2 grf256 sys sn", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07937e+06, 621635, 2230.81, 196698, 0, 0, 1.60744, 1.79127, 1.1599, 2.52429, 0.0169955, 0.0169955, 0, 0.96773, 1.11098, 0.946816, 1.02001e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 at16x2 aB wg 4x8 cab4x2 ks64 xaf st vav hi pt sr br bk0 nb 4x8 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.09253e+06, 617958, 0, 0, 0, 0, 1.24883, 3.48717, 1.05505, 2.27656, 0.0251952, 0.0251952, 0, 0.980002, 1.05051, 0.285902, 7.08638e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 2x4x4 kr cab3x2 ks32 xaf vav hi pt sr br bk0 sn nb 2x4 grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {2.50658e+06, -199989, -350122, 337287, 3.39968e+06, 2.49856e+06, 0.784682, 0.339369, 0.497498, 1.59625, 0.016336, 0.0120673, 0.0057422, 0.609408, 1.33312, 0.989535, 3.60691e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 at16x2 aB wg 2x4x4 kr cab3 ks32 xaf vav hi pt sr br bk0 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 24576, 24576, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.06937e+06, 611760, 4194.1, 135343, 0, 0, 2.11403, 1.71129, 1.57142, 3.39186, 0.0275862, 0.0275862, 0, 0.920705, 1.00345, 0.413363, 6.7934e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 2x2x8 kr cab4x2 ks16 xaf vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 445, 12288, 12288, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.47272e+06, -119971, -42490.2, 189240, 3.69459e+06, 2.31014e+06, 0.811882, 1.39447, 0.646674, 1.39008, 0.025457, 0.0234095, 0.00867396, 0.960377, 1.27279, 0.825982, 8.35402e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#Ip"}, "aB16x2 aS16x2 aB wg 4x16 cab4 ks64 af vav hi pt sr br bk0 nb 4x16 sys", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 64}, {4, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04437e+06, 894474, 0, 0, 0, 0, 1.61942, 5.20006, 2.09495, 4.09093, 0.063329, 0.063329, 0, 0.953584, 1.16518, 0.929804, 1.67951e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "aB32+m16@48 aS16 aB wg 16x1x2 kr cb4x2 ks16 xaf vav li nmk pt sr br bk0 grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {16, 1, 2}, 1, (WGType) 1, 445, 2048, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.42997e+06, -443650, -203140, 794602, 2.88358e+06, 2.22003e+06, 0.794365, 8.5854, 0.944109, 1.77614, 0.0570978, 0.00252013, 0.0532169, 0.706182, 1.0381, 0.381417, 6.1353e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m32@48 at64+m16@48 aB wg 2x2x8 kr xaf cs hi pt sr br sb64 bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.13361e+06, -87922.6, -10020.2, 138891, 2.8672e+06, 1.83501e+06, 0.818937, 0.8499, 0.851117, 1.53666, 0.0324244, 0.0285585, 0.0107132, 0.756667, 1.09867, 0.84864, 1.283e-11}}}, {{'F', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8x2+m8@12 aB8+m16@12 aB wg 8x4 kc8 nse hi pt sb32 bk0 sn grf256 kv afb l4 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {872740, 634178, 0, 0, 2.88358e+06, 5.61971e+06, 1.27345, 2.56135, 0.654382, 1.05183, 0.0667366, 0.0667366, 0, 1, 1.0025, 1.03224, -1.62343e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIqp"}, "at16+m64@48 am32+m32@56 aB wg 4x8 xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {885722, 718272, 0, 0, 8.48691e+06, 1.26566e+07, 1.01026, 0.782981, 0.918959, 1.54496, 0.00414471, 0.00414471, 0, 1, 2.17926, 1.31409, 2.23082e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m64@48 am32+m32@64 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, 1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at16x2+m32@32 am32+m64@48 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {879324, 573732, 0, 0, 5.95558e+06, 8.37222e+06, 0.790222, 0.904604, 0.883707, 1.48127, 0.0050314, 0.0050314, 0, 0.985529, 1.4878, 1.23641, 2.41747e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16+m32@32 am32+m32@48 aB wg 8x2x2 kr af vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03255e+06, -856835, 35930.9, 1.15355e+06, 4.81608e+06, 6.38157e+06, 0.493684, 0.827212, 0.896741, 1.6147, 0.0053978, 6.82453e-05, 0.00570828, 0.483746, 1.57976, 1.26821, 2.39349e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at16+m32@32 am32+m32@48 aB wg 8x2x2 kr af vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 261, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.06691e+06, -73220.1, 2702.06, 763045, 0, 0, 0.515247, 0.843582, 0.928336, 1.6865, 0.00601563, 0.00601563, 0, 1, 1.59977, 1.18803, 2.51599e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@64 am16 aB wg 16x2 cb3x2 ks32 af vav di hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am16x2+m64@32 aB wg 4x4x2 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03742e+06, -452509, 25423.9, 695882, 3.92397e+06, 3.94035e+06, 0.820319, 0.785135, 0.84902, 1.56923, 0.00809246, 0.00116078, 0.00745441, 0.526504, 1.60176, 1.07294, 4.15601e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m64@32 am32+m16@32 aB wg 4x4x2 kr xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.04651e+06, 183312, 12548.5, 379234, 0, 0, 0.711019, 0.679494, 0.902321, 1.86269, 0.00761273, 0.00761273, 0, 1, 1.42857, 1.0695, 5.07528e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m64@64 am32+m16@64 aB wg 8x4 af vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {888807, 430310, 0, 0, 0, 0, 0.918491, 0.832983, 0.687151, 1.2534, 0.00870491, 0.00870491, 0, 0.902125, 1.38093, 1.03399, 2.98332e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am32+m32@48 aB wg 4x2x4 kr xaf vav di hi pt sr br sb64 bk0 sm sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 32768, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.1758e+06, -218428, -13663.7, 334244, 4.15334e+06, 3.1703e+06, 0.488566, 0.645788, 0.915783, 1.73226, 0.00993899, 0.00846533, 0.00271336, 0.453691, 1.48336, 1.05987, 7.98819e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am32+m64@48 aB wg 8x4 af vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {894052, 395164, 0, 0, 0, 0, 0.74104, 1.32668, 0.846804, 1.46546, 0.0135248, 0.0135248, 0, 0.856275, 1.25895, 0.981331, 2.02221e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am32+m32@48 aB wg 2x2x8 kr xaf vav di hi pt sr br sb64 bk0 sm sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 16384, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15992e+06, -146486, -8001.88, 206226, 4.48102e+06, 2.75251e+06, 0.472911, 0.412364, 0.853569, 1.71413, 0.0122185, 0.0134426, 0.0018025, 0.763429, 1.44099, 1.00673, 1.03497e-11}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am32+m32@48 aB wg 4x4x2 kr xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 16384, {4, 8, 4}, {true, true, true}}, {'E', 17, {981967, 236184, 12886.5, 215670, 0, 0, 0.700293, 0.689307, 1.44479, 3.01999, 0.0137921, 0.0137921, 0, 1, 1.4102, 0.991937, 2.6828e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am16+m64@48 aB wg 4x8 xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {887285, 380837, 0, 0, 0, 0, 1.36748, 1.28667, 1.55406, 2.48833, 0.0278373, 0.0278373, 0, 0.840837, 1.07072, 0.944443, 1.41313e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m16@48 am16x2+m32@32 aB wg 4x2x4 kr af vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.00688e+06, 342686, 1015.75, 108825, 0, 0, 0.570607, 0.952349, 2.17046, 4.50514, 0.0218756, 0.0218756, 0, 0.906267, 1.25626, 0.968231, 3.52507e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@64 am32+m64@64 aB wg 2x2x8 kr xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.13957e+06, -78439.8, -8785.04, 127858, 3.09821e+06, 2.03325e+06, 0.434987, 0.462575, 0.856765, 1.54622, 0.0239184, 0.0233867, 0.00546205, 0.851204, 1.33767, 0.923893, 9.98065e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m16@48 am16x2+m32@32 aB wg 2x4x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {961255, 346939, 2950.44, 94397.8, 0, 0, 0.807965, 0.778577, 3.18224, 7.15796, 0.0300732, 0.0300732, 0, 0.978658, 1.2609, 0.937047, 3.10988e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 8, 1}, "ABI"}, "at16+m64@16 am64+m64@64 aB wg 8x4 af vav di li nmk pt sr br sb64 bk0 sm sys kv afb", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 64}, {16, 4, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03017e+06, 232218, 0, 0, 2.06356e+06, 2.32817e+06, 0.635122, 3.56777, 1.53227, 2.28154, 0.0372029, 0.00337686, 0.0415229, 0.905521, 1.06254, 0.957223, 9.43647e-13}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at16+m32@48 aB16 aB wg 8x4 cb4x2 ks32 af vav di hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06632e+06, 970822, 0, 0, 6.83377e+06, 1.0453e+07, 0.802893, 0.831622, 0.987736, 1.60125, 0.0044336, 0.0044336, 0, 0.99801, 1.7879, 1.21352, 4.70293e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AIp"}, "at16+m32@48 aB16 aB wg 16x2 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1572864, 16777216}, {262144, 1572864, 64}, {16, 96, 64}, {16, 2, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06174e+06, 862966, 0, 0, 5.06266e+06, 7.90528e+06, 0.76759, 1.1074, 1.02291, 1.47362, 0.00497143, 0.00497143, 0, 0.968925, 1.63841, 1.28013, 3.04063e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@64 aB16x2 aB wg 16x2 cb3 ks32 xaf st vav di hi pt sr br bk0 sm sn nb 0x2 dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 441, 24576, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {996043, 778753, 0, 0, 4.31555e+06, 5.99654e+06, 0.739151, 1.50326, 1.00767, 1.54244, 0.00616043, 0.00616043, 0, 0.997048, 1.53835, 1.17302, 3.4064e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at16x2+m32@32 aB16x2 aB wg 8x2x2 kr cb4 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 445, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.77071e+06, -527987, -353300, 939437, 4.14515e+06, 4.096e+06, 0.810631, 0.820929, 1.02923, 1.5067, 0.0071533, 0.000990959, 0.00658984, 0.834339, 1.70826, 1.24022, 4.35712e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m32@48 aB32 aB wg 8x4 cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00127e+06, 774753, 0, 0, 0, 0, 0.947461, 0.934863, 0.97605, 1.72158, 0.00894179, 0.00894179, 0, 0.970326, 1.50724, 1.0217, 2.9666e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at16+m32@16 aB32 aB wg 8x1x4 kr cb4x2 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 1, 4}, 1, (WGType) 1, 445, 16384, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.78479e+06, -261349, -173232, 452033, 4.02227e+06, 3.01466e+06, 0.481975, 0.723918, 1.03793, 1.54262, 0.00927887, 0.00655604, 0.00397641, 0.95291, 1.53912, 1.14817, 5.51161e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ipq"}, "at32+m32@32 aB16 aB wg 8x2x4 kr cb4 ks16 af vav di hi pt sr br bk0 sm sn dm sys l4 l2d", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 2, 4}, 1, (WGType) 1, 261, 8192, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.10265e+06, 1.12141e+06, -262.878, 187723, 0, 0, 0.762329, 1.33218, 1.05656, 2.16208, 0.011677, 0.011677, 0, 0.866398, 1.36265, 1.01013, 3.61321e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m16@64 aB32x2 aB wg 8x4 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn nb 0x4 dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {972331, 751881, 0, 0, 0, 0, 0.799573, 1.37171, 0.93861, 2.1425, 0.0147472, 0.0147472, 0, 0.928068, 1.27036, 0.985348, 1.75902e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@32 aB16x2 aB wg 4x2x4 kr cb4x2 ks32 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.88533e+06, -200778, -198593, 423600, 3.39149e+06, 2.28557e+06, 0.572852, 0.529126, 0.934694, 1.58997, 0.012793, 0.00968393, 0.00496565, 0.873155, 1.53254, 0.953462, 7.13806e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m16@32 aB16 aB wg 4x2x4 kr cb4x2 ks32 af vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.073e+06, 784464, -1329.54, 137590, 0, 0, 0.675201, 0.663483, 1.58515, 3.38345, 0.0150557, 0.0150557, 0, 0.919981, 1.2957, 0.987294, 2.89092e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m32@64 aB64 aB wg 4x8 cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00586e+06, 704834, 0, 0, 0, 0, 1.16313, 1.1353, 1.51168, 3.11912, 0.0225951, 0.0225951, 0, 0.923429, 1.15703, 0.85336, 2.75385e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.66288e+06, -98375, -71966.2, 195025, 2.84262e+06, 2.06438e+06, 0.483255, 0.500833, 0.809319, 1.72387, 0.023295, 0.0176286, 0.00890789, 0.991568, 1.447, 0.919946, 9.68796e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m16@32 aB64+m32@32 aB wg 2x2x8 kr af vav di hi pt sr br sb64 bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 261, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.05941e+06, 583908, 26225.8, 68801.4, 0, 0, 0.627148, 0.702832, 3.725, 8.29062, 0.0313015, 0.0313015, 0, 0.85187, 1.15432, 0.975457, 2.06896e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "at16x2+m64@16 aB32+m16@32 aB wg 4x2x4 kr af vav di li nmk pt sr br sb64 bk0 sm sn dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 32}, {16, 8, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.26276e+06, -130899, -63587.1, 251457, 2.73613e+06, 1.99803e+06, 0.535584, 1.4389, 1.4172, 2.56436, 0.0281714, 0.0174514, 0.0209074, 1, 1.28388, 0.655999, 5.90369e-12}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "I"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.15217e+06, -88485.5, -9852.59, 144015, 2.69517e+06, 1.69656e+06, 1.08886, 0.414631, 0.587045, 1.19593, 0.0202405, 0.0220064, 0.0155006, 1, 1.0809, 0.644827, 3.81346e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIqps"}, "at16+m64@48 am32+m32@56 aB wg 4x8 xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {64, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {885722, 718272, 0, 0, 8.48691e+06, 1.26566e+07, 1.01026, 0.782981, 0.918959, 1.54496, 0.00414471, 0.00414471, 0, 1, 2.17926, 1.31409, 2.23082e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m64@48 am32+m32@64 aB wg 8x4 xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, 1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at16x2+m32@32 am32+m64@48 aB wg 8x4 xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {879324, 573732, 0, 0, 5.95558e+06, 8.37222e+06, 0.790222, 0.904604, 0.883707, 1.48127, 0.0050314, 0.0050314, 0, 0.985529, 1.4878, 1.23641, 2.41747e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16+m32@32 am32+m32@48 aB wg 8x2x2 kr af vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03255e+06, -856835, 35930.9, 1.15355e+06, 4.81608e+06, 6.38157e+06, 0.493684, 0.827212, 0.896741, 1.6147, 0.0053978, 6.82453e-05, 0.00570828, 0.483746, 1.57976, 1.26821, 2.39349e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at16+m32@32 am32+m32@48 aB wg 8x2x2 kr af vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 261, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.06691e+06, -73220.1, 2702.06, 763045, 0, 0, 0.515247, 0.843582, 0.928336, 1.6865, 0.00601563, 0.00601563, 0, 1, 1.59977, 1.18803, 2.51599e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@64 am16 aB wg 16x2 cb3x2 ks32 af vav hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am16x2+m64@32 aB wg 4x4x2 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03742e+06, -452509, 25423.9, 695882, 3.92397e+06, 3.94035e+06, 0.820319, 0.785135, 0.84902, 1.56923, 0.00809246, 0.00116078, 0.00745441, 0.526504, 1.60176, 1.07294, 4.15601e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m64@32 am32+m16@32 aB wg 4x4x2 kr xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.04651e+06, 183312, 12548.5, 379234, 0, 0, 0.711019, 0.679494, 0.902321, 1.86269, 0.00761273, 0.00761273, 0, 1, 1.42857, 1.0695, 5.07528e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m64@64 am32+m16@64 aB wg 8x4 af vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {888807, 430310, 0, 0, 0, 0, 0.918491, 0.832983, 0.687151, 1.2534, 0.00870491, 0.00870491, 0, 0.902125, 1.38093, 1.03399, 2.98332e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am32+m32@48 aB wg 4x2x4 kr xaf vav hi pt sr br sb64 bk0 sm sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 32768, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.1758e+06, -218428, -13663.7, 334244, 4.15334e+06, 3.1703e+06, 0.488566, 0.645788, 0.915783, 1.73226, 0.00993899, 0.00846533, 0.00271336, 0.453691, 1.48336, 1.05987, 7.98819e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am32+m64@48 aB wg 8x4 af vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {894052, 395164, 0, 0, 0, 0, 0.74104, 1.32668, 0.846804, 1.46546, 0.0135248, 0.0135248, 0, 0.856275, 1.25895, 0.981331, 2.02221e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am32+m32@48 aB wg 2x2x8 kr xaf vav hi pt sr br sb64 bk0 sm sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 16384, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15992e+06, -146486, -8001.88, 206226, 4.48102e+06, 2.75251e+06, 0.472911, 0.412364, 0.853569, 1.71413, 0.0122185, 0.0134426, 0.0018025, 0.763429, 1.44099, 1.00673, 1.03497e-11}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am32+m32@48 aB wg 4x4x2 kr xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 16384, {4, 8, 4}, {true, true, true}}, {'E', 17, {981967, 236184, 12886.5, 215670, 0, 0, 0.700293, 0.689307, 1.44479, 3.01999, 0.0137921, 0.0137921, 0, 1, 1.4102, 0.991937, 2.6828e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16+m32@48 am16+m64@48 aB wg 4x8 xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {887285, 380837, 0, 0, 0, 0, 1.36748, 1.28667, 1.55406, 2.48833, 0.0278373, 0.0278373, 0, 0.840837, 1.07072, 0.944443, 1.41313e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m16@48 am16x2+m32@32 aB wg 4x2x4 kr af vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.00688e+06, 342686, 1015.75, 108825, 0, 0, 0.570607, 0.952349, 2.17046, 4.50514, 0.0218756, 0.0218756, 0, 0.906267, 1.25626, 0.968231, 3.52507e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@64 am32+m64@64 aB wg 2x2x8 kr xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.13957e+06, -78439.8, -8785.04, 127858, 3.09821e+06, 2.03325e+06, 0.434987, 0.462575, 0.856765, 1.54622, 0.0239184, 0.0233867, 0.00546205, 0.851204, 1.33767, 0.923893, 9.98065e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m16@48 am16x2+m32@32 aB wg 2x4x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {961255, 346939, 2950.44, 94397.8, 0, 0, 0.807965, 0.778577, 3.18224, 7.15796, 0.0300732, 0.0300732, 0, 0.978658, 1.2609, 0.937047, 3.10988e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 8, 1}, "ABI"}, "at16+m64@16 am64+m64@64 aB wg 8x4 af vav li nmk pt sr br sb64 bk0 sm sys kv afb", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 64}, {16, 4, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03017e+06, 232218, 0, 0, 2.06356e+06, 2.32817e+06, 0.635122, 3.56777, 1.53227, 2.28154, 0.0372029, 0.00337686, 0.0415229, 0.905521, 1.06254, 0.957223, 9.43647e-13}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at16+m32@48 aB16 aB wg 8x4 cb4x2 ks32 af vav hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06632e+06, 970822, 0, 0, 6.83377e+06, 1.0453e+07, 0.802893, 0.831622, 0.987736, 1.60125, 0.0044336, 0.0044336, 0, 0.99801, 1.7879, 1.21352, 4.70293e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AIps"}, "at16+m32@48 aB16 aB wg 16x2 cb4x2 ks64 xaf st vav hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1572864, 16777216}, {262144, 1572864, 64}, {16, 96, 64}, {16, 2, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06174e+06, 862966, 0, 0, 5.06266e+06, 7.90528e+06, 0.76759, 1.1074, 1.02291, 1.47362, 0.00497143, 0.00497143, 0, 0.968925, 1.63841, 1.28013, 3.04063e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@64 aB16x2 aB wg 16x2 cb3 ks32 xaf st vav hi pt sr br bk0 sm sn nb 0x2 dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 441, 24576, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {996043, 778753, 0, 0, 4.31555e+06, 5.99654e+06, 0.739151, 1.50326, 1.00767, 1.54244, 0.00616043, 0.00616043, 0, 0.997048, 1.53835, 1.17302, 3.4064e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at16x2+m32@32 aB16x2 aB wg 8x2x2 kr cb4 ks32 xaf st vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 2, 2}, 1, (WGType) 1, 445, 32768, 65536, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.77071e+06, -527987, -353300, 939437, 4.14515e+06, 4.096e+06, 0.810631, 0.820929, 1.02923, 1.5067, 0.0071533, 0.000990959, 0.00658984, 0.834339, 1.70826, 1.24022, 4.35712e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m32@48 aB32 aB wg 8x4 cb4x2 ks32 xaf vav hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00127e+06, 774753, 0, 0, 0, 0, 0.947461, 0.934863, 0.97605, 1.72158, 0.00894179, 0.00894179, 0, 0.970326, 1.50724, 1.0217, 2.9666e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at16+m32@16 aB32 aB wg 8x1x4 kr cb4x2 ks32 xaf st vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 64, 32}, {8, 1, 4}, 1, (WGType) 1, 445, 16384, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.78479e+06, -261349, -173232, 452033, 4.02227e+06, 3.01466e+06, 0.481975, 0.723918, 1.03793, 1.54262, 0.00927887, 0.00655604, 0.00397641, 0.95291, 1.53912, 1.14817, 5.51161e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ipq"}, "at32+m32@32 aB16 aB wg 8x2x4 kr cb4 ks16 af vav hi pt sr br bk0 sm sn dm sys l4 l2d", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 2, 4}, 1, (WGType) 1, 261, 8192, 32768, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.10265e+06, 1.12141e+06, -262.878, 187723, 0, 0, 0.762329, 1.33218, 1.05656, 2.16208, 0.011677, 0.011677, 0, 0.866398, 1.36265, 1.01013, 3.61321e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m16@64 aB32x2 aB wg 8x4 cb4x2 ks64 xaf st vav hi pt sr br bk0 sm sn nb 0x4 dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {972331, 751881, 0, 0, 0, 0, 0.799573, 1.37171, 0.93861, 2.1425, 0.0147472, 0.0147472, 0, 0.928068, 1.27036, 0.985348, 1.75902e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@32 aB16x2 aB wg 4x2x4 kr cb4x2 ks32 xaf st vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.88533e+06, -200778, -198593, 423600, 3.39149e+06, 2.28557e+06, 0.572852, 0.529126, 0.934694, 1.58997, 0.012793, 0.00968393, 0.00496565, 0.873155, 1.53254, 0.953462, 7.13806e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m16@32 aB16 aB wg 4x2x4 kr cb4x2 ks32 af vav hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.073e+06, 784464, -1329.54, 137590, 0, 0, 0.675201, 0.663483, 1.58515, 3.38345, 0.0150557, 0.0150557, 0, 0.919981, 1.2957, 0.987294, 2.89092e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m32@64 aB64 aB wg 4x8 cb4x2 ks64 xaf st vav hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.00586e+06, 704834, 0, 0, 0, 0, 1.16313, 1.1353, 1.51168, 3.11912, 0.0225951, 0.0225951, 0, 0.923429, 1.15703, 0.85336, 2.75385e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.66288e+06, -98375, -71966.2, 195025, 2.84262e+06, 2.06438e+06, 0.483255, 0.500833, 0.809319, 1.72387, 0.023295, 0.0176286, 0.00890789, 0.991568, 1.447, 0.919946, 9.68796e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16+m16@32 aB64+m32@32 aB wg 2x2x8 kr af vav hi pt sr br sb64 bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 261, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.05941e+06, 583908, 26225.8, 68801.4, 0, 0, 0.627148, 0.702832, 3.725, 8.29062, 0.0313015, 0.0313015, 0, 0.85187, 1.15432, 0.975457, 2.06896e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "at16x2+m64@16 aB32+m16@32 aB wg 4x2x4 kr af vav li nmk pt sr br sb64 bk0 sm sn dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 32}, {16, 8, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.26276e+06, -130899, -63587.1, 251457, 2.73613e+06, 1.99803e+06, 0.535584, 1.4389, 1.4172, 2.56436, 0.0281714, 0.0174514, 0.0209074, 1, 1.28388, 0.655999, 5.90369e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "I"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.15217e+06, -88485.5, -9852.59, 144015, 2.69517e+06, 1.69656e+06, 1.08886, 0.414631, 0.587045, 1.19593, 0.0202405, 0.0220064, 0.0155006, 1, 1.0809, 0.644827, 3.81346e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav di sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS16+m16@12 aS8+m16@12 aB wg 8x4 kc8 nse hi pt sb32 bk0 sm sn grf256 kv afb l4 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {878230, 813623, 0, 0, 3.31776e+06, 9.39622e+06, 1.5936, 1.75404, 0.798399, 1.185, 0.0656902, 0.0656902, 0, 1, 1.00344, 1.00135, -9.21479e-15}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16 aS16 aB sys grf256 cab2 wg 4x4 l4 sr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17895e+06, 343529, 0, 0, 0, 0, 1.63411, 1.77325, 1.00531, 1.48275, 0.0145617, 0.000936039, 0.0155971, 0.877282, 1.01034, 1.0048, 9.67486e-14}}}, -{{'F', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABqI"}, "am32+S1,32@64 av32+B32@64 aS cs di sys grf256 af wg 8x4 bo sb256 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {765291, 1.00234e+06, 0, 0, 0, 0, 0.723688, 0.663141, 1.08538, 2.05438, 0.00434664, 0.00434664, 0, 1, 1.8693, 1.21785, 3.96104e-12}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Is"}, "aS16 aS16 aB sys grf256 cab2 wg 4x4 l4 sr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17895e+06, 343529, 0, 0, 0, 0, 1.63411, 1.77325, 1.00531, 1.48275, 0.0145617, 0.000936039, 0.0155971, 0.877282, 1.01034, 1.0048, 9.67486e-14}}}, +{{'F', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABqI"}, "am32+S1,32@64 av32+B32@64 aS cs sys grf256 af wg 8x4 bo sb256 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {765291, 1.00234e+06, 0, 0, 0, 0, 0.723688, 0.663141, 1.08538, 2.05438, 0.00434664, 0.00434664, 0, 1, 1.8693, 1.21785, 3.96104e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS8x2+m16@20 aB8+m16@20 aS wg 8x4 kc8 nse hi pt sb32 bk0 sm sn grf256 kv afb l4 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {971283, 550381, 0, 0, 1.25911e+07, 8.13466e+06, 2.33425, 1.5484, 4.45361, 4.80392, 0.0689395, 0.0689395, 0, 1, 1.00725, 0.817916, 1.37423e-12}}}, -{{'F', "gemm", {"H", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, -{{'F', "gemm", {"H", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, -{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 2}, "ABI"}, "at16+m32@64 aB16 aB wg 16x2 cb3x2 ks32 af vav di hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, -{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, -{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 dm sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, -{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 dm sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, -{{'F', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, -{{'F', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav di hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.0376e+06, 806829, 0, 0, 0, 0, 1.28219, 1.0295, 1.49381, 3.00966, 0.0179815, 0.0179815, 0, 0.929009, 1.36496, 0.83458, 3.84739e-12}}}, -{{'F', "gemm", {"O", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav di hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, -{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}}, +{{'F', "gemm", {"H", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, +{{'F', "gemm", {"H", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, +{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 2}, "ABI"}, "at16+m32@64 aB16 aB wg 16x2 cb3x2 ks32 af vav hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, +{{'F', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, +{{'F', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "IAB"}, "av16 am16x2 aB wg 16x1 af rr vav hi pt sr br sb64 bk0 sys ska rr kv afb", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {16, 1, 1}, 1, (WGType) 0, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03641e+06, 123576, 0, 0, 1.681e+06, 3.63725e+06, 0.433141, 0.621138, 0.453913, 1.2219, 0.0515989, 0.00292722, 0.0485145, 0.979653, 1.17866, -0.454257, 5.43001e-11}}}, +{{'F', "gemm", {"O", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, +{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}}, {{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, -{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav di hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 aB wg 8x4 cb3 ks64 af vav di hi pt sr br bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01629e+06, 551456, 0, 0, 5.42147e+06, 5.37395e+06, 0.789185, 1.6085, 1.03879, 1.72752, 0.00628363, 0.00628363, 0, 0.924224, 1.51259, 1.24909, 2.21382e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav di hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav di li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb64 bk0 dm sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+C32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 dm sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav di sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m32@48 am32+m32@32 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, 1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"A4#16,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32+B32@96 aB32+B32@96 aB vav di sys grf256 af hi pt wg 4x8 sb512 bk0 sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {8192, 8192, 16777216}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {2048}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m128@96 am64+m64@96 aB wg 4x8 xaf st vav di hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 64}, {64, 40, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {886295, 679810, 0, 0, 6.14973e+06, 1.05103e+07, 0.387882, 0.34723, 0.844766, 1.28789, 0.00202312, 0.00202312, 0, 0.99971, 1.60161, 1.05949, 2.70909e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m64@128 am64x2+m64@128 aB wg 4x8 xaf vav di hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {878684, 628439, 0, 0, 5.63446e+06, 8.38042e+06, 0.326785, 0.347891, 0.805503, 1.24882, 0.00204557, 0.00204557, 0, 1, 1.50403, 1.06152, 3.02972e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@64 am64+m128@64 aB wg 4x8 af vav di hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 393216, 16777216}, {1048576, 393216, 64}, {64, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {893468, 542180, 0, 0, 4.62356e+06, 6.62733e+06, 0.391937, 0.505587, 0.774621, 1.23145, 0.00235858, 0.00235858, 0, 0.996539, 1.5023, 1.05328, 2.52696e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@64 am64+m64@64 aB wg 8x4 ca4x2 ks64 xaf vav di hi pt sr br bk0 sn nb 8x0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06557e+06, 554194, 0, 0, 0, 0, 0.351596, 0.90997, 0.947429, 1.57679, 0.00331983, 0.00331983, 0, 0.990955, 1.56412, 1.02425, 2.51352e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m64@128 am32x2 aB wg 4x4 cb4x2 ks32 xaf st vav di hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 4, 1}, 1, (WGType) 1, 441, 16384, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.19279e+06, 305937, 0, 0, 5.25107e+06, 8.03635e+06, 0.333064, 0.619053, 0.911588, 1.46748, 0.00298243, 0.000462826, 0.0027584, 0.631652, 1.34002, 1.00593, 2.21481e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2 am64+m64@32 aB wg 2x8x2 kr ca3 ks64 xaf vav di hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06148e+06, 261114, -1912.67, 378540, 0, 0, 0.3603, 0.488574, 0.861468, 1.70742, 0.00441825, 0.00441825, 0, 0.90841, 1.31036, 0.952495, 3.14617e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m32@64 am64x2 aB wg 2x4x4 kr ca4x2 ks32 af vav di hi pt sr br bk0 nb 2x0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 64}, {2, 4, 4}, 1, (WGType) 1, 261, 16384, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.04568e+06, 374749, 8822.2, 183553, 0, 0, 0.252579, 0.585774, 0.808529, 2.12369, 0.00569297, 0.00569297, 0, 0.992042, 1.43825, 1.02555, 1.23985e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m64@96 am32+m64@96 aB wg 2x8x2 kr ca3 ks64 xaf st vav di hi pt sr br bk0 sn nb 2x0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 12288, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {982126, 273502, 6264.44, 223316, 0, 0, 0.441125, 0.619665, 0.81579, 2.34678, 0.00837284, 0.00837284, 0, 0.969819, 1.30492, 0.959381, 1.83513e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m64@96 am128+m128@96 aB wg 2x4x4 kr xaf st vav di hi pt sr br sb128 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 128}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {999962, 392678, 594.486, 106516, 0, 0, 0.317007, 0.702686, 1.32687, 3.91881, 0.0109074, 0.0109074, 0, 0.946711, 1.26043, 0.987718, 1.10384e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m64@64 am128+m128@64 aB wg 4x8 ca3 ks64 xaf vav di hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {989932, 503264, 0, 0, 0, 0, 0.521085, 0.615326, 0.759821, 1.5027, 0.00469883, 0.00469883, 0, 0.99602, 1.31748, 0.979255, 1.00069e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m64@64 am64x2+m64@64 aB wg 4x8 ca4 ks64 af vav di hi pt sr br bk0 sn nb 4x0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {989597, 475337, 0, 0, 0, 0, 0.454855, 0.942129, 0.617706, 1.68393, 0.00734477, 0.00734477, 0, 0.963871, 1.2358, 0.932592, 1.33399e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@128 am64x2+m64@128 aB wg 2x16 ca4x2 ks64 af vav di hi pt sr br bk0 sn nb 2x0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 16384, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {961533, 455478, 0, 0, 0, 0, 0.642337, 0.7679, 0.660867, 2.13711, 0.0105946, 0.0105946, 0, 1, 1.30406, 0.902759, 1.47773e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64 am64x2+m64@128 aB wg 2x16 ca3x2 ks128 af vav di hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 32768, 16777216}, {524288, 32768, 16777216}, {32, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {991020, 415137, 0, 0, 0, 0, 0.494061, 1.1783, 1.09356, 3.42418, 0.0192037, 0.0192037, 0, 0.99432, 1.16294, -0.903393, 1.39731e-11}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m64@32 am64 aB wg 2x8x2 kr cb4x2 ks64 xaf vav di hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 64}, {64, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 445, 32768, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.4092e+06, -554772, -169517, 793747, 2.90734e+06, 3.16867e+06, 0.390299, 0.577147, 0.857166, 1.54399, 0.00361771, 0.000811875, 0.0033029, 0.98501, 1.49402, 0.997618, 1.88797e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64+m64@64 am64+m32@64 aB wg 4x2x4 kr xaf vav di hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.15591e+06, -238679, -20728.1, 351139, 2.5641e+06, 2.48218e+06, 0.273012, 0.460771, 0.775111, 1.33768, 0.00533606, 0.00407793, 0.00182134, 0.639264, 1.2796, 0.665315, 1.2356e-11}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m32@128 am128+m128@128 aB wg 2x4x4 kr xaf vav di hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 128}, {32, 16, 128}, {2, 4, 4}, 1, (WGType) 1, 445, 0, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.16312e+06, -178711, -20881.1, 280021, 2.00049e+06, 1.8178e+06, 0.329846, 0.416682, 0.587935, 1.13731, 0.00689723, 0.005772, 0.00231223, 0.806174, 1.33228, 0.879709, 5.51613e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m32@96 am128+m64@96 aB wg 2x4x4 kr ca4x2 ks64 xaf vav di hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 128}, {32, 8, 128}, {2, 4, 4}, 1, (WGType) 1, 445, 16384, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.41922e+06, -182504, -78300.4, 285831, 1.86778e+06, 1.47046e+06, 0.294783, 0.695972, 0.506752, 1.20455, 0.0108447, 0.00678984, 0.00498837, 0.844314, 1.33655, 0.794731, 4.36829e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64 am64x2+m64@32 aB wg 1x4x8 kr ca4x2 ks64 xaf st vav di hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 64}, {32, 8, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 8192, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.31936e+06, -96890.5, -30769.9, 145228, 2.34291e+06, 1.81862e+06, 0.412414, 0.30833, 0.409741, 1.09695, 0.0174833, 0.0161637, 0.00408927, 0.766829, 1.28501, 0.936336, 4.72138e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "IAB"}, "av32+B32@48 am32x2 aB wg 8x2 af vav di li sr br sb256 bk0 sys ska rr wx4 kv pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 32}, {8, 2, 1}, 4, (WGType) 1, 297, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {883594, 337014, 0, 0, 0, 0, 0.22229, 8.60539, 0.729527, 2.00832, 0.0271033, 0.0271033, 0, 0.0457007, 1.09409, 0.724113, 2.54861e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {8, 8, 1}, "IAB"}, "av64 am128+m128@96 aB wg 1x4x8 kr ca3x2 ks64 af vav di hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 128}, {16, 32, 128}, {1, 4, 8}, 1, (WGType) 1, 445, 3072, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.23048e+06, -125592, -17585.5, 179011, 2.12173e+06, 1.35168e+06, 0.920505, 0.255099, 0.653157, 1.21015, 0.0169894, 0.0198407, 0.00264754, 0.9339, 1.23674, 0.982957, 2.88698e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#Ip"}, "aB32 aB32 aB wg 4x8 cab4 ks64 xaf vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 131072, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.05829e+06, 1.03353e+06, 0, 0, 5.30842e+06, 9.18323e+06, 0.47066, 0.531818, 0.917115, 1.45352, 0.00220015, 0.00220015, 0, 0.975008, 1.63094, 0.992603, 5.47213e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "aB32+m32@96 aB32 aB wg 4x8 cab3 ks64 xaf st vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 393216, 16777216}, {1048576, 393216, 64}, {64, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 86016, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.06575e+06, 1.10881e+06, 0, 0, 4.41139e+06, 6.57818e+06, 0.42243, 0.679112, 0.918586, 1.50737, 0.00243376, 0.00243376, 0, 1, 1.95319, 1.10024, 2.294e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB64 aB32 aB wg 4x8 cab3 ks64 xaf vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 64}, {64, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 73728, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.06135e+06, 822813, 0, 0, 3.62578e+06, 4.59817e+06, 0.415771, 0.818848, 0.865328, 1.58139, 0.00337564, 0.00337564, 0, 1, 1.58683, 1.05039, 3.00347e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ipq"}, "aB32 aB32 aB wg 2x8x2 kr cab4x2 ks32 xaf st vav di hi pt sr br bk0 sn nb 2x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 65536, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.10897e+06, 485603, -29858, 432495, 0, 0, 0.55752, 0.565698, 0.750745, 1.79841, 0.00449927, 0.00449927, 0, 1, 1.66505, 1.02629, 1.8353e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m64@32 aB64+m32@32 aB wg 2x8x2 kr cab3 ks64 xaf st vav di hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 16777216}, {64, 8, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 36864, 36864, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.0734e+06, 659616, -5392.59, 285969, 0, 0, 0.380518, 0.879017, 0.522545, 1.73036, 0.00639724, 0.00639724, 0, 0.981695, 1.43982, 0.952975, 3.88499e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m64@32 aB64+m64@32 aB wg 2x8x2 kr cab3 ks64 af vav di hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 24576, 24576, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.03799e+06, 666762, 2486.11, 229962, 0, 0, 0.52045, 0.670568, 0.707588, 2.31307, 0.00907287, 0.00907287, 0, 1, 1.37325, 0.975844, 1.68113e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB64+m64@64 aB32+m32@64 aB wg 4x8 cab4 ks64 xaf st vav di hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.04335e+06, 903066, 0, 0, 0, 0, 0.573291, 0.742252, 0.728779, 1.625, 0.0049232, 0.0049232, 0, 1, 1.41255, 0.970177, 1.84329e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32x2+m64@128 aB32x2+m64@128 aB wg 4x8 cab3 ks64 xaf st vav di hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 36864, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.02689e+06, 960490, 0, 0, 0, 0, 0.510059, 1.04909, 0.673702, 1.67059, 0.00734958, 0.00734958, 0, 0.997915, 1.31262, 0.915348, 2.01189e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB64 aB32 aB wg 2x8x2 kr cab4 ks64 af vav di hi pt sr br bk0 sn nb 2x8 dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 64}, {64, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 445, 65536, 65536, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.89554e+06, -498803, -411042, 930415, 2.67387e+06, 3.20799e+06, 0.567677, 0.501271, 0.840676, 1.5063, 0.00427203, 0.000481846, 0.00404699, 0.832467, 1.44137, 0.955203, 3.92029e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB32+m128@64 aB wg 2x4x4 kr cab4 ks64 af vav di hi pt sr br bk0 sn nb 2x4 dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {2, 4, 4}, 1, (WGType) 1, 445, 32768, 32768, {1, 1, 4}, {true, true, true}}, {'E', 17, {2.01352e+06, -185065, -235490, 423906, 2.17907e+06, 2.28065e+06, 0.487223, 0.546183, 0.355732, 0.989325, 0.00871265, 0.00303217, 0.00595475, 0.892248, 1.28589, 0.854172, 3.98616e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m64@32 aB64+m64@32 aB wg 2x4x4 kr cab4 ks64 af vav di hi pt sr br bk0 sn nb 2x4 dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 64}, {32, 8, 64}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 24576, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.88418e+06, -153479, -204520, 376006, 2.17088e+06, 1.88498e+06, 0.367103, 0.802232, -0.0233093, 0.803253, 0.0120542, 0.00674447, 0.00621616, 0.958995, 1.34316, 0.655552, 9.35284e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32x2 aB32x2+m64@32 aB wg 1x4x8 kr cab4 ks64 xaf vav di hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 64}, {32, 8, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 16384, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.789e+06, -104206, -88473.3, 216093, 3.5799e+06, 1.90874e+06, 0.494892, 0.45351, -0.0429483, 0.861739, 0.0175893, 0.00752501, 0.010522, 0.988366, 1.40054, 0.929823, 4.81054e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 1, 1}, "#I"}, "aB32+m64@64 aB64 aB wg 2x8 af vav di li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 64}, {64, 2, 64}, {2, 8, 1}, 1, (WGType) 1, 441, 24576, 0, {4, 1, 4}, {true, true, true}}, {'E', 17, {1.33537e+06, 174420, 0, 0, 1.79651e+06, 2.35274e+06, 0.35032, 2.21824, 0.289392, 0.954431, 0.0187001, 0.00782903, 0.0142837, 0.961014, 1.45471, 0.970912, 4.15697e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "#Ip"}, "aB32+m64@64 aB128 aB wg 1x4x8 kr af vav di li pt sr br sb128 bk0 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 128}, {16, 16, 128}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.21993e+06, -94688.2, -20621.2, 162272, 2.16269e+06, 679936, 1.63701, 0.42146, 0.328977, 0.880663, 0.0240187, 0.0160351, 0.0106171, 0.824292, 1.25588, 0.789446, 8.45484e-12}}}, +{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+c32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, +{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am64+m64@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 512, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.11497e+06, 130485, 19378.1, 6563.26, 0, 0, 0.453475, 0.601712, 1.83124, 9.66981, 0.0453973, 0.0806318, 0.0229963, 1, 1.18514, 0.619757, 9.22053e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "IAB"}, "av16 am16x2 aB wg 16x1 af rr vav hi pt sr br sb64 bk0 sys ska rr kv afb", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {16, 1, 1}, 1, (WGType) 0, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03641e+06, 123576, 0, 0, 1.681e+06, 3.63725e+06, 0.433141, 0.621138, 0.453913, 1.2219, 0.0515989, 0.00292722, 0.0485145, 0.979653, 1.17866, -0.454257, 5.43001e-11}}}, +{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 aB wg 8x4 cb3 ks64 af vav hi pt sr br bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01629e+06, 551456, 0, 0, 5.42147e+06, 5.37395e+06, 0.789185, 1.6085, 1.03879, 1.72752, 0.00628363, 0.00628363, 0, 0.924224, 1.51259, 1.24909, 2.21382e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+c32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m32@48 am32+m32@32 aB wg 8x4 xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, 1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}}, +{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am64+m64@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 512, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.11497e+06, 130485, 19378.1, 6563.26, 0, 0, 0.453475, 0.601712, 1.83124, 9.66981, 0.0453973, 0.0806318, 0.0229963, 1, 1.18514, 0.619757, 9.22053e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"A4#16,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32+B32@96 aB32+B32@96 aB vav sys grf256 af hi pt wg 4x8 sb512 bk0 sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {8192, 8192, 16777216}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {2048}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIps"}, "av32+m128@96 am64+m64@96 aB wg 4x8 xaf st vav hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 64}, {64, 40, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {886295, 679810, 0, 0, 6.14973e+06, 1.05103e+07, 0.387882, 0.34723, 0.844766, 1.28789, 0.00202312, 0.00202312, 0, 0.99971, 1.60161, 1.05949, 2.70909e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m64@128 am64x2+m64@128 aB wg 4x8 xaf vav hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {878684, 628439, 0, 0, 5.63446e+06, 8.38042e+06, 0.326785, 0.347891, 0.805503, 1.24882, 0.00204557, 0.00204557, 0, 1, 1.50403, 1.06152, 3.02972e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IABs"}, "av32x2+m32@64 am64+m128@64 aB wg 4x8 af vav hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 393216, 16777216}, {1048576, 393216, 64}, {64, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {893468, 542180, 0, 0, 4.62356e+06, 6.62733e+06, 0.391937, 0.505587, 0.774621, 1.23145, 0.00235858, 0.00235858, 0, 0.996539, 1.5023, 1.05328, 2.52696e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@64 am64+m64@64 aB wg 8x4 ca4x2 ks64 xaf vav hi pt sr br bk0 sn nb 8x0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06557e+06, 554194, 0, 0, 0, 0, 0.351596, 0.90997, 0.947429, 1.57679, 0.00331983, 0.00331983, 0, 0.990955, 1.56412, 1.02425, 2.51352e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m64@128 am32x2 aB wg 4x4 cb4x2 ks32 xaf st vav hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 4, 1}, 1, (WGType) 1, 441, 16384, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.19279e+06, 305937, 0, 0, 5.25107e+06, 8.03635e+06, 0.333064, 0.619053, 0.911588, 1.46748, 0.00298243, 0.000462826, 0.0027584, 0.631652, 1.34002, 1.00593, 2.21481e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2 am64+m64@32 aB wg 2x8x2 kr ca3 ks64 xaf vav hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06148e+06, 261114, -1912.67, 378540, 0, 0, 0.3603, 0.488574, 0.861468, 1.70742, 0.00441825, 0.00441825, 0, 0.90841, 1.31036, 0.952495, 3.14617e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m32@64 am64x2 aB wg 2x4x4 kr ca4x2 ks32 af vav hi pt sr br bk0 nb 2x0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 64}, {2, 4, 4}, 1, (WGType) 1, 261, 16384, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.04568e+06, 374749, 8822.2, 183553, 0, 0, 0.252579, 0.585774, 0.808529, 2.12369, 0.00569297, 0.00569297, 0, 0.992042, 1.43825, 1.02555, 1.23985e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m64@96 am32+m64@96 aB wg 2x8x2 kr ca3 ks64 xaf st vav hi pt sr br bk0 sn nb 2x0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 12288, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {982126, 273502, 6264.44, 223316, 0, 0, 0.441125, 0.619665, 0.81579, 2.34678, 0.00837284, 0.00837284, 0, 0.969819, 1.30492, 0.959381, 1.83513e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m64@96 am128+m128@96 aB wg 2x4x4 kr xaf st vav hi pt sr br sb128 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 128}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {999962, 392678, 594.486, 106516, 0, 0, 0.317007, 0.702686, 1.32687, 3.91881, 0.0109074, 0.0109074, 0, 0.946711, 1.26043, 0.987718, 1.10384e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m64@64 am128+m128@64 aB wg 4x8 ca3 ks64 xaf vav hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {989932, 503264, 0, 0, 0, 0, 0.521085, 0.615326, 0.759821, 1.5027, 0.00469883, 0.00469883, 0, 0.99602, 1.31748, 0.979255, 1.00069e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m64@64 am64x2+m64@64 aB wg 4x8 ca4 ks64 af vav hi pt sr br bk0 sn nb 4x0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {989597, 475337, 0, 0, 0, 0, 0.454855, 0.942129, 0.617706, 1.68393, 0.00734477, 0.00734477, 0, 0.963871, 1.2358, 0.932592, 1.33399e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@128 am64x2+m64@128 aB wg 2x16 ca4x2 ks64 af vav hi pt sr br bk0 sn nb 2x0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 16384, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {961533, 455478, 0, 0, 0, 0, 0.642337, 0.7679, 0.660867, 2.13711, 0.0105946, 0.0105946, 0, 1, 1.30406, 0.902759, 1.47773e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64 am64x2+m64@128 aB wg 2x16 ca3x2 ks128 af vav hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 32768, 16777216}, {524288, 32768, 16777216}, {32, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {991020, 415137, 0, 0, 0, 0, 0.494061, 1.1783, 1.09356, 3.42418, 0.0192037, 0.0192037, 0, 0.99432, 1.16294, -0.903393, 1.39731e-11}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m64@32 am64 aB wg 2x8x2 kr cb4x2 ks64 xaf vav hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 64}, {64, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 445, 32768, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.4092e+06, -554772, -169517, 793747, 2.90734e+06, 3.16867e+06, 0.390299, 0.577147, 0.857166, 1.54399, 0.00361771, 0.000811875, 0.0033029, 0.98501, 1.49402, 0.997618, 1.88797e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64+m64@64 am64+m32@64 aB wg 4x2x4 kr xaf vav hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.15591e+06, -238679, -20728.1, 351139, 2.5641e+06, 2.48218e+06, 0.273012, 0.460771, 0.775111, 1.33768, 0.00533606, 0.00407793, 0.00182134, 0.639264, 1.2796, 0.665315, 1.2356e-11}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m32@128 am128+m128@128 aB wg 2x4x4 kr xaf vav hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 128}, {32, 16, 128}, {2, 4, 4}, 1, (WGType) 1, 445, 0, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.16312e+06, -178711, -20881.1, 280021, 2.00049e+06, 1.8178e+06, 0.329846, 0.416682, 0.587935, 1.13731, 0.00689723, 0.005772, 0.00231223, 0.806174, 1.33228, 0.879709, 5.51613e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m32@96 am128+m64@96 aB wg 2x4x4 kr ca4x2 ks64 xaf vav hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 128}, {32, 8, 128}, {2, 4, 4}, 1, (WGType) 1, 445, 16384, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.41922e+06, -182504, -78300.4, 285831, 1.86778e+06, 1.47046e+06, 0.294783, 0.695972, 0.506752, 1.20455, 0.0108447, 0.00678984, 0.00498837, 0.844314, 1.33655, 0.794731, 4.36829e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64 am64x2+m64@32 aB wg 1x4x8 kr ca4x2 ks64 xaf st vav hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 64}, {32, 8, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 8192, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.31936e+06, -96890.5, -30769.9, 145228, 2.34291e+06, 1.81862e+06, 0.412414, 0.30833, 0.409741, 1.09695, 0.0174833, 0.0161637, 0.00408927, 0.766829, 1.28501, 0.936336, 4.72138e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "IAB"}, "av32+B32@48 am32x2 aB wg 8x2 af vav li sr br sb256 bk0 sys ska rr wx4 kv pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 32}, {8, 2, 1}, 4, (WGType) 1, 297, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {883594, 337014, 0, 0, 0, 0, 0.22229, 8.60539, 0.729527, 2.00832, 0.0271033, 0.0271033, 0, 0.0457007, 1.09409, 0.724113, 2.54861e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {8, 8, 1}, "IAB"}, "av64 am128+m128@96 aB wg 1x4x8 kr ca3x2 ks64 af vav hi pt sr br bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 128}, {16, 32, 128}, {1, 4, 8}, 1, (WGType) 1, 445, 3072, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.23048e+06, -125592, -17585.5, 179011, 2.12173e+06, 1.35168e+06, 0.920505, 0.255099, 0.653157, 1.21015, 0.0169894, 0.0198407, 0.00264754, 0.9339, 1.23674, 0.982957, 2.88698e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#Ip"}, "aB32 aB32 aB wg 4x8 cab4 ks64 xaf vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 131072, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.05829e+06, 1.03353e+06, 0, 0, 5.30842e+06, 9.18323e+06, 0.47066, 0.531818, 0.917115, 1.45352, 0.00220015, 0.00220015, 0, 0.975008, 1.63094, 0.992603, 5.47213e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ips"}, "aB32+m32@96 aB32 aB wg 4x8 cab3 ks64 xaf st vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 393216, 16777216}, {1048576, 393216, 64}, {64, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 86016, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.06575e+06, 1.10881e+06, 0, 0, 4.41139e+06, 6.57818e+06, 0.42243, 0.679112, 0.918586, 1.50737, 0.00243376, 0.00243376, 0, 1, 1.95319, 1.10024, 2.294e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB64 aB32 aB wg 4x8 cab3 ks64 xaf vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 64}, {64, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 73728, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.06135e+06, 822813, 0, 0, 3.62578e+06, 4.59817e+06, 0.415771, 0.818848, 0.865328, 1.58139, 0.00337564, 0.00337564, 0, 1, 1.58683, 1.05039, 3.00347e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ipq"}, "aB32 aB32 aB wg 2x8x2 kr cab4x2 ks32 xaf st vav hi pt sr br bk0 sn nb 2x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 32}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 65536, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.10897e+06, 485603, -29858, 432495, 0, 0, 0.55752, 0.565698, 0.750745, 1.79841, 0.00449927, 0.00449927, 0, 1, 1.66505, 1.02629, 1.8353e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m64@32 aB64+m32@32 aB wg 2x8x2 kr cab3 ks64 xaf st vav hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 16777216}, {64, 8, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 36864, 36864, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.0734e+06, 659616, -5392.59, 285969, 0, 0, 0.380518, 0.879017, 0.522545, 1.73036, 0.00639724, 0.00639724, 0, 0.981695, 1.43982, 0.952975, 3.88499e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m64@32 aB64+m64@32 aB wg 2x8x2 kr cab3 ks64 af vav hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 24576, 24576, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.03799e+06, 666762, 2486.11, 229962, 0, 0, 0.52045, 0.670568, 0.707588, 2.31307, 0.00907287, 0.00907287, 0, 1, 1.37325, 0.975844, 1.68113e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB64+m64@64 aB32+m32@64 aB wg 4x8 cab4 ks64 xaf st vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.04335e+06, 903066, 0, 0, 0, 0, 0.573291, 0.742252, 0.728779, 1.625, 0.0049232, 0.0049232, 0, 1, 1.41255, 0.970177, 1.84329e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32x2+m64@128 aB32x2+m64@128 aB wg 4x8 cab3 ks64 xaf st vav hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 36864, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.02689e+06, 960490, 0, 0, 0, 0, 0.510059, 1.04909, 0.673702, 1.67059, 0.00734958, 0.00734958, 0, 0.997915, 1.31262, 0.915348, 2.01189e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB64 aB32 aB wg 2x8x2 kr cab4 ks64 af vav hi pt sr br bk0 sn nb 2x8 dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 64}, {64, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 445, 65536, 65536, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.89554e+06, -498803, -411042, 930415, 2.67387e+06, 3.20799e+06, 0.567677, 0.501271, 0.840676, 1.5063, 0.00427203, 0.000481846, 0.00404699, 0.832467, 1.44137, 0.955203, 3.92029e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aB32+m128@64 aB wg 2x4x4 kr cab4 ks64 af vav hi pt sr br bk0 sn nb 2x4 dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {2, 4, 4}, 1, (WGType) 1, 445, 32768, 32768, {1, 1, 4}, {true, true, true}}, {'E', 17, {2.01352e+06, -185065, -235490, 423906, 2.17907e+06, 2.28065e+06, 0.487223, 0.546183, 0.355732, 0.989325, 0.00871265, 0.00303217, 0.00595475, 0.892248, 1.28589, 0.854172, 3.98616e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m64@32 aB64+m64@32 aB wg 2x4x4 kr cab4 ks64 af vav hi pt sr br bk0 sn nb 2x4 dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 64}, {32, 8, 64}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 24576, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.88418e+06, -153479, -204520, 376006, 2.17088e+06, 1.88498e+06, 0.367103, 0.802232, -0.0233093, 0.803253, 0.0120542, 0.00674447, 0.00621616, 0.958995, 1.34316, 0.655552, 9.35284e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32x2 aB32x2+m64@32 aB wg 1x4x8 kr cab4 ks64 xaf vav hi pt sr br bk0 sn dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 64}, {32, 8, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 16384, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.789e+06, -104206, -88473.3, 216093, 3.5799e+06, 1.90874e+06, 0.494892, 0.45351, -0.0429483, 0.861739, 0.0175893, 0.00752501, 0.010522, 0.988366, 1.40054, 0.929823, 4.81054e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "aB32+m64@64 aB64 aB wg 2x8 af vav li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 64}, {64, 2, 64}, {2, 8, 1}, 1, (WGType) 1, 441, 24576, 0, {4, 1, 4}, {true, true, true}}, {'E', 17, {1.33537e+06, 174420, 0, 0, 1.79651e+06, 2.35274e+06, 0.35032, 2.21824, 0.289392, 0.954431, 0.0187001, 0.00782903, 0.0142837, 0.961014, 1.45471, 0.970912, 4.15697e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "#Ip"}, "aB32+m64@64 aB128 aB wg 1x4x8 kr af vav li pt sr br sb128 bk0 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 128}, {16, 16, 128}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.21993e+06, -94688.2, -20621.2, 162272, 2.16269e+06, 679936, 1.63701, 0.42146, 0.328977, 0.880663, 0.0240187, 0.0160351, 0.0106171, 0.824292, 1.25588, 0.789446, 8.45484e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "i"}, "av8 am16x2+m32@24 aB wg 4x8 ca3 ks32 nse hi pt sr br bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {970346, 653718, 0, 0, 3.67821e+06, 9.17504e+06, 0.766571, 0.880554, 0.876014, 1.54472, 0.0160172, 0.0160172, 0, 1, 1.11467, 0.986994, 1.14313e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#i"}, "aB32 aB32 aB wg 4x8 cab3 ks32 nse hi pt bk0 grf256 kv afb sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 49152, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.54615e+06, 742084, 0, 0, 3.95674e+06, 8.85555e+06, 0.656903, 1.11296, 0.865967, 1.49351, 0.0166951, 0.0166951, 0, 1, 1.09758, 0.989116, 8.61864e-13}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#i"}, "aB16 aB16 aB wg 4x8 cab3 ks32 nse hi pt bk0 grf256 kv afb sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 49152, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.01957e+06, 906366, 0, 0, 4.13696e+06, 8.73267e+06, 1.51183, 1.36399, 0.870951, 1.51109, 0.0167461, 0.0167461, 0, 1, 1.09321, 0.977493, 1.29851e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32x2+m32@64 av32+m32@64 aB wg 4x8 cb3 ks32 xaf st vav di hi pt sr br bk0 nb 0x8 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {968994, 716920, 0, 0, 5.78437e+06, 9.22419e+06, 0.534082, 0.742409, 0.894677, 1.49963, 0.00222583, 0.00222583, 0, 0.887893, 1.57089, 1.08446, 1.91309e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32x2+m128@64 av64+m32@64 aB wg 4x4x2 kr cb3 ks64 xaf vav di hi pt sr br bk0 nb 0x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.48013e+06, -879245, -212784, 1.19787e+06, 3.27516e+06, 5.10362e+06, 0.353883, 0.703019, 0.911615, 1.47833, 0.00312427, 0.000484667, 0.00281171, 0.593822, 1.43189, 1.02998, 1.76713e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m64@128 av64+m32@128 aB wg 4x8 cab4x2 ks64 af vav di hi pt sr br bk0 nb 4x8 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.04641e+06, 551646, 0, 0, 0, 0, 0.775015, 0.878808, 0.742373, 1.49937, 0.00558315, 0.00558315, 0, 1, 1.29356, 0.970044, 1.0419e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m32@32 av32 aB wg 4x2x4 kr cb3x2 ks32 xaf st vav di hi pt sr br bk0 nb 0x2 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 6144, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.04345e+06, 378139, 1841.02, 195723, 0, 0, 0.492254, 1.08103, 0.714577, 2.08372, 0.00664973, 0.00664973, 0, 0.929186, 1.34886, 1.00044, 8.80626e-13}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@32 av32+m32@32 aB wg 2x4x4 kr cb3 ks32 xaf vav di hi pt sr br bk0 nb 0x4 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 6144, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06063e+06, 391330, -4561.58, 141436, 0, 0, 0.898973, 0.937592, 0.811295, 2.49893, 0.0100304, 0.0100304, 0, 0.938108, 1.25259, 0.981155, 9.62597e-13}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IABg"}, "av32+m32@128 av32+m32@128 aB wg 4x8 cab3x2 ks64 xaf vav di hi pt sr br bk0 nb 4x8 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 36864, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.02633e+06, 512948, 0, 0, 0, 0, 0.616223, 1.4606, 0.843356, 1.74802, 0.0095619, 0.0095619, 0, 1, 1.19005, 0.980175, 5.84833e-13}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IABg"}, "av64+m32@128 av64+m32@128 aB wg 4x8 cab3x2 ks128 af vav di hi pt sr br bk0 nb 4x8 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.02641e+06, 481324, 0, 0, 0, 0, 0.93847, 1.32746, 0.702383, 2.19035, 0.0160592, 0.0160592, 0, 1, 1.04875, 0.330175, 3.7195e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64+m32@64 av32+m32@64 aB wg 4x4x2 kr cab3x2 ks64 xaf st vav di hi pt sr br bk0 nb 4x4 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.44895e+06, -475742, -192864, 753196, 2.79429e+06, 3.21044e+06, 0.543815, 0.583688, 0.893765, 1.54599, 0.00436005, 0.000416302, 0.00416311, 0.659609, 1.31185, 0.99626, 1.25799e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64+m32@64 av64+m32@64 aB wg 4x2x4 kr cb4x2 ks64 xaf st vav di hi pt sr br bk0 nb 0x2 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.43195e+06, -220355, -88718.8, 353320, 2.49856e+06, 2.51167e+06, 0.377332, 0.75479, 0.906308, 1.29913, 0.0058945, 0.00410324, 0.00237673, 0.692439, 1.23717, 0.463634, 1.60992e-11}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32 at32+m64@32 aB wg 2x2x8 kr ca3x2 ks32 xaf vav di hi pt sr br bk0 nb 2x0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 6144, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.33083e+06, -155638, -32206.7, 215344, 2.58048e+06, 2.38387e+06, 0.560445, 0.453657, 0.73036, 1.3036, 0.00873208, 0.0075434, 0.00252893, 0.878367, 1.27904, 0.492696, 1.21306e-11}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@32 av32+m128@32 aB wg 2x2x8 kr cb4x2 ks64 af vav di hi pt sr br bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.34069e+06, -116391, -31493.6, 173677, 2.22003e+06, 1.85139e+06, 0.42459, 0.964629, 0.713108, 1.06445, 0.0165022, 0.0154653, 0.00316435, 0.667006, 1.16044, 0.602135, 7.93972e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m64@64 at64+m32@64 aB wg 1x2x16 kr ca4x2 ks64 af vav di hi pt sr br bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {1, 2, 16}, 1, (WGType) 1, 445, 8192, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.31786e+06, -82626.9, -17117, 112973, 3.34234e+06, 1.65478e+06, 0.813229, 0.748021, 0.644888, 1.29433, 0.0397427, 0.0319944, 0.00448936, 0, 1.14457, 0.953023, 3.67408e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BIp"}, "aB32 at32+m64@32 aB wg 4x8 cab3x2 ks64 xaf vav di hi pt sr br bk0 sn nb 4x8 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06084e+06, 849124, 0, 0, 5.06593e+06, 8.26573e+06, 0.813458, 0.757306, 0.923897, 1.44252, 0.00320502, 0.00320502, 0, 0.998673, 1.38807, 1.02748, 1.29467e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB32 at32+m32@128 aB wg 4x4x2 kr cab4x2 ks32 xaf vav di hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.05992e+06, -246452, -7356.34, 1.16386e+06, 0, 0, 0.552551, 0.934019, 0.885358, 1.68144, 0.00394818, 0.00394818, 0, 0.978321, 1.42873, 1.03457, 8.61849e-13}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB32+m32@96 at32+m32@96 aB wg 2x8x2 kr cab4 ks64 xaf st vav di hi pt sr br bk0 sn nb 2x8 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 65536, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.20806e+06, 323026, -63628.4, 430630, 0, 0, 0.878466, 0.849418, 0.805392, 1.56666, 0.00632332, 0.00632332, 0, 0.885776, 1.24132, 0.950129, 2.21324e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB32x2 at32x2+m32@64 aB wg 2x4x4 kr cab3 ks32 xaf vav di hi pt sr br bk0 sn nb 2x4 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07555e+06, 518207, -1629.24, 196804, 0, 0, 0.762617, 0.80574, 0.491797, 2.04474, 0.00767, 0.00767, 0, 0.988971, 1.22544, 0.980506, 2.21112e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BIg"}, "aB32 at64+m32@96 aB wg 1x8x4 kr cab4x2 ks64 af vav di hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 16777216}, {64, 8, 64}, {1, 8, 4}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04941e+06, 600870, 2948.13, 138773, 0, 0, 1.04105, 0.750519, 0.818337, 2.45508, 0.0113281, 0.0113281, 0, 0.986082, 1.20691, 0.985358, 1.01144e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB32+m32@96 at32+m32@96 aB wg 4x4x2 kr cab4x2 ks32 xaf vav di hi pt sr br bk0 sn nb 4x4 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.66314e+06, -1.27279e+06, -300802, 1.65863e+06, 3.75931e+06, 5.86547e+06, 0.491219, 0.87616, 0.899685, 1.462, 0.00397407, 0.000371744, 0.00376005, 0.648719, 1.3183, 0.990919, 1.99015e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB32 at32+m32@96 aB wg 2x8x2 kr cab3 ks64 af vav di hi pt sr br bk0 sn nb 2x8 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 64}, {64, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 445, 49152, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.59165e+06, -496791, -255103, 821170, 2.73285e+06, 3.18177e+06, 0.790684, 0.690631, 0.850764, 1.52446, 0.00584761, 0.000218975, 0.00563592, 0.617191, 1.54591, 0.999485, 7.93976e-13}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB32+m32@64 at32+m32@64 aB wg 2x4x4 kr cab4 ks32 xaf st vav di hi pt sr br bk0 sn nb 2x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.53804e+06, -244254, -110255, 404894, 2.415e+06, 3.12934e+06, 0.469656, 0.821322, 0.84661, 1.33465, 0.00730374, 0.00540283, 0.00285574, 0.877009, 1.23985, 0.959553, 2.29643e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB32+m32@32 at32+m64@32 aB wg 1x4x8 kr cab4 ks32 xaf st vav di hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 32}, {1, 4, 8}, 1, (WGType) 1, 445, 16384, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.45404e+06, -177745, -43504.6, 259730, 2.40026e+06, 2.42483e+06, 0.652924, 0.534638, 0.729263, 1.27241, 0.00899012, 0.00717777, 0.00313284, 0.933371, 1.20556, 0.9698, 2.67914e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BIg"}, "aB32 at32+m32@96 aB wg 1x4x8 kr cab4x2 ks32 af vav di hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 445, 12288, 12288, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.43554e+06, -119170, -46892.7, 189568, 2.77709e+06, 2.1504e+06, 0.402068, 0.981366, 0.646632, 1.0724, 0.0199081, 0.0164365, 0.0044588, 0.668568, 1.20591, 0.616402, 1.76161e-11}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "I"}, "aB64+m32@96 aS32 aB wg 16x1x2 kr cb4x2 ks32 xaf vav di li nmk pt sr br bk0 grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {16, 1, 2}, 1, (WGType) 1, 445, 2048, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.58636e+06, -482491, -299697, 877643, 1.90054e+06, 1.73179e+06, 0.601985, 6.41237, 0.0513263, 1.10568, 0.0409021, 0.00376531, 0.0384593, 0.895641, 1.00114, 1.00052, -4.11446e-16}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB32x2+m32@64 at32x2+m32@64 aB wg 1x16 xaf st vav di li pt sr br sb128 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03539e+06, 223719, 0, 0, 1.75473e+06, 2.61161e+06, 5.72432, 0.487304, 0.347712, 0.902737, 0.0217428, 0.0032454, 0.0206116, 0.78847, 1.24817, 0.762289, 3.8041e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aS32 aB wg 4x8 cab4 ks64 af vav di hi pt bk0 grf256 sys l4 sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.07428e+06, 870501, 0, 0, 0, 0, 0.501416, 1.12211, 0.858073, 1.52799, 0.00684843, 0.00684843, 0, 0.996251, 1.07336, 0.092233, 6.21635e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB64 aS64 aB wg 4x8 cab4 ks128 af vav di hi pt bk0 grf256 sys l4 dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.11515e+06, 925681, 0, 0, 0, 0, 0.830567, 3.60606, 0.636814, 1.59255, 0.0203905, 0.0203905, 0, 0.862215, 1.00371, 1.00069, -4.07711e-16}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32x2 aS32x2 aB wg 2x16 cab4 ks128 af vav di hi pt bk0 grf256 sys l4 dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.05676e+06, 797092, 0, 0, 0, 0, 1.15513, 3.47579, 0.681838, 2.11327, 0.0385831, 0.0385831, 0, 0.774259, 1.0034, 1.00053, 5.87826e-17}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aS32 aB wg 1x4x8 kr cab4 ks32 af vav di hi pt bk0 grf256 sys l4 dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 261, 8192, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.0714e+06, 814336, -806.186, 79098.3, 0, 0, 1.57352, 2.83206, 2.69271, 7.17928, 0.0520116, 0.0520116, 0, 0.665122, 1.00198, 1.00049, 9.78834e-15}}}, -{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32x2 aS32x2 aB wg 2x16 cab4 ks128 af vav di hi pt bk0 grf256 sys l4 dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.00981e+06, 799187, 0, 0, 0, 0, 1.93343, 4.24673, 2.12932, 5.7688, 0.146616, 0.146616, 0, 0.630819, 1.00163, 1.00055, -1.04571e-15}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32x2+m32@64 av32+m32@64 aB wg 4x8 cb3 ks32 xaf st vav hi pt sr br bk0 nb 0x8 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {968994, 716920, 0, 0, 5.78437e+06, 9.22419e+06, 0.534082, 0.742409, 0.894677, 1.49963, 0.00222583, 0.00222583, 0, 0.887893, 1.57089, 1.08446, 1.91309e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32x2+m128@64 av64+m32@64 aB wg 4x4x2 kr cb3 ks64 xaf vav hi pt sr br bk0 nb 0x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.48013e+06, -879245, -212784, 1.19787e+06, 3.27516e+06, 5.10362e+06, 0.353883, 0.703019, 0.911615, 1.47833, 0.00312427, 0.000484667, 0.00281171, 0.593822, 1.43189, 1.02998, 1.76713e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m64@128 av64+m32@128 aB wg 4x8 cab4x2 ks64 af vav hi pt sr br bk0 nb 4x8 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.04641e+06, 551646, 0, 0, 0, 0, 0.775015, 0.878808, 0.742373, 1.49937, 0.00558315, 0.00558315, 0, 1, 1.29356, 0.970044, 1.0419e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m32@32 av32 aB wg 4x2x4 kr cb3x2 ks32 xaf st vav hi pt sr br bk0 nb 0x2 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 32}, {4, 2, 4}, 1, (WGType) 1, 261, 6144, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.04345e+06, 378139, 1841.02, 195723, 0, 0, 0.492254, 1.08103, 0.714577, 2.08372, 0.00664973, 0.00664973, 0, 0.929186, 1.34886, 1.00044, 8.80626e-13}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@32 av32+m32@32 aB wg 2x4x4 kr cb3 ks32 xaf vav hi pt sr br bk0 nb 0x4 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 6144, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06063e+06, 391330, -4561.58, 141436, 0, 0, 0.898973, 0.937592, 0.811295, 2.49893, 0.0100304, 0.0100304, 0, 0.938108, 1.25259, 0.981155, 9.62597e-13}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IABg"}, "av32+m32@128 av32+m32@128 aB wg 4x8 cab3x2 ks64 xaf vav hi pt sr br bk0 nb 4x8 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 36864, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.02633e+06, 512948, 0, 0, 0, 0, 0.616223, 1.4606, 0.843356, 1.74802, 0.0095619, 0.0095619, 0, 1, 1.19005, 0.980175, 5.84833e-13}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IABg"}, "av64+m32@128 av64+m32@128 aB wg 4x8 cab3x2 ks128 af vav hi pt sr br bk0 nb 4x8 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.02641e+06, 481324, 0, 0, 0, 0, 0.93847, 1.32746, 0.702383, 2.19035, 0.0160592, 0.0160592, 0, 1, 1.04875, 0.330175, 3.7195e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64+m32@64 av32+m32@64 aB wg 4x4x2 kr cab3x2 ks64 xaf st vav hi pt sr br bk0 nb 4x4 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.44895e+06, -475742, -192864, 753196, 2.79429e+06, 3.21044e+06, 0.543815, 0.583688, 0.893765, 1.54599, 0.00436005, 0.000416302, 0.00416311, 0.659609, 1.31185, 0.99626, 1.25799e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64+m32@64 av64+m32@64 aB wg 4x2x4 kr cb4x2 ks64 xaf st vav hi pt sr br bk0 nb 0x2 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.43195e+06, -220355, -88718.8, 353320, 2.49856e+06, 2.51167e+06, 0.377332, 0.75479, 0.906308, 1.29913, 0.0058945, 0.00410324, 0.00237673, 0.692439, 1.23717, 0.463634, 1.60992e-11}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32 at32+m64@32 aB wg 2x2x8 kr ca3x2 ks32 xaf vav hi pt sr br bk0 nb 2x0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 6144, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.33083e+06, -155638, -32206.7, 215344, 2.58048e+06, 2.38387e+06, 0.560445, 0.453657, 0.73036, 1.3036, 0.00873208, 0.0075434, 0.00252893, 0.878367, 1.27904, 0.492696, 1.21306e-11}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m32@32 av32+m128@32 aB wg 2x2x8 kr cb4x2 ks64 af vav hi pt sr br bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.34069e+06, -116391, -31493.6, 173677, 2.22003e+06, 1.85139e+06, 0.42459, 0.964629, 0.713108, 1.06445, 0.0165022, 0.0154653, 0.00316435, 0.667006, 1.16044, 0.602135, 7.93972e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32x2+m64@64 at64+m32@64 aB wg 1x2x16 kr ca4x2 ks64 af vav hi pt sr br bk0 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {1, 2, 16}, 1, (WGType) 1, 445, 8192, 8192, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.31786e+06, -82626.9, -17117, 112973, 3.34234e+06, 1.65478e+06, 0.813229, 0.748021, 0.644888, 1.29433, 0.0397427, 0.0319944, 0.00448936, 0, 1.14457, 0.953023, 3.67408e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BIp"}, "aB32 at32+m64@32 aB wg 4x8 cab3x2 ks64 xaf vav hi pt sr br bk0 sn nb 4x8 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.06084e+06, 849124, 0, 0, 5.06593e+06, 8.26573e+06, 0.813458, 0.757306, 0.923897, 1.44252, 0.00320502, 0.00320502, 0, 0.998673, 1.38807, 1.02748, 1.29467e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB32 at32+m32@128 aB wg 4x4x2 kr cab4x2 ks32 xaf vav hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 261, 49152, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.05992e+06, -246452, -7356.34, 1.16386e+06, 0, 0, 0.552551, 0.934019, 0.885358, 1.68144, 0.00394818, 0.00394818, 0, 0.978321, 1.42873, 1.03457, 8.61849e-13}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB32+m32@96 at32+m32@96 aB wg 2x8x2 kr cab4 ks64 xaf st vav hi pt sr br bk0 sn nb 2x8 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 65536, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.20806e+06, 323026, -63628.4, 430630, 0, 0, 0.878466, 0.849418, 0.805392, 1.56666, 0.00632332, 0.00632332, 0, 0.885776, 1.24132, 0.950129, 2.21324e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB32x2 at32x2+m32@64 aB wg 2x4x4 kr cab3 ks32 xaf vav hi pt sr br bk0 sn nb 2x4 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07555e+06, 518207, -1629.24, 196804, 0, 0, 0.762617, 0.80574, 0.491797, 2.04474, 0.00767, 0.00767, 0, 0.988971, 1.22544, 0.980506, 2.21112e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BIg"}, "aB32 at64+m32@96 aB wg 1x8x4 kr cab4x2 ks64 af vav hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 16777216}, {64, 8, 64}, {1, 8, 4}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04941e+06, 600870, 2948.13, 138773, 0, 0, 1.04105, 0.750519, 0.818337, 2.45508, 0.0113281, 0.0113281, 0, 0.986082, 1.20691, 0.985358, 1.01144e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB32+m32@96 at32+m32@96 aB wg 4x4x2 kr cab4x2 ks32 xaf vav hi pt sr br bk0 sn nb 4x4 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.66314e+06, -1.27279e+06, -300802, 1.65863e+06, 3.75931e+06, 5.86547e+06, 0.491219, 0.87616, 0.899685, 1.462, 0.00397407, 0.000371744, 0.00376005, 0.648719, 1.3183, 0.990919, 1.99015e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BI"}, "aB32 at32+m32@96 aB wg 2x8x2 kr cab3 ks64 af vav hi pt sr br bk0 sn nb 2x8 grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 64}, {64, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 445, 49152, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.59165e+06, -496791, -255103, 821170, 2.73285e+06, 3.18177e+06, 0.790684, 0.690631, 0.850764, 1.52446, 0.00584761, 0.000218975, 0.00563592, 0.617191, 1.54591, 0.999485, 7.93976e-13}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB32+m32@64 at32+m32@64 aB wg 2x4x4 kr cab4 ks32 xaf st vav hi pt sr br bk0 sn nb 2x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.53804e+06, -244254, -110255, 404894, 2.415e+06, 3.12934e+06, 0.469656, 0.821322, 0.84661, 1.33465, 0.00730374, 0.00540283, 0.00285574, 0.877009, 1.23985, 0.959553, 2.29643e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#ABI"}, "aB32+m32@32 at32+m64@32 aB wg 1x4x8 kr cab4 ks32 xaf st vav hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 32}, {1, 4, 8}, 1, (WGType) 1, 445, 16384, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.45404e+06, -177745, -43504.6, 259730, 2.40026e+06, 2.42483e+06, 0.652924, 0.534638, 0.729263, 1.27241, 0.00899012, 0.00717777, 0.00313284, 0.933371, 1.20556, 0.9698, 2.67914e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BIg"}, "aB32 at32+m32@96 aB wg 1x4x8 kr cab4x2 ks32 af vav hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 445, 12288, 12288, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.43554e+06, -119170, -46892.7, 189568, 2.77709e+06, 2.1504e+06, 0.402068, 0.981366, 0.646632, 1.0724, 0.0199081, 0.0164365, 0.0044588, 0.668568, 1.20591, 0.616402, 1.76161e-11}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "I"}, "aB64+m32@96 aS32 aB wg 16x1x2 kr cb4x2 ks32 xaf vav li nmk pt sr br bk0 grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {16, 1, 2}, 1, (WGType) 1, 445, 2048, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.58636e+06, -482491, -299697, 877643, 1.90054e+06, 1.73179e+06, 0.601985, 6.41237, 0.0513263, 1.10568, 0.0409021, 0.00376531, 0.0384593, 0.895641, 1.00114, 1.00052, -4.11446e-16}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB32x2+m32@64 at32x2+m32@64 aB wg 1x16 xaf st vav li pt sr br sb128 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03539e+06, 223719, 0, 0, 1.75473e+06, 2.61161e+06, 5.72432, 0.487304, 0.347712, 0.902737, 0.0217428, 0.0032454, 0.0206116, 0.78847, 1.24817, 0.762289, 3.8041e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aS32 aB wg 4x8 cab4 ks64 af vav hi pt bk0 grf256 sys l4 sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.07428e+06, 870501, 0, 0, 0, 0, 0.501416, 1.12211, 0.858073, 1.52799, 0.00684843, 0.00684843, 0, 0.996251, 1.07336, 0.092233, 6.21635e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB64 aS64 aB wg 4x8 cab4 ks128 af vav hi pt bk0 grf256 sys l4 dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.11515e+06, 925681, 0, 0, 0, 0, 0.830567, 3.60606, 0.636814, 1.59255, 0.0203905, 0.0203905, 0, 0.862215, 1.00371, 1.00069, -4.07711e-16}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32x2 aS32x2 aB wg 2x16 cab4 ks128 af vav hi pt bk0 grf256 sys l4 dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.05676e+06, 797092, 0, 0, 0, 0, 1.15513, 3.47579, 0.681838, 2.11327, 0.0385831, 0.0385831, 0, 0.774259, 1.0034, 1.00053, 5.87826e-17}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aS32 aB wg 1x4x8 kr cab4 ks32 af vav hi pt bk0 grf256 sys l4 dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 261, 8192, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.0714e+06, 814336, -806.186, 79098.3, 0, 0, 1.57352, 2.83206, 2.69271, 7.17928, 0.0520116, 0.0520116, 0, 0.665122, 1.00198, 1.00049, 9.78834e-15}}}, +{{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32x2 aS32x2 aB wg 2x16 cab4 ks128 af vav hi pt bk0 grf256 sys l4 dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.00981e+06, 799187, 0, 0, 0, 0, 1.93343, 4.24673, 2.12932, 5.7688, 0.146616, 0.146616, 0, 0.630819, 1.00163, 1.00055, -1.04571e-15}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "i"}, "av32 aS32+m32@8 aB wg 4x8 cab4 ks32 nse hi pt bk0 sn nb 4x8 grf256 kv afb sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.4409e+06, 852858, 0, 0, 4.12877e+06, 8.43776e+06, 0.61084, 0.782227, 0.876523, 1.49805, 0.0164756, 0.0164756, 0, 1, 1.10945, 0.995623, 8.03063e-13}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#i"}, "aB8x2 aS8x2 aB wg 4x8 cab4 ks32 nse hi pt sr br bk0 nb 4x8 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04685e+06, 855332, 0, 0, 3.9977e+06, 9.216e+06, 0.658789, 0.718726, 0.867169, 1.54002, 0.0168389, 0.0168389, 0, 1, 1.09776, 0.992762, 7.01917e-13}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#i"}, "aB8/4 aS8 aB wg 4x8 cab4 ks32 nse hi pt sr br bk0 nb 4x8 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {991806, 971407, 0, 0, 4.04685e+06, 9.01939e+06, 1.72245, 1.50029, 0.860423, 1.51374, 0.0176793, 0.0176793, 0, 1, 1.07978, 0.979007, 6.31428e-13}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32+m128@96 am64+m64@112 aB wg 4x8 xaf st vav di hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 64}, {64, 40, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {884649, 705009, 0, 0, 6.08092e+06, 1.03629e+07, 0.496687, 0.364134, 0.840897, 1.28383, 0.00200956, 0.00200956, 0, 1, 2.31371, 1.15477, 1.64496e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32x2+m128@96 am64+m64@128 aB wg 8x4 xaf vav di hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {895996, 619115, 0, 0, 5.44113e+06, 8.45414e+06, 0.385983, 0.342777, 0.788529, 1.22228, 0.00199536, 0.00199536, 0, 1, 1.60168, 1.14344, 3.24663e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32x2+m64@64 am64+m128@96 aB wg 8x4 xaf vav di hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 64}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {873940, 565190, 0, 0, 4.58752e+06, 7.00416e+06, 0.375538, 0.421029, 0.747052, 1.19407, 0.00251587, 0.00251587, 0, 1, 1.69069, 1.13204, 1.62736e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32+m64@64 am64+m64@96 aB wg 8x2x2 kr af vav di hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 2, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.11662e+06, -910592, -33046.6, 1.19754e+06, 3.36527e+06, 5.0217e+06, 0.258583, 0.403911, 0.818673, 1.29478, 0.00269142, 0.000381417, 0.00261948, 0.597966, 1.74376, 1.14072, 1.43395e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32+m64@64 am64+m64@96 aB wg 8x2x2 kr af vav di hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 64}, {8, 2, 2}, 1, (WGType) 1, 261, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.05222e+06, -111578, 229.635, 796442, 0, 0, 0.253233, 0.406643, 0.813555, 1.45295, 0.00276155, 0.00276155, 0, 0.877781, 1.50355, 1.01103, 4.94859e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m64@128 am32 aB wg 16x2 cb3x2 ks64 af vav di hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 64}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {985303, 529193, 0, 0, 0, 0, 0.334488, 0.597794, 0.849156, 1.3743, 0.00359035, 0.00359035, 0, 0.903609, 1.40249, 1.04911, 2.49355e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@96 am32x2+m128@64 aB wg 4x4x2 kr xaf st vav di hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.05983e+06, -474100, -2272.62, 713748, 2.81641e+06, 3.33414e+06, 0.403748, 0.373013, 0.772197, 1.18809, 0.00404296, 0.000494942, 0.00376492, 0.513109, 1.48851, 1.00899, 2.18946e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m128@64 am64+m32@64 aB wg 4x4x2 kr xaf vav di hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.07887e+06, 165901, -16323.3, 390698, 0, 0, 0.35439, 0.336657, 0.782861, 1.6985, 0.00390024, 0.00390024, 0, 0.971171, 1.481, 1.0484, 1.29175e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m128@128 am64+m32@128 aB wg 8x4 af vav di hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {887443, 420819, 0, 0, 0, 0, 0.443197, 0.408057, 0.55357, 1.07857, 0.00424551, 0.00424551, 0, 1, 1.4159, 1.00616, 1.44951e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@96 am64+m64@96 aB wg 4x2x4 kr xaf vav di hi pt sr br sb128 bk0 sm sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 32768, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.14066e+06, -229433, -14533.6, 346037, 2.54935e+06, 2.45105e+06, 0.222742, 0.287911, 0.779924, 1.34129, 0.00482324, 0.00413971, 0.00147387, 0.486442, 1.45855, 0.962427, 5.617e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m64@96 am64+m128@96 aB wg 8x4 af vav di hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {883828, 389004, 0, 0, 0, 0, 0.372711, 0.671929, 0.348909, 0.985628, 0.00656641, 0.00656641, 0, 0.979495, 1.22863, 0.98572, 8.6565e-13}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@96 am64+m64@96 aB wg 2x2x8 kr xaf vav di hi pt sr br sb128 bk0 sm sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 16384, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.14864e+06, -151615, -6901.62, 210708, 2.44695e+06, 2.41664e+06, 0.224008, 0.202288, 0.73347, 1.38945, 0.00597322, 0.00649637, 0.00100094, 0.864243, 1.42077, 0.800446, 1.1311e-11}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m64@96 am64+m64@96 aB wg 4x4x2 kr xaf vav di hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 16384, {4, 8, 4}, {true, true, true}}, {'E', 17, {958974, 226613, 38729.4, 218499, 0, 0, 0.349594, 0.344345, 0.603399, 2.13905, 0.00724009, 0.00724009, 0, 1, 1.31855, 0.978713, 1.64485e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m64@96 am32+m128@96 aB wg 4x8 xaf st vav di hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {879427, 374769, 0, 0, 0, 0, 0.681995, 0.637561, 0.593202, 1.53194, 0.0120684, 0.0120684, 0, 0.902553, 1.05814, 0.58533, 4.64073e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32x2+m64@96 am64+m32@128 aB wg 4x2x4 kr xaf st vav di hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.12606e+06, -137008, -24369.3, 228077, 2.14139e+06, 1.80224e+06, 0.232306, 0.352856, 0.366483, 1.01866, 0.0096663, 0.00801051, 0.00318484, 0.842293, 1.37613, 0.921541, 4.97764e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m32@96 am32x2+m64@64 aB wg 4x2x4 kr af vav di hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 261, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {993405, 342847, -2034.13, 109213, 0, 0, 0.288855, 0.497534, 1.21807, 3.58096, 0.01097, 0.01097, 0, 1, 1.25266, 0.993647, 1.05629e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@128 am64+m128@128 aB wg 2x2x8 kr xaf vav di hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.09423e+06, -83565.2, -8479.72, 132562, 2.41664e+06, 1.75309e+06, 0.217848, 0.232009, 0.349329, 1.02701, 0.0120474, 0.012138, 0.00283781, 0.878271, 1.29462, 0.7804, 9.59757e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m32@96 am32x2+m64@64 aB wg 2x4x4 kr xaf st vav di hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {966602, 338111, -3348.28, 97829.3, 0, 0, 0.401147, 0.39872, 2.19696, 6.17459, 0.0151964, 0.0151964, 0, 1, 1.21501, 0.949127, 1.33783e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 8, 1}, "IAB"}, "at32+m128@32 am128+m128@128 aB wg 8x4 af vav di li nmk pt sr br sb128 bk0 sm sys kv afb", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 128}, {16, 4, 128}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01309e+06, 225543, 0, 0, 1.84812e+06, 1.93577e+06, 0.318973, 1.6272, 0.589134, 1.33561, 0.014082, 0.00175378, 0.0216268, 1, 1.06459, 0.958519, 4.93673e-13}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at32+m64@96 aB32 aB wg 8x4 cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.05324e+06, 979401, 0, 0, 5.54107e+06, 7.84794e+06, 0.402313, 0.410019, 0.918697, 1.43736, 0.00226438, 0.00226438, 0, 1, 1.62189, 1.08285, 3.19321e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AIp"}, "at32+m64@96 aB32 aB wg 16x2 cb4x2 ks128 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1572864, 16777216}, {262144, 1572864, 128}, {16, 96, 128}, {16, 2, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04068e+06, 873177, 0, 0, 3.37183e+06, 6.29965e+06, 0.384445, 0.528477, 0.860175, 1.32814, 0.00247393, 0.00247393, 0, 0.990759, 1.57684, 1.14116, 2.03781e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ipq"}, "at64+m64@128 aB32x2 aB wg 16x2 cb3 ks64 xaf st vav di hi pt sr br bk0 sm sn nb 0x2 dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 64}, {16, 64, 64}, {16, 2, 1}, 1, (WGType) 1, 441, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {973856, 781280, 0, 0, 3.47341e+06, 4.52198e+06, 0.376801, 0.745941, 0.834762, 1.36837, 0.00306578, 0.00306578, 0, 1, 1.548, 1.17568, 1.37045e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32x2+m64@64 aB32x2 aB wg 8x2x2 kr cb4 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 64}, {16, 64, 64}, {8, 2, 2}, 1, (WGType) 1, 445, 32768, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.76235e+06, -528635, -346576, 928208, 2.73039e+06, 3.28253e+06, 0.411331, 0.407059, 0.82943, 1.26791, 0.00365219, 0.000634094, 0.00340305, 0.949468, 1.57605, 1.11643, 2.18464e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m64@96 aB64 aB wg 8x4 cb4x2 ks64 xaf vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {979561, 775985, 0, 0, 0, 0, 0.472998, 0.448938, 0.785458, 1.5451, 0.00427182, 0.00427182, 0, 1, 1.48807, 0.994577, 1.6929e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m64@32 aB64 aB wg 8x1x4 kr cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 64}, {16, 64, 64}, {8, 1, 4}, 1, (WGType) 1, 445, 16384, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.72793e+06, -260474, -161085, 447531, 2.31834e+06, 2.89178e+06, 0.238818, 0.354785, 0.799686, 1.20956, 0.00460907, 0.00312961, 0.00215107, 0.989106, 1.46994, 1.02912, 3.02074e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at64+m64@64 aB32 aB wg 8x2x4 kr cb4 ks32 af vav di hi pt sr br bk0 sm sn dm sys l4 l2d", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {8, 2, 4}, 1, (WGType) 1, 261, 8192, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.10055e+06, 1.06985e+06, 1310.76, 199871, 0, 0, 0.375142, 0.660095, 0.719737, 1.72017, 0.00584039, 0.00584039, 0, 1, 1.38677, 1.01044, 1.17577e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@128 aB64x2 aB wg 8x4 cb4x2 ks128 xaf st vav di hi pt sr br bk0 sm sn nb 0x4 dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.00856e+06, 729582, 0, 0, 0, 0, 0.391803, 0.680643, 0.557725, 1.58519, 0.00733339, 0.00733339, 0, 0.974679, 1.32826, 0.97557, 9.11096e-13}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at64+m64@64 aB32x2 aB wg 4x2x4 kr cb4x2 ks64 xaf st vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 64}, {16, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.84304e+06, -196719, -186130, 412502, 2.21921e+06, 2.048e+06, 0.286789, 0.263596, 0.436547, 1.25452, 0.00648641, 0.00488563, 0.00263508, 0.940889, 1.37843, 0.889564, 5.23147e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at64+m32@64 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 261, 16384, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.05176e+06, 782569, -1901.36, 140619, 0, 0, 0.336566, 0.331805, 0.795679, 2.44302, 0.00752315, 0.00752315, 0, 1, 1.33186, 0.99299, 1.10267e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m64@128 aB128 aB wg 4x8 cb4x2 ks128 xaf st vav di hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.68462e+06, 614668, 0, 0, 0, 0, 0.528613, 0.557291, -0.992969, 0.00078125, 0.0113836, 0.0113836, 0, 0.96247, 1.20355, 0.897505, 1.16415e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32x2+m64@96 aB64 aB wg 4x2x4 kr cb4x2 ks128 af vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 128}, {16, 16, 128}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.63724e+06, -117354, -138376, 322423, 1.85139e+06, 1.3271e+06, 0.217482, 0.358374, -0.173415, 0.826585, 0.00954542, 0.00723565, 0.00380699, 0.869435, 1.30653, 0.807528, 6.89227e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at64+m64@96 aB64x2 aB wg 2x2x8 kr cb4x2 ks64 xaf vav di hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.65053e+06, -99807.1, -68122.5, 193337, 2.53952e+06, 1.70394e+06, 0.244079, 0.25177, 0.0702585, 0.887446, 0.0117307, 0.00844281, 0.00495289, 0.997313, 1.45603, 0.945166, 4.40941e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@64 aB128+m64@64 aB wg 2x2x8 kr af vav di hi pt sr br sb128 bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {2, 2, 8}, 1, (WGType) 1, 261, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {995441, 575346, 2016.61, 75427, 0, 0, 0.311133, 0.344824, 2.79512, 7.15467, 0.0157164, 0.0157164, 0, 0.82504, 1.17356, 0.963482, 1.218e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "at32x2+m128@32 aB64+m32@64 aB wg 4x2x4 kr af vav di li nmk pt sr br sb128 bk0 sm sn dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 64}, {16, 8, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.2871e+06, -132026, -84694.7, 251812, 2.17989e+06, 1.66871e+06, 0.26071, 0.702238, 0.035417, 1.12604, 0.0101197, 0.0087237, 0.010893, 1, 1.29244, 0.838716, 2.17924e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "#I"}, "at32+m64@96 aB128 aB wg 1x4x8 kr af vav di li pt sr br sb128 bk0 sm dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 128}, {16, 16, 128}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.19588e+06, -90151.7, -17276.9, 143923, 2.18726e+06, 1.17965e+06, 0.50932, 0.206862, 0.351165, 0.961797, 0.00952689, 0.0094563, 0.00799782, 1, 1.08799, 0.29902, 3.8884e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32+m128@96 am64+m64@112 aB wg 4x8 xaf st vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 64}, {64, 40, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {884649, 705009, 0, 0, 6.08092e+06, 1.03629e+07, 0.496687, 0.364134, 0.840897, 1.28383, 0.00200956, 0.00200956, 0, 1, 2.31371, 1.15477, 1.64496e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32x2+m128@96 am64+m64@128 aB wg 8x4 xaf vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {895996, 619115, 0, 0, 5.44113e+06, 8.45414e+06, 0.385983, 0.342777, 0.788529, 1.22228, 0.00199536, 0.00199536, 0, 1, 1.60168, 1.14344, 3.24663e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32x2+m64@64 am64+m128@96 aB wg 8x4 xaf vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 64}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {873940, 565190, 0, 0, 4.58752e+06, 7.00416e+06, 0.375538, 0.421029, 0.747052, 1.19407, 0.00251587, 0.00251587, 0, 1, 1.69069, 1.13204, 1.62736e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32+m64@64 am64+m64@96 aB wg 8x2x2 kr af vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 2, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.11662e+06, -910592, -33046.6, 1.19754e+06, 3.36527e+06, 5.0217e+06, 0.258583, 0.403911, 0.818673, 1.29478, 0.00269142, 0.000381417, 0.00261948, 0.597966, 1.74376, 1.14072, 1.43395e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32+m64@64 am64+m64@96 aB wg 8x2x2 kr af vav hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 64}, {8, 2, 2}, 1, (WGType) 1, 261, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.05222e+06, -111578, 229.635, 796442, 0, 0, 0.253233, 0.406643, 0.813555, 1.45295, 0.00276155, 0.00276155, 0, 0.877781, 1.50355, 1.01103, 4.94859e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m64@128 am32 aB wg 16x2 cb3x2 ks64 af vav hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 64}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {985303, 529193, 0, 0, 0, 0, 0.334488, 0.597794, 0.849156, 1.3743, 0.00359035, 0.00359035, 0, 0.903609, 1.40249, 1.04911, 2.49355e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@96 am32x2+m128@64 aB wg 4x4x2 kr xaf st vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.05983e+06, -474100, -2272.62, 713748, 2.81641e+06, 3.33414e+06, 0.403748, 0.373013, 0.772197, 1.18809, 0.00404296, 0.000494942, 0.00376492, 0.513109, 1.48851, 1.00899, 2.18946e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m128@64 am64+m32@64 aB wg 4x4x2 kr xaf vav hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.07887e+06, 165901, -16323.3, 390698, 0, 0, 0.35439, 0.336657, 0.782861, 1.6985, 0.00390024, 0.00390024, 0, 0.971171, 1.481, 1.0484, 1.29175e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m128@128 am64+m32@128 aB wg 8x4 af vav hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {887443, 420819, 0, 0, 0, 0, 0.443197, 0.408057, 0.55357, 1.07857, 0.00424551, 0.00424551, 0, 1, 1.4159, 1.00616, 1.44951e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@96 am64+m64@96 aB wg 4x2x4 kr xaf vav hi pt sr br sb128 bk0 sm sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 32768, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.14066e+06, -229433, -14533.6, 346037, 2.54935e+06, 2.45105e+06, 0.222742, 0.287911, 0.779924, 1.34129, 0.00482324, 0.00413971, 0.00147387, 0.486442, 1.45855, 0.962427, 5.617e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m64@96 am64+m128@96 aB wg 8x4 af vav hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {883828, 389004, 0, 0, 0, 0, 0.372711, 0.671929, 0.348909, 0.985628, 0.00656641, 0.00656641, 0, 0.979495, 1.22863, 0.98572, 8.6565e-13}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@96 am64+m64@96 aB wg 2x2x8 kr xaf vav hi pt sr br sb128 bk0 sm sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 16384, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.14864e+06, -151615, -6901.62, 210708, 2.44695e+06, 2.41664e+06, 0.224008, 0.202288, 0.73347, 1.38945, 0.00597322, 0.00649637, 0.00100094, 0.864243, 1.42077, 0.800446, 1.1311e-11}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m64@96 am64+m64@96 aB wg 4x4x2 kr xaf vav hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 16384, {4, 8, 4}, {true, true, true}}, {'E', 17, {958974, 226613, 38729.4, 218499, 0, 0, 0.349594, 0.344345, 0.603399, 2.13905, 0.00724009, 0.00724009, 0, 1, 1.31855, 0.978713, 1.64485e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32+m64@96 am32+m128@96 aB wg 4x8 xaf st vav hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {879427, 374769, 0, 0, 0, 0, 0.681995, 0.637561, 0.593202, 1.53194, 0.0120684, 0.0120684, 0, 0.902553, 1.05814, 0.58533, 4.64073e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32x2+m64@96 am64+m32@128 aB wg 4x2x4 kr xaf st vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.12606e+06, -137008, -24369.3, 228077, 2.14139e+06, 1.80224e+06, 0.232306, 0.352856, 0.366483, 1.01866, 0.0096663, 0.00801051, 0.00318484, 0.842293, 1.37613, 0.921541, 4.97764e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m32@96 am32x2+m64@64 aB wg 4x2x4 kr af vav hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 261, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {993405, 342847, -2034.13, 109213, 0, 0, 0.288855, 0.497534, 1.21807, 3.58096, 0.01097, 0.01097, 0, 1, 1.25266, 0.993647, 1.05629e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@128 am64+m128@128 aB wg 2x2x8 kr xaf vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.09423e+06, -83565.2, -8479.72, 132562, 2.41664e+06, 1.75309e+06, 0.217848, 0.232009, 0.349329, 1.02701, 0.0120474, 0.012138, 0.00283781, 0.878271, 1.29462, 0.7804, 9.59757e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m32@96 am32x2+m64@64 aB wg 2x4x4 kr xaf st vav hi pt sr br sb128 bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 64}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {966602, 338111, -3348.28, 97829.3, 0, 0, 0.401147, 0.39872, 2.19696, 6.17459, 0.0151964, 0.0151964, 0, 1, 1.21501, 0.949127, 1.33783e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {4, 8, 1}, "IAB"}, "at32+m128@32 am128+m128@128 aB wg 8x4 af vav li nmk pt sr br sb128 bk0 sm sys kv afb", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 128}, {16, 4, 128}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01309e+06, 225543, 0, 0, 1.84812e+06, 1.93577e+06, 0.318973, 1.6272, 0.589134, 1.33561, 0.014082, 0.00175378, 0.0216268, 1, 1.06459, 0.958519, 4.93673e-13}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AI"}, "at32+m64@96 aB32 aB wg 8x4 cb4x2 ks64 af vav hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.05324e+06, 979401, 0, 0, 5.54107e+06, 7.84794e+06, 0.402313, 0.410019, 0.918697, 1.43736, 0.00226438, 0.00226438, 0, 1, 1.62189, 1.08285, 3.19321e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#AIps"}, "at32+m64@96 aB32 aB wg 16x2 cb4x2 ks128 xaf st vav hi pt sr br bk0 sm sn dm grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1572864, 16777216}, {262144, 1572864, 128}, {16, 96, 128}, {16, 2, 1}, 1, (WGType) 1, 441, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04068e+06, 873177, 0, 0, 3.37183e+06, 6.29965e+06, 0.384445, 0.528477, 0.860175, 1.32814, 0.00247393, 0.00247393, 0, 0.990759, 1.57684, 1.14116, 2.03781e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ipq"}, "at64+m64@128 aB32x2 aB wg 16x2 cb3 ks64 xaf st vav hi pt sr br bk0 sm sn nb 0x2 dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 64}, {16, 64, 64}, {16, 2, 1}, 1, (WGType) 1, 441, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {973856, 781280, 0, 0, 3.47341e+06, 4.52198e+06, 0.376801, 0.745941, 0.834762, 1.36837, 0.00306578, 0.00306578, 0, 1, 1.548, 1.17568, 1.37045e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#I"}, "at32x2+m64@64 aB32x2 aB wg 8x2x2 kr cb4 ks64 xaf st vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 64}, {16, 64, 64}, {8, 2, 2}, 1, (WGType) 1, 445, 32768, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.76235e+06, -528635, -346576, 928208, 2.73039e+06, 3.28253e+06, 0.411331, 0.407059, 0.82943, 1.26791, 0.00365219, 0.000634094, 0.00340305, 0.949468, 1.57605, 1.11643, 2.18464e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m64@96 aB64 aB wg 8x4 cb4x2 ks64 xaf vav hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {979561, 775985, 0, 0, 0, 0, 0.472998, 0.448938, 0.785458, 1.5451, 0.00427182, 0.00427182, 0, 1, 1.48807, 0.994577, 1.6929e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#I"}, "at32+m64@32 aB64 aB wg 8x1x4 kr cb4x2 ks64 xaf st vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 64}, {16, 64, 64}, {8, 1, 4}, 1, (WGType) 1, 445, 16384, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.72793e+06, -260474, -161085, 447531, 2.31834e+06, 2.89178e+06, 0.238818, 0.354785, 0.799686, 1.20956, 0.00460907, 0.00312961, 0.00215107, 0.989106, 1.46994, 1.02912, 3.02074e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#I"}, "at64+m64@64 aB32 aB wg 8x2x4 kr cb4 ks32 af vav hi pt sr br bk0 sm sn dm sys l2d", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {8, 2, 4}, 1, (WGType) 1, 261, 8192, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.10055e+06, 1.06985e+06, 1310.76, 199871, 0, 0, 0.375142, 0.660095, 0.719737, 1.72017, 0.00584039, 0.00584039, 0, 1, 1.38677, 1.01044, 1.17577e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@128 aB64x2 aB wg 8x4 cb4x2 ks128 xaf st vav hi pt sr br bk0 sm sn nb 0x4 dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.00856e+06, 729582, 0, 0, 0, 0, 0.391803, 0.680643, 0.557725, 1.58519, 0.00733339, 0.00733339, 0, 0.974679, 1.32826, 0.97557, 9.11096e-13}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#I"}, "at64+m64@64 aB32x2 aB wg 4x2x4 kr cb4x2 ks64 xaf st vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 64}, {16, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.84304e+06, -196719, -186130, 412502, 2.21921e+06, 2.048e+06, 0.286789, 0.263596, 0.436547, 1.25452, 0.00648641, 0.00488563, 0.00263508, 0.940889, 1.37843, 0.889564, 5.23147e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#Ip"}, "at64+m32@64 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 64}, {4, 2, 4}, 1, (WGType) 1, 261, 16384, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.05176e+06, 782569, -1901.36, 140619, 0, 0, 0.336566, 0.331805, 0.795679, 2.44302, 0.00752315, 0.00752315, 0, 1, 1.33186, 0.99299, 1.10267e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m64@128 aB128 aB wg 4x8 cb4x2 ks128 xaf st vav hi pt sr br bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.68462e+06, 614668, 0, 0, 0, 0, 0.528613, 0.557291, -0.992969, 0.00078125, 0.0113836, 0.0113836, 0, 0.96247, 1.20355, 0.897505, 1.16415e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32x2+m64@96 aB64 aB wg 4x2x4 kr cb4x2 ks128 af vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 128}, {16, 16, 128}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.63724e+06, -117354, -138376, 322423, 1.85139e+06, 1.3271e+06, 0.217482, 0.358374, -0.173415, 0.826585, 0.00954542, 0.00723565, 0.00380699, 0.869435, 1.30653, 0.807528, 6.89227e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#I"}, "at64+m64@96 aB64x2 aB wg 2x2x8 kr cb4x2 ks64 xaf vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.65053e+06, -99807.1, -68122.5, 193337, 2.53952e+06, 1.70394e+06, 0.244079, 0.25177, 0.0702585, 0.887446, 0.0117307, 0.00844281, 0.00495289, 0.997313, 1.45603, 0.945166, 4.40941e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@64 aB128+m64@64 aB wg 2x2x8 kr af vav hi pt sr br sb128 bk0 sm sn dm grf256 sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {2, 2, 8}, 1, (WGType) 1, 261, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {995441, 575346, 2016.61, 75427, 0, 0, 0.311133, 0.344824, 2.79512, 7.15467, 0.0157164, 0.0157164, 0, 0.82504, 1.17356, 0.963482, 1.218e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "at32x2+m128@32 aB64+m32@64 aB wg 4x2x4 kr af vav li nmk pt sr br sb128 bk0 sm sn dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 64}, {16, 8, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.2871e+06, -132026, -84694.7, 251812, 2.17989e+06, 1.66871e+06, 0.26071, 0.702238, 0.035417, 1.12604, 0.0101197, 0.0087237, 0.010893, 1, 1.29244, 0.838716, 2.17924e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "#I"}, "at32+m64@96 aB128 aB wg 1x4x8 kr af vav li pt sr br sb128 bk0 sm dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 128}, {16, 16, 128}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.19588e+06, -90151.7, -17276.9, 143923, 2.18726e+06, 1.17965e+06, 0.50932, 0.206862, 0.351165, 0.961797, 0.00952689, 0.0094563, 0.00799782, 1, 1.08799, 0.29902, 3.8884e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aS32+S64@96 aB64+S32@96 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 64}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.18068e+06, 53960.8, 0, 0, 0, 0, 0.213645, 3.70034, 2.54839, 9.45781, 0.067707, 0.0150101, 0.0808417, 1, 1.00383, 0, 0}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS16+m32@32 aB8x2+m16@32 aB wg 8x2 cb3 ks32 nse hi pt sr br bk0 sm sn grf256 kv afb l4 dm", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 2, 1}, 1, (WGType) 1, 441, 12288, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.43403e+06, 452176, 0, 0, 4.48102e+06, 9.67475e+06, 0.486151, 0.555121, 0.890086, 1.40877, 0.0164251, 0.000123601, 0.0164842, 0.490983, 1.14664, 1.00079, 8.17626e-13}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABpqI"}, "am64+S1,64@128 av64+B64@128 aS cs di sys grf256 af wg 8x4 bo sb512 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {696651, 1.10436e+06, 0, 0, 0, 0, 0.942188, 0.965594, 1.06189, 2.04065, 0.00368306, 0.00368306, 0, 0.911159, 1.34929, 0.941877, 1.64736e-12}}}, -{{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32 aS32 aB sys grf256 cab2 wg 4x4 ek l4 sr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.16401e+06, 348644, 0, 0, 0, 0, 0.807306, 0.892675, 0.990554, 1.4802, 0.00939438, 0.000733543, 0.0109328, 0.899502, 1.01113, 1.00523, 2.16142e-14}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABpqI"}, "am64+S1,64@128 av64+B64@128 aS cs sys grf256 af wg 8x4 bo sb512 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {696651, 1.10436e+06, 0, 0, 0, 0, 0.942188, 0.965594, 1.06189, 2.04065, 0.00368306, 0.00368306, 0, 0.911159, 1.34929, 0.941877, 1.64736e-12}}}, +{{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Is"}, "aS32 aS32 aB sys grf256 cab2 wg 4x4 ek l4 sr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.16401e+06, 348644, 0, 0, 0, 0, 0.807306, 0.892675, 0.990554, 1.4802, 0.00939438, 0.000733543, 0.0109328, 0.899502, 1.01113, 1.00523, 2.16142e-14}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB16 aB wg 8x4 cab3 ks32 nse hi pt sr bk0 grf256 kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.02524e+06, 922411, 0, 0, 4.21888e+06, 8.7081e+06, 0.917542, 0.658478, 0.919692, 1.40366, 0.0167399, 0.0167399, 0, 1, 1.08933, 0.991295, 6.92853e-13}}}, {{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {16, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, {{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 aS8x2+S32@8 aB wg 2x8x2 kr kc8 nse hi pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 8}, {2, 8, 2}, 1, (WGType) 1, 413, 0, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12483e+06, -574330, -20527.6, 853244, 4.03046e+06, 0, 2.33007, 1.51079, 0.80632, 1.51643, 0.0625462, -1.54388e-05, 0.0629715, 0.425906, 1.06028, 0.229788, 1.06051e-11}}}, -{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse di li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {16, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}}, +{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {16, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}}, {{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, {{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, {{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, {{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, {{'F', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "at16x2+m16@32 at16+m32@32 aB wg 16x1x2 kr kc16 nse nmk li pt sr sb256 bk0 sm grf256 kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {262144, 65536, 32}, {16, 4, 16}, {16, 1, 2}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.1734e+06, -264907, -109870, 485064, 2.21266e+06, 0, 0.856653, 15.807, 1.98085, 3.89882, 0.125049, 0.0139237, 0.143865, 1, 1.34573, 0.978713, 4.7619e-12}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m32@48 am32+S32@64 aB wg 4x8 xaf st vav di hi pt sb32 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {869760, 741196, 0, 0, 8.192e+06, 1.05431e+07, 0.731287, 0.777012, 0.881104, 1.51408, 0.00403024, 0.00403024, 0, 0.998184, 1.76752, 1.28618, 2.08947e-12}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ip"}, "aB32 aB32 aB wg 8x4 cab3 ks32 af vav di hi pt sr br bk0 sn nb 8x4 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 86016, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07153e+06, 922406, 0, 0, 5.48536e+06, 9.18323e+06, 0.85293, 1.19559, 1.04485, 1.64281, 0.00471518, 0.00471518, 0, 0.961495, 1.72318, 1.24713, 3.69593e-12}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 aB wg 2x4x4 kr ca3x2 ks16 af vav di hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 6144, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.27954e+06, -187821, -42333.9, 291644, 3.34234e+06, 2.63782e+06, 0.670967, 0.826166, 0.942564, 1.64083, 0.0148244, 0.00555253, 0.00975056, 0.806514, 1.26716, 0.788997, 1.48059e-11}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@48 av32 aB wg 8x4 cb4 ks32 xaf st vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00046e+06, 649003, 0, 0, 5.58776e+06, 8.89651e+06, 0.806799, 1.52159, 1.05017, 1.76588, 0.00548707, 0.00548707, 0, 0.843701, 1.54307, 1.23912, 1.78553e-12}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@32 av16x2 aB wg 4x8 cb4x2 ks32 xaf vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03762e+06, 706048, 0, 0, 6.93043e+06, 1.10019e+07, 0.892859, 1.0972, 0.98165, 1.70677, 0.00434444, 0.00434444, 0, 0.705466, 1.6217, 1.19447, 5.03184e-12}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB16+B16@16 aB wg 4x4 vav di hi pt sb256 bk0 grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {16, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.10018e+06, 251905, 0, 0, 0, 0, 1.56408, 2.85947, 0.648851, 1.37611, 0.0629702, 0.000146865, 0.0632313, 0.517444, 1.16754, -0.0884205, 2.14696e-11}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m64@48 am32+m32@64 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, 1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIqp"}, "at16+m64@48 am32+m32@56 aB wg 4x8 xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {16, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {885722, 718272, 0, 0, 8.48691e+06, 1.26566e+07, 1.01026, 0.782981, 0.918959, 1.54496, 0.00414471, 0.00414471, 0, 1, 2.17926, 1.31409, 2.23082e-12}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "I"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.15217e+06, -88485.5, -9852.59, 144015, 2.69517e+06, 1.69656e+06, 1.08886, 0.414631, 0.587045, 1.19593, 0.0202405, 0.0220064, 0.0155006, 1, 1.0809, 0.644827, 3.81346e-12}}}, +{{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ip"}, "aB32 aB32 aB wg 8x4 cab3 ks32 af vav hi pt sr br bk0 sn nb 8x4 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 86016, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07153e+06, 922406, 0, 0, 5.48536e+06, 9.18323e+06, 0.85293, 1.19559, 1.04485, 1.64281, 0.00471518, 0.00471518, 0, 0.961495, 1.72318, 1.24713, 3.69593e-12}}}, +{{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB16+B16@16 aB wg 4x4 vav hi pt sb256 bk0 grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {16, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.10018e+06, 251905, 0, 0, 0, 0, 1.56408, 2.85947, 0.648851, 1.37611, 0.0629702, 0.000146865, 0.0632313, 0.517444, 1.16754, -0.0884205, 2.14696e-11}}}, {{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+S1,16@24 aS16+S32@16 aB wg 2x2x8 kr vav hi pt sr sb256 bk0 sm sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.2289e+06, -127728, -18531.3, 192240, 3.35053e+06, 0, 0.932716, 1.33521, 0.665104, 1.39923, 0.0628179, 0.0675437, 0.0114361, 0.999809, 1.27564, 0.821381, 3.76564e-11}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am16x2+m64@32 aB wg 4x4x2 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {16, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03742e+06, -452509, 25423.9, 695882, 3.92397e+06, 3.94035e+06, 0.820319, 0.785135, 0.84902, 1.56923, 0.00809246, 0.00116078, 0.00745441, 0.526504, 1.60176, 1.07294, 4.15601e-12}}}, -{{'F', "gemm", {"Q", "Q", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,16@16 aB8+B8@16 aU vav di wg 8x4 bo pt sm sb256 grf256 bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {875898, 864492, 0, 0, 0, 0, 2.55527, 2.12966, 0.857629, 1.85569, 0.0625314, 0.0625314, 0, 1, 1.01096, 1.00724, 3.06523e-14}}}, +{{'F', "gemm", {"Q", "Q", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,16@16 aB8+B8@16 aU vav wg 8x4 bo pt sm sb256 grf256 bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {875898, 864492, 0, 0, 0, 0, 2.55527, 2.12966, 0.857629, 1.85569, 0.0625314, 0.0625314, 0, 1, 1.01096, 1.00724, 3.06523e-14}}}, {{'F', "gemm", {"S", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+m32@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn grf256 l4", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {1, 1, 1}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, {{'F', "gemm", {"S", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+m32@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn grf256 l4", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {1, 1, 1}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, -{{'F', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@8 aB8+B8@8 aB nse di wg 4x8 bo pt sb256 kc8 bk0 sr", {16, (LoopType) 255, 128, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+m8@32 aS32+m16@40 aB wg 4x4 kc16 nse di hi pt sb256 bk0 sn grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08792e+06, 260070, 0, 0, 0, 0, 1.27159, 2.25336, 0.633711, 1.35704, 0.0632943, 0.00105479, 0.0694168, 0.543903, 1.15915, 0.195161, 2.93818e-11}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@24 am/S16+S32@32 aB wg 4x8 kc8 nse di hi pt sb256 bk0 sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {933004, 470490, 0, 0, 0, 0, 2.49562, 3.97982, 0.810184, 1.38841, 0.0630776, 0.0630776, 0, 1, 1.22055, -0.309162, 2.6504e-11}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {0, 0, 0}, {0, 0, 0}, {1, 1, 1}, ""}, "am16+m16@64 am/S32+m32@64 aB wg 4x8 kc16 nse di hi pt sb256 bk0 sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 16777216}, {16, 4, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {895520, 534803, 0, 0, 0, 0, 1.20137, 3.44786, 1.88791, 3.67536, 0.0747548, 0.0747548, 0, 1, 1.37174, 0.989936, 2.27372e-12}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {256, 256, 256}, {8, 8, 1}, ""}, "am16+m32@64 am/S32x2+m16@32 aB wg 2x16 kc16 nse di hi pt sb256 bk0 sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 32}, {2, 16, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {938708, 543839, 0, 0, 0, 0, 2.39026, 3.22437, 2.56416, 5.86786, 0.101672, 0.101672, 0, 0.999145, 1.33995, 1.00257, 1.83338e-12}}}, +{{'F', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@8 aB8+B8@8 aB nse wg 4x8 bo pt sb256 kc8 bk0 sr", {16, (LoopType) 255, 128, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+m8@32 aS32+m16@40 aB wg 4x4 kc16 nse hi pt sb256 bk0 sn grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08792e+06, 260070, 0, 0, 0, 0, 1.27159, 2.25336, 0.633711, 1.35704, 0.0632943, 0.00105479, 0.0694168, 0.543903, 1.15915, 0.195161, 2.93818e-11}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@24 am/S16+S32@32 aB wg 4x8 kc8 nse hi pt sb256 bk0 sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {933004, 470490, 0, 0, 0, 0, 2.49562, 3.97982, 0.810184, 1.38841, 0.0630776, 0.0630776, 0, 1, 1.22055, -0.309162, 2.6504e-11}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {0, 0, 0}, {0, 0, 0}, {1, 1, 1}, ""}, "am16+m16@64 am/S32+m32@64 aB wg 4x8 kc16 nse hi pt sb256 bk0 sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 16777216}, {16, 4, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {895520, 534803, 0, 0, 0, 0, 1.20137, 3.44786, 1.88791, 3.67536, 0.0747548, 0.0747548, 0, 1, 1.37174, 0.989936, 2.27372e-12}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {256, 256, 256}, {8, 8, 1}, ""}, "am16+m32@64 am/S32x2+m16@32 aB wg 2x16 kc16 nse hi pt sb256 bk0 sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 32}, {2, 16, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {938708, 543839, 0, 0, 0, 0, 2.39026, 3.22437, 2.56416, 5.86786, 0.101672, 0.101672, 0, 0.999145, 1.33995, 1.00257, 1.83338e-12}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8/4+B8@32 aS8+S1,16@32 aB nse wg 4x8 bo pt kc8 sn sb256 grf256 bk0 sr kv afb", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {8192, 8192, 0}, {64, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 409, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {877281, 856555, 0, 0, 6.69286e+06, 0, 1.87833, 2.50913, 0.836287, 1.45578, 0.0625082, 0.0625082, 0, 1, 1.00952, 0.859121, 3.28697e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 aS8x2+S32@8 aB wg 2x8x2 kr kc8 nse hi pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {8192, 8192, 0}, {64, 16, 8}, {2, 8, 2}, 1, (WGType) 1, 413, 0, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12483e+06, -574330, -20527.6, 853244, 4.03046e+06, 0, 2.33007, 1.51079, 0.80632, 1.51643, 0.0625462, -1.54388e-05, 0.0629715, 0.425906, 1.06028, 0.229788, 1.06051e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@8 aS8x2+S16@8 aB wg 2x4x4 kr kc8 nse hi pt sr kv sb64 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {8192, 8192, 0}, {64, 16, 8}, {2, 4, 4}, 1, (WGType) 1, 413, 0, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.22775e+06, -292493, -31491.9, 424162, 3.35872e+06, 0, 1.38608, 1.51352, 0.865001, 1.65146, 0.0625896, 0.0512516, 0.0196923, 0.635005, 1.1356, -0.568771, 2.52473e-11}}}, @@ -900,18 +985,18 @@ kcatalog::FlatCatalog<1024> _CATALOG_ {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {63, -1, -1}, {1, 1, 1}, ""}, "am16x2+m16@16 aS16+m16@16 aB wg 1x16x2 kr kc16 nse li pt sr kv sb256 bk0 sn grf256 afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 16}, {1, 16, 2}, 1, (WGType) 1, 413, 0, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.11666e+06, -375280, -55079.8, 590017, 2.65421e+06, 0, 6.85937, 1.10986, 0.561901, 1.18544, 0.0718347, 0.0118681, 0.0644105, 0.645395, 1.19384, 0.859778, 5.37885e-12}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {1, 1, 1}, ""}, "am16x2+S32@16 aS32+S16@32 aS wg 1x4x8 kr kc16 nse li pt sr kv sb256 bk0 sn grf256 afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {65536, 262144, 16777216}, {8192, 8192, 0}, {4, 16, 32}, {1, 4, 8}, 1, (WGType) 1, 413, 0, 1024, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21263e+06, -101219, -23787.5, 154376, 2.3765e+06, 0, 4.80141, 0.952132, 2.62236, 5.17717, 0.130559, 0.263753, 0.0519091, 1, 1.25383, 1.14072, -8.09176e-14}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+S32@24 aS8+S16@32 aS wg 1x8x4 kr kc8 nse li pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {16384, 262144, 16777216}, {8192, 8192, 0}, {1, 16, 8}, {1, 8, 4}, 1, (WGType) 1, 413, 0, 512, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16307e+06, -117224, -49945.9, 211601, 2.21184e+06, 0, 27.752, 0.924823, 7.95811, 19.7907, 0.498836, 0.333805, 0.307337, 1, 1.19347, 0, 0}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+m16@8 aS16+m32@8 aB wg 2x4x4 kr kc16 nse di hi pt sr sb256 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 413, 0, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.22732e+06, -192672, -38121.5, 322384, 3.32595e+06, 0, 2.49763, 1.54907, 0.628927, 1.39358, 0.0627876, 0.0504407, 0.0208002, 0.589308, 1.19034, 0.60621, 4.79866e-11}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+m16@32 aS16+m16@32 aB wg 2x4x4 kr kc16 nse di hi pt sr sb256 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {2, 4, 4}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.24447e+06, -148653, -56383, 266371, 2.85901e+06, 0, 1.66468, 2.34448, 0.802059, 1.33917, 0.0632256, 0.0464224, 0.0241823, 0.804536, 1.32602, 0.945203, 1.61386e-11}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse di li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {32, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {1024, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+m8@24 aS16+m16@24 aB wg 1x2x16 kr kc16 nse di hi pt sr br sb32 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.19256e+06, -99702.7, -5250.1, 135470, 4.21888e+06, 0, 0.924905, 1.15646, 0.601147, 1.43162, 0.0628943, 0.0635026, 0.0173549, 1, 1.36155, 1.02238, 5.68694e-12}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+m8@16 aS16+m8@24 aB wg 1x4x8 kr kc8 nse di hi pt sb256 bk0 sn grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 16}, {1, 4, 8}, 1, (WGType) 1, 261, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {973570, 471411, 10282.9, 75820.8, 0, 0, 1.45806, 1.71929, 3.09927, 7.18832, 0.0643004, 0.0643004, 0, 0.97308, 1.43932, 0.981141, 7.50692e-12}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1023, -1, -1}, {1, 1, 1}, ""}, "aB8+m8@16 aS16+m8@24 aB wg 1x4x8 kr kc8 nse di hi pt sb256 bk0 sn grf256 sr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {1, 4, 8}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21899e+06, -88481.9, -25272.1, 146166, 3.03923e+06, 0, 1.52455, 1.45603, 0.792316, 1.37923, 0.0633789, 0.0507551, 0.0245627, 1, 1.38276, 0.904775, 1.71039e-11}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {256, 256, 64}, {1, 1, 1}, ""}, "aB8+m16@8 aS8x2 aB wg 1x4x4 kr kc8 nse di hi pt ar sb32 bk0", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 32768, 16777216}, {524288, 32768, 16777216}, {32, 2, 8}, {1, 4, 4}, 1, (WGType) 1, 261, 0, 1024, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.19515e+06, 151613, 62335.9, 17126.5, 0, 0, 1.28521, 4.53132, 1.51228, 3.55467, 0.145838, 0.0796081, 0.0700619, 0.527675, 1.35925, 1.00198, 1.34953e-11}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+m16@8 aS16+m32@8 aB wg 2x4x4 kr kc16 nse hi pt sr sb256 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 413, 0, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.22732e+06, -192672, -38121.5, 322384, 3.32595e+06, 0, 2.49763, 1.54907, 0.628927, 1.39358, 0.0627876, 0.0504407, 0.0208002, 0.589308, 1.19034, 0.60621, 4.79866e-11}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+m16@32 aS16+m16@32 aB wg 2x4x4 kr kc16 nse hi pt sr sb256 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {2, 4, 4}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.24447e+06, -148653, -56383, 266371, 2.85901e+06, 0, 1.66468, 2.34448, 0.802059, 1.33917, 0.0632256, 0.0464224, 0.0241823, 0.804536, 1.32602, 0.945203, 1.61386e-11}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {32, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {1024, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+m8@24 aS16+m16@24 aB wg 1x2x16 kr kc16 nse hi pt sr br sb32 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.19256e+06, -99702.7, -5250.1, 135470, 4.21888e+06, 0, 0.924905, 1.15646, 0.601147, 1.43162, 0.0628943, 0.0635026, 0.0173549, 1, 1.36155, 1.02238, 5.68694e-12}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+m8@16 aS16+m8@24 aB wg 1x4x8 kr kc8 nse hi pt sb256 bk0 sn grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 16}, {1, 4, 8}, 1, (WGType) 1, 261, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {973570, 471411, 10282.9, 75820.8, 0, 0, 1.45806, 1.71929, 3.09927, 7.18832, 0.0643004, 0.0643004, 0, 0.97308, 1.43932, 0.981141, 7.50692e-12}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1023, -1, -1}, {1, 1, 1}, ""}, "aB8+m8@16 aS16+m8@24 aB wg 1x4x8 kr kc8 nse hi pt sb256 bk0 sn grf256 sr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {1, 4, 8}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21899e+06, -88481.9, -25272.1, 146166, 3.03923e+06, 0, 1.52455, 1.45603, 0.792316, 1.37923, 0.0633789, 0.0507551, 0.0245627, 1, 1.38276, 0.904775, 1.71039e-11}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {256, 256, 64}, {1, 1, 1}, ""}, "aB8+m16@8 aS8x2 aB wg 1x4x4 kr kc8 nse hi pt ar sb32 bk0", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 32768, 16777216}, {524288, 32768, 16777216}, {32, 2, 8}, {1, 4, 4}, 1, (WGType) 1, 261, 0, 1024, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.19515e+06, 151613, 62335.9, 17126.5, 0, 0, 1.28521, 4.53132, 1.51228, 3.55467, 0.145838, 0.0796081, 0.0700619, 0.527675, 1.35925, 1.00198, 1.34953e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {1, 1, 1}, ""}, "am16x2+m32@16 aS32+m16@32 aS wg 1x4 kc16 nse li pt sr sb256 bk0 sn grf256 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {65536, 262144, 16777216}, {65536, 262144, 16777216}, {4, 16, 32}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.15385e+06, 54677.4, 0, 0, 0, 0, 4.66453, 0.94861, 2.51588, 5.10353, 0.11684, 0.263033, 0.0520785, 1, 1.35189, 1.13162, 7.09638e-15}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+m8@16 aB16+m16@16 aB wg 4x4 kc8 nse di hi pt sb256 bk0 grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.10018e+06, 251905, 0, 0, 0, 0, 1.56408, 2.85947, 0.648851, 1.37611, 0.0629702, 0.000146865, 0.0632313, 0.517444, 1.16754, -0.0884205, 2.14696e-11}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, ""}, "am16+m16@64 am16+m32@64 aB wg 4x4 kc16 nse di hi pt sb256 bk0 grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 16777216}, {16, 4, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.10359e+06, 253001, 0, 0, 0, 0, 1.23463, 4.97057, 1.98081, 3.72207, 0.0797284, 0.00775855, 0.0752955, 0.909553, 1.43847, 0.946385, 6.92721e-12}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am16+m32@32 am16+m16@32 aB wg 2x8 kc16 nse di hi pt sb256 bk0 grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.09369e+06, 252258, 0, 0, 0, 0, 1.95015, 5.48836, 2.57025, 5.95009, 0.14056, 0.0114998, 0.12796, 0.974789, 1.2964, 0.960056, 4.31831e-12}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+m8@16 aB16+m16@16 aB wg 4x4 kc8 nse hi pt sb256 bk0 grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.10018e+06, 251905, 0, 0, 0, 0, 1.56408, 2.85947, 0.648851, 1.37611, 0.0629702, 0.000146865, 0.0632313, 0.517444, 1.16754, -0.0884205, 2.14696e-11}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, ""}, "am16+m16@64 am16+m32@64 aB wg 4x4 kc16 nse hi pt sb256 bk0 grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 16777216}, {16, 4, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.10359e+06, 253001, 0, 0, 0, 0, 1.23463, 4.97057, 1.98081, 3.72207, 0.0797284, 0.00775855, 0.0752955, 0.909553, 1.43847, 0.946385, 6.92721e-12}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am16+m32@32 am16+m16@32 aB wg 2x8 kc16 nse hi pt sb256 bk0 grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.09369e+06, 252258, 0, 0, 0, 0, 1.95015, 5.48836, 2.57025, 5.95009, 0.14056, 0.0114998, 0.12796, 0.974789, 1.2964, 0.960056, 4.31831e-12}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "aB8/4x2+m8@28 aB8/4x2+m8@28 aP nse wg 4x8 bo pt sb256 kc8 grf256 bk0 sr kv afb", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 409, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {866662, 832973, 0, 0, 6.62733e+06, 0, 2.31399, 2.18462, 0.879335, 1.58093, 0.0624816, 0.0624816, 0, 1, 1.01086, 1.00539, 3.62849e-14}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m16@8 aB8+m8@16 aB wg 4x8 kc8 nse hi pt sb256 bk0 grf256 sr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 409, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {882165, 581951, 0, 0, 4.12713e+06, 0, 2.40327, 3.47719, 0.766058, 1.43151, 0.062663, 0.062663, 0, 1, 1.01425, 1.00462, 1.16371e-13}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16/8x2+B8@24 am8x2+B8@24 aB wg 4x4x2 kr kc8 nse hi pt sr sb256 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {8192, 8192, 0}, {32, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 413, 0, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {-9.04747e+06, 584317, 1.02639e+07, -265146, 4.2025e+06, 0, 1.64736, 1.39786, 0.778015, 1.45273, 0.0625854, -3.41849e-05, 0.0630026, 0.438401, 1.03752, 0.690802, 4.57945e-12}}}, @@ -923,15 +1008,15 @@ kcatalog::FlatCatalog<1024> _CATALOG_ {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {1, 1, 1}, ""}, "am16x2+m16@16 aB8x2+m8@24 aS wg 1x8x4 kr kc8 nse li pt sr sb256 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {65536, 262144, 16777216}, {65536, 262144, 32}, {4, 16, 16}, {1, 8, 4}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.26743e+06, -130651, -75631.7, 238106, 2.415e+06, 0, 13.4604, 0.983422, 2.77777, 5.15729, 0.127127, 0.144409, 0.0797807, 1, 1.29538, 1.06434, 1.41696e-13}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, ""}, "at8+m16@8 aB16x2+m16@8 aS wg 1x8x8 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {16384, 262144, 16777216}, {16384, 262144, 32}, {1, 16, 16}, {1, 8, 8}, 1, (WGType) 1, 413, 0, 512, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.10613e+06, -79156.7, -1736.09, 153649, 2.00704e+06, 0, 47.8566, 0.896699, 10.8519, 21.2763, 1.082, 0.293987, 0.241183, 0.333333, 1.10378, 0, 0}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am8x2+B8@8 am16x2+S16@8 aB wg 4x8 kc8 nse di hi pt sb256 bk0 grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {869157, 563214, 0, 0, 0, 0, 2.39412, 4.22276, 0.853271, 1.30754, 0.0725087, 0.0725087, 0, 1, 1.25715, 0.942426, 4.35208e-12}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am8+m8@32 am16x2+m16@16 aB wg 2x8 kc8 nse di hi pt sr sb256 bk0 grf256 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.09546e+06, 254169, 0, 0, 0, 0, 1.63097, 3.84728, 1.50643, 2.37923, 0.0742032, 0.0154716, 0.0594372, 0.827878, 1.42273, 0.913754, 1.12977e-11}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am8x2+B8@8 am16x2+S16@8 aB wg 4x8 kc8 nse hi pt sb256 bk0 grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {869157, 563214, 0, 0, 0, 0, 2.39412, 4.22276, 0.853271, 1.30754, 0.0725087, 0.0725087, 0, 1, 1.25715, 0.942426, 4.35208e-12}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am8+m8@32 am16x2+m16@16 aB wg 2x8 kc8 nse hi pt sr sb256 bk0 grf256 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.09546e+06, 254169, 0, 0, 0, 0, 1.63097, 3.84728, 1.50643, 2.37923, 0.0742032, 0.0154716, 0.0594372, 0.827878, 1.42273, 0.913754, 1.12977e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, -{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "aB8+m8@8 aB32+m32@8 aB wg 8x1x4 kr kc8 nse di li nmk pt sr sb32 bk0 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {524288, 65536, 32}, {32, 4, 32}, {8, 1, 4}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.18493e+06, -151099, 5931.1, 262767, 2.36749e+06, 0, 0.843802, 28.5958, 1.39394, 2.57371, 0.107758, 0.0474906, 0.0797103, 0.994029, 1.25885, 0.690983, 1.3586e-11}}}, -{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at8x2+S1,8@56 am/S8x2+S16@56 aB wg 4x8 kc8 nse di hi pt sb32 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {900022, 588464, 0, 0, 0, 0, 2.11582, 2.1833, 0.547862, 1.23354, 0.0626136, 0.0626136, 0, 1, 1.1599, -0.725344, 3.06459e-11}}}, -{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at8x2+m32@24 am/S16+m8@32 aB wg 8x4 kc8 nse di hi pt sb256 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {893971, 565638, 0, 0, 0, 0, 2.10566, 2.60265, 0.8239, 1.43017, 0.072004, 0.072004, 0, 0.497868, 1.18996, 0.90853, 4.46328e-12}}}, -{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at16+m16@64 am/S16x2+m32@48 aB wg 4x4 kc16 nse di hi pt sb256 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.30686e+06, 256588, 0, 0, 0, 0, 1.11607, 1.55713, 0.941507, 2.42007, 0.0726522, 0.00111933, 0.0659252, 0.759172, 1.32447, 0.944187, 7.04547e-12}}}, -{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at8x2+S8@56 am/S16x2+S16@48 aB wg 4x8 kc8 nse di hi pt sb32 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {892566, 531408, 0, 0, 0, 0, 1.946, 3.76473, 1.95506, 3.63902, 0.0803044, 0.0803044, 0, 1, 1.40954, 0.984594, 4.35399e-12}}}, -{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at16x2+S16@48 at32+S16@64 aB wg 2x16 kc16 nse di hi pt sb256 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {8192, 8192, 16777216}, {16, 2, 32}, {2, 16, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {880883, 514420, 0, 0, 0, 0, 4.6228, 3.84817, 2.57586, 5.80879, 0.169176, 0.169176, 0, 1, 1.15837, 0.943093, 3.07402e-12}}}, +{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "aB8+m8@8 aB32+m32@8 aB wg 8x1x4 kr kc8 nse li nmk pt sr sb32 bk0 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {524288, 65536, 32}, {32, 4, 32}, {8, 1, 4}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.18493e+06, -151099, 5931.1, 262767, 2.36749e+06, 0, 0.843802, 28.5958, 1.39394, 2.57371, 0.107758, 0.0474906, 0.0797103, 0.994029, 1.25885, 0.690983, 1.3586e-11}}}, +{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at8x2+S1,8@56 am/S8x2+S16@56 aB wg 4x8 kc8 nse hi pt sb32 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {900022, 588464, 0, 0, 0, 0, 2.11582, 2.1833, 0.547862, 1.23354, 0.0626136, 0.0626136, 0, 1, 1.1599, -0.725344, 3.06459e-11}}}, +{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at8x2+m32@24 am/S16+m8@32 aB wg 8x4 kc8 nse hi pt sb256 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {893971, 565638, 0, 0, 0, 0, 2.10566, 2.60265, 0.8239, 1.43017, 0.072004, 0.072004, 0, 0.497868, 1.18996, 0.90853, 4.46328e-12}}}, +{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at16+m16@64 am/S16x2+m32@48 aB wg 4x4 kc16 nse hi pt sb256 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.30686e+06, 256588, 0, 0, 0, 0, 1.11607, 1.55713, 0.941507, 2.42007, 0.0726522, 0.00111933, 0.0659252, 0.759172, 1.32447, 0.944187, 7.04547e-12}}}, +{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at8x2+S8@56 am/S16x2+S16@48 aB wg 4x8 kc8 nse hi pt sb32 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {892566, 531408, 0, 0, 0, 0, 1.946, 3.76473, 1.95506, 3.63902, 0.0803044, 0.0803044, 0, 1, 1.40954, 0.984594, 4.35399e-12}}}, +{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at16x2+S16@48 at32+S16@64 aB wg 2x16 kc16 nse hi pt sb256 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {8192, 8192, 16777216}, {16, 2, 32}, {2, 16, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {880883, 514420, 0, 0, 0, 0, 4.6228, 3.84817, 2.57586, 5.80879, 0.169176, 0.169176, 0, 1, 1.15837, 0.943093, 3.07402e-12}}}, {{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,16@32 aS16+S1,16@32 aB nse wg 4x8 bo pt kc8 sm sn sb256 grf256 bk0 sr kv afb", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {8192, 8192, 0}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 409, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {935084, 896033, 0, 0, 6.17677e+06, 0, 2.71042, 2.12983, 0.828082, 1.44356, 0.0625855, 0.0625855, 0, 1, 1.01199, 1.00613, 6.43507e-14}}}, {{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8x2+S1,8@16 aS16x2+S1,16@40 aB wg 4x4x2 kr kc8 nse hi pt sr sb32 bk0 sm sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {8192, 8192, 0}, {32, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 413, 0, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.53273e+06, -601303, -215849, 889641, 4.35814e+06, 0, 2.13947, 1.867, 0.799054, 1.46981, 0.062662, 9.4614e-05, 0.0633035, 0.43207, 1.15029, -0.297015, 2.33734e-11}}}, {{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "p"}, "aS8x2+S1,16@32 am/S8x2+S1,8@24 aB wg 4x2x4 kr kc8 nse hi pt sr sb32 bk0 sm sn grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {8192, 8192, 0}, {32, 32, 8}, {4, 2, 4}, 1, (WGType) 1, 413, 0, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.20785e+06, -377725, 50285.1, 535170, 6.62733e+06, 0, 1.42372, 1.15814, 0.840145, 1.59798, 0.062656, 0.0523573, 0.0192386, 0.636211, 1.17551, -0.657316, 3.00397e-11}}}, @@ -944,11 +1029,11 @@ kcatalog::FlatCatalog<1024> _CATALOG_ {{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "at8x2+m32@8 at8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 sm sn kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13247e+06, -102265, -1131.66, 143220, 3.04742e+06, 0, 1.75413, 0.928621, 0.480908, 0.891062, 0.0713067, 0.0781393, 0.013656, 0.976256, 1.18801, 0.74341, 6.89459e-12}}}, {{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {1, 1, 1}, ""}, "at16x2+S32@16 at16+S16@32 aS wg 1x4x8 kr kc16 nse li pt sr sb256 bk0 sm sn grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {65536, 262144, 16777216}, {8192, 8192, 0}, {4, 16, 16}, {1, 4, 8}, 1, (WGType) 1, 413, 0, 1024, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.22164e+06, -75649.9, -28431.3, 129898, 2.46579e+06, 0, 4.47812, 0.846859, 2.59041, 5.08414, 0.0900204, 0.188187, 0.0683862, 1, 1.31716, 0.967325, 2.73803e-12}}}, {{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, ""}, "am32+m32@32 at8x2+m16@24 aS wg 1x8x4 kr kc8 nse li pt sr sb256 bk0 sn grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {16384, 262144, 16777216}, {16384, 262144, 32}, {1, 16, 32}, {1, 8, 4}, 1, (WGType) 1, 413, 0, 512, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.26572e+06, -129216, -67284, 239862, 2.15941e+06, 0, 24.5032, 0.852871, 5.82167, 16.1942, 0.788258, 0.295269, 0.225216, 0.354473, 1.07957, 0, 0}}}, -{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8x2+m16@24 aS8x2+m32@24 aB wg 2x4x4 kr kc8 nse di hi pt sb256 bk0 sm sn grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 8}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {964742, 420594, -264.455, 102391, 0, 0, 1.36495, 1.22335, 2.7783, 6.70529, 0.075127, 0.075127, 0, 1, 1.36799, 0.837729, 1.63216e-11}}}, -{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at8+S1,16@32 at16+S8@32 aB wg 4x4x2 kr kc8 nse di hi pt sb32 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07352e+06, 346157, 1765.64, 296014, 0, 0, 1.38106, 2.15672, 0.936372, 2.08073, 0.062816, 0.062816, 0, 1, 1.1961, 0.493637, 1.23641e-11}}}, -{{'F', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,16@32 aB8/4+B8@32 aU nse di wg 8x4 bo pt kc8 sm sb256 grf256 bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {32, 64, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {875898, 864492, 0, 0, 0, 0, 2.55527, 2.12966, 0.857629, 1.85569, 0.0625314, 0.0625314, 0, 1, 1.01096, 1.00724, 3.06523e-14}}}, +{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8x2+m16@24 aS8x2+m32@24 aB wg 2x4x4 kr kc8 nse hi pt sb256 bk0 sm sn grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 8}, {2, 4, 4}, 1, (WGType) 1, 261, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {964742, 420594, -264.455, 102391, 0, 0, 1.36495, 1.22335, 2.7783, 6.70529, 0.075127, 0.075127, 0, 1, 1.36799, 0.837729, 1.63216e-11}}}, +{{'F', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "at8+S1,16@32 at16+S8@32 aB wg 4x4x2 kr kc8 nse hi pt sb32 bk0 sm sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07352e+06, 346157, 1765.64, 296014, 0, 0, 1.38106, 2.15672, 0.936372, 2.08073, 0.062816, 0.062816, 0, 1, 1.1961, 0.493637, 1.23641e-11}}}, +{{'F', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,16@32 aB8/4+B8@32 aU nse wg 8x4 bo pt kc8 sm sb256 grf256 bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {32, 64, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {875898, 864492, 0, 0, 0, 0, 2.55527, 2.12966, 0.857629, 1.85569, 0.0625314, 0.0625314, 0, 1, 1.01096, 1.00724, 3.06523e-14}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIpq"}, "am16+B16@20 am16+m16@20 aB wg 4x8 xaf st rr vav hi pt sr br sb32 bk0 sn grf256 sys np kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {870519, 757436, 0, 0, 7.0656e+06, 9.74029e+06, 1.61519, 1.51882, 0.777223, 1.2159, 0.00794099, 0.00794099, 0, 1, 1.52399, 1.13177, 3.83469e-12}}}, -{{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "am16+B8@32 am32+m16@32 aB wg 4x8 af rr vav hi pt sr br sb32 bk0 sn grf256 sys kv afb np", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 393216, 16777216}, {1048576, 393216, 32}, {64, 24, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {881335, 689481, 0, 0, 4.45891e+06, 7.58579e+06, 1.46799, 1.88849, 0.735162, 1.19884, 0.00844968, 0.00844968, 0, 1, 1.56029, 1.15624, 3.29649e-12}}}, +{{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIps"}, "am16+B8@32 am32+m16@32 aB wg 4x8 af rr vav hi pt sr br sb32 bk0 sn grf256 sys kv afb np", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 393216, 16777216}, {1048576, 393216, 32}, {64, 24, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {881335, 689481, 0, 0, 4.45891e+06, 7.58579e+06, 1.46799, 1.88849, 0.735162, 1.19884, 0.00844968, 0.00844968, 0, 1, 1.56029, 1.15624, 3.29649e-12}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIv"}, "am8+m8@12 am16+m16@12 aB wg 4x4x2 kr xaf st rr vav hi pt sr br sb32 bk0 sm sn grf256 kv afb sys np", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.05058e+06, -905176, 36068, 1.20994e+06, 4.27049e+06, 5.44768e+06, 0.832982, 1.70307, 0.823027, 1.31887, 0.00894305, 0.0016304, 0.00826202, 0.641064, 1.55717, 1.18753, 2.39955e-12}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "am8+m8@32 am32x2+m16@32 aB wg 4x4x2 kr cb3 ks32 xaf rr vav hi pt sr br bk0 sn grf256 sys np kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 49152, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.495e+06, -552515, -223241, 832726, 2.84426e+06, 3.87482e+06, 1.28529, 1.52068, 0.884565, 1.51616, 0.0123021, 0.000807932, 0.0129924, 0.76811, 1.53199, 1.09418, 3.34525e-12}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am16x2+m16@16 am16x2+m16@16 aB wg 4x2x4 kr xaf st vav hi pt sr br sb32 bk0 sm sn grf256 sys np kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {32, 32, 16}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 32768, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.151e+06, -214932, -14362.1, 332485, 3.12934e+06, 2.83443e+06, 0.856861, 1.31774, 0.784444, 1.33614, 0.0178404, 0.015689, 0.00568798, 0.760968, 1.36233, 0.935131, 1.65223e-11}}}, @@ -963,9 +1048,9 @@ kcatalog::FlatCatalog<1024> _CATALOG_ {{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB8+m16@20 aB8 aB wg 4x2x4 kr cb4 ks16 af vav hi pt sr br bk0 nb 0x2 dm grf256 sys np kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 16}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.95228e+06, -155772, -209098, 393418, 2.48218e+06, 2.2528e+06, 1.4244, 1.53085, 0.312276, 1.12087, 0.0242192, 0.00627943, 0.0193567, 1, 1.40693, 0.960855, 4.73398e-12}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB8+m8@16 aB16+m16@16 aB wg 4x2x4 kr af rr vav hi pt sr br sb32 bk0 dm grf256 sys np kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.5179e+06, -146559, -113822, 345656, 2.31834e+06, 2.07258e+06, 0.765268, 2.39027, -0.0267412, 0.770134, 0.0358339, 0.0156392, 0.0233713, 1, 1.33018, 0.794846, 1.92976e-11}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 aB16x2+m16@20 aB wg 2x4x4 kr ca4x2 ks16 xaf st rr vav hi pt sr br bk0 nb 2x0 dm grf256 sys np kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 32}, {16, 8, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 8192, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.58566e+06, -116017, -144663, 284746, 2.40845e+06, 1.7367e+06, 1.41636, 1.39229, -0.840332, 0.397168, 0.0479738, 0.025812, 0.0299484, 1, 1.37335, 0.964517, 6.85737e-12}}}, -{{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am8+B8@32 am16 aB wg 4x8 cb4x2 ks32 af vav di hi pt bk0 sn grf256 sys sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {991490, 510308, 0, 0, 0, 0, 1.21713, 3.27426, 1.0535, 2.33084, 0.0217767, 0.0217767, 0, 0.980474, 1.39168, 0.919941, 4.31334e-12}}}, -{{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am16+B16@32 am16+S1,16@32 aB wg 8x4 af vav di hi pt sb256 bk0 sn grf256 sys sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {887182, 626290, 0, 0, 0, 0, 1.11701, 2.20089, 0.753196, 1.51546, 0.00982405, 0.00982405, 0, 0.986514, 1.51371, 1.10395, 2.7046e-12}}}, -{{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am16+B8@32 am16+S1,16@32 aB wg 4x4x2 kr af vav di hi pt sb256 bk0 sn grf256 sys sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.02204e+06, 300562, 60646.4, 370867, 0, 0, 0.970642, 1.40385, 0.920267, 1.86295, 0.0130897, 0.0130897, 0, 0.835449, 1.50727, 1.0276, 2.82432e-12}}}, +{{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am8+B8@32 am16 aB wg 4x8 cb4x2 ks32 af vav hi pt bk0 sn grf256 sys sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {991490, 510308, 0, 0, 0, 0, 1.21713, 3.27426, 1.0535, 2.33084, 0.0217767, 0.0217767, 0, 0.980474, 1.39168, 0.919941, 4.31334e-12}}}, +{{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am16+B16@32 am16+S1,16@32 aB wg 8x4 af vav hi pt sb256 bk0 sn grf256 sys sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {887182, 626290, 0, 0, 0, 0, 1.11701, 2.20089, 0.753196, 1.51546, 0.00982405, 0.00982405, 0, 0.986514, 1.51371, 1.10395, 2.7046e-12}}}, +{{'F', "gemm", {"T", "T", "S"}, {"N", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am16+B8@32 am16+S1,16@32 aB wg 4x4x2 kr af vav hi pt sb256 bk0 sn grf256 sys sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 261, 0, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.02204e+06, 300562, 60646.4, 370867, 0, 0, 0.970642, 1.40385, 0.920267, 1.86295, 0.0130897, 0.0130897, 0, 0.835449, 1.50727, 1.0276, 2.82432e-12}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {64, 1, 1}, "ABIp"}, "am8x2+m16@32 at8x2+m8@32 aB wg 4x8 xaf rr vav hi pt sr br sb32 bk0 grf256 sys np kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {64, 4, 4}, {true, true, true}}, {'E', 17, {876569, 665985, 0, 0, 7.66198e+06, 9.6256e+06, 1.95542, 2.14536, 0.79812, 1.24152, 0.00817485, 0.00817485, 0, 0.994621, 1.61371, 1.13434, 6.6789e-12}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am8+B8@32 am16 aB wg 4x8 cb3x2 ks16 xaf vav hi pt sr br bk0 grf256 sys np kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.0371e+06, 775586, 0, 0, 5.70163e+06, 9.92051e+06, 1.72964, 1.82897, 0.886229, 1.47344, 0.00841793, 0.00841793, 0, 1, 1.49653, 1.11416, 4.04426e-12}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "am8+m16@16 am8x2 aB wg 4x4x2 kr cb4x2 ks16 af rr vav hi pt sr br bk0 sm grf256 sys np kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 16}, {4, 4, 2}, 1, (WGType) 1, 445, 32768, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.59272e+06, -978725, -272730, 1.30124e+06, 4.09928e+06, 5.79174e+06, 1.05622, 2.23639, 0.894523, 1.42815, 0.0092523, 0.000785124, 0.00895202, 0.63727, 1.5504, 1.17763, 2.80357e-12}}}, @@ -980,8 +1065,8 @@ kcatalog::FlatCatalog<1024> _CATALOG_ {{'F', "gemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB8+m8@8 at8+m8@8 aB wg 1x4x8 kr xaf rr fn nmk vav hi pt sr br sb32 bk0 sm sn grf256 sys np kv afb", {16, (LoopType) 255, 256, {(LoopType) 209, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {1048576, 262144, 32}, {64, 16, 8}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.18347e+06, -130993, -1022.14, 203731, 2.62144e+06, 2.65421e+06, 1.14288, 1.05988, 0.674346, 1.45385, 0.0233347, 0.0163703, 0.00886945, 0.989082, 1.38519, 0.90064, 2.11921e-11}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#BI"}, "aB8+B8@12 at8+m8@12 aB wg 2x2x8 kr xaf st vav hi pt sr br sb32 bk0 sm sn grf256 sys np kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.22438e+06, -85563.8, -17080.9, 152837, 2.89997e+06, 2.13811e+06, 1.13375, 1.65157, 0.529939, 1.09905, 0.0356927, 0.0236702, 0.0148759, 0.98034, 1.32898, 0.869925, 2.23293e-11}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "aB8x2+m8@16 at8x2+m8@16 aB wg 2x2x8 kr xaf st rr vav hi pt sr br sb32 bk0 sn grf256 sys np kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.18837e+06, -58547.5, -17231.8, 121783, 2.69517e+06, 1.93331e+06, 1.27086, 1.21231, 0.268122, 0.902316, 0.0486346, 0.0404013, 0.0158949, 0.936826, 1.27881, 0.949937, 1.10074e-11}}}, -{{'F', "gemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "ABI"}, "am16+B16@32 at16+S16@32 aB wg 4x8 af vav di hi pt sb256 bk0 grf256 sys sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {8192, 8192, 16777216}, {64, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, true, true}}, {'E', 17, {870199, 642863, 0, 0, 0, 0, 1.58315, 2.95571, 0.763338, 1.48553, 0.0107117, 0.0107117, 0, 0.95453, 1.55818, 1.10822, 3.1594e-12}}}, -{{'F', "gemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "ABI"}, "am8+B8@32 at8+S16@32 aB wg 4x8 af vav di hi pt sb256 bk0 grf256 sys sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, true, true}}, {'E', 17, {888363, 535816, 0, 0, 0, 0, 1.88142, 2.12019, 0.619881, 1.23713, 0.01468, 0.01468, 0, 0.990366, 1.42663, 0.981831, 3.45631e-12}}}, +{{'F', "gemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "ABI"}, "am16+B16@32 at16+S16@32 aB wg 4x8 af vav hi pt sb256 bk0 grf256 sys sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {8192, 8192, 16777216}, {64, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, true, true}}, {'E', 17, {870199, 642863, 0, 0, 0, 0, 1.58315, 2.95571, 0.763338, 1.48553, 0.0107117, 0.0107117, 0, 0.95453, 1.55818, 1.10822, 3.1594e-12}}}, +{{'F', "gemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "ABI"}, "am8+B8@32 at8+S16@32 aB wg 4x8 af vav hi pt sb256 bk0 grf256 sys sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, true, true}}, {'E', 17, {888363, 535816, 0, 0, 0, 0, 1.88142, 2.12019, 0.619881, 1.23713, 0.01468, 0.01468, 0, 0.990366, 1.42663, 0.981831, 3.45631e-12}}}, {{'F', "gemm", {"T", "T", "S"}, {"N", "T", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "BIp"}, "aB8x2 at8+m8@4 aB wg 2x4x4 kr ca4 ks8 xaf st vav hi pt sr br bk0 sm sn nb 2x0 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 8}, {2, 4, 4}, 1, (WGType) 1, 261, 16384, 32768, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.05686e+06, 444393, 3182.91, 403482, 0, 0, 1.8936, 1.55376, 0.673465, 2.37577, 0.0130704, 0.0130704, 0, 0.921899, 1.45934, 1.03208, 4.75956e-12}}}, {{'F', "gemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABIp"}, "at8+m32@24 am16+m16@28 aB wg 4x8 xaf st vav di hi pt sr br sb32 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {64, 40, 16}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {889895, 713074, 0, 0, 6.02931e+06, 1.12312e+07, 2.11653, 1.73065, 0.848984, 1.28, 0.00816471, 0.00816471, 0, 1, 1.55007, 1.13556, 3.63302e-12}}}, {{'F', "gemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "ABIp"}, "at8x2+m32@24 am16+m16@32 aB wg 8x4 xaf vav di hi pt sr br sb32 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {898482, 632398, 0, 0, 5.24288e+06, 9.7321e+06, 1.68469, 1.5304, 0.790341, 1.22016, 0.00802145, 0.00802145, 0, 1, 1.75052, 1.14461, 3.6943e-12}}}, diff --git a/src/gpu/intel/jit/gemm/kernel_selector.cpp b/src/gpu/intel/jit/gemm/kernel_selector.cpp index 5cd3c7b631e..fb8ace81e63 100644 --- a/src/gpu/intel/jit/gemm/kernel_selector.cpp +++ b/src/gpu/intel/jit/gemm/kernel_selector.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2023 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/gpu/intel/jit/gemm/strategy_parser.cpp b/src/gpu/intel/jit/gemm/strategy_parser.cpp index 5922cb95e2d..66e362e473b 100644 --- a/src/gpu/intel/jit/gemm/strategy_parser.cpp +++ b/src/gpu/intel/jit/gemm/strategy_parser.cpp @@ -225,7 +225,7 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, strategy.B_prefetch.padded |= isPacked(problem.B.layout); strategy.unroll[LoopK] = 1; - strategy.checkAdd32 = !native64Bit(hw) || (hw >= HW::XeHPC); + strategy.checkAdd32 = !native64Bit(hw) || (hw == HW::XeHPC); strategy.altCRemainder |= (strategy.C.accessType == AccessType::Block) || strategy.kParallel; diff --git a/src/gpu/intel/jit/gemm/utils.hpp b/src/gpu/intel/jit/gemm/utils.hpp index e200b37052d..4edbb21f6cb 100644 --- a/src/gpu/intel/jit/gemm/utils.hpp +++ b/src/gpu/intel/jit/gemm/utils.hpp @@ -17,6 +17,8 @@ #ifndef GPU_INTEL_JIT_GEMM_UTILS_HPP #define GPU_INTEL_JIT_GEMM_UTILS_HPP +#include + #include "common/math_utils.hpp" #include "common/utils.hpp" @@ -70,6 +72,16 @@ static inline int largest_pow2_divisor(int x) { return x & ~(x - 1); } +class stub_exception : public std::runtime_error { +public: + stub_exception() + : std::runtime_error("Functionality not yet implemented") {} +}; + +[[noreturn]] static inline void stub() { + throw stub_exception(); +} + } // namespace jit } // namespace intel } // namespace gpu From 693a2052419fc6ae54c430f6972ad790c378ec2c Mon Sep 17 00:00:00 2001 From: Kealan Barbieri Date: Thu, 16 May 2024 11:53:21 -0700 Subject: [PATCH 172/187] gpu: intel: jit: gemm: enable f64 --- src/gpu/intel/compute/kernel_arg_list.hpp | 7 ++++++ src/gpu/intel/jit/gemm/gen_gemm.hpp | 25 +++++++++++++++----- src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp | 2 ++ src/gpu/intel/jit/gemm/kernel_selector.cpp | 2 +- src/gpu/intel/ocl/ref_matmul.hpp | 10 ++++++++ src/gpu/sycl/sycl_interop_gpu_kernel.cpp | 1 + tests/benchdnn/inputs/matmul/test_matmul_ci | 12 +++++----- tests/benchdnn/inputs/matmul/test_matmul_gpu | 8 +++---- tests/benchdnn/matmul/cfg.cpp | 4 ++++ tests/benchdnn/matmul/matmul.cpp | 8 ++++++- 10 files changed, 61 insertions(+), 18 deletions(-) diff --git a/src/gpu/intel/compute/kernel_arg_list.hpp b/src/gpu/intel/compute/kernel_arg_list.hpp index cb323c54aff..3ec7920614b 100644 --- a/src/gpu/intel/compute/kernel_arg_list.hpp +++ b/src/gpu/intel/compute/kernel_arg_list.hpp @@ -48,6 +48,7 @@ enum class scalar_type_t { _bfloat8, _bfloat16, _float, + _double, _half, _int, _int4, @@ -78,6 +79,7 @@ inline std::string to_string(scalar_type_t type) { CASE(_bfloat8); CASE(_bfloat16); CASE(_float); + CASE(_double); CASE(_half); CASE(_int); CASE(_int4); @@ -115,6 +117,10 @@ template <> struct scalar_type_traits { static const auto type = scalar_type_t::_float; }; +template <> +struct scalar_type_traits { + static const auto type = scalar_type_t::_double; +}; template <> struct scalar_type_traits { @@ -313,6 +319,7 @@ void set_scalar_arg_cvt(kernel_arg_list_t &arg_list, int index, T scalar, case scalar_type_t::_half: arg_list.set(index, (float16_t)scalar); break; + case scalar_type_t::_double: arg_list.set(index, (double)scalar); break; case scalar_type_t::_uchar: arg_list.set(index, (uint8_t)scalar); break; case scalar_type_t::_char: arg_list.set(index, (int8_t)scalar); break; default: assert(!"Cannot convert scalar to the requested type."); diff --git a/src/gpu/intel/jit/gemm/gen_gemm.hpp b/src/gpu/intel/jit/gemm/gen_gemm.hpp index e93ee64b55e..85f8372d097 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm.hpp +++ b/src/gpu/intel/jit/gemm/gen_gemm.hpp @@ -138,13 +138,17 @@ struct gen_gemm_t : public gpu_gemm_t { VDISPATCH_GEMM(utils::one_of(d->acc_type, bf16, f32), VERBOSE_INCONSISTENT_DT, "a", "acc"); } else if (!wei_decomp_) { - VDISPATCH_GEMM( - utils::one_of(d->a_type(), f32, f16, f8_e5m2, f8_e4m3), + VDISPATCH_GEMM(utils::one_of(d->a_type(), f64, f32, f16, + f8_e5m2, f8_e4m3), VERBOSE_UNSUPPORTED_DT); VDISPATCH_GEMM(d->b_type() == d->a_type(), VERBOSE_INCONSISTENT_DT, "a", "b"); VDISPATCH_GEMM(utils::one_of(d->acc_type, d->a_type(), f32), VERBOSE_UNSUPPORTED_DT); + VDISPATCH_GEMM(IMPLICATION(utils::one_of(f64, d->a_type(), + d->b_type()), + dev_info_->has_native(f64)), + VERBOSE_UNSUPPORTED_DT); VDISPATCH_GEMM( IMPLICATION(utils::one_of(f8_e5m2, f8_e4m3, d->a_type(), d->b_type(), d->c_type()), @@ -161,11 +165,15 @@ struct gen_gemm_t : public gpu_gemm_t { VERBOSE_RUNTIMEDIM_UNSUPPORTED); VDISPATCH_GEMM( IMPLICATION(with_bias(), - utils::one_of(d->bias_type(), f32, bf16, f16, + utils::one_of(d->bias_type(), f64, f32, bf16, f16, f8_e5m2, f8_e4m3) && (d->bias_desc.ndims <= 3) && utils::one_of(bias_cmask(), 0, 1, 2, 3)), VERBOSE_UNSUPPORTED_BIAS_CFG); + VDISPATCH_GEMM( + IMPLICATION(with_bias(), + (d->c_type() != f64 || d->bias_type() == f64)), + VERBOSE_UNSUPPORTED_BIAS_CFG); VDISPATCH_GEMM(compute_engine->mayiuse_ngen_kernels(), VERBOSE_UNSUPPORTED_DEVICE_FEATURE, "ngen_kernels"); VDISPATCH_GEMM(attr()->has_default_values(attr_skip_mask), @@ -242,6 +250,7 @@ struct gen_gemm_t : public gpu_gemm_t { bool with_binary = (post_ops_.find(binary) != -1) || (post_ops_.find(prelu) != -1); + bool with_eltwise = (post_ops_.find(eltwise) != -1); // check GPU architecture bool arch_ok = utils::one_of(arch_, arch_t::gen9, arch_t::gen11, @@ -276,10 +285,16 @@ struct gen_gemm_t : public gpu_gemm_t { ? d->sum_ab_type : (utils::one_of(eff_a_type(), s8, u8) ? s32 : d->c_type()); - auto acc_type = utils::one_of(eff_a_type(), s8, u8) ? s32 : f32; + auto acc_type = utils::one_of(eff_a_type(), s8, u8) + ? s32 + : (utils::one_of(f64, eff_a_type(), eff_b_type()) ? f64 + : f32); if (swap_ab_) std::swap(ao_type, bo_type); if (d->c_type() == f16 && !has_systolic) acc_type = data_type::f16; + VDISPATCH_GEMM( + IMPLICATION(acc_type == f64, !with_eltwise && !with_binary), + VERBOSE_UNSUPPORTED_POSTOP); if (types::data_type_size(acc_type) < 4) { // Limited post-op support for low-precision accumulation. @@ -321,8 +336,6 @@ struct gen_gemm_t : public gpu_gemm_t { // accumulation unless fusion is enabled. if (kernel_desc_.driver_info()->kParallel() && !kernel_desc_.driver_info()->fusedPostOps()) { - bool with_eltwise = (post_ops_.find(eltwise) != -1); - VDISPATCH_GEMM(!with_eltwise && !with_binary && utils::one_of(d->c_type(), f32, s32), VERBOSE_UNSUPPORTED_POSTOP); diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp index cf6248fc34a..25e4f9eaa62 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp @@ -36,6 +36,7 @@ namespace jit { static inline Type convert_dnnl_to_kernel_type(data_type_t type) { switch (type) { default: assert(!"Unknown type"); + case data_type::f64: return Type::f64; case data_type::f32: return Type::f32; case data_type::f16: return Type::f16; case data_type::bf16: return Type::bf16; @@ -76,6 +77,7 @@ struct gen_gemm_kernel_desc_t { case Type::bf16: return compute::scalar_type_t::_bfloat16; case Type::f16: return compute::scalar_type_t::_half; case Type::f32: return compute::scalar_type_t::_float; + case Type::f64: return compute::scalar_type_t::_double; default: return compute::scalar_type_t::undef; } } diff --git a/src/gpu/intel/jit/gemm/kernel_selector.cpp b/src/gpu/intel/jit/gemm/kernel_selector.cpp index fb8ace81e63..5cd3c7b631e 100644 --- a/src/gpu/intel/jit/gemm/kernel_selector.cpp +++ b/src/gpu/intel/jit/gemm/kernel_selector.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2023 Intel Corporation +* Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/gpu/intel/ocl/ref_matmul.hpp b/src/gpu/intel/ocl/ref_matmul.hpp index a3d182f3565..27b3ae86a24 100644 --- a/src/gpu/intel/ocl/ref_matmul.hpp +++ b/src/gpu/intel/ocl/ref_matmul.hpp @@ -50,6 +50,10 @@ struct ref_matmul_t : public gpu_primitive_t { dst_dt_ = dst_md()->data_type; wei_dt_ = weights_md(0)->data_type; bia_dt_ = with_bias() ? weights_md(1)->data_type : data_type::f32; + auto *compute_engine + = utils::downcast(engine); + + auto dev_info_ = compute_engine->device_info(); VDISPATCH_MATMUL( is_dense_format_kind(), VERBOSE_UNSUPPORTED_SPARSE_CFG); @@ -75,6 +79,8 @@ struct ref_matmul_t : public gpu_primitive_t { utils::one_of(bia_dt_, f32, u8, s8, s32))) || ((utils::everyone_is( f32, src_dt_, wei_dt_, dst_dt_) + || utils::everyone_is( + f64, src_dt_, wei_dt_, dst_dt_) || (utils::everyone_is( f16, src_dt_, wei_dt_) && utils::one_of( @@ -97,6 +103,10 @@ struct ref_matmul_t : public gpu_primitive_t { VERBOSE_UNSUPPORTED_POSTOP); VDISPATCH_MATMUL_SC(attr_.set_default_formats(dst_md(0)), VERBOSE_UNSUPPORTED_POSTOP); + VDISPATCH_MATMUL( + IMPLICATION(utils::one_of(f64, src_dt_, wei_dt_, dst_dt_), + dev_info_->has_native(f64)), + VERBOSE_UNSUPPORTED_DT); non_default_attrs_ = !attr()->has_default_values(); attr_info_ = attr_info_t::create(attr()); diff --git a/src/gpu/sycl/sycl_interop_gpu_kernel.cpp b/src/gpu/sycl/sycl_interop_gpu_kernel.cpp index 9c39b08e7ae..2ac35528a7b 100644 --- a/src/gpu/sycl/sycl_interop_gpu_kernel.cpp +++ b/src/gpu/sycl/sycl_interop_gpu_kernel.cpp @@ -54,6 +54,7 @@ static void set_scalar_arg(::sycl::handler &cgh, int index, case scalar_type_t::_uint: cgh.set_arg(index, *static_cast(value)); break; + case scalar_type_t::_double: case scalar_type_t::_long: case scalar_type_t::_ulong: cgh.set_arg(index, *static_cast(value)); diff --git a/tests/benchdnn/inputs/matmul/test_matmul_ci b/tests/benchdnn/inputs/matmul/test_matmul_ci index 01676616523..ff941734df6 100644 --- a/tests/benchdnn/inputs/matmul/test_matmul_ci +++ b/tests/benchdnn/inputs/matmul/test_matmul_ci @@ -1,6 +1,6 @@ # Plain cases --reset ---dt=f32,bf16,f16,f8_e5m2,f8_e4m3,u8:s8:s8,s8:s8:f32 +--dt=f64,f32,bf16,f16,f8_e5m2,f8_e4m3,u8:s8:s8,s8:s8:f32 --bia_dt=f32 --bia_mask=2 --batch=shapes_2d_ci @@ -31,7 +31,7 @@ # Different tags --reset ---dt=f32,bf16,f16,f8_e5m2,f8_e4m3,u8:s8:s8,s8:s8:f32 +--dt=f64,f32,bf16,f16,f8_e5m2,f8_e4m3,u8:s8:s8,s8:s8:f32 --stag=ab,ba --wtag=ab,ba --dtag=ab,ba @@ -43,7 +43,7 @@ # Sum with different data type --reset ---dt=f32 +--dt=f64,f32 --attr-post-ops=sum:0.25:0:s32 --batch=shapes_2d --batch=shapes_3d @@ -54,7 +54,7 @@ # Arg scales check --reset ---dt=f32,bf16,f16,f8_e5m2,f8_e4m3,u8:s8:u8,s8:s8:f32 +--dt=f64,f32,bf16,f16,f8_e5m2,f8_e4m3,u8:s8:u8,s8:s8:f32 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:2,wei:per_oc --batch=shapes_2d_ci @@ -83,7 +83,7 @@ # Run-time dimensions check --reset ---dt=f32,bf16,f16,f8_e5m2,f8_e4m3 +--dt=f64,f32,bf16,f16,f8_e5m2,f8_e4m3 --runtime_dims_masks=15:15 --stag=ab,ba --wtag=ab,ba @@ -151,7 +151,7 @@ # test all the supported data type configurations + bias data types --reset ---dt=f32 +--dt=f64,f32 --bia_dt=undef,f32 --bia_mask=2,3 77x133:133x117 --bia_mask=4,6 15x24x16:15x16x32 diff --git a/tests/benchdnn/inputs/matmul/test_matmul_gpu b/tests/benchdnn/inputs/matmul/test_matmul_gpu index 97baaa64853..baff4ceac2b 100644 --- a/tests/benchdnn/inputs/matmul/test_matmul_gpu +++ b/tests/benchdnn/inputs/matmul/test_matmul_gpu @@ -1,6 +1,6 @@ --reset ---dt=f32,f16,bf16:bf16:f32,u8:s8:s8 +--dt=f64,f32,f16,bf16:bf16:f32,u8:s8:s8 --stag=ab,ba --wtag=ab,ba --dtag=ab --runtime_dims_masks=0,2:1,0:2,2:3,1:0,3:1,1:2,3:3 --bia_dt=undef,f32 @@ -24,7 +24,7 @@ # test any --reset ---dt=f32,f16,f16:f16:s8,f16:f16:u8,bf16,s8:s8:f32,s8:s8:f16 +--dt=f64,f32,f16,f16:f16:s8,f16:f16:u8,bf16,s8:s8:f32,s8:s8:f16 --bia_dt=undef --stag=ab,ba,any --wtag=ab,ba,any --dtag=ab,any 1x30:30x20 @@ -51,7 +51,7 @@ # 3d --reset ---dt=f32,f16,f16:f16:s8,f16:f16:u8,bf16,bf16:bf16:f32 +--dt=f64,f32,f16,f16:f16:s8,f16:f16:u8,bf16,bf16:bf16:f32 --stag=abc,acb --wtag=abc,acb --dtag=abc --runtime_dims_masks=0,4:2,0:4,4:6,2:0,6:2,2:4,6:6,1:1,5:3,1:5,5:7,3:1,7:3,3:5,7:7 --bia_dt=undef,f32 @@ -64,7 +64,7 @@ --batch=harness_matmul_regression_f32 --reset ---dt=f32 +--dt=f64,f32 --stag=ab --wtag=ab --dtag=ab 96x8:8x512 diff --git a/tests/benchdnn/matmul/cfg.cpp b/tests/benchdnn/matmul/cfg.cpp index 1b626361eb8..cc3287f6827 100644 --- a/tests/benchdnn/matmul/cfg.cpp +++ b/tests/benchdnn/matmul/cfg.cpp @@ -63,6 +63,7 @@ float cfg_t::get_density(const cfg_t::density_args_t &density_args) const { cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const { static const cfg_t::cfg_entry_t::cfg_map_t src_cfg_map = { + {{dnnl_f64}, {-64, 64}}, {{dnnl_f32}, {-64, 64}}, {{dnnl_bf16}, {-4, 4}}, {{dnnl_f16}, {-4, 4}}, @@ -73,6 +74,7 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const { }; static const cfg_t::cfg_entry_t::cfg_map_t wei_cfg_map = { + {{dnnl_f64}, {-128, 128}}, {{dnnl_f32}, {-128, 128}}, {{dnnl_bf16}, {-8, 8}}, {{dnnl_f16}, {-2, 2}}, @@ -85,6 +87,7 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const { }; static const cfg_t::cfg_entry_t::cfg_map_t bia_cfg_map = { + {{dnnl_f64}, {-8, 8}}, {{dnnl_f32}, {-8, 8}}, {{dnnl_bf16}, {-8, 8}}, {{dnnl_f16}, {-8, 8}}, @@ -101,6 +104,7 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const { }; static const cfg_t::cfg_entry_t::cfg_map_t dst_cfg_map = { + {{dnnl_f64}, {-8, 8}}, {{dnnl_f32}, {-8, 8}}, {{dnnl_bf16}, {-8, 8}}, {{dnnl_f16}, {-4, 4}}, diff --git a/tests/benchdnn/matmul/matmul.cpp b/tests/benchdnn/matmul/matmul.cpp index 4f97203010c..9344bd43b17 100644 --- a/tests/benchdnn/matmul/matmul.cpp +++ b/tests/benchdnn/matmul/matmul.cpp @@ -498,8 +498,14 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { return; } - // GPU supports only default sum_dt argument. const auto &po = prb->attr.post_ops; + // F64 post-ops unsupported. + if (prb->dst_dt() == dnnl_f64 && !po.is_def()) { + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; + return; + } + // GPU supports only default sum_dt argument. const int sum_idx = po.find(attr_t::post_ops_t::kind_t::SUM); if (sum_idx != -1 && po.entry[sum_idx].sum.dt != dnnl_data_type_undef) { res->state = SKIPPED; From 89693a4723d5bcca2891a7223d28fc80088b9239 Mon Sep 17 00:00:00 2001 From: "Xuxin, Zeng" Date: Thu, 16 May 2024 23:14:27 -0700 Subject: [PATCH 173/187] cpu: x64: fix assertion in bf16 conv for relo on AMX --- src/cpu/x64/jit_avx512_core_amx_conv_kernel.cpp | 3 ++- src/cpu/x64/jit_brgemm_conv.cpp | 1 + src/cpu/x64/jit_primitive_conf.hpp | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/cpu/x64/jit_avx512_core_amx_conv_kernel.cpp b/src/cpu/x64/jit_avx512_core_amx_conv_kernel.cpp index eeaed407f5d..91e9ac98c06 100644 --- a/src/cpu/x64/jit_avx512_core_amx_conv_kernel.cpp +++ b/src/cpu/x64/jit_avx512_core_amx_conv_kernel.cpp @@ -696,7 +696,8 @@ void jit_avx512_core_amx_copy_to_pbuffer_t::copy_row(int icb) { void jit_avx512_core_amx_copy_to_pbuffer_t::copy_row_reduced_lowering() { assert(jcp.nb_ic_int == 1); - assert(jcp.ic_block_int * jcp.typesize_in == 64); + assert((jcp.is_bf32 ? jcp.ic_block : jcp.ic_block_int) * jcp.typesize_in + == 64); assert(jcp.is_nspc); auto load_mask = [this](int tail, Opmask kmask) { diff --git a/src/cpu/x64/jit_brgemm_conv.cpp b/src/cpu/x64/jit_brgemm_conv.cpp index c4d8b356237..97bdc249c3a 100644 --- a/src/cpu/x64/jit_brgemm_conv.cpp +++ b/src/cpu/x64/jit_brgemm_conv.cpp @@ -988,6 +988,7 @@ status_t brgemm_convolution_fwd_t::init(engine_t *engine) { ajcp.is_relo = true; ajcp.nb_ic_int = 1; ajcp.is_nspc = true; + ajcp.is_bf32 = jcp.is_bf32; ajcp.typesize_in = jcp.src_dsz; ajcp.ic_block_int = jcp.amx_w; diff --git a/src/cpu/x64/jit_primitive_conf.hpp b/src/cpu/x64/jit_primitive_conf.hpp index 445a7c8b775..6e4d14fe706 100644 --- a/src/cpu/x64/jit_primitive_conf.hpp +++ b/src/cpu/x64/jit_primitive_conf.hpp @@ -166,6 +166,7 @@ struct jit_conv_conf_t { data_type_t ddst_dt; data_type_t dsrc_dt; data_type_t dwei_dt; + bool is_bf32 {false}; bool expl_bcast; bool large_spatial, large_w_filter; int is_ic_scale, is_oc_scale; From feeddcb0d9e3bb605e03a7c165d639c896a578d4 Mon Sep 17 00:00:00 2001 From: "Xuxin, Zeng" Date: Thu, 16 May 2024 12:32:57 -0700 Subject: [PATCH 174/187] cpu: x64: dispatch small ic/oc shapes to VNNI on AMX --- src/cpu/x64/jit_brgemm_conv_utils.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/cpu/x64/jit_brgemm_conv_utils.cpp b/src/cpu/x64/jit_brgemm_conv_utils.cpp index ab0b30d2f94..301f9264b10 100644 --- a/src/cpu/x64/jit_brgemm_conv_utils.cpp +++ b/src/cpu/x64/jit_brgemm_conv_utils.cpp @@ -2235,6 +2235,14 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa, VERBOSE_IMPL_HEURISTIC_FAIL, "Dispatch the shape that requires large/small cache size to jit"); + // Dispatch the shape to VNNI for better performance on AMX + const bool is_int8_small_ic = jcp.oc == 32 && jcp.ic < jcp.simd_w / 2 + && is_int8_convolution && is_amx(jcp.isa) + && everyone_is(640, jcp.oh, jcp.ow, jcp.ih, jcp.iw) + && everyone_is(3, jcp.kh, jcp.kw); + VDISPATCH_CONV_IC(!is_int8_small_ic, VERBOSE_IMPL_HEURISTIC_FAIL, + "Dispatch the shape that has small ic/oc to VNNI"); + // to avoid cache concurrent write access from different threads size_t sc_size = sizeof(brgemm_batch_element_t); jcp.adjusted_batch_size From 74fb846b88e10194b5d342110e85276616dc143c Mon Sep 17 00:00:00 2001 From: "Xuxin, Zeng" Date: Mon, 20 May 2024 23:09:13 -0700 Subject: [PATCH 175/187] cpu: x64: brgemm: update oh blocking --- src/cpu/x64/jit_brgemm_conv_utils.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/cpu/x64/jit_brgemm_conv_utils.cpp b/src/cpu/x64/jit_brgemm_conv_utils.cpp index 301f9264b10..5635fcbcbbd 100644 --- a/src/cpu/x64/jit_brgemm_conv_utils.cpp +++ b/src/cpu/x64/jit_brgemm_conv_utils.cpp @@ -1028,7 +1028,7 @@ void brg_blocking_t::iterate_ker_block(brg_blocking_t &best_brgb, int kd_block_, 1, od, int(L1 / (ihp * src_w_block_size))); if (cur_od_block == 1) cur_oh_block = utils::saturate( - 1, oh, int(L1 / (src_w_block_size))); + 1, oh, static_cast(L1 / src_w_block_size)); } for (; cur_od_block > 1; cur_od_block--) { const auto sp_size = cur_od_block * cur_oh_block * iwp; @@ -1040,14 +1040,17 @@ void brg_blocking_t::iterate_ker_block(brg_blocking_t &best_brgb, int kd_block_, } } if (cur_od_block == 1) { - for (; cur_oh_block > 1; cur_oh_block--) { - const auto sp_size = cur_oh_block * iwp; - if ((static_cast(oh) / rnd_up(oh, cur_oh_block)) + auto tmp_oh_block = cur_oh_block; + while (tmp_oh_block >= 1) { + const auto sp_size = tmp_oh_block * iwp; + if ((static_cast(oh) / rnd_up(oh, tmp_oh_block)) > 0.9f && sp_size > 128) { L1_fit_res = true; + cur_oh_block = tmp_oh_block; break; } + tmp_oh_block--; } } if (L1_fit_res) { From b782e13aa0023074df578a84b5382f8324635787 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Tue, 21 May 2024 17:04:56 -0700 Subject: [PATCH 176/187] gpu: intel: jit: codegen: use fencewait() with SLM fence --- src/gpu/intel/jit/codegen/codegen.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/gpu/intel/jit/codegen/codegen.cpp b/src/gpu/intel/jit/codegen/codegen.cpp index e0697fd1169..f5ed28c4764 100644 --- a/src/gpu/intel/jit/codegen/codegen.cpp +++ b/src/gpu/intel/jit/codegen/codegen.cpp @@ -374,9 +374,8 @@ class ir_to_ngen_t : public ir_visitor_t { if (!attr.is_empty()) mod = mod | to_ngen(attr.as().mod); - const int dwords = ngen::GRF::bytes(hw) / sizeof(int32_t); host_->slmfence(mod, tmp, host_->r0); - host_->template mov(dwords, host_->null, tmp); + host_->fencewait(); } void barrier(const func_call_attr_t &attr) { @@ -386,9 +385,8 @@ class ir_to_ngen_t : public ir_visitor_t { if (!attr.is_empty()) mod = mod | to_ngen(attr.as().mod); - const int dwords = ngen::GRF::bytes(hw) / sizeof(int32_t); host_->slmfence(mod, tmp, host_->r0); - host_->template mov(dwords, host_->null, tmp); + host_->fencewait(); host_->barriermsg(mod, host_->signal_header_); host_->barrierwait(); } From bda15af917676b6531eb65296f9fee6b0c36cf4f Mon Sep 17 00:00:00 2001 From: "Taylor, Deb" Date: Wed, 22 May 2024 16:30:25 -0400 Subject: [PATCH 177/187] doc: Updated latex in rnn and prelu files Signed-off-by: Taylor, Deb --- doc/primitives/prelu.md | 14 ++++++-------- doc/primitives/rnn.md | 19 +------------------ 2 files changed, 7 insertions(+), 26 deletions(-) diff --git a/doc/primitives/prelu.md b/doc/primitives/prelu.md index c451063ca67..52c9669b097 100644 --- a/doc/primitives/prelu.md +++ b/doc/primitives/prelu.md @@ -55,17 +55,15 @@ and #dnnl_forward_inference propagation kinds. The backward propagation computes \f$\diffsrc\f$ and \f$\diffweights\f$. For no broadcast case, results are calculated using formula: -\f[ - \begin{align} - \mbox{diff_src}(n, c, h, w) &= +\f[ + \diffdst(n, c, h, w) &= \begin{cases} - \mbox{diff_dst}(n, c, h, w) & \mbox{if } \src(n, c, h, w) > 0 \\ - \mbox{diff_dst}(n, c, h, w) \cdot \weights(n, c, h, w) & + \diffdst(n, c, h, w) & \mbox{if } \src(n, c, h, w) > 0 \\ + \diffdst(n, c, h, w) \cdot \weights(n, c, h, w) & \mbox{if } \src(n, c, h, w) \leq 0 \end{cases}\\\\ - \mbox{diff_weights}(n, c, h, w) &= - \min(\src(n, c, h, w), 0) \cdot \mbox{diff_dst}(n, c, h, w) - \end{align} + \diff_weights(n, c, h, w) &= + \min(\src(n, c, h, w), 0) \cdot \diffdst(n, c, h, w) \f] Similar to forward propagation, result is calculated taking into diff --git a/doc/primitives/rnn.md b/doc/primitives/rnn.md index 85ead747aa9..e511a62720c 100644 --- a/doc/primitives/rnn.md +++ b/doc/primitives/rnn.md @@ -47,9 +47,8 @@ where \f$t,l\f$ are the indices of the timestamp and the layer of the cell being And here is the equation for LSTM cells: -\f[ \begin{equation*} +\f[ (h_{t, l},c_{t,l}) = Cell(h_{t, l-1}, h_{t-1, l}, c_{t-1,l}) -\end{equation*} \f] where \f$t,l\f$ are the indices of the timestamp and the layer of the cell being executed. @@ -84,10 +83,8 @@ functions. The following equations defines the mathematical operation performed by the Vanilla RNN cell for the forward pass: \f[ -\begin{align} a_t &= W \cdot h_{t,l-1} + U \cdot h_{t-1, l} + B \\ h_t &= activation(a_t) -\end{align} \f] ### LSTM @@ -111,7 +108,6 @@ following equation gives the mathematical description of these gates and output for the forward pass: \f[ -\begin{align} i_t &= \sigma(W_i \cdot h_{t,l-1} + U_i \cdot h_{t-1, l} + B_i) \\ f_t &= \sigma(W_f \cdot h_{t,l-1} + U_f \cdot h_{t-1, l} + B_f) \\ \\ @@ -120,7 +116,6 @@ c_t &= f_t * c_{t-1} + i_t * \tilde c_t \\ \\ o_t &= \sigma(W_o \cdot h_{t,l-1} + U_o \cdot h_{t-1, l} + B_o) \\ h_t &= \tanh(c_t) * o_t -\end{align} \f] where \f$W_*\f$ are stored in \weightslayer, \f$U_*\f$ are stored in @@ -151,7 +146,6 @@ on the gates. For peephole weights, the gates order is `i`, `f`, and output for the forward pass: \f[ -\begin{align} i_t &= \sigma(W_i \cdot h_{t,l-1} + U_i \cdot h_{t-1, l} + P_i \cdot c_{t-1} + B_i) \\ f_t &= \sigma(W_f \cdot h_{t,l-1} + U_f \cdot h_{t-1, l} + P_f \cdot c_{t-1} + B_f) \\ \\ @@ -160,7 +154,6 @@ c_t &= f_t * c_{t-1} + i_t * \tilde c_t \\ \\ o_t &= \sigma(W_o \cdot h_{t,l-1} + U_o \cdot h_{t-1, l} + P_o \cdot c_t + B_o) \\ h_t &= \tanh(c_t) * o_t -\end{align} \f] where \f$P_*\f$ are stored in `weights_peephole`, and the other parameters are @@ -192,7 +185,6 @@ description of these gates and output for the forward pass (for simplicity, LSTM without peephole is shown): \f[ -\begin{align} i_t &= \sigma(W_i \cdot h_{t,l-1} + U_i \cdot h_{t-1,l} + B_i) \\ f_t &= \sigma(W_f \cdot h_{t,l-1} + U_f \cdot h_{t-1,l} + B_f) \\ & \\ @@ -201,7 +193,6 @@ LSTM without peephole is shown): & \\ o_t &= \sigma(W_o \cdot h_{t,l-1} + U_o \cdot h_{t-1,l} + B_o) \\ h_t &= R \cdot (\tanh(c_t) * o_t) -\end{align} \f] where \f$R\f$ is stored in `weights_projection`, and the other parameters are @@ -230,12 +221,10 @@ implicitly require the order of these gates to be `u`, `r`, and `o`. The following equation gives the mathematical definition of these gates. \f[ -\begin{align} u_t &= \sigma(W_u \cdot h_{t,l-1} + U_u \cdot h_{t-1, l} + B_u) \\ r_t &= \sigma(W_r \cdot h_{t,l-1} + U_r \cdot h_{t-1, l} + B_r) \\ o_t &= \tanh(W_o \cdot h_{t,l-1} + U_o \cdot (r_t * h_{t-1, l}) + B_o) \\ h_t &= u_t * h_{t-1, l} + (1 - u_t) * o_t -\end{align} \f] where \f$W_*\f$ are in \weightslayer, \f$U_*\f$ are in @@ -264,12 +253,10 @@ The following equation describes the mathematical behavior of the Linear-Before-Reset GRU cell. \f[ -\begin{align} u_t &= \sigma(W_u \cdot h_{t,l-1} + U_u \cdot h_{t-1, l} + B_u) \\ r_t &= \sigma(W_r \cdot h_{t,l-1} + U_r \cdot h_{t-1, l} + B_r) \\ o_t &= \tanh(W_o \cdot h_{t,l-1} + r_t *(U_o \cdot h_{t-1, l} + B_{u'}) + B_o) \\ h_t &= u_t * h_{t-1, l} + (1 - u_t) * o_t -\end{align} \f] Note that for all tensors with a dimension depending on the gate number, except @@ -300,13 +287,11 @@ implicitly require the order of these gates to be `u`, `r`, and `o`. The following equation gives the mathematical definition of these gates. \f[ -\begin{align} u_t &= \sigma(W_u \cdot h_{t,l-1} + U_u \cdot h_{t-1, l} + B_u) \\ r_t &= \sigma(W_r \cdot h_{t,l-1} + U_r \cdot h_{t-1, l} + B_r) \\ o_t &= \tanh(W_o \cdot h_{t,l-1} + U_o \cdot (r_t * h_{t-1, l}) + B_o) \\ \tilde u_t &= (1 - a_t) * u_t \\ h_t &= \tilde u_t * h_{t-1, l} + (1 - \tilde u_t) * o_t -\end{align} \f] where \f$W_*\f$ are in \weightslayer, \f$U_*\f$ are in @@ -330,13 +315,11 @@ The following equation describes the mathematical behavior of the Linear-Before-Reset AUGRU cell. \f[ -\begin{align} u_t &= \sigma(W_u \cdot h_{t,l-1} + U_u \cdot h_{t-1, l} + B_u) \\ r_t &= \sigma(W_r \cdot h_{t,l-1} + U_r \cdot h_{t-1, l} + B_r) \\ o_t &= \tanh(W_o \cdot h_{t,l-1} + r_t *(U_o \cdot h_{t-1, l} + B_{u'}) + B_o) \\ \tilde u_t &= (1 - a_t) * u_t \\ h_t &= \tilde u_t * h_{t-1, l} + (1 - \tilde u_t) * o_t -\end{align} \f] Note that for all tensors with a dimension depending on the gate number, except From 9b38149de71dc7e095caaa493a4c5f1a212bdd9c Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Thu, 23 May 2024 17:43:43 -0700 Subject: [PATCH 178/187] sycl: enable l0 to ocl device mapping in hw detect --- src/sycl/sycl_device_info.cpp | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/sycl/sycl_device_info.cpp b/src/sycl/sycl_device_info.cpp index bbddb68a482..2c258c84f95 100644 --- a/src/sycl/sycl_device_info.cpp +++ b/src/sycl/sycl_device_info.cpp @@ -30,8 +30,8 @@ namespace impl { namespace sycl { status_t sycl_device_info_t::init_arch(engine_t *engine) { - auto &device - = utils::downcast(engine)->device(); + auto *sycl_engine = utils::downcast(engine); + auto &device = sycl_engine->device(); // skip cpu engines if (!device.is_gpu()) return status::success; @@ -56,16 +56,11 @@ status_t sycl_device_info_t::init_arch(engine_t *engine) { } else if (be == xpu::sycl::backend_t::level0) { // TODO: add support for L0 binary ngen check // XXX: query from ocl_engine for now - gpu::intel::ocl::ocl_engine_factory_t f(engine_kind::gpu); - - engine_t *engine; - CHECK(f.engine_create(&engine, 0)); - - std::unique_ptr - compute_engine(utils::downcast< - gpu::intel::compute::compute_engine_t *>(engine)); + std::unique_ptr + ocl_engine; + CHECK(gpu::intel::sycl::create_ocl_engine(&ocl_engine, sycl_engine)); - auto *dev_info = compute_engine->device_info(); + auto *dev_info = ocl_engine->device_info(); ip_version_ = dev_info->ip_version(); gpu_arch_ = dev_info->gpu_arch(); stepping_id_ = dev_info->stepping_id(); From c08a722b81b7efc326c318803005ab31d8bd2f81 Mon Sep 17 00:00:00 2001 From: Denis Samoilov Date: Thu, 23 May 2024 17:45:37 -0700 Subject: [PATCH 179/187] gpu: ocl: restrict using index based ocl engine interface for sycl --- src/gpu/intel/ocl/ocl_engine.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/gpu/intel/ocl/ocl_engine.hpp b/src/gpu/intel/ocl/ocl_engine.hpp index 1a5f3e76e6d..bc4b5ef5db3 100644 --- a/src/gpu/intel/ocl/ocl_engine.hpp +++ b/src/gpu/intel/ocl/ocl_engine.hpp @@ -41,6 +41,10 @@ class ocl_engine_factory_t : public engine_factory_t { } status_t engine_create(engine_t **engine, size_t index) const override { +#ifdef DNNL_WITH_SYCL + gpu_error_not_expected() << "This interface is not for use with SYCL"; + return status::runtime_error; +#endif status_t status; std::vector ocl_devices; From f9468c17dc3074c424afdbf9aec9b0483740d08e Mon Sep 17 00:00:00 2001 From: Kealan Barbieri Date: Thu, 23 May 2024 17:00:26 -0700 Subject: [PATCH 180/187] gpu: intel: jit: gemm: remove broken xelpg strat --- src/gpu/intel/jit/gemm/kernel.db | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/gpu/intel/jit/gemm/kernel.db b/src/gpu/intel/jit/gemm/kernel.db index 4f301fc3a82..befdb3477d3 100644 --- a/src/gpu/intel/jit/gemm/kernel.db +++ b/src/gpu/intel/jit/gemm/kernel.db @@ -15,8 +15,8 @@ *******************************************************************************/ /*@kcatalog@*/ -kcatalog::FlatCatalog<1109> _CATALOG_ -{1, 8309, 1109, { +kcatalog::FlatCatalog<1108> _CATALOG_ +{1, 8310, 1108, { {{'9', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2x2 as8x2 ab l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'9', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4x2 ab l4 acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}}, {{'9', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 as16 ab l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {256}}}, @@ -393,7 +393,6 @@ kcatalog::FlatCatalog<1109> _CATALOG_ {{'E', "gemm", {"O", "H", "S"}, {"A", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "xyz"}, "sB8 sB8 sB wg 2x1x16 akr kc8 fg 0.25 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {32, 4, 8}, {2, 1, 16}, 1, (WGType) 0, 262917, 0, 1024, {32, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, {{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}}, {{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2x2 sB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, {{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06234e+06, 445401, 0, 0, 0, 0, 4.62244, 5.31323, 3.78655, 11.4316, 0.08593, 0.0721937, 0.0244972, 1, 1.21089, 1.20155, -5.01322e-15}}}, {{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20626e+06, 136698, 52320.2, 5700.91, 0, 0, 6.73519, 6.52141, 8.28549, 8.11665, 2.16712, 0.382531, 0.689817, 0.333333, 1.20221, 0, 0}}}, {{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.24598e+06, 199092, 0, 0, 0, 0, 5.11406, 6.54063, 3.6868, 10.138, 0.114685, 0.0491769, 0.0713602, 0.902881, 1.20724, 1.20086, 6.44189e-15}}}, From a9d0007232b1385bb4b1ebcd6906a3f324c6cd5b Mon Sep 17 00:00:00 2001 From: "Guskov, Andrey Y" Date: Sat, 11 May 2024 10:40:33 -0700 Subject: [PATCH 181/187] src: gpu: intel: jit: pooling: avoid looping when init fails --- src/gpu/intel/jit/pooling/gen_pooling.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/gpu/intel/jit/pooling/gen_pooling.cpp b/src/gpu/intel/jit/pooling/gen_pooling.cpp index 82cd9d8fe5e..5c9a3f54d08 100644 --- a/src/gpu/intel/jit/pooling/gen_pooling.cpp +++ b/src/gpu/intel/jit/pooling/gen_pooling.cpp @@ -153,6 +153,7 @@ status_t gen_pooling_fwd_t::init(engine_t *engine) { try { kernel_ = make_kernel(this, engine, cfg_, "gen_pooling_fwd", kernel_info_, grf_mode_t::any, *pd()); + break; } catch (const ngen::out_of_registers_exception &exc) { UNUSED(exc); ir_warning() << "loop too large: cut and retry!" << std::endl; @@ -161,6 +162,10 @@ status_t gen_pooling_fwd_t::init(engine_t *engine) { ir_error_not_expected() << "minimal loop too large!"; break; } + } catch (const std::exception &exc) { + ir_error_not_expected() << exc.what(); + kernel_ = {}; + break; } } set_version(cfg_.n_cuts()); From 3236077120bca55cfbe1eaf3c9e7f4962e0e27d4 Mon Sep 17 00:00:00 2001 From: "Zhang, Yifei" Date: Thu, 23 May 2024 00:07:54 -0700 Subject: [PATCH 182/187] graph: backend: compiler: ops: update matmul config --- .../core/src/ops/templates/matmul_core.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/matmul_core.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/matmul_core.cpp index 1fc341043cc..a5d1be7daf0 100644 --- a/src/graph/backend/graph_compiler/core/src/ops/templates/matmul_core.cpp +++ b/src/graph/backend/graph_compiler/core/src/ops/templates/matmul_core.cpp @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright 2022-2023 Intel Corporation + * Copyright 2022-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -154,8 +154,15 @@ config_ptr gen_matmul_core_t::get_default_config(context_ptr ctx) const { : (pad48_K >= pad32_K ? 32 : 48); if (K < 32) { cfg.K_block = K; } } else { - if (A_plain_dims[1] < 64) { - cfg.K_block = utils::rnd_up(A_plain_dims[1], is_vnni_low_fp ? 2 : 4); + int K = static_cast(A_plain_dims[1]); + if (K < 64) { + cfg.K_block = utils::rnd_up(K, is_vnni_low_fp ? 2 : 4); + } else if (K < 256) { + int ceil64_K = static_cast(utils::rnd_up(K, 64)); + int ceil32_K = static_cast(utils::rnd_up(K, 32)); + int pad64_K = ceil64_K - K; + int pad32_K = ceil32_K - K; + cfg.K_block = pad64_K > pad32_K ? 32 : 64; } } bool is_cfg_set = false; From 157de36c851fb457a9e030d8181298aed3cca5eb Mon Sep 17 00:00:00 2001 From: "Lv, Tao A" Date: Thu, 23 May 2024 13:47:00 +0800 Subject: [PATCH 183/187] api: graph: use different template typename for set_attr fix doxygen warinings duplicate member id --- include/oneapi/dnnl/dnnl_graph.hpp | 41 +++++++++++++++--------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/include/oneapi/dnnl/dnnl_graph.hpp b/include/oneapi/dnnl/dnnl_graph.hpp index 3726f3e65af..2aceaf7c377 100644 --- a/include/oneapi/dnnl/dnnl_graph.hpp +++ b/include/oneapi/dnnl/dnnl_graph.hpp @@ -1027,12 +1027,12 @@ class op : public op_handle { /// Sets the attribute according to the name and type (int64_t). /// - /// @tparam Type Attribute's type. + /// @tparam Type_i Attribute's type. /// @param name Attribute's name. /// @param value The attribute's value. /// @returns The Op self. - template ::value> = true> - op &set_attr(attr name, const Type &value) { + template ::value> = true> + op &set_attr(attr name, const Type_i &value) { dnnl_graph_op_attr_t attr = convert_to_c(name); error::wrap_c_api(dnnl_graph_op_set_attr_s64(get(), attr, &value, 1), "could not set attribute to the op"); @@ -1041,12 +1041,12 @@ class op : public op_handle { /// Sets the attribute according to the name and type (float). /// - /// @tparam Type Attribute's type. + /// @tparam Type_f Attribute's type. /// @param name Attribute's name. /// @param value The attribute's value. /// @returns The Op self. - template ::value> = true> - op &set_attr(attr name, const Type &value) { + template ::value> = true> + op &set_attr(attr name, const Type_f &value) { dnnl_graph_op_attr_t attr = convert_to_c(name); error::wrap_c_api(dnnl_graph_op_set_attr_f32(get(), attr, &value, 1), "could not set attribute to the op"); @@ -1055,12 +1055,12 @@ class op : public op_handle { /// Sets the attribute according to the name and type (bool). /// - /// @tparam Type Attribute's type. + /// @tparam Type_b Attribute's type. /// @param name Attribute's name. /// @param value The attribute's value. /// @returns The Op self. - template ::value> = true> - op &set_attr(attr name, const Type &value) { + template ::value> = true> + op &set_attr(attr name, const Type_b &value) { dnnl_graph_op_attr_t attr = convert_to_c(name); const uint8_t val = value; error::wrap_c_api(dnnl_graph_op_set_attr_bool(get(), attr, &val, 1), @@ -1070,12 +1070,13 @@ class op : public op_handle { /// Sets the attribute according to the name and type (string). /// - /// @tparam Type Attribute's type. + /// @tparam Type_s Attribute's type. /// @param name Attribute's name. /// @param value The attribute's value. /// @returns The Op self. - template ::value> = true> - op &set_attr(attr name, const Type &value) { + template ::value> = true> + op &set_attr(attr name, const Type_s &value) { dnnl_graph_op_attr_t attr = convert_to_c(name); error::wrap_c_api(dnnl_graph_op_set_attr_str( get(), attr, value.c_str(), value.size()), @@ -1086,13 +1087,13 @@ class op : public op_handle { /// Sets the attribute according to the name and type /// (std::vector). /// - /// @tparam Type Attribute's type. + /// @tparam Type_is Attribute's type. /// @param name Attribute's name. /// @param value The attribute's value. /// @returns The Op self. - template >::value> = true> - op &set_attr(attr name, const Type &value) { + template >::value> = true> + op &set_attr(attr name, const Type_is &value) { dnnl_graph_op_attr_t attr = convert_to_c(name); error::wrap_c_api(dnnl_graph_op_set_attr_s64( get(), attr, value.data(), value.size()), @@ -1102,13 +1103,13 @@ class op : public op_handle { /// Sets the attribute according to the name and type (std::vector). /// - /// @tparam Type Attribute's type. + /// @tparam Type_fs Attribute's type. /// @param name Attribute's name. /// @param value The attribute's value. /// @returns The Op self. - template >::value> = true> - op &set_attr(attr name, const Type &value) { + template >::value> = true> + op &set_attr(attr name, const Type_fs &value) { dnnl_graph_op_attr_t attr = convert_to_c(name); error::wrap_c_api(dnnl_graph_op_set_attr_f32( get(), attr, value.data(), value.size()), From 038c1be993a26c4ba3ee0861eb26fd87a2ac9c57 Mon Sep 17 00:00:00 2001 From: "Lv, Tao A" Date: Thu, 23 May 2024 16:22:08 +0800 Subject: [PATCH 184/187] api: graph: add file commands for doxygen --- include/oneapi/dnnl/dnnl_graph.h | 2 +- include/oneapi/dnnl/dnnl_graph.hpp | 3 +++ include/oneapi/dnnl/dnnl_graph_ocl.hpp | 3 +++ include/oneapi/dnnl/dnnl_graph_sycl.hpp | 3 +++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dnnl/dnnl_graph.h b/include/oneapi/dnnl/dnnl_graph.h index a2db234f984..a598754ff95 100644 --- a/include/oneapi/dnnl/dnnl_graph.h +++ b/include/oneapi/dnnl/dnnl_graph.h @@ -15,7 +15,7 @@ *******************************************************************************/ /// @file -/// C API +/// Graph C API #ifndef ONEAPI_DNNL_DNNL_GRAPH_H #define ONEAPI_DNNL_DNNL_GRAPH_H diff --git a/include/oneapi/dnnl/dnnl_graph.hpp b/include/oneapi/dnnl/dnnl_graph.hpp index 2aceaf7c377..8a180140136 100644 --- a/include/oneapi/dnnl/dnnl_graph.hpp +++ b/include/oneapi/dnnl/dnnl_graph.hpp @@ -14,6 +14,9 @@ * limitations under the License. *******************************************************************************/ +/// @file +/// Graph C++ API + #ifndef ONEAPI_DNNL_DNNL_GRAPH_HPP #define ONEAPI_DNNL_DNNL_GRAPH_HPP diff --git a/include/oneapi/dnnl/dnnl_graph_ocl.hpp b/include/oneapi/dnnl/dnnl_graph_ocl.hpp index b893ff95f83..636dc0d1c47 100644 --- a/include/oneapi/dnnl/dnnl_graph_ocl.hpp +++ b/include/oneapi/dnnl/dnnl_graph_ocl.hpp @@ -14,6 +14,9 @@ * limitations under the License. *******************************************************************************/ +/// @file +/// Graph OpenCL interop API + #ifndef ONEAPI_DNNL_DNNL_GRAPH_OCL_HPP #define ONEAPI_DNNL_DNNL_GRAPH_OCL_HPP diff --git a/include/oneapi/dnnl/dnnl_graph_sycl.hpp b/include/oneapi/dnnl/dnnl_graph_sycl.hpp index 5569b8b852b..8f694f4b36b 100644 --- a/include/oneapi/dnnl/dnnl_graph_sycl.hpp +++ b/include/oneapi/dnnl/dnnl_graph_sycl.hpp @@ -14,6 +14,9 @@ * limitations under the License. *******************************************************************************/ +/// @file +/// Graph SYCL interop API + #ifndef ONEAPI_DNNL_DNNL_GRAPH_SYCL_HPP #define ONEAPI_DNNL_DNNL_GRAPH_SYCL_HPP From 6bfd8fdc89b6ea4e308dd3e9be981fa2c6f5c71c Mon Sep 17 00:00:00 2001 From: "Lv, Tao A" Date: Mon, 13 May 2024 22:57:12 +0800 Subject: [PATCH 185/187] examples: graph: remove trailing transpose/reshape from sdpa example align with openvino's definition --- examples/graph/gpu_opencl_sdpa.cpp | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/examples/graph/gpu_opencl_sdpa.cpp b/examples/graph/gpu_opencl_sdpa.cpp index d7b8e3cdc46..c9f7498b8c5 100644 --- a/examples/graph/gpu_opencl_sdpa.cpp +++ b/examples/graph/gpu_opencl_sdpa.cpp @@ -84,9 +84,6 @@ void gpu_float_sdpa(data_type dtype, int batch_size, int seq_len, int num_head, dims qk_output_shape = {batch_size, num_head, seq_len, seq_len}; dims scale_shape = {1}; dims attention_mask_shape = {batch_size, 1, 1, seq_len}; - dims qkv_transpose_order = {0, 2, 1, 3}; - dims qkv_transposed_shape = {batch_size, seq_len, num_head, size_per_head}; - dims qkv_reshaped_shape = {batch_size * seq_len, head_dim}; size_t lt_id = 0; @@ -127,28 +124,12 @@ void gpu_float_sdpa(data_type dtype, int batch_size, int seq_len, int num_head, op matmul_v {4, op::kind::MatMul, {softmax_out, value_input}, {matmul_v_out}, "matmul_v"}; - logical_tensor qkv_transposed_out { - lt_id++, dtype, qkv_transposed_shape, layout_type::strided}; - op transpose {5, op::kind::StaticTranspose, {matmul_v_out}, - {qkv_transposed_out}, "transpose"}; - transpose.set_attr>( - op::attr::order, qkv_transpose_order); - - logical_tensor qkv_reshaped_out { - lt_id++, dtype, qkv_reshaped_shape, layout_type::strided}; - op reshape {6, op::kind::StaticReshape, {qkv_transposed_out}, - {qkv_reshaped_out}, "reshape"}; - reshape.set_attr(op::attr::special_zero, false); - reshape.set_attr>(op::attr::shape, qkv_reshaped_shape); - graph g(ekind); g.add_op(matmul_qk); g.add_op(scale_div); g.add_op(mask_add); g.add_op(softmax); g.add_op(matmul_v); - g.add_op(transpose); - g.add_op(reshape); g.finalize(); std::vector partitions = g.get_partitions(); @@ -163,7 +144,7 @@ void gpu_float_sdpa(data_type dtype, int batch_size, int seq_len, int num_head, std::vector inputs_ts, outputs_ts; std::vector> data_buffer; std::unordered_map global_outputs_ts_map; - // Input/output memory should be prepared by users. This helper funciton is + // Input/output memory should be prepared by users. This helper function is // for testing purpose and not part of API. allocate_ocl_graph_mem( inputs_ts, inputs, data_buffer, global_outputs_ts_map, eng, true); From 1162b98a81ee17508407161c5529402f23b92f9e Mon Sep 17 00:00:00 2001 From: "Lv, Tao A" Date: Wed, 15 May 2024 03:02:40 -0700 Subject: [PATCH 186/187] benchdnn: graph: inputs: add a f16 simplifed sdpa case --- .../graph/complex_fusion/harness_mha_all | 1 + .../graph/complex_fusion/harness_mha_ci | 1 + .../mha/sdpa-plain-simplified-f16.json | 347 ++++++++++++++++++ 3 files changed, 349 insertions(+) create mode 100644 tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-simplified-f16.json diff --git a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all index 3317af140a0..f5d4c31307d 100644 --- a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all +++ b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all @@ -23,6 +23,7 @@ --reset --case=complex_fusion/mha/MHA_forward-Bert_large-train-bf16-bs4.json --reset --case=complex_fusion/mha/MHA_forward-Bert_large-train-fp32-bs4.json --reset --case=complex_fusion/mha/dynamic_quantized_mha-Bert_large-inf-int8-bs1-fake.json +--reset --case=complex_fusion/mha/sdpa-plain-simplified-f16.json # Rewrited graphs --reset --in-shapes=4:4x16x32x256+5:4x16x256x33+0:4x16x33x256+1:4x1x1x33+3:4x1x32x33 --case=complex_fusion/mha/MHA-GPT-inf-fp32-bs1.json diff --git a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_ci b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_ci index add28c3b2f5..ab7056e793d 100644 --- a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_ci +++ b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_ci @@ -16,3 +16,4 @@ --reset --case=complex_fusion/mha/MHA-starcoder-inf-fp32-bs1.json --reset --case=complex_fusion/mha/MHA-starcoder-inf-int8-bs1.json --reset --case=complex_fusion/mha/dynamic_quantized_mha-Bert_large-inf-int8-bs1-fake.json +--reset --case=complex_fusion/mha/sdpa-plain-simplified-f16.json diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-simplified-f16.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-simplified-f16.json new file mode 100644 index 00000000000..a3385a1c7d0 --- /dev/null +++ b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-simplified-f16.json @@ -0,0 +1,347 @@ +{ + "version": "3.6.0", + "engine_kind": "gpu", + "fpmath_mode": "strict", + "input_ports": [ + 0, + 1, + 3, + 5, + 8 + ], + "output_ports": [ + 9 + ], + "graph": [ + { + "id": 0, + "name": "matmul_qk", + "kind": "MatMul", + "attrs": { + "transpose_a": { + "type": "bool", + "value": 0 + }, + "transpose_b": { + "type": "bool", + "value": 1 + } + }, + "inputs": [ + { + "id": 0, + "dtype": "f16", + "shape": [ + 1, + 16, + 384, + 64 + ], + "stride": [ + 393216, + 24576, + 64, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 1, + "dtype": "f16", + "shape": [ + 1, + 16, + 384, + 64 + ], + "stride": [ + 393216, + 24576, + 64, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 2, + "dtype": "f16", + "shape": [ + 1, + 16, + 384, + 384 + ], + "stride": [ + 2359296, + 147456, + 384, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 1, + "name": "scale_div", + "kind": "Divide", + "attrs": { + "auto_broadcast": { + "type": "string", + "value": "numpy" + } + }, + "inputs": [ + { + "id": 2, + "dtype": "f16", + "shape": [ + 1, + 16, + 384, + 384 + ], + "stride": [ + 2359296, + 147456, + 384, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 3, + "dtype": "f16", + "shape": [ + 1 + ], + "stride": [ + 1 + ], + "layout_type": "strided", + "property_type": "constant" + } + ], + "outputs": [ + { + "id": 4, + "dtype": "f16", + "shape": [ + 1, + 16, + 384, + 384 + ], + "stride": [ + 2359296, + 147456, + 384, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 2, + "name": "mask_add", + "kind": "Add", + "attrs": { + "auto_broadcast": { + "type": "string", + "value": "numpy" + } + }, + "inputs": [ + { + "id": 4, + "dtype": "f16", + "shape": [ + 1, + 16, + 384, + 384 + ], + "stride": [ + 2359296, + 147456, + 384, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 5, + "dtype": "f16", + "shape": [ + 1, + 1, + 1, + 384 + ], + "stride": [ + 384, + 384, + 384, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 6, + "dtype": "f16", + "shape": [ + 1, + 16, + 384, + 384 + ], + "stride": [ + 2359296, + 147456, + 384, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 3, + "name": "softmax", + "kind": "SoftMax", + "attrs": { + "axis": { + "type": "s64", + "value": -1 + } + }, + "inputs": [ + { + "id": 6, + "dtype": "f16", + "shape": [ + 1, + 16, + 384, + 384 + ], + "stride": [ + 2359296, + 147456, + 384, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 7, + "dtype": "f16", + "shape": [ + 1, + 16, + 384, + 384 + ], + "stride": [ + 2359296, + 147456, + 384, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + }, + { + "id": 4, + "name": "matmul_v", + "kind": "MatMul", + "attrs": { + "transpose_a": { + "type": "bool", + "value": 0 + }, + "transpose_b": { + "type": "bool", + "value": 0 + } + }, + "inputs": [ + { + "id": 7, + "dtype": "f16", + "shape": [ + 1, + 16, + 384, + 384 + ], + "stride": [ + 2359296, + 147456, + 384, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + }, + { + "id": 8, + "dtype": "f16", + "shape": [ + 1, + 16, + 384, + 64 + ], + "stride": [ + 393216, + 24576, + 64, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ], + "outputs": [ + { + "id": 9, + "dtype": "f16", + "shape": [ + 1, + 16, + 384, + 64 + ], + "stride": [ + 393216, + 24576, + 64, + 1 + ], + "layout_type": "strided", + "property_type": "undef" + } + ] + } + ] +} + From 2b0c176889add633f77355792b05faa82a77cf09 Mon Sep 17 00:00:00 2001 From: Simon Ewing Date: Mon, 20 May 2024 15:54:58 -0700 Subject: [PATCH 187/187] gpu: reduction: update subsplitting heuristic --- .../ocl/reduction/combined_reduction.cpp | 86 ++++++++++--------- 1 file changed, 46 insertions(+), 40 deletions(-) diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cpp b/src/gpu/intel/ocl/reduction/combined_reduction.cpp index 216e40ba2e9..296d71380fc 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cpp +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cpp @@ -184,52 +184,58 @@ status_t split_into_phases(const reduction_subproblem_t &subprb, data_type_t accum_data_type, const compute::compute_engine_t *compute_engine, std::vector &phases, bool large_grf_mode) { - - const int subgroup_size - = compute_engine->device_info()->max_subgroup_size(); - const dim_t inner_elems = subprb.inner_block.block; const dim_t reduction_elems = subprb.reduction_block.block; - const dim_t outer_elems = subprb.outer_block.block; - const dim_t inner_dim_per_sg - = nstl::max(dim_t {1}, subgroup_size / inner_elems); - const int num_EU = compute_engine->device_info()->eu_count(); - const dim_t num_sg_per_red_end - = outer_elems * utils::div_up(inner_elems, subgroup_size); - - //Heuristics: - // EU_mult: reduce parallelism to at most num_EU*EU_mult (reduces scheduling overhead?) - const int EU_mult = 20; - // Target single_phase_threshold horizontal reductions with each phase - const int single_phase_threshold = 1024; - - // Estimate the number of phases remaining, and divide it up evenly around this target - int N = static_cast(std::ceil(std::log2(reduction_elems) - / std::log2(single_phase_threshold * inner_dim_per_sg))); - N = std::max(1, N); // N must be positive - dim_t reduction_end = static_cast( - std::pow(reduction_elems, 1.0f - 1.0f / static_cast(N))); - - // Reduce parallelism and finalize reduction_end - reduction_end = nstl::clamp( - num_EU * EU_mult / num_sg_per_red_end, dim_t {1}, reduction_end); - reduction_end = get_previous_factor(reduction_elems, reduction_end); + //Heuristic: + // subsplitting has a high cost due to launching multiple sequential threads, + // so only split when parallelism is low and reductions per thread is large + reduction_phase_conf_t try_phase(subprb, accum_data_type, accum_data_type, + compute_engine, large_grf_mode); + const bool low_parallelism = [&compute_engine, &large_grf_mode, + &try_phase]() { + compute::gpu_arch_t arch = compute_engine->device_info()->gpu_arch(); + int threads_per_EU = large_grf_mode + ? 4 + : compute::device_info_t::threads_per_eu(arch); + const int num_EU = compute_engine->device_info()->eu_count(); + const int min_threads = gpu_utils::dev_getenv( + "combined_reduction_occ_thresh", threads_per_EU * num_EU / 2); + const int dispatched_threads + = gpu_utils::into(try_phase.nd_range.global_range()[0] + / gpu_utils::into(try_phase.subgroup_size)); + return dispatched_threads < min_threads; + }(); + const bool large_reduction = [&try_phase]() { + const int slm_red + = gpu_utils::into(try_phase.nd_range.local_range()[0] + / gpu_utils::into(try_phase.subgroup_size)); + const dim_t sg_red = nstl::clamp( + try_phase.subgroup_size / try_phase.inner_block.block, + dim_t {1}, try_phase.reduction_block.block); + const dim_t red_per_thread + = try_phase.reduction_block.block / slm_red / sg_red; + const int red_thresh + = gpu_utils::dev_getenv("combined_reduction_split_thresh", 128); + return red_per_thread >= red_thresh; + }(); + if (!large_reduction || !low_parallelism) { + phases.emplace_back(try_phase); + return status::success; + } - // Create the phase and recursively enter - dim_t reduction_size = reduction_elems / reduction_end; + // Split into 2 phases + dim_t reduction_end = static_cast(std::sqrt(reduction_elems)); + reduction_end = get_previous_factor(reduction_elems, reduction_end); - if (reduction_end == 1) { - phases.emplace_back(subprb, accum_data_type, accum_data_type, + auto subdivided + = subdivide_subproblem(subprb, reduction_elems / reduction_end); + phases.emplace_back(subdivided[0], accum_data_type, accum_data_type, + compute_engine, large_grf_mode); + if (reduction_end > 1) { + phases.emplace_back(subdivided[1], accum_data_type, accum_data_type, compute_engine, large_grf_mode); - return status::success; - } else { - // Subdivide the subproblem by reducing by reduction_size first - auto subdivided = subdivide_subproblem(subprb, reduction_size); - phases.emplace_back(subdivided[0], accum_data_type, accum_data_type, - compute_engine, large_grf_mode); - return split_into_phases(subdivided[1], accum_data_type, compute_engine, - phases, large_grf_mode); } + return status::success; } status_t combined_reduction_t::pd_t::init_conf(engine_t *engine) {