Merge branch 'local_HEAD' into ops_extend_amd

MOLOjl · May 29, 2024 · 2f7d655 · 2f7d655
2 parents e2ec34c + 2b0c176
commit 2f7d655
Show file tree

Hide file tree

Showing 521 changed files with 22,345 additions and 7,757 deletions.
diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
@@ -41,12 +41,12 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4
+        uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b # v4.1.5
         with:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
+        uses: ossf/scorecard-action@dc50aa9510b46c811795eb24b2f1ba02a914e534 # v2.3.3
         with:
           results_file: results.sarif
           results_format: sarif

diff --git a/.github/workflows/slack-pr.yaml b/.github/workflows/slack-pr.yaml
@@ -0,0 +1,43 @@
+#===============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+name: Slack PR Notification
+on:
+  # use pull_request_target to run on PRs from forks and have access to secrets
+  pull_request_target:
+    types: [labeled]
+
+env:
+  SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+  channel: "onednn"
+
+permissions:
+  pull-requests: read
+
+jobs:
+  rfc:
+    name: RFC Notification
+    runs-on: ubuntu-latest
+    # Trigger when labeling a PR with "RFC"
+    if: |
+      github.event.action == 'labeled' &&
+      contains(toJson(github.event.pull_request.labels.*.name), '"RFC"')
+    steps: 
+    - name: Notify Slack
+      uses: slackapi/slack-github-action@70cd7be8e40a46e8b0eced40b0de447bdb42f68e # v1.26.0
+      with:
+        channel-id: ${{ env.channel }}
+        slack-message: "${{ github.actor }} posted a RFC: ${{ github.event.pull_request.title }}. URL: ${{ github.event.pull_request.html_url }}"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -137,7 +137,17 @@ following information:
 oneDNN uses gtests for lightweight functional testing and benchdnn for
 performance and functional testing.
 
-Be sure to extend the existing tests when fixing an issue.
+Verify the modified code is covered by existing tests. If not, update the
+coverage to validate the change and sumbit it as a part of the PR.
 
-Developing new benchdnn tests can be hard, so it is a good idea to start with
-gtests first.
+Use the following command to run tests selected by a build configuration:
+``` sh
+    ctest
+```
+
+To modify the coverage, use the
+[`ONEDNN_TEST_SET`](https://oneapi-src.github.io/oneDNN/dev_guide_build_options.html#onednn-test-set)
+build option.
+
+More details on how to run benchdnn can be found in
+[benchdnn documentation](tests/benchdnn/doc/benchdnn_general_info.md#running-tests).
diff --git a/cmake/ACL.cmake b/cmake/ACL.cmake
@@ -1,5 +1,5 @@
 # ******************************************************************************
-# Copyright 2020-2023 Arm Limited and affiliates.
+# Copyright 2020-2024 Arm Limited and affiliates.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,7 +31,7 @@ endif()
 
 find_package(ACL REQUIRED)
 
-set(ACL_MINIMUM_VERSION "23.11")
+set(ACL_MINIMUM_VERSION "24.04")
 
 if(ACL_FOUND)
     file(GLOB_RECURSE ACL_VERSION_FILE ${ACL_INCLUDE_DIR}/*/arm_compute_version.embed)

diff --git a/cmake/SYCL.cmake b/cmake/SYCL.cmake
@@ -67,6 +67,27 @@ else()
     list(APPEND EXTRA_SHARED_LIBS OpenCL::OpenCL)
 endif()
 
+# CUDA and ROCm contain OpenCL headers that conflict with the OpenCL
+# headers located in the compiler's directory.
+# The workaround is to get interface include directories from all CUDA/ROCm
+# import targets and lower their priority via `-idirafter` so that the
+# compiler picks up the proper OpenCL headers.
+macro(adjust_headers_priority targets)
+    if(NOT WIN32)
+        set(include_dirs)
+        foreach(import_target ${targets})
+            get_target_property(import_target_include_dirs ${import_target} INTERFACE_INCLUDE_DIRECTORIES)
+            set_target_properties(${import_target} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "")
+            list(APPEND include_dirs ${import_target_include_dirs})
+        endforeach()
+
+        list(REMOVE_DUPLICATES include_dirs)
+        foreach(include_dir ${include_dirs})
+            append(CMAKE_CXX_FLAGS "-idirafter${include_dir}")
+        endforeach()
+    endif()
+endmacro()
+
 if(DNNL_SYCL_CUDA)
     # XXX: Suppress warning coming from SYCL headers:
     #   error: use of function template name with no prior declaration in
@@ -80,31 +101,20 @@ if(DNNL_SYCL_CUDA)
     find_package(cuBLAS REQUIRED)
     find_package(cuDNN REQUIRED)
 
-    if(NOT WIN32)
-        # XXX: CUDA contains OpenCL headers that conflict with the OpenCL
-        # headers located in the compiler's directory.
-        # The workaround is the following:
-        # Get interface include directories from all CUDA related import
-        # targets and lower their priority via `-idirafter` so that the
-        # compiler picks up the proper OpenCL headers.
-        set(cuda_include_dirs)
-        foreach(cuda_import_target cuBLAS::cuBLAS;cuDNN::cuDNN)
-            get_target_property(cuda_import_target_include_dirs ${cuda_import_target} INTERFACE_INCLUDE_DIRECTORIES)
-            set_target_properties(${cuda_import_target} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "")
-            list(APPEND cuda_include_dirs ${cuda_import_target_include_dirs})
-        endforeach()
-
-        list(REMOVE_DUPLICATES cuda_include_dirs)
-        foreach(cuda_include_dir ${cuda_include_dirs})
-            append(CMAKE_CXX_FLAGS "-idirafter${cuda_include_dir}")
-        endforeach()
-    endif()
+    adjust_headers_priority("cuBLAS::cuBLAS;cuDNN::cuDNN")
+    add_definitions_with_host_compiler("-DCUDA_NO_HALF")
 
+    list(APPEND EXTRA_SHARED_LIBS cuBLAS::cuBLAS cuDNN::cuDNN)
     message(STATUS "DPC++ support is enabled (CUDA)")
 elseif(DNNL_SYCL_HIP)
     find_package(HIP REQUIRED)
     find_package(rocBLAS REQUIRED)
     find_package(MIOpen REQUIRED)
+
+    adjust_headers_priority("HIP::HIP;rocBLAS::rocBLAS;MIOpen::MIOpen")
+    add_definitions_with_host_compiler("-D__HIP_PLATFORM_AMD__=1")
+
+    list(APPEND EXTRA_SHARED_LIBS HIP::HIP rocBLAS::rocBLAS MIOpen::MIOpen)
     message(STATUS "DPC++ support is enabled (HIP)")
 else()
     # In order to support large shapes.

diff --git a/cmake/options.cmake b/cmake/options.cmake
@@ -198,6 +198,11 @@ option(DNNL_EXPERIMENTAL_SPARSE
     independetly from DNNL_EXPERIMENTAL."
     OFF) # disabled by default
 
+option(DNNL_EXPERIMENTAL_UKERNEL
+    "Enable experimental functionality for ukernels. This option works
+    independetly from DNNL_EXPERIMENTAL."
+    OFF) # disabled by default
+
 option(DNNL_EXPERIMENTAL_PROFILING
     "Enable experimental profiling capabilities. This option works independently
     from DNNL_EXPERIMENTAL."
@@ -269,10 +274,15 @@ if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL)$")
     message(FATAL_ERROR "Unsupported GPU runtime: ${DNNL_GPU_RUNTIME}")
 endif()
 
-set(DNNL_GPU_VENDOR "INTEL" CACHE STRING
-    "specifies target GPU vendor for GPU engines.
-    Can be INTEL (default) or NVIDIA.")
-if(NOT "${DNNL_GPU_VENDOR}" MATCHES "^(INTEL|NVIDIA|AMD)$")
+set(DNNL_GPU_VENDOR "NONE" CACHE STRING
+    "When DNNL_GPU_RUNTIME is not NONE DNNL_GPU_VENDOR specifies target GPU
+    vendor for GPU engines. Can be INTEL (default), NVIDIA or AMD.")
+
+if(NOT DNNL_GPU_RUNTIME STREQUAL "NONE" AND DNNL_GPU_VENDOR STREQUAL "NONE")
+    set(DNNL_GPU_VENDOR "INTEL")
+endif()
+
+if(NOT "${DNNL_GPU_VENDOR}" MATCHES "^(NONE|INTEL|NVIDIA|AMD)$")
     message(FATAL_ERROR "Unsupported GPU vendor: ${DNNL_GPU_VENDOR}")
 endif()
 

diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
@@ -1962,7 +1962,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = DOXYGEN_SHOULD_SKIP_THIS DNNL_GPU_RUNTIME=DNNL_RUNTIME_OCL DNNL_WITH_SYCL DNNL_USE_SYCL_BUFFERS
+PREDEFINED             = DOXYGEN_SHOULD_SKIP_THIS DNNL_GPU_RUNTIME=DNNL_RUNTIME_OCL DNNL_WITH_SYCL DNNL_USE_SYCL_BUFFERS DNNL_EXPERIMENTAL_SPARSE DNNL_EXPERIMENTAL_UKERNEL
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The

diff --git a/doc/advanced/experimental.md b/doc/advanced/experimental.md
@@ -27,9 +27,11 @@ Both kinds of experimental features can be enabled simultaneously.
 | Build time option                          | Description                                                        |
 |:-------------------------------------------|:-------------------------------------------------------------------|
 | ONEDNN_EXPERIMENTAL_SPARSE                 | Enable experimental API and functionality for sparse domain.       |
-| ONEDNN_EXPERIMENTAL_PROFILING              | Enable experimental profiling API.
+| ONEDNN_EXPERIMENTAL_UKERNEL                | Enable experimental microkernel APIs and functionalities.          |
+| ONEDNN_EXPERIMENTAL_PROFILING              | Enable experimental profiling API.                                 |
 | ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND | Enable experimental graph compiler backend of the graph component. |
 
+
 ## Features details
 
 ### ONEDNN_EXPERIMENTAL_SPARSE
@@ -192,6 +194,17 @@ destination tensor should also work for the sparse one.
 Multiplication and Reorder primitives
 * Sparse memory can be created only for a CPU engine
 
+### ONEDNN_EXPERIMENTAL_UKERNEL
+
+This option enables a new set of CPU-only APIs to support block-level
+functionalities. By composing these low-level, sequential operations, users can
+implement their own custom operations/fusions, and tailor blocking/threading
+logic to their applications.
+
+More details on this API are available in the [Microkernel APIs
+section](@ref dev_guide_ukernel_basic_concepts).
+
+
 ### ONEDNN_EXPERIMENTAL_PROFILING
 This option enables profiling API that can be used to query different
 profiling data.

diff --git a/doc/advanced/understanding_memory_formats.md b/doc/advanced/understanding_memory_formats.md
@@ -137,17 +137,15 @@ For a single image (**N** = 1), this format is very similar to how
 where the image is kept pixel by pixel and every pixel contains all
 required information about colors (for instance, three channels for 24bit BMP).
 
-NHWC data format is the default one for
-[TensorFlow](https://www.tensorflow.org/performance/performance_guide#data_formats).
+NHWC is the default data format for image recognition in
+[TensorFlow](https://www.tensorflow.org/api_docs/python/tf/conv).
 
 This layout corresponds to #dnnl_nhwc or dnnl::memory::format_tag::nhwc.
 
 
 #### CHWN
 
-The last example here for the plain data layout is **CHWN**, which is used by
-[Neon](https://neon.nervanasys.com/index.html/design.html#data-layout).
-This layout might be very interesting from a vectorization perspective if
+The last example here for the plain data layout is **CHWN**. This layout might be very interesting from a vectorization perspective if
 an appropriate batch size is used, but on the other hand users cannot always
 have *good* batch size (for example, in case of real-time inference batch is
 typically 1).
@@ -222,6 +220,7 @@ the function above is:
 
     dnnl_memory_desc_create_with_strides(&md, ndims, dims, dnnl_f32, strides);
 ~~~
+
 In particular, whenever a user creates memory with the #dnnl_nchw format,
 oneDNN computes the strides and fills the structure on behalf of the
 user.

diff --git a/doc/build/build_options.md b/doc/build/build_options.md
@@ -31,7 +31,7 @@ oneDNN supports the following build-time options.
 | ONEDNN_DEV_MODE                 | ON, **OFF**                                         | Enables internal tracing and `debuginfo` logging in verbose output (for oneDNN developers)      |
 | ONEDNN_AARCH64_USE_ACL          | ON, **OFF**                                         | Enables integration with Arm Compute Library for AArch64 builds                                 |
 | ONEDNN_BLAS_VENDOR              | **NONE**, ARMPL, ACCELERATE                         | Defines an external BLAS library to link to for GEMM-like operations                            |
-| ONEDNN_GPU_VENDOR               | **INTEL**, NVIDIA, AMD                              | Defines GPU vendor for GPU engines                                                              |
+| ONEDNN_GPU_VENDOR               | NONE, **INTEL**, NVIDIA, AMD                        | When DNNL_GPU_RUNTIME is not NONE defines GPU vendor for GPU engines otherwise its value is NONE|
 | ONEDNN_DPCPP_HOST_COMPILER      | **DEFAULT**, *GNU or Clang C++ compiler executable* | Specifies host compiler executable for SYCL runtime                                             |
 | ONEDNN_LIBRARY_NAME             | **dnnl**, *library name*                            | Specifies name of the library                                                                   |
 | ONEDNN_TEST_SET                 | SMOKE, **CI**, NIGHTLY, MODIFIER_NAME               | Specifies the testing coverage enabled through the generated testing targets                    |

diff --git a/doc/build/link.md b/doc/build/link.md
@@ -7,16 +7,19 @@ on how oneDNN was built.
 
 ## Header Files
 
-| File                                       | Description                       |
-|:-------------------------------------------|:----------------------------------|
-| ``include/oneapi/dnnl/dnnl.h``             | C header                          |
-| ``include/oneapi/dnnl/dnnl.hpp``           | C++ header                        |
-| ``include/oneapi/dnnl/dnnl_types.h``       | Auxiliary C header                |
-| ``include/oneapi/dnnl/dnnl_config.h``      | Auxiliary C header                |
-| ``include/oneapi/dnnl/dnnl_version.h``     | C header with version information |
-| ``include/oneapi/dnnl/dnnl_graph.h``       | C header for graph API            |
-| ``include/oneapi/dnnl/dnnl_graph.hpp``     | C++ header for graph API          |
-| ``include/oneapi/dnnl/dnnl_graph_types.h`` | Auxiliary C header for graph API  |
+| File                                         | Description                        |
+|:---------------------------------------------|:-----------------------------------|
+| ``include/oneapi/dnnl/dnnl.h``               | C header                           |
+| ``include/oneapi/dnnl/dnnl.hpp``             | C++ header                         |
+| ``include/oneapi/dnnl/dnnl_types.h``         | Auxiliary C header                 |
+| ``include/oneapi/dnnl/dnnl_config.h``        | Auxiliary C header                 |
+| ``include/oneapi/dnnl/dnnl_version.h``       | C header with version information  |
+| ``include/oneapi/dnnl/dnnl_graph.h``         | C header for graph API             |
+| ``include/oneapi/dnnl/dnnl_graph.hpp``       | C++ header for graph API           |
+| ``include/oneapi/dnnl/dnnl_graph_types.h``   | Auxiliary C header for graph API   |
+| ``include/oneapi/dnnl/dnnl_ukernel.h``       | C header for ukernel API           |
+| ``include/oneapi/dnnl/dnnl_ukernel.hpp``     | C++ header for ukernel API         |
+| ``include/oneapi/dnnl/dnnl_ukernel_types.h`` | Auxiliary C header for ukernel API |
 
 ## Libraries
 

diff --git a/doc/examples.md b/doc/examples.md
@@ -22,4 +22,4 @@ Examples {#dev_guide_examples}
 |                | CPU     | @ref graph_cpu_inference_int8_cpp     |                             |
 |                | CPU/GPU | @ref graph_sycl_getting_started_cpp   |                             |
 |                | CPU     | @ref graph_cpu_single_op_partition_cpp|                             |
-|                | GPU     | @ref graph_gpu_single_op_partition_cpp|                             |
+|                | GPU     | @ref graph_sycl_single_op_partition_cpp|                             |
diff --git a/doc/graph/operations/Dequantize.md b/doc/graph/operations/Dequantize.md
@@ -26,7 +26,7 @@ where \f$ic\f$ is the number of channels.
 | [qtype](@ref dnnl::graph::op::attr::qtype)   | Specifies which de-quantization type is used.                        |string      | `per_tensor` (default), `per_channel`                                     | Optional             |
 | [axis](@ref dnnl::graph::op::attr::axis)     | Specifies dimension on which per-channel de-quantization is applied. |s64         | A s64 value in the range of [-r, r-1] where r = rank(src), `1` by default | Optional             |
 | [scales](@ref dnnl::graph::op::attr::scales) | Scalings applied on the src data.                                    |f32         | A f32 list (only contain one element if qtype is `per_tensor`)            | Required             |
-| [zps](@ref dnnl::graph::op::attr::zps)       | Offset values that maps to float zero.                               |s64         | A s64 list (only contain one element if qtype is `per_tensor`)            | Optional             |
+| [zps](@ref dnnl::graph::op::attr::zps)       | Offset values that maps to float zero.                               |s64         | A s64 list (only contain one element if qtype is `per_tensor`). If omitted, zps values are assumed to be zero. | Optional             |
 
 ## Execution arguments
 

diff --git a/doc/graph/operations/DynamicDequantize.md b/doc/graph/operations/DynamicDequantize.md
@@ -44,8 +44,7 @@ element number of src tensor along the dimension axis.
 @note `zps` is a 1D tensor with offset values that map to zero. For `qtype` =
 `per-tensor`, there should be only one element in the zps tensor. For `qtype` =
 `per-channel`, the element number should be equal to the element number of input
-tensor along the dimension axis. If not specified, the library can assume the
-operator is symmetric de-quantization and perform kernel optimization accordingly.
+tensor along the dimension axis. If omitted, zps values are assumed to be zero.
 
 ### Outputs
 

diff --git a/doc/graph/operations/DynamicQuantize.md b/doc/graph/operations/DynamicQuantize.md
@@ -41,11 +41,10 @@ constructing an operation.
 For `qtype` = `per-channel`, the element number should be equal to the element
 number of src tensor along the dimension axis.
 
-@note `zps` is a 1D tensor with offset values that map to zero. For `qtype` = `per-tensor`, there should be only one
-element in the zps tensor. For `qtype` = `per-channel`, the element number should be
-equal to the element number of input tensor along the dimension axis. If not
-specified, the library can assume the operator is symmetric quantization and
-perform kernel optimization accordingly.
+@note `zps` is a 1D tensor with offset values that map to zero. For `qtype` =
+`per-tensor`, there should be only one element in the zps tensor. For `qtype` =
+`per-channel`, the element number should be equal to the element number of input
+tensor along the dimension axis. If omitted, zps values are assumed to be zero.
 
 ### Outputs
 

diff --git a/doc/graph/operations/Quantize.md b/doc/graph/operations/Quantize.md
@@ -26,7 +26,7 @@ where \f$ic\f$ is the number of channels.
 | [qtype](@ref dnnl::graph::op::attr::qtype)   | Specifies which quantization type is used.                        | string     | `per_tensor` (default), `per_channel`                                     | Optional             |
 | [axis](@ref dnnl::graph::op::attr::axis)     | Specifies dimension on which per-channel quantization is applied. | s64        | A s64 value in the range of [-r, r-1] where r = rank(src), `1` by default | Optional             |
 | [scales](@ref dnnl::graph::op::attr::scales) | Scalings applied on the src data.                                 | f32        | A f32 list (only contain one element if qtype is `per_tensor`)            | Required             |
-| [zps](@ref dnnl::graph::op::attr::zps)       | Offset values that maps to float zero.                            | s64        | A s64 list (only contain one element if qtype is `per_tensor`)            | Optional             |
+| [zps](@ref dnnl::graph::op::attr::zps)       | Offset values that maps to float zero.                            | s64        | A s64 list (only contain one element if qtype is `per_tensor`). If omitted, zps values are assumed to be zero. | Optional             |
 
 ## Execution arguments