Skip to content

Commit

Permalink
Merge branch 'local_HEAD' into ops_extend_amd
Browse files Browse the repository at this point in the history
  • Loading branch information
pyf committed May 29, 2024
2 parents e2ec34c + 2b0c176 commit 2f7d655
Show file tree
Hide file tree
Showing 521 changed files with 22,345 additions and 7,757 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/openssf-scorecard.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ jobs:

steps:
- name: "Checkout code"
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b # v4.1.5
with:
persist-credentials: false

- name: "Run analysis"
uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
uses: ossf/scorecard-action@dc50aa9510b46c811795eb24b2f1ba02a914e534 # v2.3.3
with:
results_file: results.sarif
results_format: sarif
Expand Down
43 changes: 43 additions & 0 deletions .github/workflows/slack-pr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#===============================================================================
# Copyright 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================

name: Slack PR Notification
on:
# use pull_request_target to run on PRs from forks and have access to secrets
pull_request_target:
types: [labeled]

env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
channel: "onednn"

permissions:
pull-requests: read

jobs:
rfc:
name: RFC Notification
runs-on: ubuntu-latest
# Trigger when labeling a PR with "RFC"
if: |
github.event.action == 'labeled' &&
contains(toJson(github.event.pull_request.labels.*.name), '"RFC"')
steps:
- name: Notify Slack
uses: slackapi/slack-github-action@70cd7be8e40a46e8b0eced40b0de447bdb42f68e # v1.26.0
with:
channel-id: ${{ env.channel }}
slack-message: "${{ github.actor }} posted a RFC: ${{ github.event.pull_request.title }}. URL: ${{ github.event.pull_request.html_url }}"
16 changes: 13 additions & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,17 @@ following information:
oneDNN uses gtests for lightweight functional testing and benchdnn for
performance and functional testing.

Be sure to extend the existing tests when fixing an issue.
Verify the modified code is covered by existing tests. If not, update the
coverage to validate the change and sumbit it as a part of the PR.

Developing new benchdnn tests can be hard, so it is a good idea to start with
gtests first.
Use the following command to run tests selected by a build configuration:
``` sh
ctest
```

To modify the coverage, use the
[`ONEDNN_TEST_SET`](https://oneapi-src.github.io/oneDNN/dev_guide_build_options.html#onednn-test-set)
build option.

More details on how to run benchdnn can be found in
[benchdnn documentation](tests/benchdnn/doc/benchdnn_general_info.md#running-tests).
4 changes: 2 additions & 2 deletions cmake/ACL.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# ******************************************************************************
# Copyright 2020-2023 Arm Limited and affiliates.
# Copyright 2020-2024 Arm Limited and affiliates.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -31,7 +31,7 @@ endif()

find_package(ACL REQUIRED)

set(ACL_MINIMUM_VERSION "23.11")
set(ACL_MINIMUM_VERSION "24.04")

if(ACL_FOUND)
file(GLOB_RECURSE ACL_VERSION_FILE ${ACL_INCLUDE_DIR}/*/arm_compute_version.embed)
Expand Down
48 changes: 29 additions & 19 deletions cmake/SYCL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,27 @@ else()
list(APPEND EXTRA_SHARED_LIBS OpenCL::OpenCL)
endif()

# CUDA and ROCm contain OpenCL headers that conflict with the OpenCL
# headers located in the compiler's directory.
# The workaround is to get interface include directories from all CUDA/ROCm
# import targets and lower their priority via `-idirafter` so that the
# compiler picks up the proper OpenCL headers.
macro(adjust_headers_priority targets)
if(NOT WIN32)
set(include_dirs)
foreach(import_target ${targets})
get_target_property(import_target_include_dirs ${import_target} INTERFACE_INCLUDE_DIRECTORIES)
set_target_properties(${import_target} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "")
list(APPEND include_dirs ${import_target_include_dirs})
endforeach()

list(REMOVE_DUPLICATES include_dirs)
foreach(include_dir ${include_dirs})
append(CMAKE_CXX_FLAGS "-idirafter${include_dir}")
endforeach()
endif()
endmacro()

if(DNNL_SYCL_CUDA)
# XXX: Suppress warning coming from SYCL headers:
# error: use of function template name with no prior declaration in
Expand All @@ -80,31 +101,20 @@ if(DNNL_SYCL_CUDA)
find_package(cuBLAS REQUIRED)
find_package(cuDNN REQUIRED)

if(NOT WIN32)
# XXX: CUDA contains OpenCL headers that conflict with the OpenCL
# headers located in the compiler's directory.
# The workaround is the following:
# Get interface include directories from all CUDA related import
# targets and lower their priority via `-idirafter` so that the
# compiler picks up the proper OpenCL headers.
set(cuda_include_dirs)
foreach(cuda_import_target cuBLAS::cuBLAS;cuDNN::cuDNN)
get_target_property(cuda_import_target_include_dirs ${cuda_import_target} INTERFACE_INCLUDE_DIRECTORIES)
set_target_properties(${cuda_import_target} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "")
list(APPEND cuda_include_dirs ${cuda_import_target_include_dirs})
endforeach()

list(REMOVE_DUPLICATES cuda_include_dirs)
foreach(cuda_include_dir ${cuda_include_dirs})
append(CMAKE_CXX_FLAGS "-idirafter${cuda_include_dir}")
endforeach()
endif()
adjust_headers_priority("cuBLAS::cuBLAS;cuDNN::cuDNN")
add_definitions_with_host_compiler("-DCUDA_NO_HALF")

list(APPEND EXTRA_SHARED_LIBS cuBLAS::cuBLAS cuDNN::cuDNN)
message(STATUS "DPC++ support is enabled (CUDA)")
elseif(DNNL_SYCL_HIP)
find_package(HIP REQUIRED)
find_package(rocBLAS REQUIRED)
find_package(MIOpen REQUIRED)

adjust_headers_priority("HIP::HIP;rocBLAS::rocBLAS;MIOpen::MIOpen")
add_definitions_with_host_compiler("-D__HIP_PLATFORM_AMD__=1")

list(APPEND EXTRA_SHARED_LIBS HIP::HIP rocBLAS::rocBLAS MIOpen::MIOpen)
message(STATUS "DPC++ support is enabled (HIP)")
else()
# In order to support large shapes.
Expand Down
18 changes: 14 additions & 4 deletions cmake/options.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,11 @@ option(DNNL_EXPERIMENTAL_SPARSE
independetly from DNNL_EXPERIMENTAL."
OFF) # disabled by default

option(DNNL_EXPERIMENTAL_UKERNEL
"Enable experimental functionality for ukernels. This option works
independetly from DNNL_EXPERIMENTAL."
OFF) # disabled by default

option(DNNL_EXPERIMENTAL_PROFILING
"Enable experimental profiling capabilities. This option works independently
from DNNL_EXPERIMENTAL."
Expand Down Expand Up @@ -269,10 +274,15 @@ if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL)$")
message(FATAL_ERROR "Unsupported GPU runtime: ${DNNL_GPU_RUNTIME}")
endif()

set(DNNL_GPU_VENDOR "INTEL" CACHE STRING
"specifies target GPU vendor for GPU engines.
Can be INTEL (default) or NVIDIA.")
if(NOT "${DNNL_GPU_VENDOR}" MATCHES "^(INTEL|NVIDIA|AMD)$")
set(DNNL_GPU_VENDOR "NONE" CACHE STRING
"When DNNL_GPU_RUNTIME is not NONE DNNL_GPU_VENDOR specifies target GPU
vendor for GPU engines. Can be INTEL (default), NVIDIA or AMD.")

if(NOT DNNL_GPU_RUNTIME STREQUAL "NONE" AND DNNL_GPU_VENDOR STREQUAL "NONE")
set(DNNL_GPU_VENDOR "INTEL")
endif()

if(NOT "${DNNL_GPU_VENDOR}" MATCHES "^(NONE|INTEL|NVIDIA|AMD)$")
message(FATAL_ERROR "Unsupported GPU vendor: ${DNNL_GPU_VENDOR}")
endif()

Expand Down
2 changes: 1 addition & 1 deletion doc/Doxyfile.in
Original file line number Diff line number Diff line change
Expand Up @@ -1962,7 +1962,7 @@ INCLUDE_FILE_PATTERNS =
# recursively expanded use the := operator instead of the = operator.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS DNNL_GPU_RUNTIME=DNNL_RUNTIME_OCL DNNL_WITH_SYCL DNNL_USE_SYCL_BUFFERS
PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS DNNL_GPU_RUNTIME=DNNL_RUNTIME_OCL DNNL_WITH_SYCL DNNL_USE_SYCL_BUFFERS DNNL_EXPERIMENTAL_SPARSE DNNL_EXPERIMENTAL_UKERNEL

# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
# tag can be used to specify a list of macro names that should be expanded. The
Expand Down
15 changes: 14 additions & 1 deletion doc/advanced/experimental.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ Both kinds of experimental features can be enabled simultaneously.
| Build time option | Description |
|:-------------------------------------------|:-------------------------------------------------------------------|
| ONEDNN_EXPERIMENTAL_SPARSE | Enable experimental API and functionality for sparse domain. |
| ONEDNN_EXPERIMENTAL_PROFILING | Enable experimental profiling API.
| ONEDNN_EXPERIMENTAL_UKERNEL | Enable experimental microkernel APIs and functionalities. |
| ONEDNN_EXPERIMENTAL_PROFILING | Enable experimental profiling API. |
| ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND | Enable experimental graph compiler backend of the graph component. |


## Features details

### ONEDNN_EXPERIMENTAL_SPARSE
Expand Down Expand Up @@ -192,6 +194,17 @@ destination tensor should also work for the sparse one.
Multiplication and Reorder primitives
* Sparse memory can be created only for a CPU engine
### ONEDNN_EXPERIMENTAL_UKERNEL
This option enables a new set of CPU-only APIs to support block-level
functionalities. By composing these low-level, sequential operations, users can
implement their own custom operations/fusions, and tailor blocking/threading
logic to their applications.
More details on this API are available in the [Microkernel APIs
section](@ref dev_guide_ukernel_basic_concepts).
### ONEDNN_EXPERIMENTAL_PROFILING
This option enables profiling API that can be used to query different
profiling data.
Expand Down
9 changes: 4 additions & 5 deletions doc/advanced/understanding_memory_formats.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,17 +137,15 @@ For a single image (**N** = 1), this format is very similar to how
where the image is kept pixel by pixel and every pixel contains all
required information about colors (for instance, three channels for 24bit BMP).

NHWC data format is the default one for
[TensorFlow](https://www.tensorflow.org/performance/performance_guide#data_formats).
NHWC is the default data format for image recognition in
[TensorFlow](https://www.tensorflow.org/api_docs/python/tf/conv).

This layout corresponds to #dnnl_nhwc or dnnl::memory::format_tag::nhwc.


#### CHWN

The last example here for the plain data layout is **CHWN**, which is used by
[Neon](https://neon.nervanasys.com/index.html/design.html#data-layout).
This layout might be very interesting from a vectorization perspective if
The last example here for the plain data layout is **CHWN**. This layout might be very interesting from a vectorization perspective if
an appropriate batch size is used, but on the other hand users cannot always
have *good* batch size (for example, in case of real-time inference batch is
typically 1).
Expand Down Expand Up @@ -222,6 +220,7 @@ the function above is:
dnnl_memory_desc_create_with_strides(&md, ndims, dims, dnnl_f32, strides);
~~~

In particular, whenever a user creates memory with the #dnnl_nchw format,
oneDNN computes the strides and fills the structure on behalf of the
user.
Expand Down
2 changes: 1 addition & 1 deletion doc/build/build_options.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ oneDNN supports the following build-time options.
| ONEDNN_DEV_MODE | ON, **OFF** | Enables internal tracing and `debuginfo` logging in verbose output (for oneDNN developers) |
| ONEDNN_AARCH64_USE_ACL | ON, **OFF** | Enables integration with Arm Compute Library for AArch64 builds |
| ONEDNN_BLAS_VENDOR | **NONE**, ARMPL, ACCELERATE | Defines an external BLAS library to link to for GEMM-like operations |
| ONEDNN_GPU_VENDOR | **INTEL**, NVIDIA, AMD | Defines GPU vendor for GPU engines |
| ONEDNN_GPU_VENDOR | NONE, **INTEL**, NVIDIA, AMD | When DNNL_GPU_RUNTIME is not NONE defines GPU vendor for GPU engines otherwise its value is NONE|
| ONEDNN_DPCPP_HOST_COMPILER | **DEFAULT**, *GNU or Clang C++ compiler executable* | Specifies host compiler executable for SYCL runtime |
| ONEDNN_LIBRARY_NAME | **dnnl**, *library name* | Specifies name of the library |
| ONEDNN_TEST_SET | SMOKE, **CI**, NIGHTLY, MODIFIER_NAME | Specifies the testing coverage enabled through the generated testing targets |
Expand Down
23 changes: 13 additions & 10 deletions doc/build/link.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,19 @@ on how oneDNN was built.

## Header Files

| File | Description |
|:-------------------------------------------|:----------------------------------|
| ``include/oneapi/dnnl/dnnl.h`` | C header |
| ``include/oneapi/dnnl/dnnl.hpp`` | C++ header |
| ``include/oneapi/dnnl/dnnl_types.h`` | Auxiliary C header |
| ``include/oneapi/dnnl/dnnl_config.h`` | Auxiliary C header |
| ``include/oneapi/dnnl/dnnl_version.h`` | C header with version information |
| ``include/oneapi/dnnl/dnnl_graph.h`` | C header for graph API |
| ``include/oneapi/dnnl/dnnl_graph.hpp`` | C++ header for graph API |
| ``include/oneapi/dnnl/dnnl_graph_types.h`` | Auxiliary C header for graph API |
| File | Description |
|:---------------------------------------------|:-----------------------------------|
| ``include/oneapi/dnnl/dnnl.h`` | C header |
| ``include/oneapi/dnnl/dnnl.hpp`` | C++ header |
| ``include/oneapi/dnnl/dnnl_types.h`` | Auxiliary C header |
| ``include/oneapi/dnnl/dnnl_config.h`` | Auxiliary C header |
| ``include/oneapi/dnnl/dnnl_version.h`` | C header with version information |
| ``include/oneapi/dnnl/dnnl_graph.h`` | C header for graph API |
| ``include/oneapi/dnnl/dnnl_graph.hpp`` | C++ header for graph API |
| ``include/oneapi/dnnl/dnnl_graph_types.h`` | Auxiliary C header for graph API |
| ``include/oneapi/dnnl/dnnl_ukernel.h`` | C header for ukernel API |
| ``include/oneapi/dnnl/dnnl_ukernel.hpp`` | C++ header for ukernel API |
| ``include/oneapi/dnnl/dnnl_ukernel_types.h`` | Auxiliary C header for ukernel API |

## Libraries

Expand Down
2 changes: 1 addition & 1 deletion doc/examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ Examples {#dev_guide_examples}
| | CPU | @ref graph_cpu_inference_int8_cpp | |
| | CPU/GPU | @ref graph_sycl_getting_started_cpp | |
| | CPU | @ref graph_cpu_single_op_partition_cpp| |
| | GPU | @ref graph_gpu_single_op_partition_cpp| |
| | GPU | @ref graph_sycl_single_op_partition_cpp| |
2 changes: 1 addition & 1 deletion doc/graph/operations/Dequantize.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ where \f$ic\f$ is the number of channels.
| [qtype](@ref dnnl::graph::op::attr::qtype) | Specifies which de-quantization type is used. |string | `per_tensor` (default), `per_channel` | Optional |
| [axis](@ref dnnl::graph::op::attr::axis) | Specifies dimension on which per-channel de-quantization is applied. |s64 | A s64 value in the range of [-r, r-1] where r = rank(src), `1` by default | Optional |
| [scales](@ref dnnl::graph::op::attr::scales) | Scalings applied on the src data. |f32 | A f32 list (only contain one element if qtype is `per_tensor`) | Required |
| [zps](@ref dnnl::graph::op::attr::zps) | Offset values that maps to float zero. |s64 | A s64 list (only contain one element if qtype is `per_tensor`) | Optional |
| [zps](@ref dnnl::graph::op::attr::zps) | Offset values that maps to float zero. |s64 | A s64 list (only contain one element if qtype is `per_tensor`). If omitted, zps values are assumed to be zero. | Optional |

## Execution arguments

Expand Down
3 changes: 1 addition & 2 deletions doc/graph/operations/DynamicDequantize.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ element number of src tensor along the dimension axis.
@note `zps` is a 1D tensor with offset values that map to zero. For `qtype` =
`per-tensor`, there should be only one element in the zps tensor. For `qtype` =
`per-channel`, the element number should be equal to the element number of input
tensor along the dimension axis. If not specified, the library can assume the
operator is symmetric de-quantization and perform kernel optimization accordingly.
tensor along the dimension axis. If omitted, zps values are assumed to be zero.

### Outputs

Expand Down
9 changes: 4 additions & 5 deletions doc/graph/operations/DynamicQuantize.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,10 @@ constructing an operation.
For `qtype` = `per-channel`, the element number should be equal to the element
number of src tensor along the dimension axis.

@note `zps` is a 1D tensor with offset values that map to zero. For `qtype` = `per-tensor`, there should be only one
element in the zps tensor. For `qtype` = `per-channel`, the element number should be
equal to the element number of input tensor along the dimension axis. If not
specified, the library can assume the operator is symmetric quantization and
perform kernel optimization accordingly.
@note `zps` is a 1D tensor with offset values that map to zero. For `qtype` =
`per-tensor`, there should be only one element in the zps tensor. For `qtype` =
`per-channel`, the element number should be equal to the element number of input
tensor along the dimension axis. If omitted, zps values are assumed to be zero.

### Outputs

Expand Down
2 changes: 1 addition & 1 deletion doc/graph/operations/Quantize.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ where \f$ic\f$ is the number of channels.
| [qtype](@ref dnnl::graph::op::attr::qtype) | Specifies which quantization type is used. | string | `per_tensor` (default), `per_channel` | Optional |
| [axis](@ref dnnl::graph::op::attr::axis) | Specifies dimension on which per-channel quantization is applied. | s64 | A s64 value in the range of [-r, r-1] where r = rank(src), `1` by default | Optional |
| [scales](@ref dnnl::graph::op::attr::scales) | Scalings applied on the src data. | f32 | A f32 list (only contain one element if qtype is `per_tensor`) | Required |
| [zps](@ref dnnl::graph::op::attr::zps) | Offset values that maps to float zero. | s64 | A s64 list (only contain one element if qtype is `per_tensor`) | Optional |
| [zps](@ref dnnl::graph::op::attr::zps) | Offset values that maps to float zero. | s64 | A s64 list (only contain one element if qtype is `per_tensor`). If omitted, zps values are assumed to be zero. | Optional |

## Execution arguments

Expand Down
Loading

0 comments on commit 2f7d655

Please sign in to comment.