From 4b46c6f65041334d51b3593059a83f459bd964fe Mon Sep 17 00:00:00 2001 From: Monty Montgomery Date: Fri, 12 Jun 2020 20:54:30 -0400 Subject: [PATCH 1/4] First steps to removing unneeded hardwiring of u16 in the rdo_loop_decision pipeline Patch returns most of the LF RDO pipeline to Pixel rather than hardwiring to Pixel. The intent is to re-allow use of lbd assembly for cdef, LRF, etc. More work needs to be done to eliminate redundant buffers, copies, etc, but this gets most of the typing work done for testing. --- src/cdef.rs | 40 +++++++++++++++++++++- src/lrf.rs | 8 ++--- src/rdo.rs | 99 +++++++++++++++++++++++++++++++---------------------- 3 files changed, 101 insertions(+), 46 deletions(-) diff --git a/src/cdef.rs b/src/cdef.rs index 4bf93a2ca7..efa9bc031e 100644 --- a/src/cdef.rs +++ b/src/cdef.rs @@ -319,7 +319,7 @@ pub fn cdef_analyze_superblock( // blocks, the minimum working unit of the CDEF filters. pub fn cdef_block8_frame( w_8: usize, h_8: usize, pattern_tile: &Tile<'_, T>, -) -> Frame { +) -> Frame { Frame { planes: [ { @@ -338,6 +338,44 @@ pub fn cdef_block8_frame( } } +// Allocates and returns a new Frame with its own memory that is +// patterned on the decimation of the Frame backing the passed-in +// Tile. The width and height are in units of 8-pixel (undecimated) +// blocks, the minimum working unit of the CDEF filters. The contents +// of the tile, beginning at the passed in superblock offset, are +// copied into the new Frame. +pub fn cdef_tile_copy( + tile: &Tile<'_, U>, sbo: TileSuperBlockOffset, w_8: usize, h_8: usize, +) -> Frame { + let mut out = { + Frame { + planes: { + let new_plane = |pli: usize| { + let &PlaneConfig { xdec, ydec, .. } = tile.planes[pli].plane_cfg; + Plane::new(w_8 << 3 >> xdec, h_8 << 3 >> ydec, xdec, ydec, 0, 0) + }; + [new_plane(0), new_plane(1), new_plane(2)] + }, + } + }; + // Copy data into frame + for pli in 0..3 { + let PlaneOffset { x, y } = sbo.plane_offset(tile.planes[pli].plane_cfg); + let out_width = out.planes[pli].cfg.width as isize; + let out_height = out.planes[pli].cfg.height as isize; + let mut out_region = out.planes[pli].as_region_mut(); + for yi in 0..out_height { + let out_row = &mut out_region[yi as usize]; + let in_row = &tile.planes[pli][(y + yi) as usize]; + for xi in 0..out_width { + out_row[xi as usize] = + T::cast_from(u16::cast_from(in_row[(x + xi) as usize])); + } + } + } + out +} + // Allocates and returns a new Frame with its own memory that is // patterned on the decimation of the Frame backing the passed-in // Tile. The width and height are in units of 8-pixel (undecimated) diff --git a/src/lrf.rs b/src/lrf.rs index 01359c43a4..4a34913f0e 100644 --- a/src/lrf.rs +++ b/src/lrf.rs @@ -597,10 +597,10 @@ pub fn setup_integral_image( } } -pub fn sgrproj_stripe_filter( +pub fn sgrproj_stripe_filter( set: u8, xqd: [i8; 2], fi: &FrameInvariants, integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize, - cdeffed: &PlaneSlice, out: &mut PlaneRegionMut, + cdeffed: &PlaneSlice, out: &mut PlaneRegionMut, ) { let &Rect { width: stripe_w, height: stripe_h, .. } = out.rect(); let bdm8 = fi.sequence.bit_depth - 8; @@ -808,8 +808,8 @@ pub fn sgrproj_stripe_filter( // Inputs are relative to the colocated slice views. pub fn sgrproj_solve( set: u8, fi: &FrameInvariants, - integral_image_buffer: &IntegralImageBuffer, input: &PlaneSlice, - cdeffed: &PlaneSlice, cdef_w: usize, cdef_h: usize, + integral_image_buffer: &IntegralImageBuffer, input: &PlaneSlice, + cdeffed: &PlaneSlice, cdef_w: usize, cdef_h: usize, ) -> (i8, i8) { let bdm8 = fi.sequence.bit_depth - 8; diff --git a/src/rdo.rs b/src/rdo.rs index aae7364b7d..55913f6337 100644 --- a/src/rdo.rs +++ b/src/rdo.rs @@ -1884,7 +1884,7 @@ pub fn rdo_partition_decision( fn rdo_loop_plane_error( base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset, sb_w: usize, sb_h: usize, fi: &FrameInvariants, ts: &TileStateMut<'_, T>, - blocks: &TileBlocks<'_>, test: &Frame, src: &Frame, pli: usize, + blocks: &TileBlocks<'_>, test: &Frame, src: &Frame, pli: usize, ) -> ScaledDistortion { let sb_w_blocks = if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_w; @@ -2022,7 +2022,7 @@ pub fn rdo_loop_decision( // flagging the border pixels as inactive]. LR code currently does // not need and will not use padding area. It always edge-extends // the passed in rectangle. - let mut rec_subset = { + let mut rec_subset16:Frame = { let const_rec = ts.rec.as_const(); // a padding of 8 gets us a full block of border. CDEF // only needs 2 pixels, but deblocking is happier with full @@ -2045,20 +2045,21 @@ pub fn rdo_loop_decision( sb_h << SUPERBLOCK_TO_BLOCK_SHIFT, ); - // why copy and not just a view? Because CDEF optimization requires - // u16 working space. This avoids adding another generic buffer - // typing parameter and expanding code to handle all the possible - // input/output combinations. In the future we may decide to prefer - // that over the additional temp buffer (after doing the work needed - // to allow CDEF opt to work on 8 bit). - let src_subset = { - cdef_padded_tile_copy( + let src_subset:Frame = { + cdef_tile_copy( + &ts.input_tile, + base_sbo, + (pixel_w + 7) >> 3, + (pixel_h + 7) >> 3, + ) + }; + + let src_subset16:Frame = { + cdef_tile_copy( &ts.input_tile, base_sbo, (pixel_w + 7) >> 3, (pixel_h + 7) >> 3, - 0, - planes, ) }; @@ -2068,8 +2069,8 @@ pub fn rdo_loop_decision( // better results from CDEF/LRF RDO. let deblock_levels = deblock_filter_optimize( fi, - &rec_subset.as_tile(), - &src_subset.as_tile(), + &rec_subset16.as_tile(), + &src_subset16.as_tile(), &tileblocks_subset.as_const(), crop_w, crop_h, @@ -2084,7 +2085,7 @@ pub fn rdo_loop_decision( // finally, deblock the temp frame deblock_filter_frame( &deblock_copy, - &mut rec_subset.as_tile_mut(), + &mut rec_subset16.as_tile_mut(), &tileblocks_subset.as_const(), crop_w, crop_h, @@ -2094,18 +2095,38 @@ pub fn rdo_loop_decision( } } + let rec_subset:Frame = { + cdef_tile_copy( + &rec_subset16.as_tile(), + TileSuperBlockOffset(SuperBlockOffset { x: 0, y: 0 }), + (pixel_w + 7) >> 3, + (pixel_h + 7) >> 3, + ) + }; + let mut cdef_work = if fi.sequence.enable_cdef { - Some(cdef_padded_tile_copy( - &rec_subset.as_tile(), + Some(cdef_tile_copy ( + &rec_subset16.as_tile(), TileSuperBlockOffset(SuperBlockOffset { x: 0, y: 0 }), (pixel_w + 7) >> 3, (pixel_h + 7) >> 3, - 0, - planes, )) } else { None }; + + let mut cdef_dirs = if fi.sequence.enable_cdef { + Some(cdef_analyze_superblock_range( + fi, + &rec_subset16, + &tileblocks_subset.as_const(), + sb_w, + sb_h, + )) + } else { + None + }; + let mut lrf_work = if fi.sequence.enable_restoration { Some(cdef_block8_frame( (pixel_w + 7) >> 3, @@ -2117,21 +2138,14 @@ pub fn rdo_loop_decision( }; // Precompute directional analysis for CDEF - let cdef_data = { - if cdef_work.is_some() { - Some(( - &rec_subset, - cdef_analyze_superblock_range( - fi, - &rec_subset, - &tileblocks_subset.as_const(), - sb_w, - sb_h, - ), - )) - } else { - None - } + let mut cdef_data = if fi.sequence.enable_cdef { + Some(( + &rec_subset16, + cdef_work.as_mut().unwrap(), + cdef_dirs.as_mut().unwrap(), + )) + } else { + None }; // CDEF/LRF decision iteration @@ -2144,8 +2158,7 @@ pub fn rdo_loop_decision( let mut lrf_change = true; while cdef_change || lrf_change { // search for improved cdef indices, superblock by superblock, if cdef is enabled. - if let (Some((rec_copy, cdef_dirs)), Some(cdef_ref)) = - (&cdef_data, &mut cdef_work.as_mut()) + if let Some((rec_copy, cdef_ref, cdef_dirs)) = cdef_data.as_mut() { for sby in 0..sb_h { for sbx in 0..sb_w { @@ -2173,7 +2186,7 @@ pub fn rdo_loop_decision( ); cdef_filter_superblock( fi, - &rec_subset, + &rec_subset16, &mut cdef_ref_tm, &tileblocks_subset.as_const(), loop_sbo, @@ -2215,7 +2228,7 @@ pub fn rdo_loop_decision( fi, ts, &tileblocks_subset.as_const(), - cdef_ref, + &cdef_ref, &src_subset, pli, ); @@ -2293,7 +2306,7 @@ pub fn rdo_loop_decision( fi, ts, &tileblocks_subset.as_const(), - cdef_ref, + &cdef_ref, &src_subset, pli, ); @@ -2350,9 +2363,13 @@ pub fn rdo_loop_decision( // search for improved restoration filter parameters if restoration is enabled if let Some(lrf_ref) = &mut lrf_work.as_mut() { - let lrf_input = if cdef_work.is_some() { + let lrf_input = if let Some(( + _rec_copy, + cdef_work, + _cdef_dirs)) = &cdef_data + { // When CDEF is enabled, we pull from the CDEF output - &cdef_work.as_ref().unwrap() + &cdef_work } else { // When CDEF is disabled, we pull from the [optionally // deblocked] reconstruction From e8e8e51e2cbc9da39378b44ac81e317e8ef4d253 Mon Sep 17 00:00:00 2001 From: Monty Montgomery Date: Sat, 13 Jun 2020 00:39:47 -0400 Subject: [PATCH 2/4] Eliminate a redundant copy/buffer in loop filter RDO ...at the cost of more Generics combinations in the deblocking code. The patch expands the generic typing so deblocking can take mismatched input depths on reference and reconstruction (internal math is all i32 anyway). --- src/deblock.rs | 34 +++++++++++++++++----------------- src/rdo.rs | 14 +++----------- 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/src/deblock.rs b/src/deblock.rs index fc794824ec..c4a12e1c39 100644 --- a/src/deblock.rs +++ b/src/deblock.rs @@ -434,8 +434,8 @@ fn deblock_h_size4( // Assumes rec[0] and src[0] are set 2 taps back from the edge. // Accesses four taps, accumulates four pixels into the tally -fn sse_size4( - rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>, +fn sse_size4( + rec: &PlaneRegion<'_, U>, src: &PlaneRegion<'_, T>, tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize, ) { for i in 0..4 { @@ -568,8 +568,8 @@ fn deblock_h_size6( // Assumes rec[0] and src[0] are set 3 taps back from the edge. // Accesses six taps, accumulates four pixels into the tally -fn sse_size6( - rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>, +fn sse_size6( + rec: &PlaneRegion<'_, U>, src: &PlaneRegion<'_, T>, tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize, ) { let flat = 1 << (bd - 8); @@ -751,8 +751,8 @@ fn deblock_h_size8( // Assumes rec[0] and src[0] are set 4 taps back from the edge. // Accesses eight taps, accumulates six pixels into the tally -fn sse_size8( - rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>, +fn sse_size8( + rec: &PlaneRegion<'_, U>, src: &PlaneRegion<'_, T>, tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize, ) { let flat = 1 << (bd - 8); @@ -953,8 +953,8 @@ fn deblock_h_size14( // Assumes rec[0] and src[0] are set 7 taps back from the edge. // Accesses fourteen taps, accumulates twelve pixels into the tally -fn sse_size14( - rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>, +fn sse_size14( + rec: &PlaneRegion<'_, U>, src: &PlaneRegion<'_, T>, tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize, ) { let flat = 1 << (bd - 8); @@ -1165,8 +1165,8 @@ fn filter_v_edge( } } -fn sse_v_edge( - blocks: &TileBlocks, bo: TileBlockOffset, rec_plane: &PlaneRegion, +fn sse_v_edge( + blocks: &TileBlocks, bo: TileBlockOffset, rec_plane: &PlaneRegion, src_plane: &PlaneRegion, tally: &mut [i64; MAX_LOOP_FILTER + 2], pli: usize, bd: usize, xdec: usize, ydec: usize, ) { @@ -1261,8 +1261,8 @@ fn filter_h_edge( } } -fn sse_h_edge( - blocks: &TileBlocks, bo: TileBlockOffset, rec_plane: &PlaneRegion, +fn sse_h_edge( + blocks: &TileBlocks, bo: TileBlockOffset, rec_plane: &PlaneRegion, src_plane: &PlaneRegion, tally: &mut [i64; MAX_LOOP_FILTER + 2], pli: usize, bd: usize, xdec: usize, ydec: usize, ) { @@ -1481,8 +1481,8 @@ pub fn deblock_plane( } // sse count of all edges in a single plane, accumulates into vertical and horizontal counts -fn sse_plane( - rec: &PlaneRegion, src: &PlaneRegion, +fn sse_plane( + rec: &PlaneRegion, src: &PlaneRegion, v_sse: &mut [i64; MAX_LOOP_FILTER + 2], h_sse: &mut [i64; MAX_LOOP_FILTER + 2], pli: usize, blocks: &TileBlocks, crop_w: usize, crop_h: usize, bd: usize, @@ -1575,8 +1575,8 @@ pub fn deblock_filter_frame( ); } -fn sse_optimize( - rec: &Tile, input: &Tile, blocks: &TileBlocks, crop_w: usize, +fn sse_optimize( + rec: &Tile, input: &Tile, blocks: &TileBlocks, crop_w: usize, crop_h: usize, bd: usize, monochrome: bool, ) -> [u8; 4] { // i64 allows us to accumulate a total of ~ 35 bits worth of pixels @@ -1643,7 +1643,7 @@ fn sse_optimize( #[hawktracer(deblock_filter_optimize)] pub fn deblock_filter_optimize( - fi: &FrameInvariants, rec: &Tile, input: &Tile, + fi: &FrameInvariants, rec: &Tile, input: &Tile, blocks: &TileBlocks, crop_w: usize, crop_h: usize, ) -> [u8; 4] { if fi.config.speed_settings.fast_deblock { diff --git a/src/rdo.rs b/src/rdo.rs index 55913f6337..4b04b1c3c8 100644 --- a/src/rdo.rs +++ b/src/rdo.rs @@ -1998,7 +1998,8 @@ pub fn rdo_loop_decision( const MAX_SB_SIZE: usize = 1 << MAX_SB_SHIFT; const MAX_LRU_SIZE: usize = MAX_SB_SIZE; - // Static allocation relies on the "minimal LRU area for all N planes" invariant. + // Static allocation relies on the "minimal LRU area for all N + // planes" invariant. let mut best_index = [-1; MAX_SB_SIZE * MAX_SB_SIZE]; let mut best_lrf = [[RestorationFilter::None; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE]; @@ -2054,15 +2055,6 @@ pub fn rdo_loop_decision( ) }; - let src_subset16:Frame = { - cdef_tile_copy( - &ts.input_tile, - base_sbo, - (pixel_w + 7) >> 3, - (pixel_h + 7) >> 3, - ) - }; - if deblock_p { // Find a good deblocking filter solution for the passed in area. // This is not RDO of deblocking itself, merely a solution to get @@ -2070,7 +2062,7 @@ pub fn rdo_loop_decision( let deblock_levels = deblock_filter_optimize( fi, &rec_subset16.as_tile(), - &src_subset16.as_tile(), + &src_subset.as_tile(), &tileblocks_subset.as_const(), crop_w, crop_h, From 24bdbf7c61bba63f3fb06d8831e6e086f04bd59a Mon Sep 17 00:00:00 2001 From: Monty Montgomery Date: Sun, 14 Jun 2020 02:25:19 -0400 Subject: [PATCH 3/4] Swap usage of PlaneSlice for PlaneRegion in loop filter, add superregions Extend PlaneRegions to allow wrapped re-expansion of subregions via superregion, superregion_mut and SuperIndex. Expansion is only allowed out to the boundaries of the original PlaneRegion when it was created via new() or from_slice() (and, by extension, from the tiler as well as region(), as_region(), etc). In short, the original tile boundaries are still enforced during resizing and dereferencing. superregion() and superregion_mut() are simply versions of subregion() and subregion_mut() that allow a negative x,y offset, as well as a size up to the original rectangle size. SuperIndex is simply an isize such that Index can dereference outside the current declared y/height, but not beyond the original Frame/Tile bounds. The patch then continues by replacing use of PlaneSlice with PlaneRegion throughout the loopfilters and loopfilter RDO, where use was previously split between the two (because PlaneSlice offers the entire place ~ without any real bounds checking). --- src/asm/x86/cdef.rs | 76 ++++++------- src/asm/x86/lrf.rs | 36 +++--- src/cdef.rs | 209 +++++++++++++++++----------------- src/lrf.rs | 145 +++++++++++++----------- src/rdo.rs | 223 ++++++++++++++++++------------------- src/tiling/plane_region.rs | 147 ++++++++++++++++++++++++ src/tiling/tile.rs | 41 +++++++ src/tiling/tile_state.rs | 4 +- 8 files changed, 535 insertions(+), 346 deletions(-) diff --git a/src/asm/x86/cdef.rs b/src/asm/x86/cdef.rs index e9011a12c6..5a741dce95 100644 --- a/src/asm/x86/cdef.rs +++ b/src/asm/x86/cdef.rs @@ -9,8 +9,7 @@ use crate::cdef::*; use crate::cpu_features::CpuFeatureLevel; -use crate::frame::*; -use crate::tiling::PlaneRegionMut; +use crate::tiling::{PlaneRegion, PlaneRegionMut}; use crate::util::*; type CdefFilterFn = unsafe extern fn( @@ -41,8 +40,8 @@ const fn decimate_index(xdec: usize, ydec: usize) -> usize { ((ydec << 1) | xdec) & 3 } -pub(crate) unsafe fn cdef_filter_block( - dst: &mut PlaneRegionMut<'_, T>, src: *const u16, src_stride: isize, +pub(crate) fn cdef_filter_block( + dst: &mut PlaneRegionMut<'_, T>, src: &PlaneRegion<'_, u16>, pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, bit_depth: usize, xdec: usize, ydec: usize, cpu: CpuFeatureLevel, ) { @@ -50,7 +49,6 @@ pub(crate) unsafe fn cdef_filter_block( rust::cdef_filter_block( dst, src, - src_stride, pri_strength, sec_strength, dir, @@ -67,40 +65,42 @@ pub(crate) unsafe fn cdef_filter_block( call_rust(&mut copy.as_region_mut()); copy }; - match T::type_enum() { - PixelType::U8 => { - match CDEF_FILTER_FNS[cpu.as_index()][decimate_index(xdec, ydec)] { - Some(func) => { - (func)( - dst.data_ptr_mut() as *mut _, - T::to_asm_stride(dst.plane_cfg.stride), - src, - src_stride, - pri_strength, - sec_strength, - dir as i32, - damping, - ); + unsafe { + match T::type_enum() { + PixelType::U8 => { + match CDEF_FILTER_FNS[cpu.as_index()][decimate_index(xdec, ydec)] { + Some(func) => { + (func)( + dst.data_ptr_mut() as *mut _, + T::to_asm_stride(dst.plane_cfg.stride), + src.data_ptr() as *const _, + T::to_asm_stride(src.plane_cfg.stride), + pri_strength, + sec_strength, + dir as i32, + damping, + ); + } + None => call_rust(dst), } - None => call_rust(dst), } - } - PixelType::U16 => { - match CDEF_FILTER_HBD_FNS[cpu.as_index()][decimate_index(xdec, ydec)] { - Some(func) => { - (func)( - dst.data_ptr_mut() as *mut _, - T::to_asm_stride(dst.plane_cfg.stride), - src, - src_stride, - pri_strength, - sec_strength, - dir as i32, - damping, - (1 << bit_depth) - 1, + PixelType::U16 => { + match CDEF_FILTER_HBD_FNS[cpu.as_index()][decimate_index(xdec, ydec)] { + Some(func) => { + (func)( + dst.data_ptr_mut() as *mut _, + T::to_asm_stride(dst.plane_cfg.stride), + src.data_ptr() as *const _, + T::to_asm_stride(src.plane_cfg.stride), + pri_strength, + sec_strength, + dir as i32, + damping, + (1 << bit_depth) - 1, ); + } + None => call_rust(dst), } - None => call_rust(dst), } } } @@ -159,7 +159,7 @@ type CdefDirFn = #[inline(always)] #[allow(clippy::let_and_return)] pub(crate) fn cdef_find_dir( - img: &PlaneSlice<'_, u16>, var: &mut u32, coeff_shift: usize, + img: &PlaneRegion<'_, u16>, var: &mut u32, coeff_shift: usize, cpu: CpuFeatureLevel, ) -> i32 { let call_rust = @@ -180,8 +180,8 @@ pub(crate) fn cdef_find_dir( // input, even when working with 8 bit input. Mostly done to limit // the amount of code being impacted. (func)( - img.as_ptr() as *const u16, - u16::to_asm_stride(img.plane.cfg.stride), + img.data_ptr() as *const u16, + u16::to_asm_stride(img.plane_cfg.stride), var as *mut u32, ) } diff --git a/src/asm/x86/lrf.rs b/src/asm/x86/lrf.rs index 7d705b2264..4b5d63d7c2 100644 --- a/src/asm/x86/lrf.rs +++ b/src/asm/x86/lrf.rs @@ -8,7 +8,7 @@ // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::cpu_features::CpuFeatureLevel; -use crate::frame::PlaneSlice; +use crate::tiling::PlaneRegion; use crate::lrf::*; use crate::util::Pixel; #[cfg(target_arch = "x86")] @@ -93,7 +93,7 @@ pub fn sgrproj_box_ab_r2( #[inline] pub fn sgrproj_box_f_r0( - f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, + f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneRegion<'_, T>, cpu: CpuFeatureLevel, ) { if cpu >= CpuFeatureLevel::AVX2 { @@ -108,7 +108,7 @@ pub fn sgrproj_box_f_r0( #[inline] pub fn sgrproj_box_f_r1( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, - cdeffed: &PlaneSlice, cpu: CpuFeatureLevel, + cdeffed: &PlaneRegion<'_, T>, cpu: CpuFeatureLevel, ) { if cpu >= CpuFeatureLevel::AVX2 { return unsafe { @@ -122,7 +122,7 @@ pub fn sgrproj_box_f_r1( #[inline] pub fn sgrproj_box_f_r2( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], - y: usize, w: usize, cdeffed: &PlaneSlice, cpu: CpuFeatureLevel, + y: usize, w: usize, cdeffed: &PlaneRegion<'_, T>, cpu: CpuFeatureLevel, ) { if cpu >= CpuFeatureLevel::AVX2 { return unsafe { @@ -353,18 +353,18 @@ pub(crate) unsafe fn sgrproj_box_ab_r2_avx2( #[inline] #[target_feature(enable = "avx2")] unsafe fn sgrproj_box_f_r0_8_avx2( - f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice, + f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneRegion<'_, T>, ) { _mm256_storeu_si256( f.as_mut_ptr().add(x) as *mut _, _mm256_slli_epi32( if mem::size_of::() == 1 { _mm256_cvtepu8_epi32(_mm_loadl_epi64( - cdeffed.subslice(x, y).as_ptr() as *const _ + cdeffed[y][x..].as_ptr() as *const _ )) } else { _mm256_cvtepu16_epi32(_mm_loadu_si128( - cdeffed.subslice(x, y).as_ptr() as *const _ + cdeffed[y][x..].as_ptr() as *const _ )) }, SGRPROJ_RST_BITS as i32, @@ -374,7 +374,7 @@ unsafe fn sgrproj_box_f_r0_8_avx2( #[target_feature(enable = "avx2")] pub(crate) unsafe fn sgrproj_box_f_r0_avx2( - f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, + f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneRegion<'_, T>, ) { for x in (0..w).step_by(8) { if x + 8 <= w { @@ -397,7 +397,7 @@ pub(crate) unsafe fn sgrproj_box_f_r0_avx2( #[target_feature(enable = "avx2")] unsafe fn sgrproj_box_f_r1_8_avx2( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], x: usize, y: usize, - cdeffed: &PlaneSlice, + cdeffed: &PlaneRegion<'_, T>, ) { let three = _mm256_set1_epi32(3); let four = _mm256_set1_epi32(4); @@ -474,11 +474,11 @@ unsafe fn sgrproj_box_f_r1_8_avx2( a, if mem::size_of::() == 1 { _mm256_cvtepu8_epi32(_mm_loadl_epi64( - cdeffed.subslice(x, y).as_ptr() as *const _ + cdeffed[y][x..].as_ptr() as *const _ )) } else { _mm256_cvtepu16_epi32(_mm_loadu_si128( - cdeffed.subslice(x, y).as_ptr() as *const _ + cdeffed[y][x..].as_ptr() as *const _ )) }, ), @@ -497,7 +497,7 @@ unsafe fn sgrproj_box_f_r1_8_avx2( #[target_feature(enable = "avx2")] pub(crate) unsafe fn sgrproj_box_f_r1_avx2( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, - cdeffed: &PlaneSlice, + cdeffed: &PlaneRegion<'_, T>, ) { for x in (0..w).step_by(8) { if x + 8 <= w { @@ -520,7 +520,7 @@ pub(crate) unsafe fn sgrproj_box_f_r1_avx2( #[target_feature(enable = "avx2")] unsafe fn sgrproj_box_f_r2_8_avx2( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], - x: usize, y: usize, cdeffed: &PlaneSlice, + x: usize, y: usize, cdeffed: &PlaneRegion<'_, T>, ) { let five = _mm256_set1_epi32(5); let six = _mm256_set1_epi32(6); @@ -573,11 +573,11 @@ unsafe fn sgrproj_box_f_r2_8_avx2( _mm256_add_epi32(a, ao), if mem::size_of::() == 1 { _mm256_cvtepu8_epi32(_mm_loadl_epi64( - cdeffed.subslice(x, y).as_ptr() as *const _ + cdeffed[y][x..].as_ptr() as *const _ )) } else { _mm256_cvtepu16_epi32(_mm_loadu_si128( - cdeffed.subslice(x, y).as_ptr() as *const _ + cdeffed[y][x..].as_ptr() as *const _ )) }, ), @@ -588,11 +588,11 @@ unsafe fn sgrproj_box_f_r2_8_avx2( ao, if mem::size_of::() == 1 { _mm256_cvtepu8_epi32(_mm_loadl_epi64( - cdeffed.subslice(x, y + 1).as_ptr() as *const _, + cdeffed[y+1][x..].as_ptr() as *const _, )) } else { _mm256_cvtepu16_epi32(_mm_loadu_si128( - cdeffed.subslice(x, y + 1).as_ptr() as *const _, + cdeffed[y+1][x..].as_ptr() as *const _, )) }, ), @@ -619,7 +619,7 @@ unsafe fn sgrproj_box_f_r2_8_avx2( #[target_feature(enable = "avx2")] pub(crate) unsafe fn sgrproj_box_f_r2_avx2( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], - y: usize, w: usize, cdeffed: &PlaneSlice, + y: usize, w: usize, cdeffed: &PlaneRegion<'_, T>, ) { for x in (0..w).step_by(8) { if x + 8 <= w { diff --git a/src/cdef.rs b/src/cdef.rs index efa9bc031e..4ffef66fa9 100644 --- a/src/cdef.rs +++ b/src/cdef.rs @@ -74,7 +74,7 @@ pub(crate) mod rust { // that term is never computed. See Section 2, step 2, of: // http://jmvalin.ca/notes/intra_paint.pdf pub fn cdef_find_dir( - img: &PlaneSlice<'_, u16>, var: &mut u32, coeff_shift: usize, + img: &PlaneRegion<'_, u16>, var: &mut u32, coeff_shift: usize, _cpu: CpuFeatureLevel, ) -> i32 { let mut cost: [i32; 8] = [0; 8]; @@ -153,8 +153,8 @@ pub(crate) mod rust { #[cold_for_target_arch("x86_64")] #[allow(clippy::erasing_op, clippy::identity_op, clippy::neg_multiply)] - pub(crate) unsafe fn cdef_filter_block( - dst: &mut PlaneRegionMut<'_, T>, input: *const u16, istride: isize, + pub(crate) fn cdef_filter_block( + dst: &mut PlaneRegionMut<'_, T>, src: &PlaneRegion<'_, u16>, pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, bit_depth: usize, xdec: usize, ydec: usize, _cpu: CpuFeatureLevel, ) { @@ -165,6 +165,7 @@ pub(crate) mod rust { let cdef_sec_taps = [[2, 1], [2, 1]]; let pri_taps = cdef_pri_taps[((pri_strength >> coeff_shift) & 1) as usize]; let sec_taps = cdef_sec_taps[((pri_strength >> coeff_shift) & 1) as usize]; + let istride = src.plane_cfg.stride as isize; let cdef_directions = [ [-1 * istride + 1, -2 * istride + 2], [0 * istride + 1, -1 * istride + 2], @@ -175,58 +176,61 @@ pub(crate) mod rust { [1 * istride + 0, 2 * istride + 0], [1 * istride + 0, 2 * istride - 1], ]; - for i in 0..ysize { - for j in 0..xsize { - let ptr_in = input.offset(i * istride + j); - let x = *ptr_in; - let mut sum = 0 as i32; - let mut max = x; - let mut min = x; - for k in 0..2usize { - let cdef_dirs = [ - cdef_directions[dir][k], + unsafe { + for i in 0..ysize { + for j in 0..xsize { + // Next to go... + let ptr_in = src.data_ptr().offset(i * istride + j); + let x = *ptr_in; + let mut sum = 0 as i32; + let mut max = x; + let mut min = x; + for k in 0..2usize { + let cdef_dirs = [ + cdef_directions[dir][k], cdef_directions[(dir + 2) & 7][k], - cdef_directions[(dir + 6) & 7][k], - ]; - let pri_tap = pri_taps[k]; - let p = - [*ptr_in.offset(cdef_dirs[0]), *ptr_in.offset(-cdef_dirs[0])]; - for p_elem in p.iter() { - sum += pri_tap - * constrain( - i32::cast_from(*p_elem) - i32::cast_from(x), - pri_strength, - damping, - ); - if *p_elem != CDEF_VERY_LARGE { - max = cmp::max(*p_elem, max); + cdef_directions[(dir + 6) & 7][k], + ]; + let pri_tap = pri_taps[k]; + let p = + [*ptr_in.offset(cdef_dirs[0]), *ptr_in.offset(-cdef_dirs[0])]; + for p_elem in p.iter() { + sum += pri_tap + * constrain( + i32::cast_from(*p_elem) - i32::cast_from(x), + pri_strength, + damping, + ); + if *p_elem != CDEF_VERY_LARGE { + max = cmp::max(*p_elem, max); + } + min = cmp::min(*p_elem, min); } - min = cmp::min(*p_elem, min); - } - let s = [ - *ptr_in.offset(cdef_dirs[1]), - *ptr_in.offset(-cdef_dirs[1]), - *ptr_in.offset(cdef_dirs[2]), - *ptr_in.offset(-cdef_dirs[2]), - ]; - let sec_tap = sec_taps[k]; - for s_elem in s.iter() { - if *s_elem != CDEF_VERY_LARGE { - max = cmp::max(*s_elem, max); - } - min = cmp::min(*s_elem, min); - sum += sec_tap - * constrain( + let s = [ + *ptr_in.offset(cdef_dirs[1]), + *ptr_in.offset(-cdef_dirs[1]), + *ptr_in.offset(cdef_dirs[2]), + *ptr_in.offset(-cdef_dirs[2]), + ]; + let sec_tap = sec_taps[k]; + for s_elem in s.iter() { + if *s_elem != CDEF_VERY_LARGE { + max = cmp::max(*s_elem, max); + } + min = cmp::min(*s_elem, min); + sum += sec_tap + * constrain( i32::cast_from(*s_elem) - i32::cast_from(x), - sec_strength, - damping, - ); + sec_strength, + damping, + ); + } } + let v = i32::cast_from(x) + ((8 + sum - (sum < 0) as i32) >> 4); + dst[i as usize][j as usize] = + T::cast_from(clamp(v, min as i32, max as i32)); } - let v = i32::cast_from(x) + ((8 + sum - (sum < 0) as i32) >> 4); - dst[i as usize][j as usize] = - T::cast_from(clamp(v, min as i32, max as i32)); } } } @@ -260,14 +264,14 @@ fn adjust_strength(strength: i32, var: i32) -> i32 { // boundaries (padding is untouched here). pub fn cdef_analyze_superblock_range( - fi: &FrameInvariants, in_frame: &Frame, blocks: &TileBlocks<'_>, + fi: &FrameInvariants, in_tile: &Tile<'_, u16>, blocks: &TileBlocks<'_>, sb_w: usize, sb_h: usize, ) -> Vec { let mut ret = Vec::::with_capacity(sb_h * sb_w); for sby in 0..sb_h { for sbx in 0..sb_w { let sbo = TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby }); - ret.push(cdef_analyze_superblock(fi, in_frame, blocks, sbo)); + ret.push(cdef_analyze_superblock(fi, in_tile, blocks, sbo)); } } ret @@ -278,7 +282,7 @@ pub fn cdef_analyze_superblock_range( // boundaries (padding is untouched here). pub fn cdef_analyze_superblock( - fi: &FrameInvariants, in_frame: &Frame, blocks: &TileBlocks<'_>, + fi: &FrameInvariants, in_tile: &Tile, blocks: &TileBlocks<'_>, sbo: TileSuperBlockOffset, ) -> CdefDirections { let coeff_shift = fi.sequence.bit_depth as usize - 8; @@ -296,11 +300,14 @@ pub fn cdef_analyze_superblock( if !skip { let mut var: u32 = 0; - let in_plane = &in_frame.planes[0]; - let in_po = sbo.plane_offset(&in_plane.cfg); - let in_slice = in_plane.slice(in_po); + let in_region = + in_tile.planes[0].subregion(Area::BlockRect { + bo: block_offset.0, + width: 8, + height: 8, + }); dir.dir[bx][by] = cdef_find_dir::( - &in_slice.reslice(8 * bx as isize, 8 * by as isize), + &in_region, &mut var, coeff_shift, fi.cpu_feature_level, @@ -516,7 +523,7 @@ pub fn cdef_padded_frame_copy(in_frame: &Frame) -> Frame { // large as the unpadded area of in // cdef_index is taken from the block context pub fn cdef_filter_superblock( - fi: &FrameInvariants, in_frame: &Frame, out: &mut TileMut<'_, U>, + fi: &FrameInvariants, input: &Tile, output: &mut TileMut<'_, U>, blocks: &TileBlocks<'_>, sbo: TileSuperBlockOffset, cdef_index: u8, cdef_dirs: &CdefDirections, ) { @@ -550,19 +557,24 @@ pub fn cdef_filter_superblock( let dir = cdef_dirs.dir[bx][by]; let var = cdef_dirs.var[bx][by]; for p in 0..planes { - let out_plane = &mut out.planes[p]; - let in_plane = &in_frame.planes[p]; - let in_po = sbo.plane_offset(&in_plane.cfg); - let xdec = in_plane.cfg.xdec; - let ydec = in_plane.cfg.ydec; - let in_stride = in_plane.cfg.stride; - let in_slice = &in_plane.slice(in_po); - let out_region = - &mut out_plane.subregion_mut(Area::BlockStartingAt { - bo: sbo.block_offset(0, 0).0, - }); + let out_plane = &mut output.planes[p]; + let in_plane = &input.planes[p]; + let xdec = in_plane.plane_cfg.xdec; + let ydec = in_plane.plane_cfg.ydec; let xsize = 8 >> xdec; let ysize = 8 >> ydec; + let in_region = + in_plane.subregion(Area::BlockRect { + bo: block_offset.0, + width: xsize, + height: ysize, + }); + let mut out_region = + out_plane.subregion_mut(Area::BlockRect { + bo: block_offset.0, + width: xsize, + height: ysize, + }); if !skip { let local_pri_strength; @@ -594,48 +606,29 @@ pub fn cdef_filter_superblock( } }; - unsafe { - let PlaneConfig { ypad, xpad, .. } = in_slice.plane.cfg; - assert!( - in_slice.rows_iter().len() + ypad - >= ((8 * by) >> ydec) + ysize + 2 - ); - assert!(in_slice.x - 2 >= -(xpad as isize)); - assert!(in_slice.y - 2 >= -(ypad as isize)); - - let mut dst = out_region.subregion_mut(Area::BlockRect { - bo: BlockOffset { x: 2 * bx, y: 2 * by }, - width: xsize, - height: ysize, - }); - let input = - in_slice[(8 * by) >> ydec][(8 * bx) >> xdec..].as_ptr(); - cdef_filter_block( - &mut dst, - input, - in_stride as isize, - local_pri_strength, - local_sec_strength, - local_dir, - local_damping, - bit_depth, - xdec, - ydec, - fi.cpu_feature_level, - ); - } + assert!( in_region.rect().x - 2 >= in_region.pad_rect().x ); + assert!( in_region.rect().y - 2 >= in_region.pad_rect().y ); + assert!( in_region.rect().x + in_region.rect().width as isize + 2 <= + in_region.pad_rect().x+in_region.pad_rect().width as isize ); + assert!( in_region.rect().y+in_region.rect().height as isize + 2 <= + in_region.pad_rect().y+in_region.pad_rect().height as isize ); + cdef_filter_block( + &mut out_region, + &in_region, + local_pri_strength, + local_sec_strength, + local_dir, + local_damping, + bit_depth, + xdec, + ydec, + fi.cpu_feature_level, + ); } else { // we need to copy input to output - let in_block = - in_slice.subslice((8 * bx) >> xdec, (8 * by) >> ydec); - let mut out_block = out_region.subregion_mut(Area::BlockRect { - bo: BlockOffset { x: 2 * bx, y: 2 * by }, - width: xsize, - height: ysize, - }); for i in 0..ysize { for j in 0..xsize { - out_block[i][j] = U::cast_from(in_block[i][j]); + out_region[i][j] = U::cast_from(in_region[i][j]); } } } @@ -726,11 +719,11 @@ pub fn cdef_filter_tile( for fbx in 0..fb_width { let sbo = TileSuperBlockOffset(SuperBlockOffset { x: fbx, y: fby }); let cdef_index = tb.get_cdef(sbo); - let cdef_dirs = cdef_analyze_superblock(fi, &cdef_frame, tb, sbo); + let cdef_dirs = cdef_analyze_superblock(fi, &cdef_frame.as_tile(), tb, sbo); cdef_filter_superblock( fi, - &cdef_frame, + &cdef_frame.as_tile(), rec, tb, sbo, diff --git a/src/lrf.rs b/src/lrf.rs index 4a34913f0e..6456caff4a 100644 --- a/src/lrf.rs +++ b/src/lrf.rs @@ -19,10 +19,10 @@ use crate::color::ChromaSampling::Cs400; use crate::context::{MAX_PLANES, SB_SIZE}; use crate::encoder::FrameInvariants; use crate::frame::{ - AsRegion, Frame, Plane, PlaneConfig, PlaneOffset, PlaneSlice, + AsRegion, Frame, Plane, PlaneConfig, PlaneOffset, }; use crate::hawktracer::*; -use crate::tiling::{Area, PlaneRegionMut, Rect}; +use crate::tiling::{Area, PlaneRegion, PlaneRegionMut, Rect, SuperIndex}; use crate::util::{clamp, CastFromPrimitive, ILog, Pixel}; use crate::api::SGRComplexityLevel; @@ -171,11 +171,11 @@ impl RestorationFilter { pub(crate) mod rust { use crate::cpu_features::CpuFeatureLevel; - use crate::frame::PlaneSlice; use crate::lrf::{ get_integral_square, sgrproj_sum_finish, SGRPROJ_RST_BITS, SGRPROJ_SGR_BITS, }; + use crate::tiling::PlaneRegion; use crate::util::CastFromPrimitive; use crate::Pixel; @@ -204,7 +204,7 @@ pub(crate) mod rust { af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, _cpu: CpuFeatureLevel, - ) { + ) { sgrproj_box_ab_internal( 1, af, @@ -242,7 +242,7 @@ pub(crate) mod rust { } pub(crate) fn sgrproj_box_f_r0( - f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, + f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneRegion<'_, T>, _cpu: CpuFeatureLevel, ) { sgrproj_box_f_r0_internal(f, 0, y, w, cdeffed); @@ -250,16 +250,17 @@ pub(crate) mod rust { #[inline(always)] pub(crate) fn sgrproj_box_f_r0_internal( - f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice, + f: &mut [u32], start_x: usize, y: usize, w: usize, + cdeffed: &PlaneRegion<'_, T>, ) { for x in start_x..w { - f[x] = (u32::cast_from(cdeffed.p(x, y))) << SGRPROJ_RST_BITS; + f[x] = (u32::cast_from(cdeffed[y][x])) << SGRPROJ_RST_BITS; } } pub(crate) fn sgrproj_box_f_r1( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, - cdeffed: &PlaneSlice, _cpu: CpuFeatureLevel, + cdeffed: &PlaneRegion<'_, T>, _cpu: CpuFeatureLevel, ) { sgrproj_box_f_r1_internal(af, bf, f, 0, y, w, cdeffed); } @@ -267,7 +268,7 @@ pub(crate) mod rust { #[inline(always)] pub(crate) fn sgrproj_box_f_r1_internal( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], start_x: usize, - y: usize, w: usize, cdeffed: &PlaneSlice, + y: usize, w: usize, cdeffed: &PlaneRegion<'_, T>, ) { let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS; for x in start_x..w { @@ -285,14 +286,14 @@ pub(crate) mod rust { + bf[1][x + 1] + bf[2][x + 1] + bf[1][x + 2]); - let v = a * u32::cast_from(cdeffed.p(x, y)) + b; + let v = a * u32::cast_from(cdeffed[y][x]) + b; f[x] = (v + (1 << shift >> 1)) >> shift; } } pub(crate) fn sgrproj_box_f_r2( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], - y: usize, w: usize, cdeffed: &PlaneSlice, _cpu: CpuFeatureLevel, + y: usize, w: usize, cdeffed: &PlaneRegion<'_, T>, _cpu: CpuFeatureLevel, ) { sgrproj_box_f_r2_internal(af, bf, f0, f1, 0, y, w, cdeffed); } @@ -300,7 +301,7 @@ pub(crate) mod rust { #[inline(always)] pub(crate) fn sgrproj_box_f_r2_internal( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], - start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice, + start_x: usize, y: usize, w: usize, cdeffed: &PlaneRegion<'_, T>, ) { let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS; let shifto = 4 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS; @@ -309,9 +310,9 @@ pub(crate) mod rust { let b = 5 * (bf[0][x] + bf[0][x + 2]) + 6 * (bf[0][x + 1]); let ao = 5 * (af[1][x] + af[1][x + 2]) + 6 * (af[1][x + 1]); let bo = 5 * (bf[1][x] + bf[1][x + 2]) + 6 * (bf[1][x + 1]); - let v = (a + ao) * u32::cast_from(cdeffed.p(x, y)) + b + bo; + let v = (a + ao) * u32::cast_from(cdeffed[y][x]) + b + bo; f0[x] = (v + (1 << shift >> 1)) >> shift; - let vo = ao * u32::cast_from(cdeffed.p(x, y + 1)) + bo; + let vo = ao * u32::cast_from(cdeffed[y+1][x]) + bo; f1[x] = (vo + (1 << shifto >> 1)) >> shifto; } } @@ -351,10 +352,8 @@ fn get_integral_square( struct VertPaddedIter<'a, T: Pixel> { // The two sources that can be selected when clipping - deblocked: &'a Plane, - cdeffed: &'a Plane, - // x index to choice where on the row to start - x: isize, + deblocked: &'a PlaneRegion<'a, T>, + cdeffed: &'a PlaneRegion<'a, T>, // y index that will be mutated y: isize, // The index at which to terminate. Can be larger than the slice length. @@ -372,40 +371,28 @@ struct VertPaddedIter<'a, T: Pixel> { impl<'a, 'b, T: Pixel> VertPaddedIter<'a, T> { fn new( - cdeffed: &PlaneSlice<'a, T>, deblocked: &PlaneSlice<'a, T>, + cdeffed: &'a PlaneRegion<'a, T>, deblocked: &'a PlaneRegion<'a, T>, stripe_h: usize, crop: usize, ) -> VertPaddedIter<'a, T> { - // cdeffed and deblocked must start at the same coordinates from their - // underlying planes. Since cropping is provided via a separate params, the - // height of the underlying planes do not need to match. - assert_eq!(cdeffed.x, deblocked.x); - assert_eq!(cdeffed.y, deblocked.y); - // To share integral images, always use the max box filter radius of 2 let r = 2; // The number of rows outside the stripe are needed - let rows_above = r + 2; + let rows_above:isize = r + 2; let rows_below = 2; - // Offset crop and stripe_h so they are relative to the underlying plane - // and not the plane slice. - let crop = crop as isize + deblocked.y; - let stripe_end = stripe_h as isize + deblocked.y; - // Move y up the number rows above. - // If y is negative we repeat the first row - let y = deblocked.y - rows_above as isize; + // If y is above the tile/frame, we repeat the first row + let y = - rows_above as isize; VertPaddedIter { - deblocked: deblocked.plane, - cdeffed: cdeffed.plane, - x: deblocked.x, + deblocked, + cdeffed, y, - end: (rows_above + stripe_h + rows_below) as isize + y, - stripe_begin: deblocked.y, - stripe_end, - crop, + end: (stripe_h + rows_below) as isize, + stripe_begin: 0, + stripe_end: stripe_h as isize, + crop: crop as isize, } } } @@ -418,21 +405,22 @@ impl<'a, T: Pixel> Iterator for VertPaddedIter<'a, T> { if self.end > self.y { // clamp before deciding the source // clamp vertically to storage at top and passed-in height at bottom - let cropped_y = clamp(self.y, 0, self.crop - 1); + let cropped_y = clamp( + self.y, + - self.deblocked.rect().y, + self.crop - 1); // clamp vertically to stripe limits let ly = clamp(cropped_y, self.stripe_begin - 2, self.stripe_end + 1); // decide if we're vertically inside or outside the strip let src_plane = if ly >= self.stripe_begin && ly < self.stripe_end as isize { - self.cdeffed + &self.cdeffed } else { - self.deblocked + &self.deblocked }; - // cannot directly return self.ps.row(row) due to lifetime issue - let range = src_plane.row_range(self.x, ly); self.y += 1; - Some(&src_plane.data[range]) + Some(&src_plane[SuperIndex(ly)]) } else { None } @@ -501,7 +489,7 @@ impl FusedIterator for HorzPaddedIter<'_, T> {} pub fn setup_integral_image( integral_image_buffer: &mut IntegralImageBuffer, integral_image_stride: usize, crop_w: usize, crop_h: usize, stripe_w: usize, - stripe_h: usize, cdeffed: &PlaneSlice, deblocked: &PlaneSlice, + stripe_h: usize, cdeffed: &PlaneRegion<'_, T>, deblocked: &PlaneRegion<'_, T>, ) { let integral_image = &mut integral_image_buffer.integral_image; let sq_integral_image = &mut integral_image_buffer.sq_integral_image; @@ -510,22 +498,41 @@ pub fn setup_integral_image( let left_w = 4; // max radius of 2 + 2 padding let right_w = 3; // max radius of 2 + 1 padding - assert_eq!(cdeffed.x, deblocked.x); - // Find how many unique elements to use to the left and right - let left_uniques = if cdeffed.x == 0 { 0 } else { left_w }; + let left_uniques = if cdeffed.rect().x == 0 { + 0 + } else { + left_w + }; let right_uniques = right_w.min(crop_w - stripe_w); // Find the total number of unique elements used let row_uniques = left_uniques + stripe_w + right_uniques; // Negative start indices result in repeating the first element of the row - let start_index_x = if cdeffed.x == 0 { -(left_w as isize) } else { 0 }; + let start_index_x = if cdeffed.rect().x == 0 { + -(left_w as isize) + } else { + 0 + }; + // Shift inputs left to get to pre-data + let cdef_shifted = cdeffed.superregion(Area::Rect{ + x: -(left_uniques as isize), + y: 0, + width: row_uniques, + height: cdeffed.rect().height, + }); + let deblock_shifted = deblocked.superregion(Area::Rect{ + x: -(left_uniques as isize), + y: 0, + width: row_uniques, + height: deblocked.rect().height, + }); let mut rows_iter = VertPaddedIter::new( // Move left to encompass all the used data - &cdeffed.go_left(left_uniques), - &deblocked.go_left(left_uniques), + &cdef_shifted, + &deblock_shifted, // since r2 uses every other row, we need an extra row if stripe_h is odd stripe_h + (stripe_h & 1), crop_h, @@ -600,7 +607,7 @@ pub fn setup_integral_image( pub fn sgrproj_stripe_filter( set: u8, xqd: [i8; 2], fi: &FrameInvariants, integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize, - cdeffed: &PlaneSlice, out: &mut PlaneRegionMut, + cdeffed: &PlaneRegion, out: &mut PlaneRegionMut, ) { let &Rect { width: stripe_w, height: stripe_h, .. } = out.rect(); let bdm8 = fi.sequence.bit_depth - 8; @@ -808,8 +815,8 @@ pub fn sgrproj_stripe_filter( // Inputs are relative to the colocated slice views. pub fn sgrproj_solve( set: u8, fi: &FrameInvariants, - integral_image_buffer: &IntegralImageBuffer, input: &PlaneSlice, - cdeffed: &PlaneSlice, cdef_w: usize, cdef_h: usize, + integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion, + cdeffed: &PlaneRegion, cdef_w: usize, cdef_h: usize, ) -> (i8, i8) { let bdm8 = fi.sequence.bit_depth - 8; @@ -1519,10 +1526,14 @@ impl RestorationState { (crop_h as isize - stripe_start_y) as usize, size, stripe_size, - &cdeffed.planes[pli] - .slice(PlaneOffset { x: x as isize, y: stripe_start_y }), - &pre_cdef.planes[pli] - .slice(PlaneOffset { x: x as isize, y: stripe_start_y }), + &cdeffed.planes[pli].region(Area::StartingAt { + x: x as isize, + y: stripe_start_y + }), + &pre_cdef.planes[pli].region(Area::StartingAt { + x: x as isize, + y: stripe_start_y + }), ); sgrproj_stripe_filter( @@ -1531,13 +1542,15 @@ impl RestorationState { fi, &stripe_filter_buffer, STRIPE_IMAGE_STRIDE, - &cdeffed.planes[pli] - .slice(PlaneOffset { x: x as isize, y: stripe_start_y }), + &cdeffed.planes[pli].region(Area::StartingAt { + x: x as isize, + y: stripe_start_y + }), &mut out.planes[pli].region_mut(Area::Rect { - x: x as isize, - y: stripe_start_y, - width: size, - height: stripe_size, + x: x as isize, + y: stripe_start_y, + width: size, + height: stripe_size, }), ); } diff --git a/src/rdo.rs b/src/rdo.rs index 4b04b1c3c8..6a7a3694a0 100644 --- a/src/rdo.rs +++ b/src/rdo.rs @@ -1884,7 +1884,7 @@ pub fn rdo_partition_decision( fn rdo_loop_plane_error( base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset, sb_w: usize, sb_h: usize, fi: &FrameInvariants, ts: &TileStateMut<'_, T>, - blocks: &TileBlocks<'_>, test: &Frame, src: &Frame, pli: usize, + blocks: &TileBlocks<'_>, test: &Tile<'_, T>, src: &Tile<'_, T>, pli: usize, ) -> ScaledDistortion { let sb_w_blocks = if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_w; @@ -1899,9 +1899,9 @@ fn rdo_loop_plane_error( if loop_bo.0.x < blocks.cols() && loop_bo.0.y < blocks.rows() { let src_plane = &src.planes[pli]; let test_plane = &test.planes[pli]; - let PlaneConfig { xdec, ydec, .. } = src_plane.cfg; - debug_assert_eq!(xdec, test_plane.cfg.xdec); - debug_assert_eq!(ydec, test_plane.cfg.ydec); + let &PlaneConfig { xdec, ydec, .. } = src_plane.plane_cfg; + debug_assert_eq!(xdec, test_plane.plane_cfg.xdec); + debug_assert_eq!(ydec, test_plane.plane_cfg.ydec); // Unfortunately, our distortion biases are only available via // Frame-absolute addressing, so we need a block offset @@ -1915,9 +1915,9 @@ fn rdo_loop_plane_error( ); let src_region = - src_plane.region(Area::BlockStartingAt { bo: loop_bo.0 }); + src_plane.subregion(Area::BlockStartingAt { bo: loop_bo.0 }); let test_region = - test_plane.region(Area::BlockStartingAt { bo: loop_bo.0 }); + test_plane.subregion(Area::BlockStartingAt { bo: loop_bo.0 }); err += if pli == 0 { // For loop filters, We intentionally use cdef_dist even with @@ -2037,23 +2037,22 @@ pub fn rdo_loop_decision( planes, ) }; + let mut rec_region16 = rec_subset16.as_tile_mut(); // sub-setted region of the TileBlocks for our working frame area - let mut tileblocks_subset = cw.bc.blocks.subregion( + let mut tileblocks_region = cw.bc.blocks.subregion( base_sbo.block_offset(0, 0).0.x, base_sbo.block_offset(0, 0).0.y, sb_w << SUPERBLOCK_TO_BLOCK_SHIFT, sb_h << SUPERBLOCK_TO_BLOCK_SHIFT, ); - let src_subset:Frame = { - cdef_tile_copy( - &ts.input_tile, - base_sbo, - (pixel_w + 7) >> 3, - (pixel_h + 7) >> 3, - ) - }; + let src_region:Tile<'_, T> = ts.input_tile.subregion (Area::Rect { + x: (base_sbo.0.x << SUPERBLOCK_TO_PLANE_SHIFT) as isize, + y: (base_sbo.0.y << SUPERBLOCK_TO_PLANE_SHIFT) as isize, + width: pixel_w, + height: pixel_h, + }); if deblock_p { // Find a good deblocking filter solution for the passed in area. @@ -2061,9 +2060,9 @@ pub fn rdo_loop_decision( // better results from CDEF/LRF RDO. let deblock_levels = deblock_filter_optimize( fi, - &rec_subset16.as_tile(), - &src_subset.as_tile(), - &tileblocks_subset.as_const(), + &rec_region16.as_const(), + &src_region, + &tileblocks_region.as_const(), crop_w, crop_h, ); @@ -2077,8 +2076,8 @@ pub fn rdo_loop_decision( // finally, deblock the temp frame deblock_filter_frame( &deblock_copy, - &mut rec_subset16.as_tile_mut(), - &tileblocks_subset.as_const(), + &mut rec_region16, + &tileblocks_region.as_const(), crop_w, crop_h, fi.sequence.bit_depth, @@ -2087,22 +2086,25 @@ pub fn rdo_loop_decision( } } - let rec_subset:Frame = { + let mut rec_subset:Frame = { cdef_tile_copy( - &rec_subset16.as_tile(), + &rec_region16.as_const(), TileSuperBlockOffset(SuperBlockOffset { x: 0, y: 0 }), (pixel_w + 7) >> 3, (pixel_h + 7) >> 3, ) }; + let rec_region = rec_subset.as_tile_mut(); - let mut cdef_work = if fi.sequence.enable_cdef { - Some(cdef_tile_copy ( - &rec_subset16.as_tile(), + let mut cdef_work; + let mut cdef_region = if fi.sequence.enable_cdef { + cdef_work = cdef_tile_copy ( + &rec_region16.as_const(), TileSuperBlockOffset(SuperBlockOffset { x: 0, y: 0 }), (pixel_w + 7) >> 3, (pixel_h + 7) >> 3, - )) + ); + Some(cdef_work.as_tile_mut()) } else { None }; @@ -2110,8 +2112,8 @@ pub fn rdo_loop_decision( let mut cdef_dirs = if fi.sequence.enable_cdef { Some(cdef_analyze_superblock_range( fi, - &rec_subset16, - &tileblocks_subset.as_const(), + &rec_region16.as_const(), + &tileblocks_region.as_const(), sb_w, sb_h, )) @@ -2119,12 +2121,14 @@ pub fn rdo_loop_decision( None }; - let mut lrf_work = if fi.sequence.enable_restoration { - Some(cdef_block8_frame( + let mut lrf_work; + let mut lrf_region = if fi.sequence.enable_restoration { + lrf_work = cdef_block8_frame( (pixel_w + 7) >> 3, (pixel_h + 7) >> 3, &ts.rec.as_const(), - )) + ); + Some(lrf_work.as_tile_mut()) } else { None }; @@ -2132,8 +2136,8 @@ pub fn rdo_loop_decision( // Precompute directional analysis for CDEF let mut cdef_data = if fi.sequence.enable_cdef { Some(( - &rec_subset16, - cdef_work.as_mut().unwrap(), + &rec_region16, + cdef_region.as_mut().unwrap(), cdef_dirs.as_mut().unwrap(), )) } else { @@ -2150,7 +2154,7 @@ pub fn rdo_loop_decision( let mut lrf_change = true; while cdef_change || lrf_change { // search for improved cdef indices, superblock by superblock, if cdef is enabled. - if let Some((rec_copy, cdef_ref, cdef_dirs)) = cdef_data.as_mut() + if let Some((rec, cdef, cdef_dirs)) = cdef_data.as_mut() { for sby in 0..sb_h { for sbx in 0..sb_w { @@ -2166,21 +2170,11 @@ pub fn rdo_loop_decision( for cdef_index in 0..(1 << fi.cdef_bits) { let mut err = ScaledDistortion::zero(); let mut rate = 0; - - let mut cdef_ref_tm = TileMut::new( - cdef_ref, - TileRect { - x: 0, - y: 0, - width: cdef_ref.planes[0].cfg.width, - height: cdef_ref.planes[0].cfg.height, - }, - ); cdef_filter_superblock( fi, - &rec_subset16, - &mut cdef_ref_tm, - &tileblocks_subset.as_const(), + &rec_region16.as_const(), + cdef, + &tileblocks_region.as_const(), loop_sbo, cdef_index, &cdef_dirs[sby * sb_w + sbx], @@ -2190,23 +2184,23 @@ pub fn rdo_loop_decision( // We need the cropped-to-visible-frame area of this SB let wh = if fi.sequence.use_128x128_superblock { 128 } else { 64 }; - let PlaneConfig { xdec, ydec, .. } = cdef_ref.planes[pli].cfg; + let PlaneConfig { xdec, ydec, .. } = cdef.planes[pli].plane_cfg; let vis_width = (wh >> xdec).min( (crop_w >> xdec) - - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).x + - loop_sbo.plane_offset(&cdef.planes[pli].plane_cfg).x as usize, ); let vis_height = (wh >> ydec).min( (crop_h >> ydec) - - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).y + - loop_sbo.plane_offset(&cdef.planes[pli].plane_cfg).y as usize, ); // which LRU are we currently testing against? - if let (Some((lru_x, lru_y)), Some(lrf_ref)) = { + if let (Some((lru_x, lru_y)), Some(lrf)) = { let rp = &ts.restoration.planes[pli]; ( rp.restoration_unit_offset(base_sbo, loop_sbo, false), - &mut lrf_work, + &mut lrf_region, ) } { // We have a valid LRU, apply LRF, compute error @@ -2219,9 +2213,9 @@ pub fn rdo_loop_decision( 1, fi, ts, - &tileblocks_subset.as_const(), - &cdef_ref, - &src_subset, + &tileblocks_region.as_const(), + &cdef.as_const(), + &src_region, pli, ); rate += if fi.sequence.enable_restoration { @@ -2238,8 +2232,9 @@ pub fn rdo_loop_decision( } RestorationFilter::Sgrproj { set, xqd } => { // only run on this single superblock - let loop_po = - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg); + let loop_cdef = &cdef.planes[pli]. + subregion(Area::SuperBlockStartingAt{sbo: loop_sbo.0}, + ); // todo: experiment with borrowing border pixels // rather than edge-extending. Right now this is // hard-clipping to the superblock boundary. @@ -2250,8 +2245,8 @@ pub fn rdo_loop_decision( vis_height, vis_width, vis_height, - &cdef_ref.planes[pli].slice(loop_po), - &cdef_ref.planes[pli].slice(loop_po), + loop_cdef, + loop_cdef, ); sgrproj_stripe_filter( set, @@ -2259,13 +2254,17 @@ pub fn rdo_loop_decision( fi, &ts.integral_buffer, SOLVE_IMAGE_STRIDE, - &cdef_ref.planes[pli].slice(loop_po), - &mut lrf_ref.planes[pli].region_mut(Area::Rect { - x: loop_po.x, - y: loop_po.y, - width: vis_width, - height: vis_height, - }), + loop_cdef, + // set the outer access bounds via as_region_mut + // (the entire scratch buffer), then get a + // subregion view. This allows lrf access to + // padding. + &mut lrf.planes[pli].subregion_mut( + Area::SuperBlockRect { + sbo: loop_sbo.0, + width: vis_width, + height: vis_height, + }), ); err += rdo_loop_plane_error( base_sbo, @@ -2274,9 +2273,9 @@ pub fn rdo_loop_decision( 1, fi, ts, - &tileblocks_subset.as_const(), - lrf_ref, - &src_subset, + &tileblocks_region.as_const(), + &lrf.as_const(), + &src_region, pli, ); rate += cw.count_lrf_switchable( @@ -2297,9 +2296,9 @@ pub fn rdo_loop_decision( 1, fi, ts, - &tileblocks_subset.as_const(), - &cdef_ref, - &src_subset, + &tileblocks_region.as_const(), + &cdef.as_const(), + &src_region, pli, ); // no relative cost differeneces to different @@ -2319,26 +2318,16 @@ pub fn rdo_loop_decision( if best_new_index != prev_best_index { cdef_change = true; best_index[sby * sb_w + sbx] = best_new_index; - tileblocks_subset.set_cdef(loop_sbo, best_new_index as u8); + tileblocks_region.set_cdef(loop_sbo, best_new_index as u8); } - let mut cdef_ref_tm = TileMut::new( - cdef_ref, - TileRect { - x: 0, - y: 0, - width: cdef_ref.planes[0].cfg.width, - height: cdef_ref.planes[0].cfg.height, - }, - ); - // Keep cdef output up to date; we need it for restoration // both below and above (padding) cdef_filter_superblock( fi, - rec_copy, - &mut cdef_ref_tm, - &tileblocks_subset.as_const(), + &rec.as_const(), + cdef, + &tileblocks_region.as_const(), loop_sbo, best_index[sby * sb_w + sbx] as u8, &cdef_dirs[sby * sb_w + sbx], @@ -2354,18 +2343,18 @@ pub fn rdo_loop_decision( lrf_change = false; // search for improved restoration filter parameters if restoration is enabled - if let Some(lrf_ref) = &mut lrf_work.as_mut() { + if let Some(lrf_output) = &mut lrf_region.as_mut() { let lrf_input = if let Some(( - _rec_copy, - cdef_work, + _rec, + cdef, _cdef_dirs)) = &cdef_data { // When CDEF is enabled, we pull from the CDEF output - &cdef_work + cdef } else { // When CDEF is disabled, we pull from the [optionally // deblocked] reconstruction - &rec_subset + &rec_region }; for pli in 0..planes { // Nominal size of LRU in pixels before clipping to visible frame @@ -2374,7 +2363,7 @@ pub fn rdo_loop_decision( let lru_sb_w = 1 << ts.restoration.planes[pli].rp_cfg.sb_h_shift; // height, in sb, of an LRU in this plane let lru_sb_h = 1 << ts.restoration.planes[pli].rp_cfg.sb_v_shift; - let PlaneConfig { xdec, ydec, .. } = lrf_ref.planes[pli].cfg; + let PlaneConfig { xdec, ydec, .. } = lrf_output.planes[pli].plane_cfg; for lru_y in 0..lru_h[pli] { // number of LRUs vertically for lru_x in 0..lru_w[pli] { @@ -2388,9 +2377,6 @@ pub fn rdo_loop_decision( pli, false, ) { - let src_plane = &src_subset.planes[pli]; // uncompressed input for reference - let lrf_in_plane = &lrf_input.planes[pli]; - let lrf_po = loop_sbo.plane_offset(&src_plane.cfg); let mut best_new_lrf = best_lrf[lru_y * lru_w[pli] + lru_x][pli]; let mut best_cost = best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli]; @@ -2404,9 +2390,9 @@ pub fn rdo_loop_decision( lru_sb_h, fi, ts, - &tileblocks_subset.as_const(), - lrf_input, - &src_subset, + &tileblocks_region.as_const(), + &lrf_input.as_const(), + &src_region, pli, ); let rate = cw.count_lrf_switchable( @@ -2429,12 +2415,16 @@ pub fn rdo_loop_decision( // We need the cropped-to-visible-frame computation area of this LRU let vis_width = unit_size.min( (crop_w >> xdec) - - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).x as usize, + - loop_sbo.plane_offset(&lrf_output.planes[pli].plane_cfg).x as usize, ); let vis_height = unit_size.min( (crop_h >> ydec) - - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).y as usize, + - loop_sbo.plane_offset(&lrf_output.planes[pli].plane_cfg).y as usize, ); + let src_plane = src_region.planes[pli].subregion( + Area::SuperBlockStartingAt{sbo: loop_sbo.0}); + let lrf_in_plane = lrf_input.planes[pli]. + subregion(Area::SuperBlockStartingAt{sbo: loop_sbo.0}); // todo: experiment with borrowing border pixels // rather than edge-extending. Right now this is @@ -2446,8 +2436,8 @@ pub fn rdo_loop_decision( vis_height, vis_width, vis_height, - &lrf_in_plane.slice(lrf_po), - &lrf_in_plane.slice(lrf_po), + &lrf_in_plane, + &lrf_in_plane, ); for &set in get_sgr_sets(fi.config.speed_settings.sgr_complexity) @@ -2456,8 +2446,8 @@ pub fn rdo_loop_decision( set, fi, &ts.integral_buffer, - &src_plane.slice(lrf_po), - &lrf_in_plane.slice(lrf_po), + &src_plane, + &lrf_in_plane, vis_width, vis_height, ); @@ -2470,13 +2460,18 @@ pub fn rdo_loop_decision( fi, &ts.integral_buffer, SOLVE_IMAGE_STRIDE, - &lrf_in_plane.slice(lrf_po), - &mut lrf_ref.planes[pli].region_mut(Area::Rect { - x: lrf_po.x, - y: lrf_po.y, - width: vis_width, - height: vis_height, - }), + &lrf_in_plane, + // set the outer access bounds via as_region_mut() + // (the entire scratch buffer), then get a + // subregion view. This allows lrf access to + // padding. + &mut lrf_output.planes[pli]. + subregion_mut( + Area::SuperBlockRect { + sbo: loop_sbo.0, + width: vis_width, + height: vis_height, + }), ); } let err = rdo_loop_plane_error( @@ -2486,9 +2481,9 @@ pub fn rdo_loop_decision( lru_sb_h, fi, ts, - &tileblocks_subset.as_const(), - lrf_ref, - &src_subset, + &tileblocks_region.as_const(), + &lrf_output.as_const(), + &src_region, pli, ); let rate = cw.count_lrf_switchable( diff --git a/src/tiling/plane_region.rs b/src/tiling/plane_region.rs index 44801f5eec..b0acf067a2 100644 --- a/src/tiling/plane_region.rs +++ b/src/tiling/plane_region.rs @@ -38,6 +38,14 @@ impl Rect { height: self.height >> ydec, } } + pub fn to_area(&self) -> Area { + Area::Rect { + x: self.x, + y: self.y, + width: self.width, + height: self.height, + } + } } // Structure to describe a rectangle area in several ways @@ -65,6 +73,11 @@ pub enum Area { /// a rectangle starting at given block offset until the bottom-right corner /// of the parent BlockStartingAt { bo: BlockOffset }, + /// A well-defined rectangle with offset expressed in superblocks + SuperBlockRect { sbo: SuperBlockOffset, width: usize, height: usize }, + /// a rectangle starting at given superblock offset until the + /// bottom-right corner of the parent + SuperBlockStartingAt { sbo: SuperBlockOffset }, } impl Area { @@ -101,6 +114,22 @@ impl Area { height: (parent_height as isize - y) as usize, } } + Area::SuperBlockRect { sbo, width, height } => Rect { + x: (sbo.x >> xdec << SUPERBLOCK_TO_PLANE_SHIFT) as isize, + y: (sbo.y >> ydec << SUPERBLOCK_TO_PLANE_SHIFT) as isize, + width, + height, + }, + Area::SuperBlockStartingAt { sbo } => { + let x = (sbo.x >> xdec << SUPERBLOCK_TO_PLANE_SHIFT) as isize; + let y = (sbo.y >> ydec << SUPERBLOCK_TO_PLANE_SHIFT) as isize; + Rect { + x, + y, + width: (parent_width as isize - x) as usize, + height: (parent_height as isize - y) as usize, + } + } } } } @@ -115,6 +144,7 @@ pub struct PlaneRegion<'a, T: Pixel> { pub plane_cfg: &'a PlaneConfig, // private to guarantee borrowing rules rect: Rect, + bounds: Rect, phantom: PhantomData<&'a T>, } @@ -127,9 +157,12 @@ pub struct PlaneRegionMut<'a, T: Pixel> { data: *mut T, // points to (plane_cfg.x, plane_cfg.y) pub plane_cfg: &'a PlaneConfig, rect: Rect, + bounds: Rect, phantom: PhantomData<&'a mut T>, } +pub struct SuperIndex(pub isize); + // common impl for PlaneRegion and PlaneRegionMut macro_rules! plane_region_common { // $name: PlaneRegion or PlaneRegionMut @@ -145,6 +178,12 @@ macro_rules! plane_region_common { data: unsafe { std::ptr::null_mut::() }, plane_cfg: cfg, rect, + bounds: Rect{ + x: -(cfg.xorigin as isize), + y: -(cfg.yorigin as isize), + width: cfg.width + cfg.xorigin + cfg.xpad, + height: cfg.height + cfg.yorigin + cfg.ypad, + }, phantom: PhantomData, } } @@ -158,14 +197,26 @@ macro_rules! plane_region_common { data: unsafe { data.$as_ptr().offset(origin) }, plane_cfg: cfg, rect, + bounds: Rect{ + x: -(cfg.xorigin as isize), + y: -(cfg.yorigin as isize), + width: cfg.width + cfg.xorigin + cfg.xpad, + height: cfg.height + cfg.yorigin + cfg.ypad, + }, phantom: PhantomData, } } + #[inline(always)] pub fn new(plane: &'a $($opt_mut)? Plane, rect: Rect) -> Self { Self::from_slice(& $($opt_mut)? plane.data, &plane.cfg, rect) } + #[inline(always)] + pub fn restrict(&mut self) { + self.bounds = self.rect; + } + #[inline(always)] pub fn data_ptr(&self) -> *const T { self.data @@ -176,6 +227,11 @@ macro_rules! plane_region_common { &self.rect } + #[inline(always)] + pub fn pad_rect(&self) -> &Rect { + &self.bounds + } + #[inline(always)] pub fn rows_iter(&self) -> RowsIter<'_, T> { RowsIter { @@ -263,6 +319,43 @@ macro_rules! plane_region_common { data, plane_cfg: &self.plane_cfg, rect: absolute_rect, + bounds: self.bounds, + phantom: PhantomData, + } + } + + #[inline(always)] + // as with subregion above, but allows re-expanding the region. + // This will _not_ allow expanding a region beyond the original + // rectangle created with new() or from_slice(). As such, it + // protects the original Tile/Frame boundaries. + pub fn superregion(&self, area: Area) -> PlaneRegion<'_, T> { + let rect = area.to_rect( + self.plane_cfg.xdec, + self.plane_cfg.ydec, + self.rect.width, + self.rect.height, + ); + assert!(self.rect.x + rect.x >= self.bounds.x); + assert!(rect.x + self.rect.x <= self.bounds.width as isize + + self.bounds.x); + assert!(self.rect.y + rect.y >= self.bounds.y); + assert!(rect.y + self.rect.y <= self.bounds.height as isize + + self.bounds.y); + let data = unsafe { + self.data.offset(rect.y * self.plane_cfg.stride as isize + rect.x) + }; + let absolute_rect = Rect { + x: self.rect.x + rect.x, + y: self.rect.y + rect.y, + width: rect.width, + height: rect.height, + }; + PlaneRegion { + data, + plane_cfg: &self.plane_cfg, + rect: absolute_rect, + bounds: self.bounds, phantom: PhantomData, } } @@ -345,6 +438,20 @@ macro_rules! plane_region_common { } } } + + impl Index for $name<'_, T> { + type Output = [T]; + + #[inline(always)] + fn index(&self, index: SuperIndex) -> &Self::Output { + assert!(index.0 < self.bounds.y + self.bounds.height as isize - self.rect.y); + assert!(index.0 >= self.bounds.y - self.rect.y); + unsafe { + let ptr = self.data.offset(index.0 * self.plane_cfg.stride as isize); + slice::from_raw_parts(ptr, self.rect.width) + } + } + } } } @@ -416,6 +523,43 @@ impl<'a, T: Pixel> PlaneRegionMut<'a, T> { data, plane_cfg: self.plane_cfg, rect: absolute_rect, + bounds: self.bounds, + phantom: PhantomData, + } + } + + #[inline(always)] + // as with subregion_mut above, but allows re-expanding the region. + // This will _not_ allow expanding a region beyond the original + // rectangle created with new() or from_slice(). As such, it + // protects the original Tile/Frame boundaries. + pub fn superregion_mut(&self, area: Area) -> PlaneRegionMut<'_, T> { + let rect = area.to_rect( + self.plane_cfg.xdec, + self.plane_cfg.ydec, + self.rect.width, + self.rect.height, + ); + assert!(self.rect.x + rect.x >= self.bounds.x); + assert!(rect.x + self.rect.x <= self.bounds.width as isize + + self.bounds.x); + assert!(self.rect.y + rect.y >= self.bounds.y); + assert!(rect.y + self.rect.y <= self.bounds.height as isize + + self.bounds.y); + let data = unsafe { + self.data.offset(rect.y * self.plane_cfg.stride as isize + rect.x) + }; + let absolute_rect = Rect { + x: self.rect.x + rect.x, + y: self.rect.y + rect.y, + width: rect.width, + height: rect.height, + }; + PlaneRegionMut { + data, + plane_cfg: &self.plane_cfg, + rect: absolute_rect, + bounds: self.bounds, phantom: PhantomData, } } @@ -426,6 +570,7 @@ impl<'a, T: Pixel> PlaneRegionMut<'a, T> { data: self.data, plane_cfg: self.plane_cfg, rect: self.rect, + bounds: self.bounds, phantom: PhantomData, } } @@ -549,6 +694,7 @@ impl<'a, T: Pixel> Iterator for VertWindows<'a, T> { data: self.data, plane_cfg: self.plane_cfg, rect: self.output_rect, + bounds: self.output_rect, phantom: PhantomData, }; self.data = unsafe { self.data.add(self.plane_cfg.stride) }; @@ -583,6 +729,7 @@ impl<'a, T: Pixel> Iterator for HorzWindows<'a, T> { data: self.data, plane_cfg: self.plane_cfg, rect: self.output_rect, + bounds: self.output_rect, phantom: PhantomData, }; self.data = unsafe { self.data.add(1) }; diff --git a/src/tiling/tile.rs b/src/tiling/tile.rs index b621fddb2e..fbfe824b42 100644 --- a/src/tiling/tile.rs +++ b/src/tiling/tile.rs @@ -131,6 +131,47 @@ macro_rules! tile_common { ], } } + + #[inline(always)] + pub fn restrict(mut self) -> Self { + self.planes[0].restrict(); + self.planes[1].restrict(); + self.planes[2].restrict(); + self + } + + // Return a view to a subregion of the Tile + // + // The subregion must be included in (i.e. must not exceed) this Tile. + // + // It is described by an `Area`, relative to the luma plane of + // this region. + #[inline(always)] + pub fn subregion(&self, area: Area) -> Tile<'_, T> { + let tile_rect = area.to_rect( + 0, + 0, + self.planes[0].rect().width, + self.planes[0].rect().height, + ); + Tile { + planes: { + let sub_plane = |pli: usize| { + let plane = &self.planes[pli]; + let &PlaneConfig { xdec, ydec, .. } = self.planes[pli].plane_cfg; + let rect = tile_rect.decimated(xdec, ydec); + assert!(rect.x >= 0 && rect.x as usize <= plane.rect().width); + assert!(rect.y >= 0 && rect.y as usize <= plane.rect().height); + assert!(rect.x as usize + rect.width <= + plane.rect().x as usize + plane.rect().width); + assert!(rect.y as usize + rect.height <= + plane.rect().y as usize + plane.rect().height); + plane.subregion(rect.to_area()) + }; + [sub_plane(0), sub_plane(1), sub_plane(2)] + }, + } + } } } } diff --git a/src/tiling/tile_state.rs b/src/tiling/tile_state.rs index 7866ee2d4e..78d36315d7 100644 --- a/src/tiling/tile_state.rs +++ b/src/tiling/tile_state.rs @@ -160,11 +160,11 @@ impl<'a, T: Pixel> TileStateMut<'a, T> { width, height, input: &fs.input, - input_tile: Tile::new(&fs.input, luma_rect), + input_tile: Tile::new(&fs.input, luma_rect).restrict(), input_hres: &fs.input_hres, input_qres: &fs.input_qres, deblock: &fs.deblock, - rec: TileMut::new(Arc::make_mut(&mut fs.rec), luma_rect), + rec: TileMut::new(Arc::make_mut(&mut fs.rec), luma_rect).restrict(), qc: Default::default(), segmentation: &fs.segmentation, restoration: TileRestorationStateMut::new( From 4adf2468c8638fa676b42409f32ac9002dca5b8d Mon Sep 17 00:00:00 2001 From: Monty Montgomery Date: Tue, 23 Jun 2020 14:56:09 -0400 Subject: [PATCH 4/4] Implement fully bounds-checked pad and visible area access Sanitized, 2-D access to the visible or padded region of a PlaneRegion. Currently implemented through index operator overloading via some horrifying fat pointer abuse. "I don't think slices are meant to do that." "Yeah, but they're doing it!" --- src/lrf.rs | 69 ++++++++------------------ src/tiling/plane_region.rs | 99 +++++++++++++++++++++++++++++++++----- 2 files changed, 108 insertions(+), 60 deletions(-) diff --git a/src/lrf.rs b/src/lrf.rs index 6456caff4a..2867631613 100644 --- a/src/lrf.rs +++ b/src/lrf.rs @@ -22,7 +22,7 @@ use crate::frame::{ AsRegion, Frame, Plane, PlaneConfig, PlaneOffset, }; use crate::hawktracer::*; -use crate::tiling::{Area, PlaneRegion, PlaneRegionMut, Rect, SuperIndex}; +use crate::tiling::{Area, PadIndex, PadRow, PlaneRegion, PlaneRegionMut, Rect}; use crate::util::{clamp, CastFromPrimitive, ILog, Pixel}; use crate::api::SGRComplexityLevel; @@ -398,7 +398,7 @@ impl<'a, 'b, T: Pixel> VertPaddedIter<'a, T> { } impl<'a, T: Pixel> Iterator for VertPaddedIter<'a, T> { - type Item = &'a [T]; + type Item = &'a PadRow<'a, T>; #[inline(always)] fn next(&mut self) -> Option { @@ -420,7 +420,7 @@ impl<'a, T: Pixel> Iterator for VertPaddedIter<'a, T> { &self.deblocked }; self.y += 1; - Some(&src_plane[SuperIndex(ly)]) + Some(&src_plane[PadIndex(ly)]) } else { None } @@ -439,24 +439,25 @@ impl ExactSizeIterator for VertPaddedIter<'_, T> {} impl FusedIterator for VertPaddedIter<'_, T> {} struct HorzPaddedIter<'a, T: Pixel> { - // Active area cropping is done using the length of the slice - slice: &'a [T], + row: &'a PadRow<'a, T>, // x index of the iterator // When less than 0, repeat the first element. When greater than end, repeat // the last element index: isize, + crop: isize, // The index at which to terminate. Can be larger than the slice length. - end: usize, + end: isize, } impl<'a, T: Pixel> HorzPaddedIter<'a, T> { fn new( - slice: &'a [T], start_index: isize, width: usize, + row: &'a PadRow<'a,T>, index: isize, crop: isize, end: isize, ) -> HorzPaddedIter<'a, T> { HorzPaddedIter { - slice, - index: start_index, - end: (width as isize + start_index) as usize, + row, + index, + crop, + end, } } } @@ -468,9 +469,9 @@ impl<'a, T: Pixel> Iterator for HorzPaddedIter<'a, T> { fn next(&mut self) -> Option { if self.index < self.end as isize { // clamp to the edges of the frame - let x = clamp(self.index, 0, self.slice.len() as isize - 1) as usize; + let x = clamp(self.index, -self.row.x(), self.crop as isize - 1); self.index += 1; - Some(&self.slice[x]) + Some(&self.row[PadIndex(x)]) } else { None } @@ -498,51 +499,21 @@ pub fn setup_integral_image( let left_w = 4; // max radius of 2 + 2 padding let right_w = 3; // max radius of 2 + 1 padding - // Find how many unique elements to use to the left and right - let left_uniques = if cdeffed.rect().x == 0 { - 0 - } else { - left_w - }; - let right_uniques = right_w.min(crop_w - stripe_w); - - // Find the total number of unique elements used - let row_uniques = left_uniques + stripe_w + right_uniques; - - // Negative start indices result in repeating the first element of the row - let start_index_x = if cdeffed.rect().x == 0 { - -(left_w as isize) - } else { - 0 - }; - - // Shift inputs left to get to pre-data - let cdef_shifted = cdeffed.superregion(Area::Rect{ - x: -(left_uniques as isize), - y: 0, - width: row_uniques, - height: cdeffed.rect().height, - }); - let deblock_shifted = deblocked.superregion(Area::Rect{ - x: -(left_uniques as isize), - y: 0, - width: row_uniques, - height: deblocked.rect().height, - }); let mut rows_iter = VertPaddedIter::new( // Move left to encompass all the used data - &cdef_shifted, - &deblock_shifted, + &cdeffed, + &deblocked, // since r2 uses every other row, we need an extra row if stripe_h is odd stripe_h + (stripe_h & 1), crop_h, ) - .map(|row: &[T]| { + .map(|row: &PadRow<'_, T>| { HorzPaddedIter::new( // Limit how many unique elements we use - &row[..row_uniques], - start_index_x, - left_w + stripe_w + right_w, + row, + -left_w, + crop_w as isize, + stripe_w as isize + right_w, ) }); diff --git a/src/tiling/plane_region.rs b/src/tiling/plane_region.rs index b0acf067a2..0dbd09de7e 100644 --- a/src/tiling/plane_region.rs +++ b/src/tiling/plane_region.rs @@ -16,6 +16,7 @@ use crate::util::*; use std::iter::FusedIterator; use std::marker::PhantomData; use std::ops::{Index, IndexMut}; +use std::rc::Rc; use std::slice; /// Rectangle of a plane region, in pixels @@ -134,6 +135,86 @@ impl Area { } } +pub struct PadIndex(pub isize); +pub struct PadRow<'a, T: Pixel>(pub [PlaneRegion<'a, T>]); +pub struct PadRowMut<'a, T: Pixel>(pub [PlaneRegionMut<'a, T>]); + +macro_rules! pad_row_common { + // $name: PadRow or PadRowMut + ($name:ident, $parent:ident $(,$opt_mut:tt)?) => { + impl<'a, T: Pixel> $name<'a, T> { + pub fn x(&self) -> isize { + unsafe { + let pr: &$parent<'a, T> = &self.0[0]; + pr.rect().x + } + } + pub fn y(&self) -> isize { + unsafe { + let pr: &$parent<'a, T> = &self.0[0]; + pr.rect().y + } + } + pub fn pad_x(&self) -> isize { + unsafe { + let pr: &$parent<'a, T> = &self.0[0]; + pr.pad_rect().x + } + } + pub fn pad_y(&self) -> isize { + unsafe { + let pr: &$parent<'a, T> = &self.0[0]; + pr.pad_rect().y + } + } + pub fn width(&self) -> usize { + unsafe { + let pr: &$parent<'a, T> = &self.0[0]; + pr.pad_rect().width + } + } + pub fn pad_width(&self) -> usize { + unsafe { + let pr: &$parent<'a, T> = &self.0[0]; + pr.pad_rect().width + } + } + } + impl<'a, T: Pixel> Index for $name<'a, T> { + type Output = T; + #[inline(always)] + fn index(&self, index: PadIndex) -> &Self::Output { + unsafe { + let pr: &$parent<'a, T> = &self.0[0]; + let row = self.0.len() as isize - pr.rect().y + pr.pad_rect().y - 1; + assert!(row >= pr.pad_rect().y - pr.rect().y); + assert!(row < pr.pad_rect().height as isize + pr.pad_rect().y - pr.rect().y); + assert!(index.0 >= pr.pad_rect().x - pr.rect().x); + assert!(index.0 < pr.pad_rect().width as isize + pr.pad_rect().x - pr.rect().x); + &*pr.data.offset(row * pr.plane_cfg.stride as isize + index.0) + } + } + } + impl<'a, T: Pixel> Index for $name<'a, T> { + type Output = T; + #[inline(always)] + fn index(&self, index: usize) -> &Self::Output { + unsafe { + let pr: &$parent<'a, T> = &self.0[0]; + let row = self.0.len() as isize - pr.rect().y + pr.pad_rect().y - 1; + assert!(row >= pr.pad_rect().y - pr.rect().y); + assert!(row < pr.pad_rect().height as isize + pr.pad_rect().y - pr.rect().y); + assert!((index as isize) < pr.pad_rect().width as isize + pr.pad_rect().x - pr.rect().x); + &*pr.data.offset(row * pr.plane_cfg.stride as isize + index as isize) + } + } + } + } +} + +pad_row_common!(PadRow, PlaneRegion); +pad_row_common!(PadRowMut, PlaneRegionMut, mut); + /// Bounded region of a plane /// /// This allows to give access to a rectangular area of a plane without @@ -161,8 +242,6 @@ pub struct PlaneRegionMut<'a, T: Pixel> { phantom: PhantomData<&'a mut T>, } -pub struct SuperIndex(pub isize); - // common impl for PlaneRegion and PlaneRegionMut macro_rules! plane_region_common { // $name: PlaneRegion or PlaneRegionMut @@ -426,7 +505,7 @@ macro_rules! plane_region_common { unsafe impl Send for $name<'_, T> {} unsafe impl Sync for $name<'_, T> {} - impl Index for $name<'_, T> { + impl<'a, T: Pixel> Index for $name<'a, T> { type Output = [T]; #[inline(always)] @@ -439,16 +518,14 @@ macro_rules! plane_region_common { } } - impl Index for $name<'_, T> { - type Output = [T]; - + impl<'a, T: Pixel> Index for $name<'a, T> { + type Output = PadRow<'a, T>; #[inline(always)] - fn index(&self, index: SuperIndex) -> &Self::Output { - assert!(index.0 < self.bounds.y + self.bounds.height as isize - self.rect.y); + fn index(&self, index: PadIndex) -> &Self::Output { assert!(index.0 >= self.bounds.y - self.rect.y); + assert!(index.0 < self.bounds.height as isize + self.bounds.y - self.rect.y); unsafe { - let ptr = self.data.offset(index.0 * self.plane_cfg.stride as isize); - slice::from_raw_parts(ptr, self.rect.width) + &*(slice::from_raw_parts(self, (index.0 + self.rect.y - self.bounds.y + 1) as usize) as *const [Self] as *const PadRow<'a, T>) } } } @@ -533,7 +610,7 @@ impl<'a, T: Pixel> PlaneRegionMut<'a, T> { // This will _not_ allow expanding a region beyond the original // rectangle created with new() or from_slice(). As such, it // protects the original Tile/Frame boundaries. - pub fn superregion_mut(&self, area: Area) -> PlaneRegionMut<'_, T> { + pub fn superregion_mut(&mut self, area: Area) -> PlaneRegionMut<'_, T> { let rect = area.to_rect( self.plane_cfg.xdec, self.plane_cfg.ydec,