From b6373766bc4168dd1e2cbaeb261ffe9c8476858d Mon Sep 17 00:00:00 2001 From: Nicole LeGare Date: Mon, 11 Mar 2024 10:35:01 -0700 Subject: [PATCH] `Rav1dFrameContext_lf::lr_lpf_line`: Convert to offsets --- src/cdef_apply.rs | 23 ++++++++++++-------- src/decode.rs | 54 +++++++++++++++++++++-------------------------- src/internal.rs | 7 +++--- src/lf_apply.rs | 35 ++++++++++-------------------- src/lr_apply.rs | 15 ++++++------- 5 files changed, 59 insertions(+), 75 deletions(-) diff --git a/src/cdef_apply.rs b/src/cdef_apply.rs index ac0293abf..4d4b5b780 100644 --- a/src/cdef_apply.rs +++ b/src/cdef_apply.rs @@ -196,6 +196,7 @@ pub(crate) unsafe fn rav1d_cdef_brow( let uv_stride: ptrdiff_t = BD::pxstride(f.cur.stride[1]); let cdef_line_buf = BD::cast_pixel_slice_mut(&mut f.lf.cdef_line_buf); + let lr_line_buf = BD::cast_pixel_slice(&f.lf.lr_line_buf); let mut bit = false; for by in (by_start..by_end).step_by(2) { @@ -328,7 +329,7 @@ pub(crate) unsafe fn rav1d_cdef_brow( } else { offset = (sby * ((4 as c_int) << sb128) - 4) as isize * y_stride + (bx * 4) as isize; - top = f.lf.lr_lpf_line[0].cast::().offset(offset); + top = lr_line_buf.as_ptr().add(f.lf.lr_lpf_line[0]).offset(offset); } bot = bptrs[0].offset(8 * y_stride as isize); st_y = false; @@ -347,7 +348,7 @@ pub(crate) unsafe fn rav1d_cdef_brow( } else { let line = sby * ((4 as c_int) << sb128) + 4 * sb128 + 2; offset = line as isize * y_stride + (bx * 4) as isize; - bot = f.lf.lr_lpf_line[0].cast::().offset(offset); + bot = lr_line_buf.as_ptr().add(f.lf.lr_lpf_line[0]).offset(offset); } st_y = false; } else { @@ -421,8 +422,10 @@ pub(crate) unsafe fn rav1d_cdef_brow( let line_0 = sby * ((4 as c_int) << sb128) - 4; offset = line_0 as isize * uv_stride + (bx * 4 >> ss_hor) as isize; - top = - f.lf.lr_lpf_line[pl].cast::().offset(offset); + top = lr_line_buf + .as_ptr() + .add(f.lf.lr_lpf_line[pl]) + .offset(offset); } bot = bptrs[pl].offset(((8 >> ss_ver) * uv_stride) as isize); st_uv = false; @@ -441,11 +444,13 @@ pub(crate) unsafe fn rav1d_cdef_brow( .add(f.lf.cdef_lpf_line[pl]) .offset(offset); } else { - let line_1 = sby * ((4 as c_int) << sb128) + 4 * sb128 + 2; - offset = line_1 as isize * uv_stride - + (bx * 4 >> ss_hor) as isize; - bot = - f.lf.lr_lpf_line[pl].cast::().offset(offset); + let line = sby * ((4 as c_int) << sb128) + 4 * sb128 + 2; + offset = + line as isize * uv_stride + (bx * 4 >> ss_hor) as isize; + bot = lr_line_buf + .as_ptr() + .add(f.lf.lr_lpf_line[pl]) + .offset(offset); } st_uv = false; } else { diff --git a/src/decode.rs b/src/decode.rs index ef1773523..63d9751af 100644 --- a/src/decode.rs +++ b/src/decode.rs @@ -4446,38 +4446,32 @@ pub(crate) unsafe fn rav1d_decode_frame_init( }; y_stride = f.sr_cur.p.stride[0]; uv_stride = f.sr_cur.p.stride[1]; - if y_stride * num_lines as isize != f.lf.lr_buf_plane_sz[0] as isize - || uv_stride * num_lines as isize * 2 != f.lf.lr_buf_plane_sz[1] as isize - { - // lr simd may overread the input, so slightly over-allocate the lpf buffer - let mut alloc_sz: usize = 128; - alloc_sz += y_stride.unsigned_abs() * num_lines as usize; - alloc_sz += uv_stride.unsigned_abs() * num_lines as usize * 2; - // TODO: Fallible allocation - // On allocation failure set `f.lf.lr_buf_plane_sz` to 0. - f.lf.lr_line_buf.resize(alloc_sz, 0); - let mut ptr = f.lf.lr_line_buf.as_mut_ptr(); - ptr = ptr.offset(64); - if y_stride < 0 { - f.lf.lr_lpf_line[0] = - ptr.offset(-(y_stride * (num_lines as isize - 1))) as *mut DynPixel; - } else { - f.lf.lr_lpf_line[0] = ptr as *mut DynPixel; - } - ptr = ptr.offset(y_stride.abs() * num_lines as isize); - if uv_stride < 0 { - f.lf.lr_lpf_line[1] = - ptr.offset(-(uv_stride * (num_lines as isize * 1 - 1))) as *mut DynPixel; - f.lf.lr_lpf_line[2] = - ptr.offset(-(uv_stride * (num_lines as isize * 2 - 1))) as *mut DynPixel; - } else { - f.lf.lr_lpf_line[1] = ptr as *mut DynPixel; - f.lf.lr_lpf_line[2] = ptr.offset(uv_stride * num_lines as isize) as *mut DynPixel; - } + // lr simd may overread the input, so slightly over-allocate the lpf buffer + let mut alloc_sz: usize = 128; + alloc_sz += y_stride.unsigned_abs() * num_lines as usize; + alloc_sz += uv_stride.unsigned_abs() * num_lines as usize * 2; + // TODO: Fallible allocation + f.lf.lr_line_buf.resize(alloc_sz, 0); + + let y_stride_px = bpc.pxstride(y_stride); + let uv_stride_px = bpc.pxstride(uv_stride); - f.lf.lr_buf_plane_sz[0] = y_stride as c_int * num_lines; - f.lf.lr_buf_plane_sz[1] = uv_stride as c_int * num_lines * 2; + let mut offset = bpc.pxstride(64usize); + if y_stride < 0 { + f.lf.lr_lpf_line[0] = offset.wrapping_add_signed(-(y_stride_px * (num_lines as isize - 1))); + } else { + f.lf.lr_lpf_line[0] = offset; + } + offset = offset.wrapping_add_signed(y_stride_px.abs() * num_lines as isize); + if uv_stride < 0 { + f.lf.lr_lpf_line[1] = + offset.wrapping_add_signed(-(uv_stride_px * (num_lines as isize * 1 - 1))); + f.lf.lr_lpf_line[2] = + offset.wrapping_add_signed(-(uv_stride_px * (num_lines as isize * 2 - 1))); + } else { + f.lf.lr_lpf_line[1] = offset; + f.lf.lr_lpf_line[2] = offset.wrapping_add_signed(uv_stride_px * num_lines as isize); } // update allocation for loopfilter masks diff --git a/src/internal.rs b/src/internal.rs index 02139885d..af2117a77 100644 --- a/src/internal.rs +++ b/src/internal.rs @@ -455,16 +455,15 @@ pub struct Rav1dFrameContext_lf { pub level: Vec<[u8; 4]>, pub mask: Vec, /* len = w*h */ pub lr_mask: Vec, - pub lr_buf_plane_sz: [c_int; 2], /* (stride*sbh*4) << sb128 if n_tc > 1, else stride*4 */ pub lim_lut: Align16, pub last_sharpness: c_int, pub lvl: [[[[u8; 2]; 8]; 4]; 8], /* [8 seg_id][4 dir][8 ref][2 is_gmv] */ pub tx_lpf_right_edge: TxLpfRightEdge, pub cdef_line_buf: AlignedVec32, /* AlignedVec32 */ pub lr_line_buf: AlignedVec64, - pub cdef_line: [[usize; 3]; 2], /* [2 pre/post][3 plane] */ - pub cdef_lpf_line: [usize; 3], /* plane */ - pub lr_lpf_line: [*mut DynPixel; 3], /* plane */ + pub cdef_line: [[usize; 3]; 2], /* [2 pre/post][3 plane] */ + pub cdef_lpf_line: [usize; 3], /* plane */ + pub lr_lpf_line: [usize; 3], /* plane */ // in-loop filter per-frame state keeping pub start_of_tile_row: *mut u8, diff --git a/src/lf_apply.rs b/src/lf_apply.rs index 400a8eb71..1dceb7b6a 100644 --- a/src/lf_apply.rs +++ b/src/lf_apply.rs @@ -155,35 +155,22 @@ pub(crate) unsafe fn rav1d_copy_lpf( let seq_hdr = &***f.seq_hdr.as_ref().unwrap(); let tt_off = have_tt * sby * ((4 as c_int) << seq_hdr.sb128); - let lr_plane_sz = &f.lf.lr_buf_plane_sz; let y_stride = BD::pxstride(lr_stride[0]); let uv_stride = BD::pxstride(lr_stride[1]); - let y_span = lr_plane_sz[0] as isize - y_stride; - let uv_span = lr_plane_sz[1] as isize / 2 - uv_stride; - let dst: [&mut [BD::Pixel]; 3] = [ - slice::from_raw_parts_mut( - (f.lf.lr_lpf_line[0] as *mut BD::Pixel).offset(cmp::min(y_span, 0)), - lr_plane_sz[0] as usize, - ), - slice::from_raw_parts_mut( - (f.lf.lr_lpf_line[1] as *mut BD::Pixel).offset(cmp::min(uv_span, 0)), - lr_plane_sz[1] as usize / 2, - ), - slice::from_raw_parts_mut( - (f.lf.lr_lpf_line[2] as *mut BD::Pixel).offset(cmp::min(uv_span, 0)), - lr_plane_sz[1] as usize / 2, - ), - ]; - let dst_offset: [usize; 2] = [ - (tt_off as isize * y_stride - cmp::min(y_span, 0)) as usize, - (tt_off as isize * uv_stride - cmp::min(uv_span, 0)) as usize, + let y_offset = (tt_off as isize * y_stride) as usize; + let uv_offset = (tt_off as isize * uv_stride) as usize; + let dst_offset = [ + f.lf.lr_lpf_line[0] + y_offset, + f.lf.lr_lpf_line[1] + uv_offset, + f.lf.lr_lpf_line[2] + uv_offset, ]; // TODO Also check block level restore type to reduce copying. let restore_planes = f.lf.restore_planes; let cdef_line_buf = BD::cast_pixel_slice_mut(&mut f.lf.cdef_line_buf); + let lr_line_buf = BD::cast_pixel_slice_mut(&mut f.lf.lr_line_buf); if seq_hdr.cdef != 0 || restore_planes & LR_RESTORE_Y as c_int != 0 { let h = f.cur.p.h; @@ -193,7 +180,7 @@ pub(crate) unsafe fn rav1d_copy_lpf( if restore_planes & LR_RESTORE_Y as c_int != 0 || resize == 0 { backup_lpf::( c, - dst[0], + lr_line_buf, dst_offset[0], lr_stride[0], src[0], @@ -265,7 +252,7 @@ pub(crate) unsafe fn rav1d_copy_lpf( if restore_planes & LR_RESTORE_U as c_int != 0 || resize == 0 { backup_lpf::( c, - dst[1], + lr_line_buf, dst_offset[1], lr_stride[1], src[1], @@ -325,8 +312,8 @@ pub(crate) unsafe fn rav1d_copy_lpf( if restore_planes & LR_RESTORE_V as c_int != 0 || resize == 0 { backup_lpf::( c, - dst[2], - dst_offset[1], + lr_line_buf, + dst_offset[2], lr_stride[1], src[2], (src_offset[1] as isize - offset_uv as isize * BD::pxstride(src_stride[1])) diff --git a/src/lr_apply.rs b/src/lr_apply.rs index da2635389..b9c8a66f5 100644 --- a/src/lr_apply.rs +++ b/src/lr_apply.rs @@ -18,7 +18,6 @@ use libc::ptrdiff_t; use std::cmp; use std::ffi::c_int; use std::ffi::c_uint; -use std::slice; pub type LrRestorePlanes = c_uint; pub const LR_RESTORE_V: LrRestorePlanes = 4; @@ -47,12 +46,8 @@ unsafe fn lr_stripe( let sby = y + (if y != 0 { 8 << ss_ver } else { 0 }) >> 6 - ss_ver + seq_hdr.sb128; let have_tt = (c.tc.len() > 1) as c_int; let lpf_stride = BD::pxstride(stride); - let lpf_plane_sz = BD::pxstride(f.lf.lr_buf_plane_sz[(plane != 0) as usize] as isize); - let mut lpf_offset = cmp::max(lpf_stride - lpf_plane_sz, 0); - let lpf = &slice::from_raw_parts( - (f.lf.lr_lpf_line[plane as usize] as *const BD::Pixel).offset(-lpf_offset), - lpf_plane_sz.unsigned_abs(), - ); + let lr_line_buf = BD::cast_pixel_slice(&f.lf.lr_line_buf); + let mut lpf_offset = f.lf.lr_lpf_line[plane as usize] as isize; lpf_offset += (have_tt * (sby * (4 << seq_hdr.sb128) - 4)) as isize * lpf_stride + x as isize; // The first stripe of the frame is shorter by 8 luma pixel rows. let mut stripe_h = cmp::min(64 - 8 * (y == 0) as c_int >> ss_ver, row_h - y); @@ -102,7 +97,11 @@ unsafe fn lr_stripe( p.as_mut_ptr().add(p_offset).cast(), stride, left.as_ptr().cast(), - lpf.as_ptr().offset(lpf_offset).cast(), + // NOTE: The calculated pointer may point to before the beginning of + // `lr_line_buf`, so we must use `.wrapping_offset` here. + // `.wrapping_offset` is needed since `.offset` requires the pointer is in bounds, + // which `.wrapping_offset` does not, and delays that requirement to when the pointer is dereferenced + lr_line_buf.as_ptr().wrapping_offset(lpf_offset).cast(), unit_w, stripe_h, &mut params,