Source code

Revision control

Copy as Markdown

Other Tools

; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%include "x86/filmgrain_common.asm"
SECTION_RODATA 16
pd_16: times 4 dd 16
pw_1: times 8 dw 1
pw_16384: times 8 dw 16384
pw_8192: times 8 dw 8192
pw_23_22: dw 23, 22
times 3 dw 0, 32
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
pw_27_17_17_27: dw 27, 17, 17, 27
times 2 dw 0, 32
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
pw_seed_xor: times 2 dw 0xb524
times 2 dw 0x49d8
pb_1: times 4 db 1
hmul_bits: dw 32768, 16384, 8192, 4096
round: dw 2048, 1024, 512
mul_bits: dw 256, 128, 64, 32, 16
round_vals: dw 32, 64, 128, 256, 512, 1024
max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16
min: dw 0, 16*4, 16*16
; these two should be next to each other
pw_4: times 2 dw 4
pw_16: times 2 dw 16
%macro JMP_TABLE 1-*
%xdefine %1_table %%table
%xdefine %%base %1_table
%xdefine %%prefix mangle(private_prefix %+ _%1)
%%table:
%rep %0 - 1
dd %%prefix %+ .ar%2 - %%base
%rotate 1
%endrep
%endmacro
JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3
SECTION .text
%if ARCH_X86_32
%undef base
%define PIC_ptr(a) base+a
%else
%define PIC_ptr(a) a
%endif
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg
%assign %%idx 0
%define %%tmp %2
%if %0 == 8
%define %%tmp %8
%endif
%rep (%6/2)
%if %%idx == 0
movd %5 %+ d, %2
pshuflw %%tmp, %2, q3232
%else
movd %5 %+ d, %%tmp
%if %6 == 8
%if %%idx == 2
punpckhqdq %%tmp, %%tmp
%elif %%idx == 4
psrlq %%tmp, 32
%endif
%endif
%endif
movzx %4 %+ d, %5 %+ w
shr %5 %+ d, 16
%if %%idx == 0
movd %1, [%3+%4*%7]
%else
pinsrw %1, [%3+%4*%7], %%idx + 0
%endif
pinsrw %1, [%3+%5*%7], %%idx + 1
%assign %%idx %%idx+2
%endrep
%endmacro
%macro SPLATD 2 ; dst, src
%ifnidn %1, %2
movd %1, %2
%endif
pshufd %1, %1, q0000
%endmacro
%macro SPLATW 2 ; dst, src
%ifnidn %1, %2
movd %1, %2
%endif
pshuflw %1, %1, q0000
punpcklqdq %1, %1
%endmacro
INIT_XMM ssse3
%if ARCH_X86_64
cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax
lea r4, [pb_mask]
%define base r4-pb_mask
%else
cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax
LEA r4, $$
%define base r4-$$
%endif
movq m1, [base+rnd_next_upperbit_mask]
movq m4, [base+mul_bits]
movq m7, [base+hmul_bits]
mov r3d, [fg_dataq+FGData.grain_scale_shift]
lea r5d, [bdmaxq+1]
shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc
sub r3, r5
SPLATW m6, [base+round+r3*2-2]
mova m5, [base+pb_mask]
SPLATW m0, [fg_dataq+FGData.seed]
mov r3, -73*82*2
sub bufq, r3
%if ARCH_X86_64
lea r6, [gaussian_sequence]
%endif
.loop:
pand m2, m0, m1
psrlw m3, m2, 10
por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
pmullw m2, m4 ; bits 0x0f00 are set
pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
psllq m2, m3, 30
por m2, m3
psllq m3, m2, 15
por m2, m3 ; aggregate each bit into next seed's high bit
pmulhuw m3, m0, m7
por m2, m3 ; 4 next output seeds
pshuflw m0, m2, q3333
psrlw m2, 5
%if ARCH_X86_64
vpgatherdw m3, m2, r6, r5, r7, 4, 2
%else
vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2
%endif
paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0
; shifts by 0, which pmulhrsw does not support
pmulhrsw m3, m6
movq [bufq+r3], m3
add r3, 4*2
jl .loop
; auto-regression code
movsxd r3, [fg_dataq+FGData.ar_coeff_lag]
movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4]
lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table]
jmp r3
.ar1:
%if WIN64
DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0
lea bufq, [r0-2*(82*73-(82*3+79))]
PUSH r8
%else
%if ARCH_X86_64
DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
%else ; x86-32
DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0
PUSH r6
%define shiftd r1d
%endif
sub bufq, 2*(82*73-(82*3+79))
%endif
movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
movd m4, [fg_dataq+FGData.ar_coeffs_y]
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
%if WIN64
DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0
%elif ARCH_X86_64
DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
%else ; x86-32
%undef shiftd
DEFINE_ARGS buf, shift, min, val3, x, cf3, val0
%define hd dword r0m
%define maxd dword minm
%endif
%if cpuflag(sse4)
pmovsxbw m4, m4
%else
pxor m3, m3
pcmpgtb m3, m4
punpcklbw m4, m3
%endif
pinsrw m4, [base+pw_1], 3
pshufd m5, m4, q1111
pshufd m4, m4, q0000
SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd
mov hd, 70
sar maxd, 1
mov mind, maxd
xor mind, -1
.y_loop_ar1:
mov xq, -76
movsx val3d, word [bufq+xq*2-2]
.x_loop_ar1:
movu m0, [bufq+xq*2-82*2-2] ; top/left
psrldq m2, m0, 2 ; top
psrldq m1, m0, 4 ; top/right
punpcklwd m0, m2
punpcklwd m1, m3
pmaddwd m0, m4
pmaddwd m1, m5
paddd m0, m1
.x_loop_ar1_inner:
movd val0d, m0
psrldq m0, 4
imul val3d, cf3d
add val3d, val0d
sar val3d, shiftb
movsx val0d, word [bufq+xq*2]
add val3d, val0d
cmp val3d, maxd
cmovg val3d, maxd
cmp val3d, mind
cmovl val3d, mind
mov word [bufq+xq*2], val3w
; keep val3d in-place as left for next x iteration
inc xq
jz .x_loop_ar1_end
test xq, 3
jnz .x_loop_ar1_inner
jmp .x_loop_ar1
.x_loop_ar1_end:
add bufq, 82*2
dec hd
jg .y_loop_ar1
%if WIN64
POP r8
%elif ARCH_X86_32
POP r6
%undef maxd
%undef hd
%endif
.ar0:
RET
.ar2:
%if ARCH_X86_32
ALLOC_STACK -16*8
%endif
DEFINE_ARGS buf, fg_data, bdmax, shift
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movd m0, [base+round_vals-12+shiftq*2]
pshuflw m0, m0, q0000
movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11
pxor m2, m2
punpcklwd m0, m2
pcmpgtb m2, m6
punpckhbw m3, m6, m2
punpcklbw m6, m2
pshufd m2, m6, q3333
pshufd m1, m6, q2222
pshufd m7, m6, q1111
pshufd m6, m6, q0000
pshufd m4, m3, q1111
pshufd m3, m3, q0000
%if ARCH_X86_64
SWAP 0, 12
SWAP 1, 8
SWAP 2, 9
SWAP 3, 10
SWAP 4, 11
%else
%define m12 [rsp+0*16]
%define m8 [rsp+1*16]
%define m9 [rsp+2*16]
%define m10 [rsp+3*16]
%define m11 [rsp+4*16]
mova m12, m0
mova m8, m1
mova m9, m2
mova m10, m3
mova m11, m4
mov bdmaxd, bdmaxm
%endif
sar bdmaxd, 1
SPLATW m0, bdmaxd ; max_grain
pcmpeqw m1, m1
%if !cpuflag(sse4)
pcmpeqw m2, m2
psrldq m2, 14
pslldq m2, 2
pxor m2, m1
%endif
pxor m1, m0 ; min_grain
%if ARCH_X86_64
SWAP 0, 13
SWAP 1, 14
SWAP 2, 15
%else
%define m13 [rsp+5*16]
%define m14 [rsp+6*16]
mova m13, m0
mova m14, m1
%if !cpuflag(sse4)
%define m15 [rsp+7*16]
mova m15, m2
%endif
%endif
sub bufq, 2*(82*73-(82*3+79))
DEFINE_ARGS buf, fg_data, h, x
mov hd, 70
.y_loop_ar2:
mov xq, -76
.x_loop_ar2:
movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5]
movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5]
psrldq m2, m0, 2
psrldq m3, m0, 4
psrldq m4, m0, 6
psrldq m5, m0, 8
punpcklwd m0, m2
punpcklwd m3, m4
punpcklwd m5, m1
psrldq m2, m1, 2
psrldq m4, m1, 4
punpcklwd m2, m4
psrldq m4, m1, 6
psrldq m1, 8
punpcklwd m4, m1
pmaddwd m0, m6
pmaddwd m3, m7
pmaddwd m5, m8
pmaddwd m2, m9
pmaddwd m4, m10
paddd m0, m3
paddd m5, m2
paddd m0, m4
paddd m0, m5 ; accumulated top 2 rows
paddd m0, m12
movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5]
pshufd m4, m1, q3321
pxor m2, m2
pcmpgtw m2, m4
punpcklwd m4, m2 ; in dwords, y=0,x=[0,3]
.x_loop_ar2_inner:
pmaddwd m2, m1, m11
paddd m2, m0
psrldq m0, 4 ; shift top to next pixel
psrad m2, [fg_dataq+FGData.ar_coeff_shift]
paddd m2, m4
packssdw m2, m2
pminsw m2, m13
pmaxsw m2, m14
psrldq m4, 4
pslldq m2, 2
psrldq m1, 2
%if cpuflag(sse4)
pblendw m1, m2, 00000010b
%else
pand m1, m15
pandn m3, m15, m2
por m1, m3
%endif
; overwrite previous pixel, this should be ok
movd [bufq+xq*2-2], m1
inc xq
jz .x_loop_ar2_end
test xq, 3
jnz .x_loop_ar2_inner
jmp .x_loop_ar2
.x_loop_ar2_end:
add bufq, 82*2
dec hd
jg .y_loop_ar2
%if ARCH_X86_32
%undef m8
%undef m9
%undef m10
%undef m11
%undef m12
%undef m13
%undef m14
%undef m15
%endif
RET
.ar3:
DEFINE_ARGS buf, fg_data, bdmax, shift
%if WIN64
mov r6, rsp
and rsp, ~15
sub rsp, 64
%define tmp rsp
%elif ARCH_X86_64
%define tmp rsp+stack_offset-72
%else
ALLOC_STACK -16*12
%define tmp rsp
mov bdmaxd, bdmaxm
%endif
sar bdmaxd, 1
SPLATW m7, bdmaxd ; max_grain
pcmpeqw m6, m6
%if !cpuflag(sse4)
pcmpeqw m4, m4
psrldq m4, 14
pslldq m4, 4
pxor m4, m6
%endif
pxor m6, m7 ; min_grain
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
%if ARCH_X86_64
SWAP 6, 14
SWAP 7, 15
%else
%define m14 [rsp+10*16]
%define m15 [esp+11*16]
mova m14, m6
mova m15, m7
%endif
; build cf0-1 until 18-19 in m5-12 and r0/1
pxor m1, m1
movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15
pcmpgtb m1, m0
punpckhbw m2, m0, m1
punpcklbw m0, m1
%if cpuflag(sse4)
pshufd m4, m2, q3333
%else
pshufd m5, m2, q3333
mova [tmp+48], m5
%endif
pshufd m3, m2, q2222
pshufd m1, m2, q0000
pshufd m2, m2, q1111
pshufd m7, m0, q2222
pshufd m6, m0, q1111
pshufd m5, m0, q0000
pshufd m0, m0, q3333
%if ARCH_X86_64
SWAP 0, 8
SWAP 1, 9
SWAP 2, 10
SWAP 3, 11
SWAP 4, 12
%else
%define m8 [rsp+4*16]
%define m9 [esp+5*16]
%define m10 [rsp+6*16]
%define m11 [esp+7*16]
%define m12 [rsp+8*16]
mova m8, m0
mova m9, m1
mova m10, m2
mova m11, m3
mova m12, m4
%endif
; build cf20,round in r2
; build cf21-23,round*2 in m13
pxor m1, m1
movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23
pcmpgtb m1, m0
punpcklbw m0, m1
pshufd m1, m0, q0000
pshufd m2, m0, q1111
mova [tmp+ 0], m1
mova [tmp+16], m2
psrldq m3, m0, 10
pinsrw m3, [base+round_vals+shiftq*2-10], 3
%if ARCH_X86_64
SWAP 3, 13
%else
%define m13 [esp+9*16]
mova m13, m3
%endif
pinsrw m0, [base+round_vals+shiftq*2-12], 5
pshufd m3, m0, q2222
mova [tmp+32], m3
DEFINE_ARGS buf, fg_data, h, x
sub bufq, 2*(82*73-(82*3+79))
mov hd, 70
.y_loop_ar3:
mov xq, -76
.x_loop_ar3:
movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6]
palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5]
palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6]
punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
pmaddwd m0, m5
pmaddwd m2, m6
pmaddwd m3, m7
paddd m0, m2
paddd m0, m3
; m0 = top line first 6 multiplied by cf, m1 = top line last entry
movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4]
movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6]
punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5]
palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6]
punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
pmaddwd m1, m8
pmaddwd m4, m9
pmaddwd m3, m10
pmaddwd m2, m11
paddd m1, m4
paddd m3, m2
paddd m0, m1
paddd m0, m3
; m0 = top 2 lines multiplied by cf
movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6]
palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5]
palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6]
punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
punpcklwd m2, [base+pw_1]
%if cpuflag(sse4)
pmaddwd m1, m12
%else
pmaddwd m1, [tmp+48]
%endif
pmaddwd m3, [tmp+ 0]
pmaddwd m4, [tmp+16]
pmaddwd m2, [tmp+32]
paddd m1, m3
paddd m4, m2
paddd m0, m1
paddd m0, m4
; m0 = top 3 lines multiplied by cf plus rounding for downshift
movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
.x_loop_ar3_inner:
pmaddwd m2, m1, m13
pshufd m3, m2, q1111
paddd m2, m3 ; left+cur
paddd m2, m0 ; add top
psrldq m0, 4
psrad m2, [fg_dataq+FGData.ar_coeff_shift]
packssdw m2, m2
pminsw m2, m15
pmaxsw m2, m14
pslldq m2, 4
psrldq m1, 2
%if cpuflag(sse4)
pblendw m1, m2, 00000100b
%else
pand m1, m12
pandn m3, m12, m2
por m1, m3
%endif
; overwrite a couple of pixels, should be ok
movq [bufq+xq*2-4], m1
inc xq
jz .x_loop_ar3_end
test xq, 3
jnz .x_loop_ar3_inner
jmp .x_loop_ar3
.x_loop_ar3_end:
add bufq, 82*2
dec hd
jg .y_loop_ar3
%if WIN64
mov rsp, r6
%elif ARCH_X86_32
%undef m8
%undef m9
%undef m10
%undef m11
%undef m12
%undef m13
%undef m14
%undef m15
%endif
RET
%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
INIT_XMM ssse3
%if ARCH_X86_64
cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg
%define base r8-pb_mask
lea r8, [pb_mask]
movifnidn bdmaxd, bdmaxm
lea r6d, [bdmaxq+1]
%else
cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
%define base r2-$$
LEA r2, $$
mov fg_dataq, r2m
mov r6d, r4m
inc r6d
%endif
movq m1, [base+rnd_next_upperbit_mask]
movq m4, [base+mul_bits]
movq m7, [base+hmul_bits]
mov r5d, [fg_dataq+FGData.grain_scale_shift]
shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc
sub r5, r6
SPLATW m6, [base+round+r5*2-2]
mova m5, [base+pb_mask]
SPLATW m0, [fg_dataq+FGData.seed]
%if ARCH_X86_64
SPLATW m2, [base+pw_seed_xor+uvq*4]
%else
mov r5d, r3m
SPLATW m2, [base+pw_seed_xor+r5*4]
%endif
pxor m0, m2
%if ARCH_X86_64
lea r6, [gaussian_sequence]
%endif
%if %2
mov hd, 73-35*%3
add bufq, 44*2
.loop_y:
mov xq, -44
%else
mov xq, -82*73
add bufq, 82*73*2
%endif
.loop_x:
pand m2, m0, m1
psrlw m3, m2, 10
por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
pmullw m2, m4 ; bits 0x0f00 are set
pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
psllq m2, m3, 30
por m2, m3
psllq m3, m2, 15
por m2, m3 ; aggregate each bit into next seed's high bit
pmulhuw m3, m0, m7
por m2, m3 ; 4 next output seeds
pshuflw m0, m2, q3333
psrlw m2, 5
%if ARCH_X86_64
vpgatherdw m3, m2, r6, r9, r10, 4, 2
%else
vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2
%endif
paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0
; shifts by 0, which pmulhrsw does not support
pmulhrsw m3, m6
movq [bufq+xq*2], m3
add xq, 4
jl .loop_x
%if %2
add bufq, 82*2
dec hd
jg .loop_y
%endif
; auto-regression code
movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4]
lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table]
jmp r5
.ar0:
%if ARCH_X86_64
DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
%else
DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
ALLOC_STACK -16*2
mov bufyq, r1m
mov uvd, r3m
%endif
imul uvd, 28
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
SPLATW m3, [base+hmul_bits+shiftq*2-10]
%if ARCH_X86_64
sar bdmaxd, 1
SPLATW m1, bdmaxd ; max_gain
%else
SPLATW m1, r4m
psraw m1, 1
%endif
pcmpeqw m7, m7
pxor m7, m1 ; min_grain
%if ARCH_X86_64
SWAP 1, 14
DEFINE_ARGS buf, bufy, h, x
%else
%define m14 [rsp+0*16]
mova m14, m1
DEFINE_ARGS buf, bufy, pic_reg, h, x
%endif
pxor m5, m5
pcmpgtb m5, m4
punpcklbw m4, m5
%if %2
SPLATW m6, [base+hmul_bits+2+%3*2]
%endif
SPLATW m4, m4
pxor m5, m5
%if %2
%if !cpuflag(sse4)
pcmpeqw m2, m2
pslldq m2, 12
%if ARCH_X86_64
SWAP 2, 12
%else
%define m12 [rsp+1*16]
mova m12, m2
%endif
%endif
%endif
%if %2
sub bufq, 2*(82*(73-35*%3)+82-(82*3+41))
%else
sub bufq, 2*(82*70-3)
%endif
add bufyq, 2*(3+82*3)
mov hd, 70-35*%3
.y_loop_ar0:
; first 32 pixels
xor xd, xd
.x_loop_ar0:
movu m0, [bufyq+xq*(2<<%2)]
%if %2
%if %3
movu m2, [bufyq+xq*4+82*2]
paddw m0, m2
%endif
movu m1, [bufyq+xq*4 +16]
%if %3
movu m2, [bufyq+xq*4+82*2+16]
paddw m1, m2
%endif
phaddw m0, m1
pmulhrsw m0, m6
%endif
punpckhwd m1, m0, m5
punpcklwd m0, m5
REPX {pmaddwd x, m4}, m0, m1
REPX {psrad x, 5}, m0, m1
packssdw m0, m1
pmulhrsw m0, m3
movu m1, [bufq+xq*2]
paddw m0, m1
pminsw m0, m14
pmaxsw m0, m7
cmp xd, 72-40*%2
je .end
movu [bufq+xq*2], m0
add xd, 8
jmp .x_loop_ar0
; last 6/4 pixels
.end:
%if %2
%if cpuflag(sse4)
pblendw m0, m1, 11000000b
%else
pand m1, m12
pandn m2, m12, m0
por m0, m1, m2
%endif
movu [bufq+xq*2], m0
%else
movq [bufq+xq*2], m0
%endif
add bufq, 82*2
add bufyq, 82*(2<<%3)
dec hd
jg .y_loop_ar0
%if ARCH_X86_32
%undef m12
%undef m14
%endif
RET
.ar1:
%if ARCH_X86_64
DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x
%else
RESET_STACK_STATE
DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3
mov bufyq, r1m
mov uvd, r3m
%endif
imul uvd, 28
movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
%if WIN64
DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0
%if %2
lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))]
%else
lea bufq, [r0-2*(82*69+3)]
%endif
%else
%if ARCH_X86_64
DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0
%else
DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3
%define hd dword r1m
%define mind dword r3m
%define maxd dword r4m
%endif
%if %2
sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
%else
sub bufq, 2*(82*69+3)
%endif
%endif
%if ARCH_X86_64
mov shiftd, [r2+FGData.ar_coeff_shift]
%else
mov shiftd, [r3+FGData.ar_coeff_shift]
%endif
pxor m5, m5
pcmpgtb m5, m4
punpcklbw m4, m5 ; cf0-4 in words
pshuflw m4, m4, q2100
psrldq m4, 2 ; cf0-3,4 in words
pshufd m5, m4, q1111
pshufd m4, m4, q0000
movd m3, [base+round_vals+shiftq*2-12] ; rnd
pxor m6, m6
punpcklwd m3, m6
%if %2
SPLATW m6, [base+hmul_bits+2+%3*2]
%endif
SPLATD m3, m3
add bufyq, 2*(79+82*3)
mov hd, 70-35*%3
sar maxd, 1
%if ARCH_X86_64
mov mind, maxd
xor mind, -1
%else
DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3
mov r2, maxd
xor r2, -1
mov mind, r2
%endif
.y_loop_ar1:
mov xq, -(76>>%2)
movsx val3d, word [bufq+xq*2-2]
.x_loop_ar1:
movu m0, [bufq+xq*2-82*2-2] ; top/left
%if %2
movu m7, [bufyq+xq*4]
%if %3
movu m1, [bufyq+xq*4+82*2]
phaddw m7, m1
%else
phaddw m7, m7
%endif
%else
movq m7, [bufyq+xq*2]
%endif
psrldq m2, m0, 2 ; top
psrldq m1, m0, 4 ; top/right
punpcklwd m0, m2
%if %2
%if %3
pshufd m2, m7, q3232
paddw m7, m2
%endif
pmulhrsw m7, m6
%endif
punpcklwd m1, m7
pmaddwd m0, m4
pmaddwd m1, m5
paddd m0, m1
paddd m0, m3
.x_loop_ar1_inner:
movd val0d, m0
psrldq m0, 4
imul val3d, cf3d
add val3d, val0d
sar val3d, shiftb
movsx val0d, word [bufq+xq*2]
add val3d, val0d
cmp val3d, maxd
cmovg val3d, maxd
cmp val3d, mind
cmovl val3d, mind
mov word [bufq+xq*2], val3w
; keep val3d in-place as left for next x iteration
inc xq
jz .x_loop_ar1_end
test xq, 3
jnz .x_loop_ar1_inner
jmp .x_loop_ar1
.x_loop_ar1_end:
add bufq, 82*2
add bufyq, 82*2<<%3
dec hd
jg .y_loop_ar1
%if ARCH_X86_32
%undef maxd
%undef mind
%undef hd
%endif
RET
.ar2:
%if ARCH_X86_64
DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
%else
DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
ALLOC_STACK -16*8
mov bufyq, r1m
mov uvd, r3m
%endif
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
imul uvd, 28
%if ARCH_X86_64
sar bdmaxd, 1
SPLATW m5, bdmaxd ; max_grain
%else
SPLATW m5, r4m
psraw m5, 1
%endif
pcmpeqw m6, m6
%if !cpuflag(sse4)
pcmpeqw m7, m7
psrldq m7, 14
pslldq m7, 2
pxor m7, m6
%endif
pxor m6, m5 ; min_grain
%if %2 && cpuflag(sse4)
SPLATW m7, [base+hmul_bits+2+%3*2]
%endif
%if ARCH_X86_64
SWAP 5, 13
SWAP 6, 14
SWAP 7, 15
%else
%define m13 [rsp+5*16]
%define m14 [rsp+6*16]
%define m15 [rsp+7*16]
mova m13, m5
mova m14, m6
mova m15, m7
%endif
; coef values
movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]
pxor m1, m1
pcmpgtb m1, m0
punpckhbw m2, m0, m1
punpcklbw m0, m1
pinsrw m2, [base+round_vals-12+shiftq*2], 5
pshufd m6, m0, q0000
pshufd m7, m0, q1111
pshufd m1, m0, q3333
pshufd m0, m0, q2222
pshufd m3, m2, q1111
pshufd m4, m2, q2222
pshufd m2, m2, q0000
%if ARCH_X86_64
SWAP 0, 8
SWAP 1, 9
SWAP 2, 10
SWAP 3, 11
SWAP 4, 12
%else
%define m8 [rsp+0*16]
%define m9 [rsp+1*16]
%define m10 [rsp+2*16]
%define m11 [rsp+3*16]
%define m12 [rsp+4*16]
mova m8, m0
mova m9, m1
mova m10, m2
mova m11, m3
mova m12, m4
%endif
%if ARCH_X86_64
DEFINE_ARGS buf, bufy, fg_data, h, x
%else
DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
%endif
%if %2
sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
%else
sub bufq, 2*(82*69+3)
%endif
add bufyq, 2*(79+82*3)
mov hd, 70-35*%3
.y_loop_ar2:
mov xq, -(76>>%2)
.x_loop_ar2:
movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5]
movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5]
psrldq m4, m0, 2 ; y=-2,x=[-1,+5]
psrldq m1, m0, 4 ; y=-2,x=[-0,+5]
psrldq m3, m0, 6 ; y=-2,x=[+1,+5]
psrldq m2, m0, 8 ; y=-2,x=[+2,+5]
punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1]
pmaddwd m0, m6
pmaddwd m1, m7
pmaddwd m2, m8
paddd m0, m1
paddd m0, m2
psrldq m3, m5, 2 ; y=-1,x=[-1,+5]
psrldq m1, m5, 4 ; y=-1,x=[-0,+5]
psrldq m4, m5, 6 ; y=-1,x=[+1,+5]
psrldq m2, m5, 8 ; y=-1,x=[+2,+5]
punpcklwd m3, m1
punpcklwd m4, m2
pmaddwd m3, m9
pmaddwd m4, m10
paddd m3, m4
paddd m0, m3
; luma component & rounding
%if %2
movu m1, [bufyq+xq*4]
%if %3
movu m2, [bufyq+xq*4+82*2]
phaddw m1, m2
pshufd m2, m1, q3232
paddw m1, m2
%else
phaddw m1, m1
%endif
%if cpuflag(sse4)
pmulhrsw m1, m15
%elif %3
pmulhrsw m1, [base+pw_8192]
%else
pmulhrsw m1, [base+pw_16384]
%endif
%else
movq m1, [bufyq+xq*2]
%endif
punpcklwd m1, [base+pw_1]
pmaddwd m1, m12
paddd m0, m1
movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5]
pshufd m2, m1, q3321
pxor m3, m3
pcmpgtw m3, m2
punpcklwd m2, m3 ; y=0,x=[0,3] in dword
.x_loop_ar2_inner:
pmaddwd m3, m1, m11
paddd m3, m0
psrldq m0, 4 ; shift top to next pixel
psrad m3, [fg_dataq+FGData.ar_coeff_shift]
; we do not need to packssdw since we only care about one value
paddd m3, m2
packssdw m3, m3
pminsw m3, m13
pmaxsw m3, m14
psrldq m1, 2
pslldq m3, 2
psrldq m2, 4
%if cpuflag(sse4)
pblendw m1, m3, 00000010b
%else
pand m1, m15
pandn m4, m15, m3
por m1, m4
%endif
; overwrite previous pixel, should be ok
movd [bufq+xq*2-2], m1
inc xq
jz .x_loop_ar2_end
test xq, 3
jnz .x_loop_ar2_inner
jmp .x_loop_ar2
.x_loop_ar2_end:
add bufq, 82*2
add bufyq, 82*2<<%3
dec hd
jg .y_loop_ar2
%if ARCH_X86_32
%undef m13
%undef m14
%undef m15
%endif
RET
.ar3:
%if ARCH_X86_64
DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
%if WIN64
mov r6, rsp
and rsp, ~15
sub rsp, 96
%define tmp rsp
%else
%define tmp rsp+stack_offset-120
%endif
%else
DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
ALLOC_STACK -16*14
mov bufyq, r1m
mov uvd, r3m
%define tmp rsp
%endif
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
imul uvd, 28
SPLATW m4, [base+round_vals-12+shiftq*2]
pxor m5, m5
pcmpgtw m5, m4
punpcklwd m4, m5
%if ARCH_X86_64
sar bdmaxd, 1
SPLATW m6, bdmaxd ; max_grain
%else
SPLATW m6, r4m
psraw m6, 1
%endif
pcmpeqw m7, m7
%if !cpuflag(sse4)
pcmpeqw m3, m3
psrldq m3, 14
pslldq m3, 4
pxor m3, m7
%endif
pxor m7, m6 ; min_grain
%if %2 && cpuflag(sse4)
SPLATW m3, [base+hmul_bits+2+%3*2]
%endif
%if ARCH_X86_64
SWAP 3, 11
SWAP 4, 12
SWAP 6, 14
SWAP 7, 15
%else
%define m11 [rsp+ 9*16]
%define m12 [rsp+10*16]
%define m14 [rsp+12*16]
%define m15 [rsp+13*16]
mova m11, m3
mova m12, m4
mova m14, m6
mova m15, m7
%endif
; cf from y=-3,x=-3 until y=-3,x=-2
movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
pxor m1, m1
pcmpgtb m1, m0
punpckhbw m2, m0, m1
punpcklbw m0, m1
pshufd m1, m0, q0000
pshufd m3, m0, q1111
pshufd m4, m0, q2222
pshufd m0, m0, q3333
pshufd m5, m2, q0000
pshufd m6, m2, q1111
mova [tmp+16*0], m1
mova [tmp+16*1], m3
mova [tmp+16*2], m4
mova [tmp+16*3], m0
mova [tmp+16*4], m5
mova [tmp+16*5], m6
pshufd m6, m2, q2222
pshufd m7, m2, q3333
; cf from y=-1,x=-1 to y=0,x=-1 + luma component
movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]
pxor m1, m1
pcmpgtb m1, m0
punpckhbw m2, m0, m1 ; luma
punpcklbw m0, m1
pshufd m3, m0, q3232
psrldq m5, m0, 10
; y=0,x=[-3 to -1] + "1.0" for current pixel
pinsrw m5, [base+round_vals-10+shiftq*2], 3
; y=-1,x=[-1 to +2]
pshufd m1, m0, q0000
pshufd m0, m0, q1111
; y=-1,x=+3 + luma
punpcklwd m3, m2
pshufd m3, m3, q0000
%if ARCH_X86_64
SWAP 1, 8
SWAP 0, 9
SWAP 3, 10
SWAP 5, 13
DEFINE_ARGS buf, bufy, fg_data, h, x
%else
%define m8 [rsp+ 6*16]
%define m9 [rsp+ 7*16]
%define m10 [rsp+ 8*16]
%define m13 [rsp+11*16]
mova m8, m1
mova m9, m0
mova m10, m3
mova m13, m5
DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
%endif
%if %2
sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
%else
sub bufq, 2*(82*69+3)
%endif
add bufyq, 2*(79+82*3)
mov hd, 70-35*%3
.y_loop_ar3:
mov xq, -(76>>%2)
.x_loop_ar3:
; first line
movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6]
palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5]
palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6]
punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
pmaddwd m0, [tmp+0*16]
pmaddwd m2, [tmp+1*16]
pmaddwd m3, [tmp+2*16]
paddd m0, m2
paddd m0, m3 ; first 6 x of top y
; second line [m0/1 are busy]
movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4]
movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6]
punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5]
palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5]
punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
pmaddwd m1, [tmp+3*16]
pmaddwd m4, [tmp+4*16]
pmaddwd m3, [tmp+5*16]
pmaddwd m5, m6
paddd m1, m4
paddd m3, m5
paddd m0, m1
paddd m0, m3 ; top 2 lines
; third line [m0 is busy] & luma + round
movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6]
%if %2
movu m5, [bufyq+xq*4]
%if %3
movu m4, [bufyq+xq*4+82*2]
phaddw m5, m4
%else
phaddw m5, m5
%endif
%else
movq m5, [bufyq+xq*2]
%endif
palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5]
palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6]
%if %3
pshufd m4, m5, q3232
paddw m5, m4
%endif
%if %2
%if cpuflag(sse4)
pmulhrsw m5, m11
%elif %3
pmulhrsw m5, [base+pw_8192]
%else
pmulhrsw m5, [base+pw_16384]
%endif
%endif
punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
punpcklwd m2, m5
pmaddwd m1, m7
pmaddwd m3, m8
pmaddwd m4, m9
pmaddwd m2, m10
paddd m1, m3
paddd m4, m2
paddd m0, m12 ; += round
paddd m1, m4
paddd m0, m1
movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
.x_loop_ar3_inner:
pmaddwd m2, m1, m13
pshufd m3, m2, q1111
paddd m2, m3 ; left+cur
paddd m2, m0 ; add top
psrldq m0, 4
psrad m2, [fg_dataq+FGData.ar_coeff_shift]
packssdw m2, m2
pminsw m2, m14
pmaxsw m2, m15
pslldq m2, 4
psrldq m1, 2
%if cpuflag(sse4)
pblendw m1, m2, 00000100b
%else
pand m1, m11
pandn m3, m11, m2
por m1, m3
%endif
; overwrite previous pixels, should be ok
movq [bufq+xq*2-4], m1
inc xq
jz .x_loop_ar3_end
test xq, 3
jnz .x_loop_ar3_inner
jmp .x_loop_ar3
.x_loop_ar3_end:
add bufq, 82*2
add bufyq, 82*2<<%3
dec hd
jg .y_loop_ar3
%if WIN64
mov rsp, r6
%elif ARCH_X86_32
%undef m8
%undef m9
%undef m10
%undef m11
%undef m12
%undef m13
%undef m14
%undef m15
%endif
RET
%endmacro
generate_grain_uv_fn 420, 1, 1
generate_grain_uv_fn 422, 1, 0
generate_grain_uv_fn 444, 0, 0
%macro SCRATCH 3
%if ARCH_X86_32
mova [rsp+%3*mmsize], m%1
%define m%2 [rsp+%3*mmsize]
%else
SWAP %1, %2
%endif
%endmacro
INIT_XMM ssse3
%if ARCH_X86_32
%if STACK_ALIGNMENT < mmsize
cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \
dst, src, scaling, unused1, fg_data, picptr, unused2
; copy stack arguments to new position post-alignment, so that we
; don't have to keep the old stack location in a separate register
mov r0, r0m
mov r1, r2m
mov r2, r4m
mov r3, r6m
mov r4, r7m
mov r5, r8m
%define r0m [rsp+8*mmsize+ 3*gprsize]
%define r2m [rsp+8*mmsize+ 5*gprsize]
%define r4m [rsp+8*mmsize+ 7*gprsize]
%define r6m [rsp+8*mmsize+ 9*gprsize]
%define r7m [rsp+8*mmsize+10*gprsize]
%define r8m [rsp+8*mmsize+11*gprsize]
mov r0m, r0
mov r2m, r1
mov r4m, r2
mov r6m, r3
mov r7m, r4
mov r8m, r5
%else
cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \
dst, src, scaling, unused1, fg_data, picptr, unused2
%endif
mov srcq, srcm
mov scalingq, r5m
mov fg_dataq, r3m
%if STACK_ALIGNMENT < mmsize
mov r6, r9m
%define r9m [rsp+8*mmsize+ 4*gprsize]
%define r3m [rsp+8*mmsize+ 6*gprsize]
%define r5m [rsp+8*mmsize+ 8*gprsize]
mov r9m, r6
%endif
LEA r5, $$
%define base r5-$$
mov r5m, picptrq
%else
cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
lea r8, [pb_mask]
%define base r8-pb_mask
%endif
mov r6d, [fg_dataq+FGData.scaling_shift]
SPLATW m3, [base+mul_bits+r6*2-14]
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
%if ARCH_X86_32
DECLARE_REG_TMP 0, 3
%else
DECLARE_REG_TMP 9, 10
%endif
mov t0d, r9m ; bdmax
sar t0d, 11 ; is_12bpc
inc t0d
mov t1d, r6d
imul t1d, t0d
dec t0d
SPLATW m5, [base+min+t1*2]
lea t0d, [t0d*3]
lea t0d, [r6d*2+t0d]
SPLATW m4, [base+max+t0*2]
SPLATW m2, r9m
pcmpeqw m1, m1
psraw m7, m2, 1 ; max_grain
pxor m1, m7 ; min_grain
SPLATD m6, [base+pd_16]
SCRATCH 1, 9, 0
SCRATCH 2, 10, 1
SCRATCH 3, 11, 2
SCRATCH 4, 12, 3
SCRATCH 5, 13, 4
SCRATCH 6, 14, 5
SCRATCH 7, 15, 6
mova m6, [base+pw_27_17_17_27] ; for horizontal filter
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2
DECLARE_REG_TMP 0
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
sby, see
DECLARE_REG_TMP 7
%endif
mov sbyd, r8m
movzx t0d, byte [fg_dataq+FGData.overlap_flag]
test t0d, t0d
jz .no_vertical_overlap
test sbyd, sbyd
jnz .vertical_overlap
.no_vertical_overlap:
mov dword r8m, t0d
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
imul seed, (173 << 24) | 37
%else
imul seed, sbyd, (173 << 24) | 37
%endif
add seed, (105 << 24) | 178
rol seed, 8
movzx seed, seew
xor seed, [fg_dataq+FGData.seed]
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
mov r3m, seed
mov wq, r4m
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
unused1, unused2, see, src_bak
%endif
lea src_bakq, [srcq+wq*2]
mov r9mp, src_bakq
neg wq
sub dstmp, srcq
%if ARCH_X86_32
mov r4m, wq
%endif
.loop_x:
%if ARCH_X86_32
mov seed, r3m
%endif
mov r6d, seed
or seed, 0xEFF4
shr r6d, 1
test seeb, seeh
lea seed, [r6+0x8000]
cmovp seed, r6d ; updated seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
offx, offy, see, src_bak
mov offyd, seed
mov offxd, seed
%endif
ror offyd, 8
shr offxd, 12
and offyd, 0xf
imul offyd, 164
lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
h, offxy, see, src_bak
%endif
.loop_x_odd:
movzx hd, word r7m
mov grain_lutq, grain_lutmp
.loop_y:
; src
pand m0, m10, [srcq+ 0]
pand m1, m10, [srcq+16] ; m0-1: src as word
; scaling[src]
%if ARCH_X86_32
vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4
vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4
%else
vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4
vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4
%endif
REPX {psrlw x, 8}, m2, m3
; grain = grain_lut[offy+y][offx+x]
movu m4, [grain_lutq+offxyq*2]
movu m5, [grain_lutq+offxyq*2+16]
; noise = round2(scaling[src] * grain, scaling_shift)
REPX {pmullw x, m11}, m2, m3
pmulhrsw m4, m2
pmulhrsw m5, m3
; dst = clip_pixel(src, noise)
paddw m0, m4
paddw m1, m5
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
movifnidn dstq, dstmp
mova [dstq+srcq+ 0], m0
mova [dstq+srcq+16], m1
add srcq, r2mp ; src += stride
add grain_lutq, 82*2
dec hd
jg .loop_y
%if ARCH_X86_32
add r4mp, 16
%else
add wq, 16
%endif
jge .end
%if ARCH_X86_32
mov srcq, r9mp
add srcq, r4mp
add srcq, r4mp
%else
mov src_bakq, r9mp
lea srcq, [src_bakq+wq*2]
%endif
btc dword r8m, 2
jc .next_blk
add offxyd, 16
test dword r8m, 2
jz .loop_x_odd
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
%else
add r12d, 16 ; top_offxy += 16
%endif
jmp .loop_x_odd_v_overlap
.next_blk:
test dword r8m, 1
jz .loop_x
; r8m = sbym
test dword r8m, 2
jnz .loop_x_hv_overlap
; horizontal overlap (without vertical overlap)
.loop_x_h_overlap:
%if ARCH_X86_32
add offxyd, 16
mov [rsp+8*mmsize+0*gprsize], offxyd
DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
mov seed, r3m
%endif
mov r6d, seed
or seed, 0xEFF4
shr r6d, 1
test seeb, seeh
lea seed, [r6+0x8000]
cmovp seed, r6d ; updated seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
offx, offy, see, src_bak, left_offxy
lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
mov offyd, seed
mov offxd, seed
%endif
ror offyd, 8
shr offxd, 12
and offyd, 0xf
imul offyd, 164
lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
h, offxy, see, src_bak, left_offxy
%endif
mov hd, dword r7m
mov grain_lutq, grain_lutmp
.loop_y_h_overlap:
; grain = grain_lut[offy+y][offx+x]
movu m5, [grain_lutq+offxyq*2]
%if ARCH_X86_32
mov r5, [rsp+8*mmsize+0*gprsize]
movd m4, [grain_lutq+r5*2]
%else
movd m4, [grain_lutq+left_offxyq*2]
%endif
punpcklwd m4, m5
pmaddwd m4, m6
paddd m4, m14
psrad m4, 5
packssdw m4, m4
pminsw m4, m15
pmaxsw m4, m9
shufps m4, m5, q3210
; src
pand m0, m10, [srcq+ 0]
pand m1, m10, [srcq+16] ; m0-1: src as word
; scaling[src]
%if ARCH_X86_32
vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5
vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5
%else
vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5
vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5
%endif
REPX {psrlw x, 8}, m2, m3
; noise = round2(scaling[src] * grain, scaling_shift)
movu m5, [grain_lutq+offxyq*2+16]
REPX {pmullw x, m11}, m2, m3
pmulhrsw m4, m2
pmulhrsw m5, m3
; dst = clip_pixel(src, noise)
paddw m0, m4
paddw m1, m5
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
movifnidn dstq, dstmp
mova [dstq+srcq+ 0], m0
mova [dstq+srcq+16], m1
add srcq, r2mp
add grain_lutq, 82*2
dec hd
jg .loop_y_h_overlap
%if ARCH_X86_32
add r4mp, 16
%else
add wq, 16
%endif
jge .end
%if ARCH_X86_32
mov srcq, r9mp
add srcq, r4mp
add srcq, r4mp
%else
mov src_bakq, r9mp
lea srcq, [src_bakq+wq*2]
%endif
or dword r8m, 4
add offxyd, 16
; r8m = sbym
test dword r8m, 2
jz .loop_x_odd
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
%else
add r12d, 16 ; top_offxy += 16
%endif
jmp .loop_x_odd_v_overlap
.end:
RET
.vertical_overlap:
or t0d, 2
mov r8m, t0d
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
sby, see
%endif
movzx sbyd, sbyb
%if ARCH_X86_32
imul r4, [fg_dataq+FGData.seed], 0x00010001
DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused
%else
imul seed, [fg_dataq+FGData.seed], 0x00010001
%endif
imul t0d, sbyd, 173 * 0x00010001
imul sbyd, 37 * 0x01000100
add t0d, (105 << 16) | 188
add sbyd, (178 << 24) | (141 << 8)
and t0d, 0x00ff00ff
and sbyd, 0xff00ff00
xor seed, t0d
%if ARCH_X86_32
xor sbyd, seed
DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
mov r3m, seed
mov wq, r4m
%else
xor seed, sbyd ; (cur_seed << 16) | top_seed
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
unused1, unused2, see, src_bak
%endif
lea src_bakq, [srcq+wq*2]
mov r9mp, src_bakq
neg wq
sub dstmp, srcq
%if ARCH_X86_32
mov r4m, wq
%endif
.loop_x_v_overlap:
%if ARCH_X86_32
mov r5, r5m
SPLATD m7, [base+pw_27_17_17_27]
mov seed, r3m
%else
SPLATD m7, [pw_27_17_17_27]
%endif
; we assume from the block above that bits 8-15 of r7d are zero'ed
mov r6d, seed
or seed, 0xeff4eff4
test seeb, seeh
setp t0b ; parity of top_seed
shr seed, 16
shl t0d, 16
test seeb, seeh
setp t0b ; parity of cur_seed
or r6d, 0x00010001
xor t0d, r6d
mov seed, t0d
ror seed, 1 ; updated (cur_seed << 16) | top_seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
offx, offy, see, src_bak, unused, top_offxy
mov offyd, seed
mov offxd, seed
%endif
ror offyd, 8
ror offxd, 12
and offyd, 0xf000f
and offxd, 0xf000f
imul offyd, 164
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyq, [offyq+offxq*2+0x10001*747+32*82]
%if ARCH_X86_32
DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
h, offxy, see, src_bak, unused, top_offxy
%endif
movzx top_offxyd, offxyw
%if ARCH_X86_32
mov [rsp+8*mmsize+1*gprsize], top_offxyd
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%endif
shr offxyd, 16
.loop_x_odd_v_overlap:
%if ARCH_X86_32
mov r5, r5m
%endif
SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
mov hd, dword r7m
mov grain_lutq, grain_lutmp
.loop_y_v_overlap:
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq*2]
%if ARCH_X86_32
mov r5, [rsp+8*mmsize+1*gprsize]
movu m2, [grain_lutq+r5*2]
%else
movu m2, [grain_lutq+top_offxyq*2]
%endif
punpckhwd m4, m2, m3
punpcklwd m2, m3
REPX {pmaddwd x, m7}, m4, m2
REPX {paddd x, m14}, m4, m2
REPX {psrad x, 5}, m4, m2
packssdw m2, m4
pminsw m2, m15
pmaxsw m2, m9
movu m4, [grain_lutq+offxyq*2+16]
%if ARCH_X86_32
movu m3, [grain_lutq+r5*2+16]
%else
movu m3, [grain_lutq+top_offxyq*2+16]
%endif
punpckhwd m5, m3, m4
punpcklwd m3, m4
REPX {pmaddwd x, m7}, m5, m3
REPX {paddd x, m14}, m5, m3
REPX {psrad x, 5}, m5, m3
packssdw m3, m5
pminsw m3, m15
pmaxsw m3, m9
; src
pand m0, m10, [srcq+ 0] ; m0-1: src as word
pand m1, m10, [srcq+16] ; m0-1: src as word
; scaling[src]
; noise = round2(scaling[src] * grain, scaling_shift)
%if ARCH_X86_32
vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5
%else
vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5
%endif
psrlw m4, 8
pmullw m4, m11
pmulhrsw m4, m2
%if ARCH_X86_32
vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2
%else
vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2
%endif
psrlw m5, 8
pmullw m5, m11
pmulhrsw m5, m3
; dst = clip_pixel(src, noise)
paddw m0, m4
paddw m1, m5
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
movifnidn dstq, dstmp
mova [dstq+srcq+ 0], m0
mova [dstq+srcq+16], m1
add srcq, r2mp
add grain_lutq, 82*2
dec hw
jz .end_y_v_overlap
; 2 lines get vertical overlap, then fall back to non-overlap code for
; remaining (up to) 30 lines
%if ARCH_X86_32
mov r5, r5m
%endif
SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
xor hd, 0x10000
test hd, 0x10000
jnz .loop_y_v_overlap
jmp .loop_y
.end_y_v_overlap:
%if ARCH_X86_32
add r4mp, 16
%else
add wq, 16
%endif
jge .end_hv
%if ARCH_X86_32
mov srcq, r9mp
add srcq, r4mp
add srcq, r4mp
%else
mov src_bakq, r9mp
lea srcq, [src_bakq+wq*2]
%endif
btc dword r8m, 2
jc .next_blk_v
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif
add offxyd, 16
jmp .loop_x_odd_v_overlap
.next_blk_v:
; since fg_dataq.overlap is guaranteed to be set, we never jump
; back to .loop_x_v_overlap, and instead always fall-through to
; h+v overlap
.loop_x_hv_overlap:
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
mov r0, [rsp+8*mmsize+1*gprsize]
add r3, 16
add r0, 16
mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy
mov seed, r3m
xor r0, r0
%else
; we assume from the block above that bits 8-15 of r7d are zero'ed
%endif
mov r6d, seed
or seed, 0xeff4eff4
test seeb, seeh
setp t0b ; parity of top_seed
shr seed, 16
shl t0d, 16
test seeb, seeh
setp t0b ; parity of cur_seed
or r6d, 0x00010001
xor t0d, r6d
mov seed, t0d
ror seed, 1 ; updated (cur_seed << 16) | top_seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
lea topleft_offxyq, [top_offxyq+16]
lea left_offxyq, [offyq+16]
mov offyd, seed
mov offxd, seed
%endif
ror offyd, 8
ror offxd, 12
and offyd, 0xf000f
and offxd, 0xf000f
imul offyd, 164
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyq, [offyq+offxq*2+0x10001*747+32*82]
%if ARCH_X86_32
DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
%endif
movzx top_offxyd, offxyw
%if ARCH_X86_32
mov [rsp+8*mmsize+1*gprsize], top_offxyd
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%endif
shr offxyd, 16
%if ARCH_X86_32
mov r5, r5m
%endif
SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
movzx hd, word r7m
mov grain_lutq, grain_lutmp
.loop_y_hv_overlap:
; grain = grain_lut[offy+y][offx+x]
movu m2, [grain_lutq+offxyq*2]
%if ARCH_X86_32
mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
movu m4, [grain_lutq+r0*2]
movd m5, [grain_lutq+r5*2]
mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
movd m3, [grain_lutq+r5*2]
%else
movu m4, [grain_lutq+top_offxyq*2]
movd m5, [grain_lutq+left_offxyq*2]
movd m3, [grain_lutq+topleft_offxyq*2]
%endif
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
punpcklwd m5, m2
punpcklwd m3, m4
REPX {pmaddwd x, m6}, m5, m3
REPX {paddd x, m14}, m5, m3
REPX {psrad x, 5}, m5, m3
packssdw m5, m3
pminsw m5, m15
pmaxsw m5, m9
shufps m3, m5, m2, q3210
shufps m5, m4, q3232
; followed by v interpolation (top | cur -> cur)
movu m0, [grain_lutq+offxyq*2+16]
%if ARCH_X86_32
movu m1, [grain_lutq+r0*2+16]
%else
movu m1, [grain_lutq+top_offxyq*2+16]
%endif
punpcklwd m2, m5, m3
punpckhwd m5, m3
punpcklwd m3, m1, m0
punpckhwd m1, m0
REPX {pmaddwd x, m7}, m2, m5, m3, m1
REPX {paddd x, m14}, m2, m5, m3, m1
REPX {psrad x, 5}, m2, m5, m3, m1
packssdw m2, m5
packssdw m3, m1
REPX {pminsw x, m15}, m2, m3
REPX {pmaxsw x, m9}, m2, m3
; src
pand m0, m10, [srcq+ 0]
pand m1, m10, [srcq+16] ; m0-1: src as word
; scaling[src]
; noise = round2(scaling[src] * grain, scaling_shift)
%if ARCH_X86_32
vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5
%else
vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5
%endif
psrlw m4, 8
pmullw m4, m11
pmulhrsw m2, m4
%if ARCH_X86_32
vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4
%else
vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4
%endif
psrlw m5, 8
pmullw m5, m11
pmulhrsw m3, m5
; dst = clip_pixel(src, noise)
paddw m0, m2
paddw m1, m3
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
movifnidn dstq, dstmp
mova [dstq+srcq+ 0], m0
mova [dstq+srcq+16], m1
add srcq, r2mp
add grain_lutq, 82*2
dec hw
jz .end_y_hv_overlap
; 2 lines get vertical overlap, then fall back to non-overlap code for
; remaining (up to) 30 lines
%if ARCH_X86_32
mov r5, r5m
%endif
SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
xor hd, 0x10000
test hd, 0x10000
jnz .loop_y_hv_overlap
jmp .loop_y_h_overlap
.end_y_hv_overlap:
or dword r8m, 4
%if ARCH_X86_32
add r4mp, 16
%else
add wq, 16
%endif
jge .end_hv
%if ARCH_X86_32
mov r5, r5m
add offxyd, 16
add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16
mov srcq, r9mp
add srcq, r4mp
add srcq, r4mp
%else
add offxyd, 16
add top_offxyd, 16
mov src_bakq, r9mp
lea srcq, [src_bakq+wq*2]
%endif
jmp .loop_x_odd_v_overlap
.end_hv:
RET
%if ARCH_X86_32
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
%endif
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
INIT_XMM ssse3
%if ARCH_X86_32
%if STACK_ALIGNMENT < mmsize
cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \
tmp, src, scaling, h, fg_data, picptr, unused
mov r0, r0m
mov r1, r1m
mov r2, r2m
mov r4, r3m
mov r3, r4m
mov r5, r5m
%define r0m [rsp+8*mmsize+ 3*gprsize]
%define r1m [rsp+8*mmsize+ 4*gprsize]
%define r2m [rsp+8*mmsize+ 5*gprsize]
%define r3m [rsp+8*mmsize+ 6*gprsize]
%define r4m [rsp+8*mmsize+ 7*gprsize]
%define r5m [rsp+8*mmsize+ 8*gprsize]
mov r0m, r0
mov r2m, r2
mov r4m, r3
mov r5m, r5
mov r0, r6m
mov r2, r7m
mov r3, r8m
mov r5, r9m
%define r6m [rsp+8*mmsize+ 9*gprsize]
%define r7m [rsp+8*mmsize+10*gprsize]
%define r8m [rsp+8*mmsize+11*gprsize]
%define r9m [rsp+8*mmsize+12*gprsize]
mov r6m, r0
mov r7m, r2
mov r8m, r3
mov r9m, r5
mov r2, r10m
mov r3, r11m
mov r5, r12m
mov r0, r13m
%define r10m [rsp+8*mmsize+13*gprsize]
%define r11m [rsp+8*mmsize+14*gprsize]
%define r12m [rsp+8*mmsize+15*gprsize]
mov r10m, r2
mov r11m, r3
mov r12m, r5
SPLATW m2, r13m
%else
cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
tmp, src, scaling, h, fg_data, picptr, unused
mov srcq, srcm
mov fg_dataq, r3m
%endif
LEA r5, $$
%define base r5-$$
DECLARE_REG_TMP 0, 2, 3
%else
cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
grain_lut, h, sby, luma, lstride, uv_pl, is_id
%define base r8-pb_mask
lea r8, [pb_mask]
DECLARE_REG_TMP 9, 10, 11
%endif
mov r6d, [fg_dataq+FGData.scaling_shift]
SPLATW m3, [base+mul_bits+r6*2-14]
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
%if STACK_ALIGNMENT >= mmsize
mov t0d, r13m ; bdmax
%endif
sar t0d, 11 ; is_12bpc
inc t0d
mov t1d, r6d
imul t1d, t0d
dec t0d
SPLATW m5, [base+min+t1*2]
lea t1d, [t0d*3]
mov t2d, r12m
inc t2d
imul r6d, t2d
add t1d, r6d
SPLATW m4, [base+max+t1*2]
%if STACK_ALIGNMENT >= mmsize
SPLATW m2, r13m
%endif
SCRATCH 2, 10, 2
SCRATCH 3, 11, 3
SCRATCH 4, 12, 4
SCRATCH 5, 13, 5
%define mzero m7
%if %3
SPLATD m2, [base+pw_23_22]
%endif
%if ARCH_X86_32
mov scalingq, r5m
mov r5m, r5
%else
mov r13mp, strideq
%endif
pcmpeqw m0, m0
psraw m1, m10, 1
pxor m0, m1
SCRATCH 0, 8, 0
SCRATCH 1, 9, 1
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
jne .csfl
%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
DECLARE_REG_TMP 0
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
DECLARE_REG_TMP 9
%endif
%if %1
mov r6d, r11m
SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4]
SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
punpcklwd m6, m1, m0
SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4]
SPLATD m7, [base+pw_4+t0*4]
pmullw m5, m7
%else
SPLATD m6, [base+pd_16]
%if %2
mova m5, [base+pw_23_22]
%else
mova m5, [base+pw_27_17_17_27]
%endif
%endif
SCRATCH 6, 14, 6
SCRATCH 5, 15, 7
%if ARCH_X86_32
DECLARE_REG_TMP 0
%else
DECLARE_REG_TMP 7
%endif
mov sbyd, r8m
mov t0d, [fg_dataq+FGData.overlap_flag]
test t0d, t0d
jz %%no_vertical_overlap
test sbyd, sbyd
jnz %%vertical_overlap
%%no_vertical_overlap:
mov r8m, t0d
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
imul seed, (173 << 24) | 37
%else
imul seed, sbyd, (173 << 24) | 37
%endif
add seed, (105 << 24) | 178
rol seed, 8
movzx seed, seew
xor seed, [fg_dataq+FGData.seed]
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
mov dstq, r0mp
mov lumaq, r9mp
mov wq, r4m
lea r3, [srcq+wq*2]
mov r1mp, r3
lea r3, [dstq+wq*2]
mov r11mp, r3
lea r3, [lumaq+wq*(2<<%2)]
mov r12mp, r3
%if %3
shl r10mp, 1
%endif
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
unused2, unused3, see, unused4, unused5, unused6, luma, lstride
mov lstrideq, r10mp
%if %3
add lstrideq, lstrideq
%endif
mov lumaq, r9mp
lea r10, [srcq+wq*2]
lea r11, [dstq+wq*2]
lea r12, [lumaq+wq*(2<<%2)]
mov r10mp, r10
mov r11mp, r11
mov r12mp, r12
%endif
neg wq
%if ARCH_X86_32
mov r4mp, wq
%endif
%%loop_x:
%if ARCH_X86_32
mov seed, r3m
%endif
mov r6d, seed
or seed, 0xEFF4
shr r6d, 1
test seeb, seeh
lea seed, [r6+0x8000]
cmovp seed, r6d ; updated seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
offx, offy, see, unused1, unused2, unused3, luma, lstride
mov offxd, seed
mov offyd, seed
%endif
ror offyd, 8
shr offxd, 12
and offyd, 0xf
imul offyd, 164>>%3
lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
h, offxy, see, unused1, unused2, unused3, luma, lstride
%endif
%if %2 == 0
%%loop_x_odd:
%endif
mov hd, r7m
mov grain_lutq, grain_lutmp
%%loop_y:
; src
mova m0, [srcq]
mova m1, [srcq+16] ; m0-1: src as word
; luma_src
pxor mzero, mzero
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
mov lumaq, r9m
%endif
mova m4, [lumaq+ 0]
mova m6, [lumaq+(16<<%2)]
%if %2
phaddw m4, [lumaq+16]
phaddw m6, [lumaq+48]
%endif
%if ARCH_X86_32
add lumaq, r10mp
mov r9m, lumaq
%endif
%if %2
pavgw m4, mzero
pavgw m6, mzero
%endif
%if %1
punpckhwd m3, m4, m0
punpcklwd m4, m0
punpckhwd m5, m6, m1
punpcklwd m6, m1 ; { luma, chroma }
REPX {pmaddwd x, m14}, m3, m4, m5, m6
REPX {psrad x, 6}, m3, m4, m5, m6
packssdw m4, m3
packssdw m6, m5
REPX {paddw x, m15}, m4, m6
REPX {pmaxsw x, mzero}, m4, m6
REPX {pminsw x, m10}, m4, m6 ; clip_pixel()
%else
REPX {pand x, m10}, m4, m6
%endif
; scaling[luma_src]
%if ARCH_X86_32
vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1
vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1
%else
vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1
vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1
%endif
REPX {psrlw x, 8}, m3, m5
; grain = grain_lut[offy+y][offx+x]
movu m4, [grain_lutq+offxyq*2]
movu m6, [grain_lutq+offxyq*2+16]
; noise = round2(scaling[luma_src] * grain, scaling_shift)
REPX {pmullw x, m11}, m3, m5
pmulhrsw m4, m3
pmulhrsw m6, m5
; dst = clip_pixel(src, noise)
paddw m0, m4
paddw m1, m6
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
movifnidn dstq, dstmp
mova [dstq+ 0], m0
mova [dstq+16], m1
%if ARCH_X86_32
add srcq, r2mp
add dstq, r2mp
mov dstmp, dstq
%else
add srcq, r13mp
add dstq, r13mp
add lumaq, lstrideq
%endif
add grain_lutq, 82*2
dec hd
jg %%loop_y
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma
mov wq, r4mp
%endif
add wq, 16
jge %%end
%if ARCH_X86_32
mov srcq, r1mp
%else
mov srcq, r10mp
%endif
mov dstq, r11mp
mov lumaq, r12mp
lea srcq, [srcq+wq*2]
lea dstq, [dstq+wq*2]
lea lumaq, [lumaq+wq*(2<<%2)]
%if ARCH_X86_32
mov r0m, dstq
mov r9m, lumaq
mov r4m, wq
%endif
%if %2 == 0
btc dword r8m, 2
jc %%next_blk
add offxyd, 16
test dword r8m, 2
jz %%loop_x_odd
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
%else
add r11d, 16
%endif
jmp %%loop_x_odd_v_overlap
%%next_blk:
%endif
test dword r8m, 1
je %%loop_x
; r8m = sbym
test dword r8m, 2
jnz %%loop_x_hv_overlap
; horizontal overlap (without vertical overlap)
%%loop_x_h_overlap:
%if ARCH_X86_32
add offxyd, 16
mov [rsp+8*mmsize+0*gprsize], offxyd
DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
mov seed, r3m
%endif
mov r6d, seed
or seed, 0xEFF4
shr r6d, 1
test seeb, seeh
lea seed, [r6+0x8000]
cmovp seed, r6d ; updated seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
offx, offy, see, left_offxy, unused1, unused2, luma, lstride
lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
mov offxd, seed
mov offyd, seed
%endif
ror offyd, 8
shr offxd, 12
and offyd, 0xf
imul offyd, 164>>%3
lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
h, offxy, see, left_offxy, unused1, unused2, luma, lstride
%endif
mov hd, r7m
mov grain_lutq, grain_lutmp
%%loop_y_h_overlap:
mova m0, [srcq]
mova m1, [srcq+16]
; luma_src
pxor mzero, mzero
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
mov lumaq, r9m
%endif
mova m4, [lumaq+ 0]
mova m6, [lumaq+(16<<%2)]
%if %2
phaddw m4, [lumaq+16]
phaddw m6, [lumaq+48]
%endif
%if ARCH_X86_32
add lumaq, r10mp
mov r9m, lumaq
%endif
%if %2
pavgw m4, mzero
pavgw m6, mzero
%endif
%if %1
punpckhwd m3, m4, m0
punpcklwd m4, m0
punpckhwd m5, m6, m1
punpcklwd m6, m1 ; { luma, chroma }
REPX {pmaddwd x, m14}, m3, m4, m5, m6
REPX {psrad x, 6}, m3, m4, m5, m6
packssdw m4, m3
packssdw m6, m5
REPX {paddw x, m15}, m4, m6
REPX {pmaxsw x, mzero}, m4, m6
REPX {pminsw x, m10}, m4, m6 ; clip_pixel()
%else
REPX {pand x, m10}, m4, m6
%endif
; grain = grain_lut[offy+y][offx+x]
movu m7, [grain_lutq+offxyq*2]
%if ARCH_X86_32
mov r5, [rsp+8*mmsize+0*gprsize]
movd m5, [grain_lutq+r5*2]
%else
movd m5, [grain_lutq+left_offxyq*2+ 0]
%endif
punpcklwd m5, m7 ; {left0, cur0}
%if %1
%if ARCH_X86_32
mov r5, r5m
%endif
%if %2
pmaddwd m5, [PIC_ptr(pw_23_22)]
%else
pmaddwd m5, [PIC_ptr(pw_27_17_17_27)]
%endif
paddd m5, [PIC_ptr(pd_16)]
%else
pmaddwd m5, m15
paddd m5, m14
%endif
psrad m5, 5
packssdw m5, m5
pmaxsw m5, m8
pminsw m5, m9
shufps m5, m7, q3210
movu m3, [grain_lutq+offxyq*2+16]
; scaling[luma_src]
%if ARCH_X86_32
vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1
vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1
%else
vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1
vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1
%endif
REPX {psrlw x, 8}, m7, m4
; noise = round2(scaling[luma_src] * grain, scaling_shift)
REPX {pmullw x, m11}, m7, m4
pmulhrsw m5, m7
pmulhrsw m3, m4
; dst = clip_pixel(src, noise)
paddw m0, m5
paddw m1, m3
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
movifnidn dstq, dstmp
mova [dstq+ 0], m0
mova [dstq+16], m1
%if ARCH_X86_32
add srcq, r2mp
add dstq, r2mp
mov dstmp, dstq
%else
add srcq, r13mp
add dstq, r13mp
add lumaq, lstrideq
%endif
add grain_lutq, 82*2
dec hd
jg %%loop_y_h_overlap
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
mov wq, r4mp
%endif
add wq, 16
jge %%end
%if ARCH_X86_32
mov srcq, r1mp
%else
mov srcq, r10mp
%endif
mov dstq, r11mp
mov lumaq, r12mp
lea srcq, [srcq+wq*2]
lea dstq, [dstq+wq*2]
lea lumaq, [lumaq+wq*(2<<%2)]
%if ARCH_X86_32
mov r0mp, dstq
mov r9mp, lumaq
mov r4m, wq
%endif
%if %2
; r8m = sbym
test dword r8m, 2
jne %%loop_x_hv_overlap
jmp %%loop_x_h_overlap
%else
or dword r8m, 4
add offxyd, 16
; r8m = sbym
test dword r8m, 2
jz %%loop_x_odd
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
%else
add r11d, 16 ; top_offxy += 16
%endif
jmp %%loop_x_odd_v_overlap
%endif
%%end:
RET
%%vertical_overlap:
or t0d, 2
mov r8m, t0d
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
sby, see, unused1, unused2, unused3, lstride
%endif
movzx sbyd, sbyb
%if ARCH_X86_32
imul r4, [fg_dataq+FGData.seed], 0x00010001
DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
%else
imul seed, [fg_dataq+FGData.seed], 0x00010001
%endif
imul t0d, sbyd, 173 * 0x00010001
imul sbyd, 37 * 0x01000100
add t0d, (105 << 16) | 188
add sbyd, (178 << 24) | (141 << 8)
and t0d, 0x00ff00ff
and sbyd, 0xff00ff00
xor seed, t0d
%if ARCH_X86_32
xor sbyd, seed
DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
mov r3m, seed
mov dstq, r0mp
mov lumaq, r9mp
mov wq, r4m
lea r3, [srcq+wq*2]
mov r1mp, r3
lea r3, [dstq+wq*2]
mov r11mp, r3
lea r3, [lumaq+wq*(2<<%2)]
mov r12mp, r3
%if %3
shl r10mp, 1
%endif
%else
xor seed, sbyd ; (cur_seed << 16) | top_seed
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
unused1, unused2, see, unused3, unused4, unused5, luma, lstride
mov lstrideq, r10mp
%if %3
add lstrideq, lstrideq
%endif
mov lumaq, r9mp
lea r10, [srcq+wq*2]
lea r11, [dstq+wq*2]
lea r12, [lumaq+wq*(2<<%2)]
mov r10mp, r10
mov r11mp, r11
mov r12mp, r12
%endif
neg wq
%if ARCH_X86_32
mov r4m, wq
%endif
%%loop_x_v_overlap:
%if ARCH_X86_32
mov seed, r3m
xor t0d, t0d
%else
; we assume from the block above that bits 8-15 of r7d are zero'ed
%endif
mov r6d, seed
or seed, 0xeff4eff4
test seeb, seeh
setp t0b ; parity of top_seed
shr seed, 16
shl t0d, 16
test seeb, seeh
setp t0b ; parity of cur_seed
or r6d, 0x00010001
xor t0d, r6d
mov seed, t0d
ror seed, 1 ; updated (cur_seed << 16) | top_seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
offx, offy, see, unused1, top_offxy, unused2, luma, lstride
mov offyd, seed
mov offxd, seed
%endif
ror offyd, 8
ror offxd, 12
and offyd, 0xf000f
and offxd, 0xf000f
imul offyd, 164>>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
%if ARCH_X86_32
DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
h, offxy, see, unused1, top_offxy, unused2, luma, lstride
%endif
movzx top_offxyd, offxyw
%if ARCH_X86_32
mov [rsp+8*mmsize+1*gprsize], top_offxyd
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%endif
shr offxyd, 16
%if %2 == 0
%%loop_x_odd_v_overlap:
%endif
%if %3 == 0
%if ARCH_X86_32
mov r5, r5m
%endif
SPLATD m2, [PIC_ptr(pw_27_17_17_27)]
%endif
mov hd, r7m
mov grain_lutq, grain_lutmp
%%loop_y_v_overlap:
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq*2]
%if ARCH_X86_32
mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy
movu m5, [grain_lutq+r0*2]
%else
movu m5, [grain_lutq+top_offxyq*2]
%endif
punpckhwd m7, m5, m3
punpcklwd m5, m3 ; {top/cur interleaved}
REPX {pmaddwd x, m2}, m7, m5
%if %1
%if ARCH_X86_32
mov r5, r5m
%endif
REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
%else
REPX {paddd x, m14}, m7, m5
%endif
REPX {psrad x, 5}, m7, m5
packssdw m3, m5, m7
pmaxsw m3, m8
pminsw m3, m9
; grain = grain_lut[offy+y][offx+x]
movu m4, [grain_lutq+offxyq*2+16]
%if ARCH_X86_32
movu m5, [grain_lutq+r0*2+16]
%else
movu m5, [grain_lutq+top_offxyq*2+16]
%endif
punpckhwd m7, m5, m4
punpcklwd m5, m4 ; {top/cur interleaved}
REPX {pmaddwd x, m2}, m7, m5
%if %1
REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
%else
REPX {paddd x, m14}, m7, m5
%endif
REPX {psrad x, 5}, m7, m5
packssdw m4, m5, m7
pmaxsw m4, m8
pminsw m4, m9
; src
mova m0, [srcq]
mova m1, [srcq+16]
; luma_src
pxor mzero, mzero
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
mov lumaq, r9mp
%endif
mova m5, [lumaq+ 0]
mova m6, [lumaq+(16<<%2)]
%if %2
phaddw m5, [lumaq+16]
phaddw m6, [lumaq+48]
%endif
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
%endif
%if %2
pavgw m5, mzero
pavgw m6, mzero
%endif
%if %1
punpckhwd m7, m5, m0
punpcklwd m5, m0
REPX {pmaddwd x, m14}, m7, m5
REPX {psrad x, 6}, m7, m5
packssdw m5, m7
punpckhwd m7, m6, m1
punpcklwd m6, m1 ; { luma, chroma }
REPX {pmaddwd x, m14}, m7, m6
REPX {psrad x, 6}, m7, m6
packssdw m6, m7
pxor mzero, mzero
REPX {paddw x, m15}, m5, m6
REPX {pmaxsw x, mzero}, m5, m6
REPX {pminsw x, m10}, m5, m6 ; clip_pixel()
%else
REPX {pand x, m10}, m5, m6
%endif
; scaling[luma_src]
%if ARCH_X86_32
vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1
vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1
%else
vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1
vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1
%endif
REPX {psrlw x, 8}, m7, m5
; noise = round2(scaling[luma_src] * grain, scaling_shift)
REPX {pmullw x, m11}, m7, m5
pmulhrsw m3, m7
pmulhrsw m4, m5
; dst = clip_pixel(src, noise)
paddw m0, m3
paddw m1, m4
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
movifnidn dstq, dstmp
mova [dstq+ 0], m0
mova [dstq+16], m1
dec hw
jle %%end_y_v_overlap
%if ARCH_X86_32
add srcq, r2mp
add dstq, r2mp
mov dstmp, dstq
%else
add srcq, r13mp
add dstq, r13mp
add lumaq, lstrideq
%endif
add grain_lutq, 82*2
%if %3
jmp %%loop_y
%else
btc hd, 16
jc %%loop_y
%if ARCH_X86_32
mov r5, r5m
%endif
SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4]
jmp %%loop_y_v_overlap
%endif
%%end_y_v_overlap:
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
mov wq, r4m
%endif
add wq, 16
jge %%end_hv
%if ARCH_X86_32
mov srcq, r1mp
%else
mov srcq, r10mp
%endif
mov dstq, r11mp
mov lumaq, r12mp
lea srcq, [srcq+wq*2]
lea dstq, [dstq+wq*2]
lea lumaq, [lumaq+wq*(2<<%2)]
%if ARCH_X86_32
mov r0mp, dstq
mov r9mp, lumaq
mov r4m, wq
%endif
%if %2
; since fg_dataq.overlap is guaranteed to be set, we never jump
; back to .loop_x_v_overlap, and instead always fall-through to
; h+v overlap
%else
btc dword r8m, 2
jc %%loop_x_hv_overlap
add offxyd, 16
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
%else
add r11d, 16
%endif
jmp %%loop_x_odd_v_overlap
%endif
%%loop_x_hv_overlap:
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut
mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy
add offxyd, 16
add t0d, 16
mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd
mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd
DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
mov seed, r3m
xor t0d, t0d
%else
; we assume from the block above that bits 8-15 of r7d are zero'ed
%endif
mov r6d, seed
or seed, 0xeff4eff4
test seeb, seeh
setp t0b ; parity of top_seed
shr seed, 16
shl t0d, 16
test seeb, seeh
setp t0b ; parity of cur_seed
or r6d, 0x00010001
xor t0d, r6d
mov seed, t0d
ror seed, 1 ; updated (cur_seed << 16) | top_seed
%if ARCH_X86_32
mov r3m, seed
DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
mov offxd, offyd
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
lea topleft_offxyq, [top_offxyq+16]
lea left_offxyq, [offyq+16]
mov offyd, seed
mov offxd, seed
%endif
ror offyd, 8
ror offxd, 12
and offyd, 0xf000f
and offxd, 0xf000f
imul offyd, 164>>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy
%else
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
%endif
movzx top_offxyd, offxyw
%if ARCH_X86_32
mov [rsp+8*mmsize+1*gprsize], top_offxyd
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%endif
shr offxyd, 16
%if %3 == 0
%if ARCH_X86_32
mov r5, r5m
%endif
SPLATD m2, [PIC_ptr(pw_27_17_17_27)]
%endif
mov hd, r7m
mov grain_lutq, grain_lutmp
%%loop_y_hv_overlap:
; grain = grain_lut[offy+y][offx+x]
%if ARCH_X86_32
mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
movd m5, [grain_lutq+r5*2]
%else
movd m5, [grain_lutq+left_offxyq*2]
%endif
movu m7, [grain_lutq+offxyq*2]
%if ARCH_X86_32
mov r5, [rsp+8*mmsize+2*gprsize]
movu m4, [grain_lutq+r0*2]
%if %2
pinsrw m5, [grain_lutq+r5*2], 2
%else
movd m3, [grain_lutq+r5*2]
%endif
%else
movu m4, [grain_lutq+top_offxyq*2]
%if %2
pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left }
%else
movd m3, [grain_lutq+topleft_offxyq*2]
%endif
%endif
%if %2 == 0
punpckldq m5, m3
%endif
punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 }
punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 }
%if %1
%if ARCH_X86_32
mov r5, r5m
%endif
%if %2
movddup m0, [PIC_ptr(pw_23_22)]
%else
movddup m0, [PIC_ptr(pw_27_17_17_27)]
%endif
%else
pshufd m0, m15, q1010
%endif
pmaddwd m5, m0
%if %1
paddd m5, [PIC_ptr(pd_16)]
%else
paddd m5, m14
%endif
psrad m5, 5
packssdw m5, m5
pmaxsw m5, m8
pminsw m5, m9
shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3
shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter
shufps m5, m4, q3231 ; top0-7 post-h_filter
punpckhwd m7, m5, m3
punpcklwd m5, m3 ; {top/cur interleaved}
REPX {pmaddwd x, m2}, m7, m5
%if %1
REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7
%else
REPX {paddd x, m14}, m5, m7
%endif
REPX {psrad x, 5}, m5, m7
packssdw m3, m5, m7
pmaxsw m3, m8
pminsw m3, m9
; right half
movu m4, [grain_lutq+offxyq*2+16]
%if ARCH_X86_32
movu m0, [grain_lutq+r0*2+16]
%else
movu m0, [grain_lutq+top_offxyq*2+16]
%endif
punpckhwd m1, m0, m4
punpcklwd m0, m4 ; {top/cur interleaved}
REPX {pmaddwd x, m2}, m1, m0
%if %1
REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0
%else
REPX {paddd x, m14}, m1, m0
%endif
REPX {psrad x, 5}, m1, m0
packssdw m4, m0, m1
pmaxsw m4, m8
pminsw m4, m9
; src
mova m0, [srcq]
mova m1, [srcq+16]
; luma_src
pxor mzero, mzero
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
mov lumaq, r9mp
%endif
mova m6, [lumaq+ 0]
mova m5, [lumaq+(16<<%2)]
%if %2
phaddw m6, [lumaq+16]
phaddw m5, [lumaq+48]
%endif
%if ARCH_X86_32
add lumaq, r10mp
mov r9mp, lumaq
%endif
%if %2
pavgw m6, mzero
pavgw m5, mzero
%endif
%if %1
punpckhwd m7, m6, m0
punpcklwd m6, m0
REPX {pmaddwd x, m14}, m7, m6
REPX {psrad x, 6}, m7, m6
packssdw m6, m7
punpckhwd m7, m5, m1
punpcklwd m5, m1 ; { luma, chroma }
REPX {pmaddwd x, m14}, m7, m5
REPX {psrad x, 6}, m7, m5
packssdw m5, m7
pxor mzero, mzero
REPX {paddw x, m15}, m6, m5
REPX {pmaxsw x, mzero}, m6, m5
REPX {pminsw x, m10}, m6, m5 ; clip_pixel()
%else
REPX {pand x, m10}, m6, m5
%endif
; scaling[luma_src]
%if ARCH_X86_32
vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1
vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1
%else
%if %3 == 0
; register shortage :)
push r12
%endif
vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1
vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1
%if %3 == 0
pop r12
%endif
%endif
REPX {psrlw x, 8}, m7, m6
; noise = round2(scaling[luma_src] * grain, scaling_shift)
REPX {pmullw x, m11}, m7, m6
pmulhrsw m3, m7
pmulhrsw m4, m6
; dst = clip_pixel(src, noise)
paddw m0, m3
paddw m1, m4
pmaxsw m0, m13
pmaxsw m1, m13
pminsw m0, m12
pminsw m1, m12
movifnidn dstq, dstmp
mova [dstq+ 0], m0
mova [dstq+16], m1
%if ARCH_X86_32
add srcq, r2mp
add dstq, r2mp
mov dstmp, dstq
%else
add srcq, r13mp
add dstq, r13mp
add lumaq, lstrideq
%endif
add grain_lutq, 82*2
dec hw
%if %3
jg %%loop_y_h_overlap
%else
jle %%end_y_hv_overlap
btc hd, 16
jc %%loop_y_h_overlap
%if ARCH_X86_32
mov r5, r5m
%endif
SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4]
jmp %%loop_y_hv_overlap
%%end_y_hv_overlap:
%endif
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
mov wq, r4m
%endif
add wq, 16
jge %%end_hv
%if ARCH_X86_32
mov srcq, r1mp
%else
mov srcq, r10mp
%endif
mov dstq, r11mp
mov lumaq, r12mp
lea srcq, [srcq+wq*2]
lea dstq, [dstq+wq*2]
lea lumaq, [lumaq+wq*(2<<%2)]
%if ARCH_X86_32
mov dstmp, dstq
mov r9mp, lumaq
mov r4m, wq
%endif
%if %2
jmp %%loop_x_hv_overlap
%else
or dword r8m, 4
add offxyd, 16
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
%else
add r11d, 16 ; top_offxy += 16
%endif
jmp %%loop_x_odd_v_overlap
%endif
%%end_hv:
RET
%endmacro
%%FGUV_32x32xN_LOOP 1, %2, %3
.csfl:
%%FGUV_32x32xN_LOOP 0, %2, %3
%if STACK_ALIGNMENT < mmsize
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
%endif
%endmacro
FGUV_FN 420, 1, 1
FGUV_FN 422, 1, 0
FGUV_FN 444, 0, 0