Source code

Revision control

Copy as Markdown

Other Tools

; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA
; dav1d_obmc_masks[] << 9
obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0
dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0
dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120
dw 4096, 3072, 2048, 1536, 0, 0, 0, 0
dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240
dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608
dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
rescale_mul: dd 0, 1, 2, 3
resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
bdct_lb_q: times 8 db 0
times 8 db 4
times 8 db 8
times 8 db 12
pw_2: times 8 dw 2
pw_16: times 4 dw 16
prep_mul: times 4 dw 16
times 8 dw 4
pw_64: times 8 dw 64
pw_256: times 8 dw 256
pw_2048: times 4 dw 2048
bidir_mul: times 4 dw 2048
pw_8192: times 8 dw 8192
pw_27615: times 8 dw 27615
pw_32766: times 8 dw 32766
pw_m512: times 8 dw -512
pd_63: times 4 dd 63
pd_64: times 4 dd 64
pd_512: times 4 dd 512
pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32
pd_0x3ff: times 4 dd 0x3ff
pd_0x4000: times 4 dd 0x4000
pq_0x400000: times 2 dq 0x400000
pq_0x40000000: times 2 dq 0x40000000
pd_65538: times 2 dd 65538
put_bilin_h_rnd: times 4 dw 8
times 4 dw 10
s_8tap_h_rnd: times 2 dd 2
times 2 dd 8
put_s_8tap_v_rnd: times 2 dd 512
times 2 dd 128
s_8tap_h_sh: dd 2, 4
put_s_8tap_v_sh: dd 10, 8
bidir_rnd: times 4 dw -16400
times 4 dw -16388
put_8tap_h_rnd: dd 34, 34, 40, 40
prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4)
prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5)
warp8x8_shift: dd 11, 13
warp8x8_rnd1: dd 1024, 1024, 4096, 4096
warp8x8_rnd2: times 4 dw 4096
times 4 dw 16384
warp8x8t_rnd: times 2 dd 16384 - (8192 << 15)
%macro BIDIR_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - 2*%3)
%xdefine %%base %1_%2_table
%xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
%%table:
%rep %0 - 2
dd %%prefix %+ .w%3 - %%base
%rotate 1
%endrep
%endmacro
BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128
%macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3)
%xdefine %%base %1_%2
%%table:
%rep %0 - 2
dw %%base %+ _w%3 - %%base
%rotate 1
%endrep
%endmacro
%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put)
%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep)
BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
%macro SCALED_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - %3)
%xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
%%table:
%rep %0 - 2
dw %%base %+ .w%3 - %%base
%rotate 1
%endrep
%rotate 2
%%dy_1024:
%xdefine %1_%2_dy1_table (%%dy_1024 - %3)
%rep %0 - 2
dw %%base %+ .dy1_w%3 - %%base
%rotate 1
%endrep
%rotate 2
%%dy_2048:
%xdefine %1_%2_dy2_table (%%dy_2048 - %3)
%rep %0 - 2
dw %%base %+ .dy2_w%3 - %%base
%rotate 1
%endrep
%endmacro
SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128
cextern mc_subpel_filters
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
cextern mc_warp_filter
cextern resize_filter
SECTION .text
%if UNIX64
DECLARE_REG_TMP 7
%else
DECLARE_REG_TMP 5
%endif
INIT_XMM ssse3
cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy
%define base t0-put_ssse3
mov mxyd, r6m ; mx
LEA t0, put_ssse3
movifnidn wd, wm
test mxyd, mxyd
jnz .h
mov mxyd, r7m ; my
test mxyd, mxyd
jnz .v
.put:
tzcnt wd, wd
movzx wd, word [base+put_ssse3_table+wq*2]
add wq, t0
movifnidn hd, hm
jmp wq
.put_w2:
mov r4d, [srcq+ssq*0]
mov r6d, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r4d
mov [dstq+dsq*1], r6d
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w2
RET
.put_w4:
movq m0, [srcq+ssq*0]
movq m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movq [dstq+dsq*0], m0
movq [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w4
RET
.put_w8:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w8
RET
.put_w16:
movu m0, [srcq+ssq*0+16*0]
movu m1, [srcq+ssq*0+16*1]
movu m2, [srcq+ssq*1+16*0]
movu m3, [srcq+ssq*1+16*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0+16*0], m0
mova [dstq+dsq*0+16*1], m1
mova [dstq+dsq*1+16*0], m2
mova [dstq+dsq*1+16*1], m3
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w16
RET
.put_w32:
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
movu m3, [srcq+16*3]
add srcq, ssq
mova [dstq+16*0], m0
mova [dstq+16*1], m1
mova [dstq+16*2], m2
mova [dstq+16*3], m3
add dstq, dsq
dec hd
jg .put_w32
RET
.put_w64:
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
movu m3, [srcq+16*3]
mova [dstq+16*0], m0
mova [dstq+16*1], m1
mova [dstq+16*2], m2
mova [dstq+16*3], m3
movu m0, [srcq+16*4]
movu m1, [srcq+16*5]
movu m2, [srcq+16*6]
movu m3, [srcq+16*7]
add srcq, ssq
mova [dstq+16*4], m0
mova [dstq+16*5], m1
mova [dstq+16*6], m2
mova [dstq+16*7], m3
add dstq, dsq
dec hd
jg .put_w64
RET
.put_w128:
add srcq, 16*8
add dstq, 16*8
.put_w128_loop:
movu m0, [srcq-16*8]
movu m1, [srcq-16*7]
movu m2, [srcq-16*6]
movu m3, [srcq-16*5]
mova [dstq-16*8], m0
mova [dstq-16*7], m1
mova [dstq-16*6], m2
mova [dstq-16*5], m3
movu m0, [srcq-16*4]
movu m1, [srcq-16*3]
movu m2, [srcq-16*2]
movu m3, [srcq-16*1]
mova [dstq-16*4], m0
mova [dstq-16*3], m1
mova [dstq-16*2], m2
mova [dstq-16*1], m3
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
movu m3, [srcq+16*3]
mova [dstq+16*0], m0
mova [dstq+16*1], m1
mova [dstq+16*2], m2
mova [dstq+16*3], m3
movu m0, [srcq+16*4]
movu m1, [srcq+16*5]
movu m2, [srcq+16*6]
movu m3, [srcq+16*7]
add srcq, ssq
mova [dstq+16*4], m0
mova [dstq+16*5], m1
mova [dstq+16*6], m2
mova [dstq+16*7], m3
add dstq, dsq
dec hd
jg .put_w128_loop
RET
.h:
movd m5, mxyd
mov mxyd, r7m ; my
mova m4, [base+pw_16]
pshufb m5, [base+pw_256]
psubw m4, m5
test mxyd, mxyd
jnz .hv
; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
mov r6d, r8m ; bitdepth_max
shr r6d, 11
movddup m3, [base+put_bilin_h_rnd+r6*8]
movifnidn hd, hm
sub wd, 8
jg .h_w16
je .h_w8
cmp wd, -4
je .h_w4
.h_w2:
movq m1, [srcq+ssq*0]
movhps m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmullw m0, m4, m1
psrlq m1, 16
pmullw m1, m5
paddw m0, m3
paddw m0, m1
psrlw m0, 4
movd [dstq+dsq*0], m0
punpckhqdq m0, m0
movd [dstq+dsq*1], m0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2
RET
.h_w4:
movq m0, [srcq+ssq*0]
movhps m0, [srcq+ssq*1]
movq m1, [srcq+ssq*0+2]
movhps m1, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
pmullw m0, m4
pmullw m1, m5
paddw m0, m3
paddw m0, m1
psrlw m0, 4
movq [dstq+dsq*0], m0
movhps [dstq+dsq*1], m0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w4
RET
.h_w8:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*0+2]
pmullw m0, m4
pmullw m1, m5
paddw m0, m3
paddw m0, m1
movu m1, [srcq+ssq*1]
movu m2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
pmullw m1, m4
pmullw m2, m5
paddw m1, m3
paddw m1, m2
psrlw m0, 4
psrlw m1, 4
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
RET
.h_w16:
lea srcq, [srcq+wq*2]
lea dstq, [dstq+wq*2]
neg wq
.h_w16_loop0:
mov r6, wq
.h_w16_loop:
movu m0, [srcq+r6*2+ 0]
movu m1, [srcq+r6*2+ 2]
pmullw m0, m4
pmullw m1, m5
paddw m0, m3
paddw m0, m1
movu m1, [srcq+r6*2+16]
movu m2, [srcq+r6*2+18]
pmullw m1, m4
pmullw m2, m5
paddw m1, m3
paddw m1, m2
psrlw m0, 4
psrlw m1, 4
mova [dstq+r6*2+16*0], m0
mova [dstq+r6*2+16*1], m1
add r6, 16
jl .h_w16_loop
add srcq, ssq
add dstq, dsq
dec hd
jg .h_w16_loop0
RET
.v:
shl mxyd, 11
movd m5, mxyd
pshufb m5, [base+pw_256]
movifnidn hd, hm
cmp wd, 4
jg .v_w8
je .v_w4
.v_w2:
movd m0, [srcq+ssq*0]
.v_w2_loop:
movd m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklqdq m2, m0, m1
movd m0, [srcq+ssq*0]
punpcklqdq m1, m0
psubw m1, m2
pmulhrsw m1, m5
paddw m1, m2
movd [dstq+dsq*0], m1
punpckhqdq m1, m1
movd [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movq m0, [srcq+ssq*0]
.v_w4_loop:
movq m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklqdq m2, m0, m1
movq m0, [srcq+ssq*0]
punpcklqdq m1, m0
psubw m1, m2
pmulhrsw m1, m5
paddw m1, m2
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
RET
.v_w8:
%if ARCH_X86_64
%if WIN64
push r7
%endif
shl wd, 5
mov r7, srcq
lea r6d, [wq+hq-256]
mov r4, dstq
%else
mov r6, srcq
%endif
.v_w8_loop0:
movu m0, [srcq+ssq*0]
.v_w8_loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
psubw m1, m3, m0
pmulhrsw m1, m5
paddw m1, m0
movu m0, [srcq+ssq*0]
psubw m2, m0, m3
pmulhrsw m2, m5
paddw m2, m3
mova [dstq+dsq*0], m1
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
%if ARCH_X86_64
add r7, 16
add r4, 16
movzx hd, r6b
mov srcq, r7
mov dstq, r4
sub r6d, 1<<8
%else
mov dstq, dstmp
add r6, 16
mov hd, hm
add dstq, 16
mov srcq, r6
mov dstmp, dstq
sub wd, 8
%endif
jg .v_w8_loop0
%if WIN64
pop r7
%endif
RET
.hv:
WIN64_SPILL_XMM 8
shl mxyd, 11
mova m3, [base+pw_2]
movd m6, mxyd
mova m7, [base+pw_8192]
pshufb m6, [base+pw_256]
test dword r8m, 0x800
jnz .hv_12bpc
psllw m4, 2
psllw m5, 2
mova m7, [base+pw_2048]
.hv_12bpc:
movifnidn hd, hm
cmp wd, 4
jg .hv_w8
je .hv_w4
.hv_w2:
movddup m0, [srcq+ssq*0]
pshufhw m1, m0, q0321
pmullw m0, m4
pmullw m1, m5
paddw m0, m3
paddw m0, m1
psrlw m0, 2
.hv_w2_loop:
movq m2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movhps m2, [srcq+ssq*0]
pmullw m1, m4, m2
psrlq m2, 16
pmullw m2, m5
paddw m1, m3
paddw m1, m2
psrlw m1, 2 ; 1 _ 2 _
shufpd m2, m0, m1, 0x01 ; 0 _ 1 _
mova m0, m1
psubw m1, m2
paddw m1, m1
pmulhw m1, m6
paddw m1, m2
pmulhrsw m1, m7
movd [dstq+dsq*0], m1
punpckhqdq m1, m1
movd [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w4:
movddup m0, [srcq+ssq*0]
movddup m1, [srcq+ssq*0+2]
pmullw m0, m4
pmullw m1, m5
paddw m0, m3
paddw m0, m1
psrlw m0, 2
.hv_w4_loop:
movq m1, [srcq+ssq*1]
movq m2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
movhps m1, [srcq+ssq*0]
movhps m2, [srcq+ssq*0+2]
pmullw m1, m4
pmullw m2, m5
paddw m1, m3
paddw m1, m2
psrlw m1, 2 ; 1 2
shufpd m2, m0, m1, 0x01 ; 0 1
mova m0, m1
psubw m1, m2
paddw m1, m1
pmulhw m1, m6
paddw m1, m2
pmulhrsw m1, m7
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
RET
.hv_w8:
%if ARCH_X86_64
%if WIN64
push r7
%endif
shl wd, 5
lea r6d, [wq+hq-256]
mov r4, srcq
mov r7, dstq
%else
mov r6, srcq
%endif
.hv_w8_loop0:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*0+2]
pmullw m0, m4
pmullw m1, m5
paddw m0, m3
paddw m0, m1
psrlw m0, 2
.hv_w8_loop:
movu m1, [srcq+ssq*1]
movu m2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
pmullw m1, m4
pmullw m2, m5
paddw m1, m3
paddw m1, m2
psrlw m1, 2
psubw m2, m1, m0
paddw m2, m2
pmulhw m2, m6
paddw m2, m0
pmulhrsw m2, m7
mova [dstq+dsq*0], m2
movu m0, [srcq+ssq*0]
movu m2, [srcq+ssq*0+2]
pmullw m0, m4
pmullw m2, m5
paddw m0, m3
paddw m0, m2
psrlw m0, 2
psubw m2, m0, m1
paddw m2, m2
pmulhw m2, m6
paddw m2, m1
pmulhrsw m2, m7
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
%if ARCH_X86_64
add r4, 16
add r7, 16
movzx hd, r6b
mov srcq, r4
mov dstq, r7
sub r6d, 1<<8
%else
mov dstq, dstmp
add r6, 16
mov hd, hm
add dstq, 16
mov srcq, r6
mov dstmp, dstq
sub wd, 8
%endif
jg .hv_w8_loop0
%if WIN64
pop r7
%endif
RET
cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3
%define base r6-prep_ssse3
movifnidn mxyd, r5m ; mx
LEA r6, prep_ssse3
movifnidn hd, hm
test mxyd, mxyd
jnz .h
mov mxyd, r6m ; my
test mxyd, mxyd
jnz .v
.prep:
tzcnt wd, wd
movzx wd, word [base+prep_ssse3_table+wq*2]
mov r5d, r7m ; bitdepth_max
mova m5, [base+pw_8192]
add wq, r6
shr r5d, 11
movddup m4, [base+prep_mul+r5*8]
lea stride3q, [strideq*3]
jmp wq
.prep_w4:
movq m0, [srcq+strideq*0]
movhps m0, [srcq+strideq*1]
movq m1, [srcq+strideq*2]
movhps m1, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
pmullw m0, m4
pmullw m1, m4
psubw m0, m5
psubw m1, m5
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
add tmpq, 16*2
sub hd, 4
jg .prep_w4
RET
.prep_w8:
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*1]
movu m2, [srcq+strideq*2]
movu m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
add tmpq, 16*4
sub hd, 4
jg .prep_w8
RET
.prep_w16:
movu m0, [srcq+strideq*0+16*0]
movu m1, [srcq+strideq*0+16*1]
movu m2, [srcq+strideq*1+16*0]
movu m3, [srcq+strideq*1+16*1]
lea srcq, [srcq+strideq*2]
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
add tmpq, 16*4
sub hd, 2
jg .prep_w16
RET
.prep_w32:
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
movu m3, [srcq+16*3]
add srcq, strideq
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
add tmpq, 16*4
dec hd
jg .prep_w32
RET
.prep_w64:
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
movu m3, [srcq+16*3]
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
movu m0, [srcq+16*4]
movu m1, [srcq+16*5]
movu m2, [srcq+16*6]
movu m3, [srcq+16*7]
add srcq, strideq
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*4], m0
mova [tmpq+16*5], m1
mova [tmpq+16*6], m2
mova [tmpq+16*7], m3
add tmpq, 16*8
dec hd
jg .prep_w64
RET
.prep_w128:
movu m0, [srcq+16* 0]
movu m1, [srcq+16* 1]
movu m2, [srcq+16* 2]
movu m3, [srcq+16* 3]
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
movu m0, [srcq+16* 4]
movu m1, [srcq+16* 5]
movu m2, [srcq+16* 6]
movu m3, [srcq+16* 7]
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+16*4], m0
mova [tmpq+16*5], m1
mova [tmpq+16*6], m2
mova [tmpq+16*7], m3
movu m0, [srcq+16* 8]
movu m1, [srcq+16* 9]
movu m2, [srcq+16*10]
movu m3, [srcq+16*11]
add tmpq, 16*16
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq-16*8], m0
mova [tmpq-16*7], m1
mova [tmpq-16*6], m2
mova [tmpq-16*5], m3
movu m0, [srcq+16*12]
movu m1, [srcq+16*13]
movu m2, [srcq+16*14]
movu m3, [srcq+16*15]
add srcq, strideq
REPX {pmullw x, m4}, m0, m1, m2, m3
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq-16*4], m0
mova [tmpq-16*3], m1
mova [tmpq-16*2], m2
mova [tmpq-16*1], m3
dec hd
jg .prep_w128
RET
.h:
movd m4, mxyd
mov mxyd, r6m ; my
mova m3, [base+pw_16]
pshufb m4, [base+pw_256]
mova m5, [base+pw_32766]
psubw m3, m4
test dword r7m, 0x800
jnz .h_12bpc
psllw m3, 2
psllw m4, 2
.h_12bpc:
test mxyd, mxyd
jnz .hv
sub wd, 8
je .h_w8
jg .h_w16
.h_w4:
movq m0, [srcq+strideq*0]
movhps m0, [srcq+strideq*1]
movq m1, [srcq+strideq*0+2]
movhps m1, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
pmullw m0, m3
pmullw m1, m4
psubw m0, m5
paddw m0, m1
psraw m0, 2
mova [tmpq], m0
add tmpq, 16
sub hd, 2
jg .h_w4
RET
.h_w8:
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*0+2]
pmullw m0, m3
pmullw m1, m4
psubw m0, m5
paddw m0, m1
movu m1, [srcq+strideq*1]
movu m2, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
pmullw m1, m3
pmullw m2, m4
psubw m1, m5
paddw m1, m2
psraw m0, 2
psraw m1, 2
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
add tmpq, 16*2
sub hd, 2
jg .h_w8
RET
.h_w16:
lea srcq, [srcq+wq*2]
neg wq
.h_w16_loop0:
mov r6, wq
.h_w16_loop:
movu m0, [srcq+r6*2+ 0]
movu m1, [srcq+r6*2+ 2]
pmullw m0, m3
pmullw m1, m4
psubw m0, m5
paddw m0, m1
movu m1, [srcq+r6*2+16]
movu m2, [srcq+r6*2+18]
pmullw m1, m3
pmullw m2, m4
psubw m1, m5
paddw m1, m2
psraw m0, 2
psraw m1, 2
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
add tmpq, 16*2
add r6, 16
jl .h_w16_loop
add srcq, strideq
dec hd
jg .h_w16_loop0
RET
.v:
movd m4, mxyd
mova m3, [base+pw_16]
pshufb m4, [base+pw_256]
mova m5, [base+pw_32766]
psubw m3, m4
test dword r7m, 0x800
jnz .v_12bpc
psllw m3, 2
psllw m4, 2
.v_12bpc:
cmp wd, 8
je .v_w8
jg .v_w16
.v_w4:
movq m0, [srcq+strideq*0]
.v_w4_loop:
movq m2, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
punpcklqdq m1, m0, m2 ; 0 1
movq m0, [srcq+strideq*0]
punpcklqdq m2, m0 ; 1 2
pmullw m1, m3
pmullw m2, m4
psubw m1, m5
paddw m1, m2
psraw m1, 2
mova [tmpq], m1
add tmpq, 16
sub hd, 2
jg .v_w4_loop
RET
.v_w8:
movu m0, [srcq+strideq*0]
.v_w8_loop:
movu m2, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
pmullw m0, m3
pmullw m1, m4, m2
psubw m0, m5
paddw m1, m0
movu m0, [srcq+strideq*0]
psraw m1, 2
pmullw m2, m3
mova [tmpq+16*0], m1
pmullw m1, m4, m0
psubw m2, m5
paddw m1, m2
psraw m1, 2
mova [tmpq+16*1], m1
add tmpq, 16*2
sub hd, 2
jg .v_w8_loop
RET
.v_w16:
%if WIN64
push r7
%endif
mov r5, srcq
%if ARCH_X86_64
lea r6d, [wq*4-32]
mov wd, wd
lea r6d, [hq+r6*8]
mov r7, tmpq
%else
mov r6d, wd
%endif
.v_w16_loop0:
movu m0, [srcq+strideq*0]
.v_w16_loop:
movu m2, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
pmullw m0, m3
pmullw m1, m4, m2
psubw m0, m5
paddw m1, m0
movu m0, [srcq+strideq*0]
psraw m1, 2
pmullw m2, m3
mova [tmpq+wq*0], m1
pmullw m1, m4, m0
psubw m2, m5
paddw m1, m2
psraw m1, 2
mova [tmpq+wq*2], m1
lea tmpq, [tmpq+wq*4]
sub hd, 2
jg .v_w16_loop
%if ARCH_X86_64
add r5, 16
add r7, 16
movzx hd, r6b
mov srcq, r5
mov tmpq, r7
sub r6d, 1<<8
%else
mov tmpq, tmpmp
add r5, 16
mov hd, hm
add tmpq, 16
mov srcq, r5
mov tmpmp, tmpq
sub r6d, 8
%endif
jg .v_w16_loop0
%if WIN64
pop r7
%endif
RET
.hv:
WIN64_SPILL_XMM 7
shl mxyd, 11
movd m6, mxyd
pshufb m6, [base+pw_256]
cmp wd, 8
je .hv_w8
jg .hv_w16
.hv_w4:
movddup m0, [srcq+strideq*0]
movddup m1, [srcq+strideq*0+2]
pmullw m0, m3
pmullw m1, m4
psubw m0, m5
paddw m0, m1
psraw m0, 2
.hv_w4_loop:
movq m1, [srcq+strideq*1]
movq m2, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
movhps m1, [srcq+strideq*0]
movhps m2, [srcq+strideq*0+2]
pmullw m1, m3
pmullw m2, m4
psubw m1, m5
paddw m1, m2
psraw m1, 2 ; 1 2
shufpd m2, m0, m1, 0x01 ; 0 1
mova m0, m1
psubw m1, m2
pmulhrsw m1, m6
paddw m1, m2
mova [tmpq], m1
add tmpq, 16
sub hd, 2
jg .hv_w4_loop
RET
.hv_w8:
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*0+2]
pmullw m0, m3
pmullw m1, m4
psubw m0, m5
paddw m0, m1
psraw m0, 2
.hv_w8_loop:
movu m1, [srcq+strideq*1]
movu m2, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
pmullw m1, m3
pmullw m2, m4
psubw m1, m5
paddw m1, m2
psraw m1, 2
psubw m2, m1, m0
pmulhrsw m2, m6
paddw m2, m0
mova [tmpq+16*0], m2
movu m0, [srcq+strideq*0]
movu m2, [srcq+strideq*0+2]
pmullw m0, m3
pmullw m2, m4
psubw m0, m5
paddw m0, m2
psraw m0, 2
psubw m2, m0, m1
pmulhrsw m2, m6
paddw m2, m1
mova [tmpq+16*1], m2
add tmpq, 16*2
sub hd, 2
jg .hv_w8_loop
RET
.hv_w16:
%if WIN64
push r7
%endif
mov r5, srcq
%if ARCH_X86_64
lea r6d, [wq*4-32]
mov wd, wd
lea r6d, [hq+r6*8]
mov r7, tmpq
%else
mov r6d, wd
%endif
.hv_w16_loop0:
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*0+2]
pmullw m0, m3
pmullw m1, m4
psubw m0, m5
paddw m0, m1
psraw m0, 2
.hv_w16_loop:
movu m1, [srcq+strideq*1]
movu m2, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
pmullw m1, m3
pmullw m2, m4
psubw m1, m5
paddw m1, m2
psraw m1, 2
psubw m2, m1, m0
pmulhrsw m2, m6
paddw m2, m0
mova [tmpq+wq*0], m2
movu m0, [srcq+strideq*0]
movu m2, [srcq+strideq*0+2]
pmullw m0, m3
pmullw m2, m4
psubw m0, m5
paddw m0, m2
psraw m0, 2
psubw m2, m0, m1
pmulhrsw m2, m6
paddw m2, m1
mova [tmpq+wq*2], m2
lea tmpq, [tmpq+wq*4]
sub hd, 2
jg .hv_w16_loop
%if ARCH_X86_64
add r5, 16
add r7, 16
movzx hd, r6b
mov srcq, r5
mov tmpq, r7
sub r6d, 1<<8
%else
mov tmpq, tmpmp
add r5, 16
mov hd, hm
add tmpq, 16
mov srcq, r5
mov tmpmp, tmpq
sub r6d, 8
%endif
jg .hv_w16_loop0
%if WIN64
pop r7
%endif
RET
; int8_t subpel_filters[5][15][8]
%assign FILTER_REGULAR (0*15 << 16) | 3*15
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
%macro FN 4 ; prefix, type, type_h, type_v
cglobal %1_%2_16bpc
mov t0d, FILTER_%3
%ifidn %3, %4
mov t1d, t0d
%else
mov t1d, FILTER_%4
%endif
%ifnidn %2, regular ; skip the jump in the last filter
jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX)
%endif
%endmacro
%if ARCH_X86_32
DECLARE_REG_TMP 1, 2, 6
%elif WIN64
DECLARE_REG_TMP 4, 5, 8
%else
DECLARE_REG_TMP 7, 8, 8
%endif
%define PUT_8TAP_FN FN put_8tap,
PUT_8TAP_FN sharp, SHARP, SHARP
PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
PUT_8TAP_FN smooth, SMOOTH, SMOOTH
PUT_8TAP_FN sharp_regular, SHARP, REGULAR
PUT_8TAP_FN regular_sharp, REGULAR, SHARP
PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
PUT_8TAP_FN regular, REGULAR, REGULAR
%if ARCH_X86_32
cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my
%define mxb r0b
%define mxd r0
%define mxq r0
%define myb r1b
%define myd r1
%define myq r1
%define m8 [esp+16*0]
%define m9 [esp+16*1]
%define m10 [esp+16*2]
%define m11 [esp+16*3]
%define m12 [esp+16*4]
%define m13 [esp+16*5]
%define m14 [esp+16*6]
%define m15 [esp+16*7]
%else
cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
%endif
%define base t2-put_ssse3
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 8tap_v, my, 4tap_v
LEA t2, put_ssse3
movifnidn wd, wm
movifnidn srcq, srcmp
movifnidn ssq, ssmp
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
tzcnt wd, wd
movzx wd, word [base+put_ssse3_table+wq*2]
movifnidn dstq, dstmp
movifnidn dsq, dsmp
add wq, t2
%if WIN64
pop r8
pop r7
%endif
jmp wq
.h:
test myd, 0xf00
jnz .hv
mov myd, r8m
movd m5, r8m
shr myd, 11
movddup m4, [base+put_8tap_h_rnd+myq*8]
movifnidn dsq, dsmp
pshufb m5, [base+pw_256]
cmp wd, 4
jg .h_w8
movzx mxd, mxb
lea srcq, [srcq-2]
movq m3, [base+subpel_filters+mxq*8]
movifnidn dstq, dstmp
punpcklbw m3, m3
psraw m3, 8 ; sign-extend
je .h_w4
.h_w2:
mova m2, [base+spel_h_shuf2]
pshufd m3, m3, q2121
.h_w2_loop:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pshufb m0, m2
pshufb m1, m2
pmaddwd m0, m3
pmaddwd m1, m3
phaddd m0, m1
paddd m0, m4
psrad m0, 6
packssdw m0, m0
pxor m1, m1
pminsw m0, m5
pmaxsw m0, m1
movd [dstq+dsq*0], m0
pshuflw m0, m0, q3232
movd [dstq+dsq*1], m0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2_loop
RET
.h_w4:
WIN64_SPILL_XMM 8
mova m6, [base+spel_h_shufA]
mova m7, [base+spel_h_shufB]
pshufd m2, m3, q1111
pshufd m3, m3, q2222
.h_w4_loop:
movu m1, [srcq]
add srcq, ssq
pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4
pshufb m1, m7 ; 2 3 3 4 4 5 5 6
pmaddwd m0, m2
pmaddwd m1, m3
paddd m0, m4
paddd m0, m1
psrad m0, 6
packssdw m0, m0
pxor m1, m1
pminsw m0, m5
pmaxsw m0, m1
movq [dstq], m0
add dstq, dsq
dec hd
jg .h_w4_loop
RET
.h_w8:
WIN64_SPILL_XMM 12
shr mxd, 16
movq m3, [base+subpel_filters+mxq*8]
movifnidn dstq, dstmp
mova m6, [base+spel_h_shufA]
mova m7, [base+spel_h_shufB]
%if UNIX64
mov wd, wd
%endif
lea srcq, [srcq+wq*2]
punpcklbw m3, m3
lea dstq, [dstq+wq*2]
psraw m3, 8
neg wq
%if ARCH_X86_32
ALLOC_STACK -16*4
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
mova m8, m0
mova m9, m1
mova m10, m2
mova m11, m3
%else
pshufd m8, m3, q0000
pshufd m9, m3, q1111
pshufd m10, m3, q2222
pshufd m11, m3, q3333
%endif
.h_w8_loop0:
mov r6, wq
.h_w8_loop:
movu m0, [srcq+r6*2- 6]
movu m1, [srcq+r6*2+ 2]
pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4
pshufb m0, m7 ; 2 3 3 4 4 5 5 6
pmaddwd m2, m8 ; abcd0
pmaddwd m0, m9 ; abcd1
pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8
pshufb m1, m7 ; 6 7 7 8 8 9 9 a
paddd m2, m4
paddd m0, m2
pmaddwd m2, m10, m3 ; abcd2
pmaddwd m3, m8 ; efgh0
paddd m0, m2
pmaddwd m2, m11, m1 ; abcd3
pmaddwd m1, m9 ; efgh1
paddd m0, m2
movu m2, [srcq+r6*2+10]
paddd m3, m4
paddd m1, m3
pshufb m3, m2, m6 ; 8 9 9 a a b b c
pshufb m2, m7 ; a b b c c d d e
pmaddwd m3, m10 ; efgh2
pmaddwd m2, m11 ; efgh3
paddd m1, m3
paddd m1, m2
psrad m0, 6
psrad m1, 6
packssdw m0, m1
pxor m1, m1
pminsw m0, m5
pmaxsw m0, m1
mova [dstq+r6*2], m0
add r6, 8
jl .h_w8_loop
add srcq, ssq
add dstq, dsq
dec hd
jg .h_w8_loop0
RET
.v:
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovb myd, mxd
movq m3, [base+subpel_filters+myq*8]
WIN64_SPILL_XMM 15
movd m7, r8m
movifnidn dstq, dstmp
movifnidn dsq, dsmp
punpcklbw m3, m3
pshufb m7, [base+pw_256]
psraw m3, 8 ; sign-extend
%if ARCH_X86_32
ALLOC_STACK -16*7
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
mova m8, m0
mova m9, m1
mova m10, m2
mova m11, m3
%else
pshufd m8, m3, q0000
pshufd m9, m3, q1111
pshufd m10, m3, q2222
pshufd m11, m3, q3333
%endif
lea r6, [ssq*3]
sub srcq, r6
cmp wd, 2
jne .v_w4
.v_w2:
movd m1, [srcq+ssq*0]
movd m4, [srcq+ssq*1]
movd m2, [srcq+ssq*2]
add srcq, r6
movd m5, [srcq+ssq*0]
movd m3, [srcq+ssq*1]
movd m6, [srcq+ssq*2]
add srcq, r6
movd m0, [srcq+ssq*0]
punpckldq m1, m4 ; 0 1
punpckldq m4, m2 ; 1 2
punpckldq m2, m5 ; 2 3
punpckldq m5, m3 ; 3 4
punpckldq m3, m6 ; 4 5
punpckldq m6, m0 ; 5 6
punpcklwd m1, m4 ; 01 12
punpcklwd m2, m5 ; 23 34
punpcklwd m3, m6 ; 45 56
pxor m6, m6
.v_w2_loop:
movd m4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmaddwd m5, m8, m1 ; a0 b0
mova m1, m2
pmaddwd m2, m9 ; a1 b1
paddd m5, m2
mova m2, m3
pmaddwd m3, m10 ; a2 b2
paddd m5, m3
punpckldq m3, m0, m4 ; 6 7
movd m0, [srcq+ssq*0]
punpckldq m4, m0 ; 7 8
punpcklwd m3, m4 ; 67 78
pmaddwd m4, m11, m3 ; a3 b3
paddd m5, m4
psrad m5, 5
packssdw m5, m5
pmaxsw m5, m6
pavgw m5, m6
pminsw m5, m7
movd [dstq+dsq*0], m5
pshuflw m5, m5, q3232
movd [dstq+dsq*1], m5
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
%if ARCH_X86_32
shl wd, 14
%if STACK_ALIGNMENT < 16
mov [esp+4*29], srcq
mov [esp+4*30], dstq
%else
mov srcmp, srcq
%endif
lea wd, [wq+hq-(1<<16)]
%else
shl wd, 6
mov r7, srcq
mov r8, dstq
lea wd, [wq+hq-(1<<8)]
%endif
.v_w4_loop0:
movq m1, [srcq+ssq*0]
movq m2, [srcq+ssq*1]
movq m3, [srcq+ssq*2]
add srcq, r6
movq m4, [srcq+ssq*0]
movq m5, [srcq+ssq*1]
movq m6, [srcq+ssq*2]
add srcq, r6
movq m0, [srcq+ssq*0]
punpcklwd m1, m2 ; 01
punpcklwd m2, m3 ; 12
punpcklwd m3, m4 ; 23
punpcklwd m4, m5 ; 34
punpcklwd m5, m6 ; 45
punpcklwd m6, m0 ; 56
%if ARCH_X86_32
jmp .v_w4_loop_start
.v_w4_loop:
mova m1, m12
mova m2, m13
mova m3, m14
.v_w4_loop_start:
pmaddwd m1, m8 ; a0
pmaddwd m2, m8 ; b0
mova m12, m3
mova m13, m4
pmaddwd m3, m9 ; a1
pmaddwd m4, m9 ; b1
paddd m1, m3
paddd m2, m4
mova m14, m5
mova m4, m6
pmaddwd m5, m10 ; a2
pmaddwd m6, m10 ; b2
paddd m1, m5
paddd m2, m6
movq m6, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklwd m5, m0, m6 ; 67
movq m0, [srcq+ssq*0]
pmaddwd m3, m11, m5 ; a3
punpcklwd m6, m0 ; 78
paddd m1, m3
pmaddwd m3, m11, m6 ; b3
paddd m2, m3
psrad m1, 5
psrad m2, 5
packssdw m1, m2
pxor m2, m2
pmaxsw m1, m2
pavgw m1, m2
pminsw m1, m7
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
%if STACK_ALIGNMENT < 16
mov srcq, [esp+4*29]
mov dstq, [esp+4*30]
movzx hd, ww
add srcq, 8
add dstq, 8
mov [esp+4*29], srcq
mov [esp+4*30], dstq
%else
mov srcq, srcmp
mov dstq, dstmp
movzx hd, ww
add srcq, 8
add dstq, 8
mov srcmp, srcq
mov dstmp, dstq
%endif
sub wd, 1<<16
%else
.v_w4_loop:
pmaddwd m12, m8, m1 ; a0
pmaddwd m13, m8, m2 ; b0
mova m1, m3
mova m2, m4
pmaddwd m3, m9 ; a1
pmaddwd m4, m9 ; b1
paddd m12, m3
paddd m13, m4
mova m3, m5
mova m4, m6
pmaddwd m5, m10 ; a2
pmaddwd m6, m10 ; b2
paddd m12, m5
paddd m13, m6
movq m6, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklwd m5, m0, m6 ; 67
movq m0, [srcq+ssq*0]
pmaddwd m14, m11, m5 ; a3
punpcklwd m6, m0 ; 78
paddd m12, m14
pmaddwd m14, m11, m6 ; b3
paddd m13, m14
psrad m12, 5
psrad m13, 5
packssdw m12, m13
pxor m13, m13
pmaxsw m12, m13
pavgw m12, m13
pminsw m12, m7
movq [dstq+dsq*0], m12
movhps [dstq+dsq*1], m12
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
add r7, 8
add r8, 8
movzx hd, wb
mov srcq, r7
mov dstq, r8
sub wd, 1<<8
%endif
jg .v_w4_loop0
RET
.hv:
RESET_STACK_STATE
%if ARCH_X86_32
movd m4, r8m
mova m6, [base+pd_512]
pshufb m4, [base+pw_256]
%else
%if WIN64
ALLOC_STACK 16*6, 16
%endif
movd m15, r8m
pshufb m15, [base+pw_256]
%endif
cmp wd, 4
jg .hv_w8
movzx mxd, mxb
je .hv_w4
movq m0, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovb myd, mxd
movq m3, [base+subpel_filters+myq*8]
%if ARCH_X86_32
mov dstq, dstmp
mov dsq, dsmp
mova m5, [base+spel_h_shuf2]
ALLOC_STACK -16*8
%else
mova m6, [base+pd_512]
mova m9, [base+spel_h_shuf2]
%endif
pshuflw m0, m0, q2121
pxor m7, m7
punpcklbw m7, m0
punpcklbw m3, m3
psraw m3, 8 ; sign-extend
test dword r8m, 0x800
jz .hv_w2_10bpc
psraw m7, 2
psllw m3, 2
.hv_w2_10bpc:
lea r6, [ssq*3]
sub srcq, 2
sub srcq, r6
%if ARCH_X86_32
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
mova m9, m5
mova m11, m0
mova m12, m1
mova m13, m2
mova m14, m3
mova m15, m4
%else
pshufd m11, m3, q0000
pshufd m12, m3, q1111
pshufd m13, m3, q2222
pshufd m14, m3, q3333
%endif
movu m2, [srcq+ssq*0]
movu m3, [srcq+ssq*1]
movu m1, [srcq+ssq*2]
add srcq, r6
movu m4, [srcq+ssq*0]
%if ARCH_X86_32
REPX {pshufb x, m5}, m2, m3, m1, m4
%else
REPX {pshufb x, m9}, m2, m3, m1, m4
%endif
REPX {pmaddwd x, m7}, m2, m3, m1, m4
phaddd m2, m3 ; 0 1
phaddd m1, m4 ; 2 3
movu m3, [srcq+ssq*1]
movu m4, [srcq+ssq*2]
add srcq, r6
movu m0, [srcq+ssq*0]
%if ARCH_X86_32
REPX {pshufb x, m5}, m3, m4, m0
%else
REPX {pshufb x, m9}, m3, m4, m0
%endif
REPX {pmaddwd x, m7}, m3, m4, m0
phaddd m3, m4 ; 4 5
phaddd m0, m0 ; 6 6
REPX {paddd x, m6}, m2, m1, m3, m0
REPX {psrad x, 10}, m2, m1, m3, m0
packssdw m2, m1 ; 0 1 2 3
packssdw m3, m0 ; 4 5 6 _
palignr m4, m3, m2, 4 ; 1 2 3 4
pshufd m5, m3, q0321 ; 5 6 _ _
punpcklwd m1, m2, m4 ; 01 12
punpckhwd m2, m4 ; 23 34
punpcklwd m3, m5 ; 45 56
.hv_w2_loop:
movu m4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movu m5, [srcq+ssq*0]
pshufb m4, m9
pshufb m5, m9
pmaddwd m4, m7
pmaddwd m5, m7
phaddd m4, m5
pmaddwd m5, m11, m1 ; a0 b0
mova m1, m2
pmaddwd m2, m12 ; a1 b1
paddd m5, m2
mova m2, m3
pmaddwd m3, m13 ; a2 b2
paddd m5, m3
paddd m4, m6
psrad m4, 10 ; 7 8
packssdw m0, m4
pshufd m3, m0, q2103
punpckhwd m3, m0 ; 67 78
mova m0, m4
pmaddwd m4, m14, m3 ; a3 b3
paddd m5, m6
paddd m5, m4
psrad m5, 10
packssdw m5, m5
pxor m4, m4
pminsw m5, m15
pmaxsw m5, m4
movd [dstq+dsq*0], m5
pshuflw m5, m5, q3232
movd [dstq+dsq*1], m5
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w8:
shr mxd, 16
.hv_w4:
movq m2, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovb myd, mxd
movq m3, [base+subpel_filters+myq*8]
%if ARCH_X86_32
RESET_STACK_STATE
mov dstq, dstmp
mov dsq, dsmp
mova m0, [base+spel_h_shufA]
mova m1, [base+spel_h_shufB]
ALLOC_STACK -16*15
mova m8, m0
mova m9, m1
mova m14, m6
%else
mova m8, [base+spel_h_shufA]
mova m9, [base+spel_h_shufB]
%endif
pxor m0, m0
punpcklbw m0, m2
punpcklbw m3, m3
psraw m3, 8
test dword r8m, 0x800
jz .hv_w4_10bpc
psraw m0, 2
psllw m3, 2
.hv_w4_10bpc:
lea r6, [ssq*3]
sub srcq, 6
sub srcq, r6
%if ARCH_X86_32
%define tmp esp+16*8
shl wd, 14
%if STACK_ALIGNMENT < 16
mov [esp+4*61], srcq
mov [esp+4*62], dstq
%else
mov srcmp, srcq
%endif
mova [tmp+16*5], m4
lea wd, [wq+hq-(1<<16)]
pshufd m1, m0, q0000
pshufd m2, m0, q1111
pshufd m5, m0, q2222
pshufd m0, m0, q3333
mova m10, m1
mova m11, m2
mova m12, m5
mova m13, m0
%else
%if WIN64
%define tmp rsp
%else
%define tmp rsp-104 ; red zone
%endif
shl wd, 6
mov r7, srcq
mov r8, dstq
lea wd, [wq+hq-(1<<8)]
pshufd m10, m0, q0000
pshufd m11, m0, q1111
pshufd m12, m0, q2222
pshufd m13, m0, q3333
mova [tmp+16*5], m15
%endif
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
mova [tmp+16*1], m0
mova [tmp+16*2], m1
mova [tmp+16*3], m2
mova [tmp+16*4], m3
%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512]
pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4
pshufb m%1, m9 ; 2 3 3 4 4 5 5 6
pmaddwd m%3, m10
pmaddwd m%1, m11
paddd m%3, %5
paddd m%1, m%3
pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8
pshufb m%2, m9 ; 6 7 7 8 8 9 9 a
pmaddwd m%3, m12
pmaddwd m%2, m13
paddd m%1, m%3
paddd m%1, m%2
psrad m%1, %4
%endmacro
.hv_w4_loop0:
%if ARCH_X86_64
mova m14, [pd_512]
%endif
movu m4, [srcq+ssq*0+0]
movu m1, [srcq+ssq*0+8]
movu m5, [srcq+ssq*1+0]
movu m2, [srcq+ssq*1+8]
movu m6, [srcq+ssq*2+0]
movu m3, [srcq+ssq*2+8]
add srcq, r6
PUT_8TAP_HV_H 4, 1, 0, 10
PUT_8TAP_HV_H 5, 2, 0, 10
PUT_8TAP_HV_H 6, 3, 0, 10
movu m7, [srcq+ssq*0+0]
movu m2, [srcq+ssq*0+8]
movu m1, [srcq+ssq*1+0]
movu m3, [srcq+ssq*1+8]
PUT_8TAP_HV_H 7, 2, 0, 10
PUT_8TAP_HV_H 1, 3, 0, 10
movu m2, [srcq+ssq*2+0]
movu m3, [srcq+ssq*2+8]
add srcq, r6
PUT_8TAP_HV_H 2, 3, 0, 10
packssdw m4, m7 ; 0 3
packssdw m5, m1 ; 1 4
movu m0, [srcq+ssq*0+0]
movu m1, [srcq+ssq*0+8]
PUT_8TAP_HV_H 0, 1, 3, 10
packssdw m6, m2 ; 2 5
packssdw m7, m0 ; 3 6
punpcklwd m1, m4, m5 ; 01
punpckhwd m4, m5 ; 34
punpcklwd m2, m5, m6 ; 12
punpckhwd m5, m6 ; 45
punpcklwd m3, m6, m7 ; 23
punpckhwd m6, m7 ; 56
%if ARCH_X86_32
jmp .hv_w4_loop_start
.hv_w4_loop:
mova m1, [tmp+16*6]
mova m2, m15
.hv_w4_loop_start:
mova m7, [tmp+16*1]
pmaddwd m1, m7 ; a0
pmaddwd m2, m7 ; b0
mova m7, [tmp+16*2]
mova [tmp+16*6], m3
pmaddwd m3, m7 ; a1
mova m15, m4
pmaddwd m4, m7 ; b1
mova m7, [tmp+16*3]
paddd m1, m3
paddd m2, m4
mova m3, m5
pmaddwd m5, m7 ; a2
mova m4, m6
pmaddwd m6, m7 ; b2
paddd m1, m5
paddd m2, m6
movu m7, [srcq+ssq*1+0]
movu m5, [srcq+ssq*1+8]
lea srcq, [srcq+ssq*2]
PUT_8TAP_HV_H 7, 5, 6, 10
packssdw m0, m7 ; 6 7
mova [tmp+16*0], m0
movu m0, [srcq+ssq*0+0]
movu m5, [srcq+ssq*0+8]
PUT_8TAP_HV_H 0, 5, 6, 10
mova m6, [tmp+16*0]
packssdw m7, m0 ; 7 8
punpcklwd m5, m6, m7 ; 67
punpckhwd m6, m7 ; 78
pmaddwd m7, m5, [tmp+16*4]
paddd m1, m7 ; a3
pmaddwd m7, m6, [tmp+16*4]
paddd m2, m7 ; b3
psrad m1, 9
psrad m2, 9
packssdw m1, m2
pxor m7, m7
pmaxsw m1, m7
pavgw m7, m1
pminsw m7, [tmp+16*5]
movq [dstq+dsq*0], m7
movhps [dstq+dsq*1], m7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
%if STACK_ALIGNMENT < 16
mov srcq, [esp+4*61]
mov dstq, [esp+4*62]
add srcq, 8
add dstq, 8
mov [esp+4*61], srcq
mov [esp+4*62], dstq
%else
mov srcq, srcmp
mov dstq, dstmp
add srcq, 8
add dstq, 8
mov srcmp, srcq
mov dstmp, dstq
%endif
movzx hd, ww
sub wd, 1<<16
%else
.hv_w4_loop:
mova m15, [tmp+16*1]
pmaddwd m14, m15, m1 ; a0
pmaddwd m15, m2 ; b0
mova m7, [tmp+16*2]
mova m1, m3
pmaddwd m3, m7 ; a1
mova m2, m4
pmaddwd m4, m7 ; b1
mova m7, [tmp+16*3]
paddd m14, m3
paddd m15, m4
mova m3, m5
pmaddwd m5, m7 ; a2
mova m4, m6
pmaddwd m6, m7 ; b2
paddd m14, m5
paddd m15, m6
movu m7, [srcq+ssq*1+0]
movu m5, [srcq+ssq*1+8]
lea srcq, [srcq+ssq*2]
PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512]
packssdw m0, m7 ; 6 7
mova [tmp+16*0], m0
movu m0, [srcq+ssq*0+0]
movu m5, [srcq+ssq*0+8]
PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512]
mova m6, [tmp+16*0]
packssdw m7, m0 ; 7 8
punpcklwd m5, m6, m7 ; 67
punpckhwd m6, m7 ; 78
pmaddwd m7, m5, [tmp+16*4]
paddd m14, m7 ; a3
pmaddwd m7, m6, [tmp+16*4]
paddd m15, m7 ; b3
psrad m14, 9
psrad m15, 9
packssdw m14, m15
pxor m7, m7
pmaxsw m14, m7
pavgw m7, m14
pminsw m7, [tmp+16*5]
movq [dstq+dsq*0], m7
movhps [dstq+dsq*1], m7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
add r7, 8
add r8, 8
movzx hd, wb
mov srcq, r7
mov dstq, r8
sub wd, 1<<8
%endif
jg .hv_w4_loop0
RET
%undef tmp
%if ARCH_X86_32
DECLARE_REG_TMP 2, 1, 6, 4
%elif WIN64
DECLARE_REG_TMP 6, 4, 7, 4
%else
DECLARE_REG_TMP 6, 7, 7, 8
%endif
%define PREP_8TAP_FN FN prep_8tap,
PREP_8TAP_FN sharp, SHARP, SHARP
PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
PREP_8TAP_FN smooth, SMOOTH, SMOOTH
PREP_8TAP_FN sharp_regular, SHARP, REGULAR
PREP_8TAP_FN regular_sharp, REGULAR, SHARP
PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
PREP_8TAP_FN regular, REGULAR, REGULAR
%if ARCH_X86_32
cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my
%define mxb r0b
%define mxd r0
%define mxq r0
%define myb r2b
%define myd r2
%define myq r2
%else
cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
%endif
%define base t2-prep_ssse3
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 8tap_v, my, 4tap_v
LEA t2, prep_ssse3
movifnidn wd, wm
movifnidn srcq, srcmp
test mxd, 0xf00
jnz .h
movifnidn hd, hm
test myd, 0xf00
jnz .v
tzcnt wd, wd
mov myd, r7m ; bitdepth_max
movzx wd, word [base+prep_ssse3_table+wq*2]
mova m5, [base+pw_8192]
shr myd, 11
add wq, t2
movddup m4, [base+prep_mul+myq*8]
movifnidn ssq, ssmp
movifnidn tmpq, tmpmp
lea r6, [ssq*3]
%if WIN64
pop r7
%endif
jmp wq
.h:
test myd, 0xf00
jnz .hv
movifnidn ssq, r2mp
movifnidn hd, r4m
movddup m5, [base+prep_8tap_1d_rnd]
cmp wd, 4
jne .h_w8
movzx mxd, mxb
movq m0, [base+subpel_filters+mxq*8]
mova m3, [base+spel_h_shufA]
mova m4, [base+spel_h_shufB]
movifnidn tmpq, tmpmp
sub srcq, 2
WIN64_SPILL_XMM 8
punpcklbw m0, m0
psraw m0, 8
test dword r7m, 0x800
jnz .h_w4_12bpc
psllw m0, 2
.h_w4_12bpc:
pshufd m6, m0, q1111
pshufd m7, m0, q2222
.h_w4_loop:
movu m1, [srcq+ssq*0]
movu m2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4
pshufb m1, m4 ; 2 3 3 4 4 5 5 6
pmaddwd m0, m6
pmaddwd m1, m7
paddd m0, m5
paddd m0, m1
pshufb m1, m2, m3
pshufb m2, m4
pmaddwd m1, m6
pmaddwd m2, m7
paddd m1, m5
paddd m1, m2
psrad m0, 4
psrad m1, 4
packssdw m0, m1
mova [tmpq], m0
add tmpq, 16
sub hd, 2
jg .h_w4_loop
RET
.h_w8:
WIN64_SPILL_XMM 11
shr mxd, 16
movq m2, [base+subpel_filters+mxq*8]
mova m4, [base+spel_h_shufA]
mova m6, [base+spel_h_shufB]
movifnidn tmpq, r0mp
add wd, wd
punpcklbw m2, m2
add srcq, wq
psraw m2, 8
add tmpq, wq
neg wq
test dword r7m, 0x800
jnz .h_w8_12bpc
psllw m2, 2
.h_w8_12bpc:
pshufd m7, m2, q0000
%if ARCH_X86_32
ALLOC_STACK -16*3
pshufd m0, m2, q1111
pshufd m1, m2, q2222
pshufd m2, m2, q3333
mova m8, m0
mova m9, m1
mova m10, m2
%else
pshufd m8, m2, q1111
pshufd m9, m2, q2222
pshufd m10, m2, q3333
%endif
.h_w8_loop0:
mov r6, wq
.h_w8_loop:
movu m0, [srcq+r6- 6]
movu m1, [srcq+r6+ 2]
pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4
pshufb m0, m6 ; 2 3 3 4 4 5 5 6
pmaddwd m2, m7 ; abcd0
pmaddwd m0, m8 ; abcd1
pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8
pshufb m1, m6 ; 6 7 7 8 8 9 9 a
paddd m2, m5
paddd m0, m2
pmaddwd m2, m9, m3 ; abcd2
pmaddwd m3, m7 ; efgh0
paddd m0, m2
pmaddwd m2, m10, m1 ; abcd3
pmaddwd m1, m8 ; efgh1
paddd m0, m2
movu m2, [srcq+r6+10]
paddd m3, m5
paddd m1, m3
pshufb m3, m2, m4 ; a b b c c d d e
pshufb m2, m6 ; 8 9 9 a a b b c
pmaddwd m3, m9 ; efgh2
pmaddwd m2, m10 ; efgh3
paddd m1, m3
paddd m1, m2
psrad m0, 4
psrad m1, 4
packssdw m0, m1
mova [tmpq+r6], m0
add r6, 16
jl .h_w8_loop
add srcq, ssq
sub tmpq, wq
dec hd
jg .h_w8_loop0
RET
.v:
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmove myd, mxd
movq m3, [base+subpel_filters+myq*8]
WIN64_SPILL_XMM 15
movddup m7, [base+prep_8tap_1d_rnd]
movifnidn ssq, r2mp
movifnidn tmpq, r0mp
punpcklbw m3, m3
psraw m3, 8 ; sign-extend
test dword r7m, 0x800
jnz .v_12bpc
psllw m3, 2
.v_12bpc:
%if ARCH_X86_32
ALLOC_STACK -16*7
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
mova m8, m0
mova m9, m1
mova m10, m2
mova m11, m3
%else
pshufd m8, m3, q0000
pshufd m9, m3, q1111
pshufd m10, m3, q2222
pshufd m11, m3, q3333
%endif
lea r6, [ssq*3]
sub srcq, r6
mov r6d, wd
shl wd, 6
mov r5, srcq
%if ARCH_X86_64
mov r7, tmpq
%elif STACK_ALIGNMENT < 16
mov [esp+4*29], tmpq
%endif
lea wd, [wq+hq-(1<<8)]
.v_loop0:
movq m1, [srcq+ssq*0]
movq m2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movq m3, [srcq+ssq*0]
movq m4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movq m5, [srcq+ssq*0]
movq m6, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movq m0, [srcq+ssq*0]
punpcklwd m1, m2 ; 01
punpcklwd m2, m3 ; 12
punpcklwd m3, m4 ; 23
punpcklwd m4, m5 ; 34
punpcklwd m5, m6 ; 45
punpcklwd m6, m0 ; 56
%if ARCH_X86_32
jmp .v_loop_start
.v_loop:
mova m1, m12
mova m2, m13
mova m3, m14
.v_loop_start:
pmaddwd m1, m8 ; a0
pmaddwd m2, m8 ; b0
mova m12, m3
mova m13, m4
pmaddwd m3, m9 ; a1
pmaddwd m4, m9 ; b1
paddd m1, m3
paddd m2, m4
mova m14, m5
mova m4, m6
pmaddwd m5, m10 ; a2
pmaddwd m6, m10 ; b2
paddd m1, m5
paddd m2, m6
movq m6, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklwd m5, m0, m6 ; 67
movq m0, [srcq+ssq*0]
pmaddwd m3, m11, m5 ; a3
punpcklwd m6, m0 ; 78
paddd m1, m7
paddd m1, m3
pmaddwd m3, m11, m6 ; b3
paddd m2, m7
paddd m2, m3
psrad m1, 4
psrad m2, 4
packssdw m1, m2
movq [tmpq+r6*0], m1
movhps [tmpq+r6*2], m1
lea tmpq, [tmpq+r6*4]
sub hd, 2
jg .v_loop
%if STACK_ALIGNMENT < 16
mov tmpq, [esp+4*29]
add r5, 8
add tmpq, 8
mov srcq, r5
mov [esp+4*29], tmpq
%else
mov tmpq, tmpmp
add r5, 8
add tmpq, 8
mov srcq, r5
mov tmpmp, tmpq
%endif
%else
.v_loop:
pmaddwd m12, m8, m1 ; a0
pmaddwd m13, m8, m2 ; b0
mova m1, m3
mova m2, m4
pmaddwd m3, m9 ; a1
pmaddwd m4, m9 ; b1
paddd m12, m3
paddd m13, m4
mova m3, m5
mova m4, m6
pmaddwd m5, m10 ; a2
pmaddwd m6, m10 ; b2
paddd m12, m5
paddd m13, m6
movq m6, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklwd m5, m0, m6 ; 67
movq m0, [srcq+ssq*0]
pmaddwd m14, m11, m5 ; a3
punpcklwd m6, m0 ; 78
paddd m12, m7
paddd m12, m14
pmaddwd m14, m11, m6 ; b3
paddd m13, m7
paddd m13, m14
psrad m12, 4
psrad m13, 4
packssdw m12, m13
movq [tmpq+r6*0], m12
movhps [tmpq+r6*2], m12
lea tmpq, [tmpq+r6*4]
sub hd, 2
jg .v_loop
add r5, 8
add r7, 8
mov srcq, r5
mov tmpq, r7
%endif
movzx hd, wb
sub wd, 1<<8
jg .v_loop0
RET
.hv:
RESET_STACK_STATE
movzx t3d, mxb
shr mxd, 16
cmp wd, 4
cmove mxd, t3d
movifnidn hd, r4m
movq m2, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmove myd, mxd
movq m3, [base+subpel_filters+myq*8]
%if ARCH_X86_32
mov ssq, r2mp
mov tmpq, r0mp
mova m0, [base+spel_h_shufA]
mova m1, [base+spel_h_shufB]
mova m4, [base+prep_8tap_2d_rnd]
ALLOC_STACK -16*14
mova m8, m0
mova m9, m1
mova m14, m4
%else
%if WIN64
ALLOC_STACK 16*6, 16
%endif
mova m8, [base+spel_h_shufA]
mova m9, [base+spel_h_shufB]
%endif
pxor m0, m0
punpcklbw m0, m2
punpcklbw m3, m3
psraw m0, 4
psraw m3, 8
test dword r7m, 0x800
jz .hv_10bpc
psraw m0, 2
.hv_10bpc:
lea r6, [ssq*3]
sub srcq, 6
sub srcq, r6
mov r6d, wd
shl wd, 6
mov r5, srcq
%if ARCH_X86_32
%define tmp esp+16*8
%if STACK_ALIGNMENT < 16
mov [esp+4*61], tmpq
%endif
pshufd m1, m0, q0000
pshufd m2, m0, q1111
pshufd m5, m0, q2222
pshufd m0, m0, q3333
mova m10, m1
mova m11, m2
mova m12, m5
mova m13, m0
%else
%if WIN64
%define tmp rsp
%else
%define tmp rsp-88 ; red zone
%endif
mov r7, tmpq
pshufd m10, m0, q0000
pshufd m11, m0, q1111
pshufd m12, m0, q2222
pshufd m13, m0, q3333
%endif
lea wd, [wq+hq-(1<<8)]
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
mova [tmp+16*1], m0
mova [tmp+16*2], m1
mova [tmp+16*3], m2
mova [tmp+16*4], m3
.hv_loop0:
%if ARCH_X86_64
mova m14, [prep_8tap_2d_rnd]
%endif
movu m4, [srcq+ssq*0+0]
movu m1, [srcq+ssq*0+8]
movu m5, [srcq+ssq*1+0]
movu m2, [srcq+ssq*1+8]
lea srcq, [srcq+ssq*2]
movu m6, [srcq+ssq*0+0]
movu m3, [srcq+ssq*0+8]
PUT_8TAP_HV_H 4, 1, 0, 6
PUT_8TAP_HV_H 5, 2, 0, 6
PUT_8TAP_HV_H 6, 3, 0, 6
movu m7, [srcq+ssq*1+0]
movu m2, [srcq+ssq*1+8]
lea srcq, [srcq+ssq*2]
movu m1, [srcq+ssq*0+0]
movu m3, [srcq+ssq*0+8]
PUT_8TAP_HV_H 7, 2, 0, 6
PUT_8TAP_HV_H 1, 3, 0, 6
movu m2, [srcq+ssq*1+0]
movu m3, [srcq+ssq*1+8]
lea srcq, [srcq+ssq*2]
PUT_8TAP_HV_H 2, 3, 0, 6
packssdw m4, m7 ; 0 3
packssdw m5, m1 ; 1 4
movu m0, [srcq+ssq*0+0]
movu m1, [srcq+ssq*0+8]
PUT_8TAP_HV_H 0, 1, 3, 6
packssdw m6, m2 ; 2 5
packssdw m7, m0 ; 3 6
punpcklwd m1, m4, m5 ; 01
punpckhwd m4, m5 ; 34
punpcklwd m2, m5, m6 ; 12
punpckhwd m5, m6 ; 45
punpcklwd m3, m6, m7 ; 23
punpckhwd m6, m7 ; 56
%if ARCH_X86_32
jmp .hv_loop_start
.hv_loop:
mova m1, [tmp+16*5]
mova m2, m15
.hv_loop_start:
mova m7, [tmp+16*1]
pmaddwd m1, m7 ; a0
pmaddwd m2, m7 ; b0
mova m7, [tmp+16*2]
mova [tmp+16*5], m3
pmaddwd m3, m7 ; a1
mova m15, m4
pmaddwd m4, m7 ; b1
mova m7, [tmp+16*3]
paddd m1, m14
paddd m2, m14
paddd m1, m3
paddd m2, m4
mova m3, m5
pmaddwd m5, m7 ; a2
mova m4, m6
pmaddwd m6, m7 ; b2
paddd m1, m5
paddd m2, m6
movu m7, [srcq+ssq*1+0]
movu m5, [srcq+ssq*1+8]
lea srcq, [srcq+ssq*2]
PUT_8TAP_HV_H 7, 5, 6, 6
packssdw m0, m7 ; 6 7
mova [tmp+16*0], m0
movu m0, [srcq+ssq*0+0]
movu m5, [srcq+ssq*0+8]
PUT_8TAP_HV_H 0, 5, 6, 6
mova m6, [tmp+16*0]
packssdw m7, m0 ; 7 8
punpcklwd m5, m6, m7 ; 67
punpckhwd m6, m7 ; 78
pmaddwd m7, m5, [tmp+16*4]
paddd m1, m7 ; a3
pmaddwd m7, m6, [tmp+16*4]
paddd m2, m7 ; b3
psrad m1, 6
psrad m2, 6
packssdw m1, m2
movq [tmpq+r6*0], m1
movhps [tmpq+r6*2], m1
lea tmpq, [tmpq+r6*4]
sub hd, 2
jg .hv_loop
%if STACK_ALIGNMENT < 16
mov tmpq, [esp+4*61]
add r5, 8
add tmpq, 8
mov srcq, r5
mov [esp+4*61], tmpq
%else
mov tmpq, tmpmp
add r5, 8
add tmpq, 8
mov srcq, r5
mov tmpmp, tmpq
%endif
%else
.hv_loop:
mova m15, [tmp+16*1]
mova m7, [prep_8tap_2d_rnd]
pmaddwd m14, m15, m1 ; a0
pmaddwd m15, m2 ; b0
paddd m14, m7
paddd m15, m7
mova m7, [tmp+16*2]
mova m1, m3
pmaddwd m3, m7 ; a1
mova m2, m4
pmaddwd m4, m7 ; b1
mova m7, [tmp+16*3]
paddd m14, m3
paddd m15, m4
mova m3, m5
pmaddwd m5, m7 ; a2
mova m4, m6
pmaddwd m6, m7 ; b2
paddd m14, m5
paddd m15, m6
movu m7, [srcq+ssq*1+0]
movu m5, [srcq+ssq*1+8]
lea srcq, [srcq+ssq*2]
PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd]
packssdw m0, m7 ; 6 7
mova [tmp+16*0], m0
movu m0, [srcq+ssq*0+0]
movu m5, [srcq+ssq*0+8]
PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd]
mova m6, [tmp+16*0]
packssdw m7, m0 ; 7 8
punpcklwd m5, m6, m7 ; 67
punpckhwd m6, m7 ; 78
pmaddwd m7, m5, [tmp+16*4]
paddd m14, m7 ; a3
pmaddwd m7, m6, [tmp+16*4]
paddd m15, m7 ; b3
psrad m14, 6
psrad m15, 6
packssdw m14, m15
movq [tmpq+r6*0], m14
movhps [tmpq+r6*2], m14
lea tmpq, [tmpq+r6*4]
sub hd, 2
jg .hv_loop
add r5, 8
add r7, 8
mov srcq, r5
mov tmpq, r7
%endif
movzx hd, wb
sub wd, 1<<8
jg .hv_loop0
RET
%undef tmp
%macro movifprep 2
%if isprep
mov %1, %2
%endif
%endmacro
%macro SAVE_REG 1
%xdefine r%1_save r%1
%xdefine r%1q_save r%1q
%xdefine r%1d_save r%1d
%if ARCH_X86_32
%define r%1m_save [rstk+stack_offset+(%1+1)*4]
%endif
%endmacro
%macro LOAD_REG 1
%xdefine r%1 r%1_save
%xdefine r%1q r%1q_save
%xdefine r%1d r%1d_save
%if ARCH_X86_32
%define r%1m r%1m_save
%endif
%undef r%1d_save
%undef r%1q_save
%undef r%1_save
%endmacro
%macro REMAP_REG 2-3
%xdefine r%1 r%2
%xdefine r%1q r%2q
%xdefine r%1d r%2d
%if ARCH_X86_32
%if %3 == 0
%xdefine r%1m r%2m
%else
%define r%1m [rstk+stack_offset+(%1+1)*4]
%endif
%endif
%endmacro
%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
%if isprep
%if ARCH_X86_64
SAVE_REG 14
%assign %%i 14
%rep 14
%assign %%j %%i-1
REMAP_REG %%i, %%j
%assign %%i %%i-1
%endrep
%else
SAVE_REG 5
%assign %%i 5
%rep 5
%assign %%j %%i-1
REMAP_REG %%i, %%j, 0
%assign %%i %%i-1
%endrep
%endif
%endif
%endmacro
%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
%if isprep
%assign %%i 1
%if ARCH_X86_64
%rep 13
%assign %%j %%i+1
REMAP_REG %%i, %%j
%assign %%i %%i+1
%endrep
LOAD_REG 14
%else
%rep 4
%assign %%j %%i+1
REMAP_REG %%i, %%j, 1
%assign %%i %%i+1
%endrep
LOAD_REG 5
%endif
%endif
%endmacro
%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
RET
%if %1
MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
%endif
%endmacro
%if ARCH_X86_32
%macro MC_4TAP_SCALED_H 1 ; dst_mem
movu m7, [srcq+ssq*0]
movu m2, [srcq+ssq*1]
movu m5, [r4 +ssq*0]
movu m6, [r4 +ssq*1]
lea srcq, [srcq+ssq*2]
lea r4, [r4 +ssq*2]
REPX {pshufb x, m12}, m7, m2
REPX {pmaddwd x, m13}, m7, m2
REPX {pshufb x, m14}, m5, m6
REPX {pmaddwd x, m15}, m5, m6
phaddd m7, m5
phaddd m2, m6
mova m5, [esp+0x00]
movd m6, [esp+0x10]
paddd m7, m5
paddd m2, m5
psrad m7, m6
psrad m2, m6
packssdw m7, m2
mova [stk+%1], m7
%endmacro
%endif
%if ARCH_X86_64
%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
movu m%1, [srcq+ r4*2]
movu m%2, [srcq+ r6*2]
movu m%3, [srcq+ r7*2]
movu m%4, [srcq+ r9*2]
movu m%5, [srcq+r10*2]
movu m%6, [srcq+r11*2]
movu m%7, [srcq+r13*2]
movu m%8, [srcq+ rX*2]
add srcq, ssq
pmaddwd m%1, [stk+0x10]
pmaddwd m%2, [stk+0x20]
pmaddwd m%3, [stk+0x30]
pmaddwd m%4, [stk+0x40]
pmaddwd m%5, [stk+0x50]
pmaddwd m%6, [stk+0x60]
pmaddwd m%7, [stk+0x70]
pmaddwd m%8, [stk+0x80]
phaddd m%1, m%2
phaddd m%3, m%4
phaddd m%5, m%6
phaddd m%7, m%8
phaddd m%1, m%3
phaddd m%5, m%7
paddd m%1, hround
paddd m%5, hround
psrad m%1, m12
psrad m%5, m12
packssdw m%1, m%5
%endmacro
%else
%macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets
%if %3 == 1
mov r0, [stk+ 0]
mov rX, [stk+ 4]
mov r4, [stk+ 8]
mov r5, [stk+12]
%endif
movu m0, [srcq+r0*2]
movu m1, [srcq+rX*2]
movu m2, [srcq+r4*2]
movu m3, [srcq+r5*2]
mov r0, [stk+16]
mov rX, [stk+20]
mov r4, [stk+24]
mov r5, [stk+28]
pmaddwd m0, [stk+%1+0x00]
pmaddwd m1, [stk+%1+0x10]
pmaddwd m2, [stk+%1+0x20]
pmaddwd m3, [stk+%1+0x30]
phaddd m0, m1
phaddd m2, m3
movu m4, [srcq+r0*2]
movu m5, [srcq+rX*2]
movu m6, [srcq+r4*2]
movu m7, [srcq+r5*2]
add srcq, ssq
pmaddwd m4, [stk+%1+0xa0]
pmaddwd m5, [stk+%1+0xb0]
pmaddwd m6, [stk+%1+0xc0]
pmaddwd m7, [stk+%1+0xd0]
phaddd m4, m5
phaddd m6, m7
phaddd m0, m2
phaddd m4, m6
paddd m0, hround
paddd m4, hround
psrad m0, m12
psrad m4, m12
packssdw m0, m4
%if %2 != 0
mova [stk+%2], m0
%endif
%endmacro
%endif
%macro MC_8TAP_SCALED 1
%ifidn %1, put
%assign isput 1
%assign isprep 0
%if ARCH_X86_64
%if required_stack_alignment <= STACK_ALIGNMENT
cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
%else
cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
%endif
%else ; ARCH_X86_32
%if required_stack_alignment <= STACK_ALIGNMENT
cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
%else
cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
%endif
%endif
%xdefine base_reg r12
%else ; prep
%assign isput 0
%assign isprep 1
%if ARCH_X86_64
%if required_stack_alignment <= STACK_ALIGNMENT
cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
%xdefine tmp_stridem r14q
%else
cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
%define tmp_stridem qword [stk+0x138]
%endif
%xdefine base_reg r11
%else ; ARCH_X86_32
%if required_stack_alignment <= STACK_ALIGNMENT
cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
%else
cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
%endif
%define tmp_stridem dword [stk+0x138]
%endif
%endif
%if ARCH_X86_32
mov [esp+0x1f0], t0d
mov [esp+0x1f4], t1d
%if isput && required_stack_alignment > STACK_ALIGNMENT
mov dstd, dstm
mov dsd, dsm
mov srcd, srcm
mov ssd, ssm
mov hd, hm
mov r4, mxm
%define r0m [esp+0x200]
%define dsm [esp+0x204]
%define dsmp dsm
%define r1m dsm
%define r2m [esp+0x208]
%define ssm [esp+0x20c]
%define r3m ssm
%define hm [esp+0x210]
%define mxm [esp+0x214]
mov r0m, dstd
mov dsm, dsd
mov r2m, srcd
mov ssm, ssd
mov hm, hd
mov r0, mym
mov r1, dxm
mov r2, dym
%define mym [esp+0x218]
%define dxm [esp+0x21c]
%define dym [esp+0x220]
mov mxm, r4
mov mym, r0
mov dxm, r1
mov dym, r2
tzcnt wd, wm
%endif
%if isput
mov r3, pxmaxm
%define pxmaxm r3
%else
mov r2, pxmaxm
%endif
%if isprep && required_stack_alignment > STACK_ALIGNMENT
%xdefine base_reg r5
%else
%xdefine base_reg r6
%endif
%endif
LEA base_reg, %1_8tap_scaled_16bpc_ssse3
%xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3
%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
tzcnt wd, wm
%endif
%if ARCH_X86_64
%if isput
mov r7d, pxmaxm
%endif
%else
%define m8 m0
%define m9 m1
%define m14 m4
%define m15 m3
%endif
movd m8, dxm
movd m14, mxm
%if isput
movd m15, pxmaxm
%endif
pshufd m8, m8, q0000
pshufd m14, m14, q0000
%if isput
pshuflw m15, m15, q0000
punpcklqdq m15, m15
%endif
%if isprep
%if UNIX64
mov r5d, t0d
DECLARE_REG_TMP 5, 7
%endif
%if ARCH_X86_64
mov r6d, pxmaxm
%endif
%endif
%if ARCH_X86_64
mov dyd, dym
%endif
%if isput
%if WIN64
mov r8d, hm
DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
%define hm r5m
%define dxm r8m
%elif ARCH_X86_64
DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
%define hm r6m
%else
%endif
%if ARCH_X86_64
%if required_stack_alignment > STACK_ALIGNMENT
%define dsm [rsp+0x138]
%define rX r1
%define rXd r1d
%else
%define dsm dsq
%define rX r14
%define rXd r14d
%endif
%else
%define rX r1
%endif
%else ; prep
%if WIN64
mov r7d, hm
DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
%define hm r4m
%define dxm r7m
%elif ARCH_X86_64
DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
%xdefine hm r7m
%endif
MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
%if ARCH_X86_64
%define rX r14
%define rXd r14d
%else
%define rX r3
%endif
%endif
%if ARCH_X86_64
shr r7d, 11
mova m10, [base+pd_0x3ff]
movddup m11, [base+s_8tap_h_rnd+r7*8]
movd m12, [base+s_8tap_h_sh+r7*4]
%if isput
movddup m13, [base+put_s_8tap_v_rnd+r7*8]
movd m7, [base+put_s_8tap_v_sh+r7*4]
%define pxmaxm [rsp]
mova pxmaxm, m15
punpcklqdq m12, m7
%endif
lea ss3q, [ssq*3]
movzx r7d, t1b
shr t1d, 16
cmp hd, 6
cmovs t1d, r7d
sub srcq, ss3q
%else
%define m10 [base+pd_0x3ff]
%define m11 [esp+0x00]
%define m12 [esp+0x10]
shr r3, 11
movddup m1, [base+s_8tap_h_rnd+r3*8]
movd m2, [base+s_8tap_h_sh+r3*4]
%if isput
%define m13 [esp+0x20]
%define pxmaxm [esp+0x30]
%define stk esp+0x40
movddup m5, [base+put_s_8tap_v_rnd+r3*8]
movd m6, [base+put_s_8tap_v_sh+r3*4]
mova pxmaxm, m15
punpcklqdq m2, m6
mova m13, m5
%else
%define m13 [base+pd_m524256]
%endif
mov ssd, ssm
mova m11, m1
mova m12, m2
MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
mov r1, [esp+0x1f4]
lea r0, [ssd*3]
movzx r2, r1b
shr r1, 16
cmp dword hm, 6
cmovs r1, r2
mov [esp+0x1f4], r1
%if isprep
mov r1, r1m
%endif
mov r2, r2m
sub srcq, r0
MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
%define ss3q r0
%define myd r4
%define dyd dword dym
%define hd dword hm
%endif
cmp dyd, 1024
je .dy1
cmp dyd, 2048
je .dy2
movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
add wq, base_reg
jmp wq
%if isput
.w2:
%if ARCH_X86_64
mov myd, mym
movzx t0d, t0b
sub srcq, 2
movd m15, t0d
%else
movzx r4, byte [esp+0x1f0]
sub srcq, 2
movd m15, r4
%endif
pxor m9, m9
punpckldq m9, m8
paddd m14, m9 ; mx+dx*[0-1]
%if ARCH_X86_64
mova m9, [base+pd_0x4000]
%endif
pshufd m15, m15, q0000
pand m8, m14, m10
psrld m8, 6
paddd m15, m8
movd r4d, m15
pshufd m15, m15, q0321
%if ARCH_X86_64
movd r6d, m15
%else
movd r3d, m15
%endif
mova m5, [base+bdct_lb_q]
mova m6, [base+spel_s_shuf2]
movd m15, [base+subpel_filters+r4*8+2]
%if ARCH_X86_64
movd m7, [base+subpel_filters+r6*8+2]
%else
movd m7, [base+subpel_filters+r3*8+2]
%endif
pxor m2, m2
pcmpeqd m8, m2
psrld m14, 10
paddd m14, m14
%if ARCH_X86_32
mov r3, r3m
pshufb m14, m5
paddb m14, m6
mova [stk], m14
SWAP m5, m0
SWAP m6, m3
%define m15 m6
%endif
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
movu m2, [srcq+ssq*2]
movu m3, [srcq+ss3q ]
lea srcq, [srcq+ssq*4]
punpckldq m15, m7
%if ARCH_X86_64
pshufb m14, m5
paddb m14, m6
pand m9, m8
pandn m8, m15
SWAP m15, m8
por m15, m9
movu m4, [srcq+ssq*0]
movu m5, [srcq+ssq*1]
movu m6, [srcq+ssq*2]
movu m7, [srcq+ss3q ]
lea srcq, [srcq+ssq*4]
%else
pand m7, m5, [base+pd_0x4000]
pandn m5, m15
por m5, m7
%define m15 m5
%endif
punpcklbw m15, m15
psraw m15, 8
REPX {pshufb x, m14}, m0, m1, m2, m3
REPX {pmaddwd x, m15}, m0, m1, m2, m3
%if ARCH_X86_64
REPX {pshufb x, m14}, m4, m5, m6, m7
REPX {pmaddwd x, m15}, m4, m5, m6, m7
phaddd m0, m1
phaddd m2, m3
phaddd m4, m5
phaddd m6, m7
REPX {paddd x, m11}, m0, m2, m4, m6
REPX {psrad x, m12}, m0, m2, m4, m6
packssdw m0, m2 ; 0 1 2 3
packssdw m4, m6 ; 4 5 6 7
SWAP m1, m4
%else
mova [stk+0x10], m15
phaddd m0, m1
phaddd m2, m3
movu m1, [srcq+ssq*0]
movu m7, [srcq+ssq*1]
movu m6, [srcq+ssq*2]
movu m3, [srcq+ss3q ]
lea srcq, [srcq+ssq*4]
REPX {pshufb x, m14}, m1, m7, m6, m3
REPX {pmaddwd x, m15}, m1, m7, m6, m3
phaddd m1, m7
phaddd m6, m3
REPX {paddd x, m11}, m0, m2, m1, m6
REPX {psrad x, m12}, m0, m2, m1, m6
packssdw m0, m2
packssdw m1, m6
%define m14 [stk+0x00]
%define m15 [stk+0x10]
%endif
palignr m2, m1, m0, 4 ; 1 2 3 4
punpcklwd m3, m0, m2 ; 01 12
punpckhwd m0, m2 ; 23 34
pshufd m5, m1, q0321 ; 5 6 7 _
punpcklwd m2, m1, m5 ; 45 56
punpckhwd m4, m1, m5 ; 67 __
%if ARCH_X86_32
mov myd, mym
mov r0, r0m
mova [stk+0x20], m3
mova [stk+0x30], m0
mova [stk+0x40], m2
mova [stk+0x50], m4
%endif
.w2_loop:
and myd, 0x3ff
%if ARCH_X86_64
mov r6d, 64 << 24
mov r4d, myd
shr r4d, 6
lea r4d, [t1+r4]
cmovnz r6q, [base+subpel_filters+r4*8]
movq m10, r6q
punpcklbw m10, m10
psraw m10, 8
pshufd m7, m10, q0000
pshufd m8, m10, q1111
pmaddwd m5, m3, m7
pmaddwd m6, m0, m8
pshufd m9, m10, q2222
pshufd m10, m10, q3333
pmaddwd m7, m2, m9
pmaddwd m8, m4, m10
paddd m5, m6
paddd m7, m8
%else
mov r1, [esp+0x1f4]
xor r3, r3
mov r5, myd
shr r5, 6
lea r1, [r1+r5]
mov r5, 64 << 24
cmovnz r3, [base+subpel_filters+r1*8+4]
cmovnz r5, [base+subpel_filters+r1*8+0]
movd m6, r3
movd m7, r5
punpckldq m7, m6
punpcklbw m7, m7
psraw m7, 8
pshufd m5, m7, q0000
pshufd m6, m7, q1111
pmaddwd m3, m5
pmaddwd m0, m6
pshufd m5, m7, q2222
pshufd m7, m7, q3333
pmaddwd m2, m5
pmaddwd m4, m7
paddd m3, m0
paddd m2, m4
SWAP m5, m3
SWAP m7, m2
%define m8 m3
%endif
paddd m5, m13
pshufd m6, m12, q1032
pxor m8, m8
paddd m5, m7
psrad m5, m6
packssdw m5, m5
pmaxsw m5, m8
pminsw m5, pxmaxm
movd [dstq], m5
add dstq, dsmp
dec hd
jz .ret
%if ARCH_X86_64
add myd, dyd
%else
add myd, dym
%endif
test myd, ~0x3ff
%if ARCH_X86_32
SWAP m3, m5
SWAP m2, m7
mova m3, [stk+0x20]
mova m0, [stk+0x30]
mova m2, [stk+0x40]
mova m4, [stk+0x50]
%endif
jz .w2_loop
%if ARCH_X86_32
mov r3, r3m
%endif
movu m5, [srcq]
test myd, 0x400
jz .w2_skip_line
add srcq, ssq
shufps m3, m0, q1032 ; 01 12
shufps m0, m2, q1032 ; 23 34
shufps m2, m4, q1032 ; 45 56
pshufb m5, m14
pmaddwd m5, m15
phaddd m5, m5
paddd m5, m11
psrad m5, m12
packssdw m5, m5
palignr m4, m5, m1, 12
punpcklqdq m1, m4, m4 ; 6 7 6 7
punpcklwd m4, m1, m5 ; 67 __
%if ARCH_X86_32
mova [stk+0x20], m3
mova [stk+0x30], m0
mova [stk+0x40], m2
mova [stk+0x50], m4
%endif
jmp .w2_loop
.w2_skip_line:
movu m6, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova m3, m0 ; 01 12
mova m0, m2 ; 23 34
pshufb m5, m14
pshufb m6, m14
pmaddwd m5, m15
pmaddwd m6, m15
phaddd m5, m6
paddd m5, m11
psrad m5, m12
packssdw m5, m5 ; 6 7 6 7
punpckhqdq m1, m5 ; 4 5 6 7
pshufd m5, m1, q0321 ; 5 6 7 _
punpcklwd m2, m1, m5 ; 45 56
punpckhwd m4, m1, m5 ; 67 __
%if ARCH_X86_32
mova [stk+0x20], m3
mova [stk+0x30], m0
mova [stk+0x40], m2
mova [stk+0x50], m4
%endif
jmp .w2_loop
%endif
INIT_XMM ssse3
.w4:
%if ARCH_X86_64
mov myd, mym
mova [rsp+0x10], m11
mova [rsp+0x20], m12
%if isput
mova [rsp+0x30], m13
%endif
movzx t0d, t0b
sub srcq, 2
movd m15, t0d
%else
%define m8 m0
%xdefine m14 m4
%define m15 m3
movzx r4, byte [esp+0x1f0]
sub srcq, 2
movd m15, r4
%endif
pmaddwd m8, [base+rescale_mul]
%if ARCH_X86_64
mova m9, [base+pd_0x4000]
%else
%define m9 [base+pd_0x4000]
%endif
pshufd m15, m15, q0000
paddd m14, m8 ; mx+dx*[0-3]
pand m0, m14, m10
psrld m0, 6
paddd m15, m0
pshufd m7, m15, q1032
%if ARCH_X86_64
movd r4d, m15
movd r11d, m7
pshufd m15, m15, q0321
pshufd m7, m7, q0321
movd r6d, m15
movd r13d, m7
mova m10, [base+bdct_lb_q+ 0]
mova m11, [base+bdct_lb_q+16]
movd m13, [base+subpel_filters+ r4*8+2]
movd m2, [base+subpel_filters+ r6*8+2]
movd m15, [base+subpel_filters+r11*8+2]
movd m4, [base+subpel_filters+r13*8+2]
%else
movd r0, m15
movd r4, m7
pshufd m15, m15, q0321
pshufd m7, m7, q0321
movd rX, m15
movd r5, m7
mova m5, [base+bdct_lb_q+ 0]
mova m6, [base+bdct_lb_q+16]
movd m1, [base+subpel_filters+r0*8+2]
movd m2, [base+subpel_filters+rX*8+2]
movd m3, [base+subpel_filters+r4*8+2]
movd m7, [base+subpel_filters+r5*8+2]
movifprep r3, r3m
SWAP m4, m7
%define m10 m5
%define m11 m6
%define m12 m1
%define m13 m1
%endif
psrld m14, 10
paddd m14, m14
punpckldq m13, m2
punpckldq m15, m4
punpcklqdq m13, m15
pxor m2, m2
pcmpeqd m0, m2
%if ARCH_X86_64
pand m9, m0
%else
pand m2, m9, m0
%define m9 m2
SWAP m7, m4
%endif
pandn m0, m13
%if ARCH_X86_64
SWAP m13, m0
%else
%define m13 m0
%endif
por m13, m9
punpckhbw m15, m13, m13
punpcklbw m13, m13
psraw m15, 8
psraw m13, 8
pshufb m12, m14, m10
pshufb m14, m11
mova m10, [base+spel_s_shuf2]
movd r4d, m14
shr r4d, 24
%if ARCH_X86_32
mova [stk+0x20], m13
mova [stk+0x30], m15
pxor m2, m2
%endif
pshufb m7, m14, m2
psubb m14, m7
paddb m12, m10
paddb m14, m10
%if ARCH_X86_64
lea r6, [r4+ssq*1]
lea r11, [r4+ssq*2]
lea r13, [r4+ss3q ]
movu m7, [srcq+ssq*0]
movu m9, [srcq+ssq*1]
movu m8, [srcq+ssq*2]
movu m10, [srcq+ss3q ]
movu m1, [srcq+r4 ]
movu m3, [srcq+r6 ]
movu m2, [srcq+r11 ]
movu m4, [srcq+r13 ]
lea srcq, [srcq+ssq*4]
REPX {pshufb x, m12}, m7, m9, m8, m10
REPX {pmaddwd x, m13}, m7, m9, m8, m10
REPX {pshufb x, m14}, m1, m2, m3, m4
REPX {pmaddwd x, m15}, m1, m2, m3, m4
mova m5, [rsp+0x10]
movd xm6, [rsp+0x20]
phaddd m7, m1
phaddd m9, m3
phaddd m8, m2
phaddd m10, m4
movu m1, [srcq+ssq*0]
movu m2, [srcq+ssq*1]
movu m3, [srcq+ssq*2]
movu m4, [srcq+ss3q ]
REPX {paddd x, m5}, m7, m9, m8, m10
REPX {psrad x, xm6}, m7, m9, m8, m10
packssdw m7, m9 ; 0 1
packssdw m8, m10 ; 2 3
movu m0, [srcq+r4 ]
movu m9, [srcq+r6 ]
movu m10, [srcq+r11 ]
movu m11, [srcq+r13 ]
lea srcq, [srcq+ssq*4]
REPX {pshufb x, m12}, m1, m2, m3, m4
REPX {pmaddwd x, m13}, m1, m2, m3, m4
REPX {pshufb x, m14}, m0, m9, m10, m11
REPX {pmaddwd x, m15}, m0, m9, m10, m11
phaddd m1, m0
phaddd m2, m9
phaddd m3, m10
phaddd m4, m11
REPX {paddd x, m5}, m1, m2, m3, m4
REPX {psrad x, xm6}, m1, m2, m3, m4
packssdw m1, m2 ; 4 5
packssdw m3, m4 ; 6 7
SWAP m9, m1
shufps m4, m7, m8, q1032 ; 1 2
shufps m5, m8, m9, q1032 ; 3 4
shufps m6, m9, m3, q1032 ; 5 6
pshufd m10, m3, q1032 ; 7 _
punpcklwd m0, m7, m4 ; 01
punpckhwd m7, m4 ; 12
punpcklwd m1, m8, m5 ; 23
punpckhwd m8, m5 ; 34
punpcklwd m2, m9, m6 ; 45
punpckhwd m9, m6 ; 56
punpcklwd m3, m10 ; 67
mova [rsp+0x40], m7
mova [rsp+0x50], m8
mova [rsp+0x60], m9
%else
mova [stk+0x00], m12
mova [stk+0x10], m14
add r4, srcq
MC_4TAP_SCALED_H 0x40 ; 0 1
MC_4TAP_SCALED_H 0x50 ; 2 3
MC_4TAP_SCALED_H 0x60 ; 4 5
MC_4TAP_SCALED_H 0x70 ; 6 7
mova m4, [stk+0x40]
mova m5, [stk+0x50]
mova m6, [stk+0x60]
mova m7, [stk+0x70]
mov [stk+0xc0], r4
shufps m1, m4, m5, q1032 ; 1 2
shufps m2, m5, m6, q1032 ; 3 4
shufps m3, m6, m7, q1032 ; 5 6
pshufd m0, m7, q1032 ; 7 _
mova [stk+0xb0], m0
punpcklwd m0, m4, m1 ; 01
punpckhwd m4, m1 ; 12
punpcklwd m1, m5, m2 ; 23
punpckhwd m5, m2 ; 34
punpcklwd m2, m6, m3 ; 45
punpckhwd m6, m3 ; 56
punpcklwd m3, m7, [stk+0xb0] ; 67
mov myd, mym
mov r0, r0m
mova [stk+0x40], m0 ; 01
mova [stk+0x50], m1 ; 23
mova [stk+0x60], m2 ; 45
mova [stk+0x70], m3 ; 67
mova [stk+0x80], m4 ; 12
mova [stk+0x90], m5 ; 34
mova [stk+0xa0], m6 ; 56
%define m12 [stk+0x00]
%define m14 [stk+0x10]
%define m13 [stk+0x20]
%define m15 [stk+0x30]
%define hrnd_mem [esp+0x00]
%define hsh_mem [esp+0x10]
%if isput
%define vrnd_mem [esp+0x20]
%else
%define vrnd_mem [base+pd_m524256]
%endif
%endif
.w4_loop:
and myd, 0x3ff
%if ARCH_X86_64
mov r11d, 64 << 24
mov r13d, myd
shr r13d, 6
lea r13d, [t1+r13]
cmovnz r11q, [base+subpel_filters+r13*8]
movq m9, r11q
punpcklbw m9, m9
psraw m9, 8
pshufd m7, m9, q0000
pshufd m8, m9, q1111
pmaddwd m4, m0, m7
pmaddwd m5, m1, m8
pshufd m7, m9, q2222
pshufd m9, m9, q3333
pmaddwd m6, m2, m7
pmaddwd m8, m3, m9
%if isput
movd m9, [rsp+0x28]
%define vrnd_mem [rsp+0x30]
%else
%define vrnd_mem [base+pd_m524256]
%endif
paddd m4, m5
paddd m6, m8
paddd m4, m6
paddd m4, vrnd_mem
%else
mov mym, myd
mov r5, [esp+0x1f4]
xor r3, r3
shr r4, 6
lea r5, [r5+r4]
mov r4, 64 << 24
cmovnz r4, [base+subpel_filters+r5*8+0]
cmovnz r3, [base+subpel_filters+r5*8+4]
movd m7, r4
movd m6, r3
punpckldq m7, m6
punpcklbw m7, m7
psraw m7, 8
pshufd m4, m7, q0000
pshufd m5, m7, q1111
pshufd m6, m7, q2222
pshufd m7, m7, q3333
pmaddwd m0, m4
pmaddwd m1, m5
pmaddwd m2, m6
pmaddwd m3, m7
%if isput
movd m4, [esp+0x18]
%endif
paddd m0, m1
paddd m2, m3
paddd m0, vrnd_mem
paddd m0, m2
SWAP m4, m0
%define m9 m0
%endif
%if isput
pxor m5, m5
psrad m4, m9
packssdw m4, m4
pmaxsw m4, m5
pminsw m4, pxmaxm
movq [dstq], m4
add dstq, dsmp
%else
psrad m4, 6
packssdw m4, m4
movq [tmpq], m4
add tmpq, 8
%endif
dec hd
jz .ret
%if ARCH_X86_64
add myd, dyd
test myd, ~0x3ff
jz .w4_loop
mova m8, [rsp+0x10]
movd m9, [rsp+0x20]
movu m4, [srcq]
movu m5, [srcq+r4]
test myd, 0x400
jz .w4_skip_line
mova m0, [rsp+0x40]
mova [rsp+0x40], m1
mova m1, [rsp+0x50]
mova [rsp+0x50], m2
mova m2, [rsp+0x60]
mova [rsp+0x60], m3
pshufb m4, m12
pshufb m5, m14
pmaddwd m4, m13
pmaddwd m5, m15
phaddd m4, m5
paddd m4, m8
psrad m4, m9
packssdw m4, m4
punpcklwd m3, m10, m4
mova m10, m4
add srcq, ssq
jmp .w4_loop
.w4_skip_line:
movu m6, [srcq+ssq*1]
movu m7, [srcq+r6]
mova m0, [rsp+0x50]
mova m11, [rsp+0x60]
pshufb m4, m12
pshufb m6, m12
pshufb m5, m14
pshufb m7, m14
pmaddwd m4, m13
pmaddwd m6, m13
pmaddwd m5, m15
pmaddwd m7, m15
mova [rsp+0x40], m0
mova [rsp+0x50], m11
phaddd m4, m5
phaddd m6, m7
paddd m4, m8
paddd m6, m8
psrad m4, m9
psrad m6, m9
packssdw m4, m6
punpcklwd m9, m10, m4
mova [rsp+0x60], m9
pshufd m10, m4, q1032
mova m0, m1
mova m1, m2
mova m2, m3
punpcklwd m3, m4, m10
lea srcq, [srcq+ssq*2]
jmp .w4_loop
%else
SWAP m0, m4
mov myd, mym
mov r3, r3m
add myd, dym
test myd, ~0x3ff
jnz .w4_next_line
mova m0, [stk+0x40]
mova m1, [stk+0x50]
mova m2, [stk+0x60]
mova m3, [stk+0x70]
jmp .w4_loop
.w4_next_line:
mov r5, [stk+0xc0]
movu m4, [srcq]
movu m5, [r5]
test myd, 0x400
jz .w4_skip_line
add [stk+0xc0], ssq
mova m0, [stk+0x80]
mova m3, [stk+0x50]
mova [stk+0x40], m0
mova [stk+0x80], m3
mova m1, [stk+0x90]
mova m6, [stk+0x60]
mova [stk+0x50], m1
mova [stk+0x90], m6
mova m2, [stk+0xa0]
mova m7, [stk+0x70]
mova [stk+0x60], m2
mova [stk+0xa0], m7
pshufb m4, m12
pshufb m5, m14
pmaddwd m4, m13
pmaddwd m5, m15
phaddd m4, m5
paddd m4, hrnd_mem
psrad m4, hsh_mem
packssdw m4, m4
punpcklwd m3, [stk+0xb0], m4
mova [stk+0xb0], m4
mova [stk+0x70], m3
add srcq, ssq
jmp .w4_loop
.w4_skip_line:
movu m6, [srcq+ssq*1]
movu m7, [r5 +ssq*1]
lea r5, [r5 +ssq*2]
mov [stk+0xc0], r5
mova m0, [stk+0x50]
mova m1, [stk+0x60]
mova m2, [stk+0x70]
mova m3, [stk+0x90]
pshufb m4, m12
pshufb m6, m12
pshufb m5, m14
pshufb m7, m14
pmaddwd m4, m13
pmaddwd m6, m13
pmaddwd m5, m15
pmaddwd m7, m15
mova [stk+0x40], m0
mova [stk+0x50], m1
mova [stk+0x60], m2
mova [stk+0x80], m3
phaddd m4, m5
phaddd m6, m7
mova m5, [stk+0xa0]
mova m7, [stk+0xb0]
paddd m4, hrnd_mem
paddd m6, hrnd_mem
psrad m4, hsh_mem
psrad m6, hsh_mem
packssdw m4, m6
punpcklwd m7, m4
pshufd m6, m4, q1032
mova [stk+0x90], m5
mova [stk+0xa0], m7
mova [stk+0xb0], m6
punpcklwd m3, m4, m6
mova [stk+0x70], m3
lea srcq, [srcq+ssq*2]
jmp .w4_loop
%endif
INIT_XMM ssse3
%if ARCH_X86_64
%define stk rsp+0x20
%endif
.w8:
mov dword [stk+0xf0], 1
movifprep tmp_stridem, 16
jmp .w_start
.w16:
mov dword [stk+0xf0], 2
movifprep tmp_stridem, 32
jmp .w_start
.w32:
mov dword [stk+0xf0], 4
movifprep tmp_stridem, 64
jmp .w_start
.w64:
mov dword [stk+0xf0], 8
movifprep tmp_stridem, 128
jmp .w_start
.w128:
mov dword [stk+0xf0], 16
movifprep tmp_stridem, 256
.w_start:
%if ARCH_X86_64
%ifidn %1, put
movifnidn dsm, dsq
%endif
mova [rsp+0x10], m11
%define hround m11
shr t0d, 16
movd m15, t0d
%if isprep
mova m13, [base+pd_m524256]
%endif
%else
%define hround [esp+0x00]
%define m12 [esp+0x10]
%define m10 [base+pd_0x3ff]
%define m8 m0
%xdefine m14 m4
%define m15 m3
%if isprep
%define ssq ssm
%endif
mov r4, [esp+0x1f0]
shr r4, 16
movd m15, r4
mov r0, r0m
mov myd, mym
%endif
sub srcq, 6
pslld m7, m8, 2 ; dx*4
pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
pshufd m15, m15, q0000
paddd m14, m8 ; mx+dx*[0-3]
mova [stk+0x100], m7
mova [stk+0x120], m15
mov [stk+0x0f8], srcq
mov [stk+0x130], r0q ; dstq / tmpq
%if ARCH_X86_64 && UNIX64
mov hm, hd
%elif ARCH_X86_32
mov r5, hm
mov [stk+0x0f4], myd
mov [stk+0x134], r5
%endif
jmp .hloop
.hloop_prep:
dec dword [stk+0x0f0]
jz .ret
%if ARCH_X86_64
add qword [stk+0x130], 16
mov hd, hm
%else
add dword [stk+0x130], 16
mov myd, [stk+0x0f4]
mov r5, [stk+0x134]
mov r0, [stk+0x130]
%endif
mova m7, [stk+0x100]
mova m14, [stk+0x110]
%if ARCH_X86_64
mova m10, [base+pd_0x3ff]
mova m11, [rsp+0x10]
%endif
mova m15, [stk+0x120]
mov srcq, [stk+0x0f8]
%if ARCH_X86_64
mov r0q, [stk+0x130] ; dstq / tmpq
%else
mov mym, myd
mov hm, r5
mov r0m, r0
mov r3, r3m
%endif
paddd m14, m7
.hloop:
%if ARCH_X86_64
mova m9, [base+pq_0x40000000]
%else
%define m9 [base+pq_0x40000000]
%endif
pxor m1, m1
psrld m2, m14, 10
mova [stk], m2
pand m6, m14, m10
psrld m6, 6
paddd m5, m15, m6
pcmpeqd m6, m1
pshufd m2, m5, q1032
%if ARCH_X86_64
movd r4d, m5
movd r6d, m2
pshufd m5, m5, q0321
pshufd m2, m2, q0321
movd r7d, m5
movd r9d, m2
movq m0, [base+subpel_filters+r4*8]
movq m1, [base+subpel_filters+r6*8]
movhps m0, [base+subpel_filters+r7*8]
movhps m1, [base+subpel_filters+r9*8]
%else
movd r0, m5
movd rX, m2
pshufd m5, m5, q0321
pshufd m2, m2, q0321
movd r4, m5
movd r5, m2
movq m0, [base+subpel_filters+r0*8]
movq m1, [base+subpel_filters+rX*8]
movhps m0, [base+subpel_filters+r4*8]
movhps m1, [base+subpel_filters+r5*8]
%endif
paddd m14, m7 ; mx+dx*[4-7]
pand m5, m14, m10
psrld m5, 6
paddd m15, m5
pxor m2, m2
pcmpeqd m5, m2
mova [stk+0x110], m14
pshufd m4, m15, q1032
%if ARCH_X86_64
movd r10d, m15
movd r11d, m4
pshufd m15, m15, q0321
pshufd m4, m4, q0321
movd r13d, m15
movd rXd, m4
movq m2, [base+subpel_filters+r10*8]
movq m3, [base+subpel_filters+r11*8]
movhps m2, [base+subpel_filters+r13*8]
movhps m3, [base+subpel_filters+ rX*8]
psrld m14, 10
movq r11, m14
punpckhqdq m14, m14
movq rX, m14
mov r10d, r11d
shr r11, 32
mov r13d, rXd
shr rX, 32
mov r4d, [stk+ 0]
mov r6d, [stk+ 4]
mov r7d, [stk+ 8]
mov r9d, [stk+12]
pshufd m4, m6, q1100
pshufd m6, m6, q3322
pshufd m14, m5, q1100
pshufd m5, m5, q3322
pand m7, m9, m4
pand m8, m9, m6
pand m15, m9, m14
pand m9, m9, m5
pandn m4, m0
pandn m6, m1
pandn m14, m2
pandn m5, m3
por m7, m4
por m8, m6
por m15, m14
por m9, m5
punpcklbw m0, m7, m7
punpckhbw m7, m7
punpcklbw m1, m8, m8
punpckhbw m8, m8
psraw m0, 8
psraw m7, 8
psraw m1, 8
psraw m8, 8
punpcklbw m2, m15, m15
punpckhbw m15, m15
punpcklbw m3, m9, m9
punpckhbw m9, m9
psraw m2, 8
psraw m15, 8
psraw m3, 8
psraw m9, 8
mova [stk+0x10], m0
mova [stk+0x20], m7
mova [stk+0x30], m1
mova [stk+0x40], m8
mova [stk+0x50], m2
mova [stk+0x60], m15
mova [stk+0x70], m3
mova [stk+0x80], m9
MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
mova [stk+0x90], m1
MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
mova [stk+0xa0], m2
MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
mova [stk+0xb0], m3
MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
mova [stk+0xc0], m4
MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
mova [stk+0xd0], m5
MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
mova m5, [stk+0xd0]
mova m1, [stk+0x90]
mova m2, [stk+0xa0]
mova m3, [stk+0xb0]
mova m9, [stk+0xc0]
mov myd, mym
mov dyd, dym
punpcklwd m4, m5, m6 ; 45a
punpckhwd m5, m6 ; 45b
punpcklwd m6, m7, m8 ; 67a
punpckhwd m7, m8 ; 67b
punpcklwd m0, m1, m2 ; 01a
punpckhwd m1, m2 ; 01b
punpcklwd m2, m3, m9 ; 23a
punpckhwd m3, m9 ; 23b
mova [stk+0x90], m4
mova [stk+0xa0], m5
mova [stk+0xb0], m6
mova [stk+0xc0], m7
%define hround [rsp+0x10]
.vloop:
and myd, 0x3ff
mov r6d, 64 << 24
mov r4d, myd
shr r4d, 6
lea r4d, [t1+r4]
cmovnz r6q, [base+subpel_filters+r4*8]
movq m11, r6q
punpcklbw m11, m11
psraw m11, 8
pshufd m5, m11, q0000
pshufd m7, m11, q1111
pshufd m10, m11, q2222
pshufd m11, m11, q3333
pmaddwd m4, m5, m0
pmaddwd m5, m5, m1
pmaddwd m6, m7, m2
pmaddwd m7, m7, m3
paddd m4, m13
paddd m5, m13
paddd m4, m6
paddd m5, m7
pmaddwd m6, [stk+0x90], m10
pmaddwd m7, [stk+0xa0], m10
pmaddwd m8, [stk+0xb0], m11
pmaddwd m9, [stk+0xc0], m11
paddd m4, m6
paddd m5, m7
%if isput
pshufd m6, m12, q1032
%endif
paddd m4, m8
paddd m5, m9
%else
movd r0, m15
movd rX, m4
pshufd m15, m15, q0321
pshufd m4, m4, q0321
movd r4, m15
movd r5, m4
mova m14, [stk+0x110]
movq m2, [base+subpel_filters+r0*8]
movq m3, [base+subpel_filters+rX*8]
movhps m2, [base+subpel_filters+r4*8]
movhps m3, [base+subpel_filters+r5*8]
psrld m14, 10
mova [stk+16], m14
mov r0, [stk+ 0]
mov rX, [stk+ 4]
mov r4, [stk+ 8]
mov r5, [stk+12]
mova [stk+0x20], m0
mova [stk+0x30], m1
mova [stk+0x40], m2
mova [stk+0x50], m3
pshufd m4, m6, q1100
pshufd m6, m6, q3322
pshufd m7, m5, q1100
pshufd m5, m5, q3322
pand m0, m9, m4
pand m1, m9, m6
pand m2, m9, m7
pand m3, m9, m5
pandn m4, [stk+0x20]
pandn m6, [stk+0x30]
pandn m7, [stk+0x40]
pandn m5, [stk+0x50]
por m0, m4
por m1, m6
por m2, m7
por m3, m5
punpcklbw m4, m0, m0
punpckhbw m0, m0
punpcklbw m5, m1, m1
punpckhbw m1, m1
psraw m4, 8
psraw m0, 8
psraw m5, 8
psraw m1, 8
punpcklbw m6, m2, m2
punpckhbw m2, m2
punpcklbw m7, m3, m3
punpckhbw m3, m3
psraw m6, 8
psraw m2, 8
psraw m7, 8
psraw m3, 8
mova [stk+0x0a0], m4
mova [stk+0x0b0], m0
mova [stk+0x0c0], m5
mova [stk+0x0d0], m1
mova [stk+0x140], m6
mova [stk+0x150], m2
mova [stk+0x160], m7
mova [stk+0x170], m3
MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
mova m5, [stk+0x60]
mova m6, [stk+0x70]
mova m7, [stk+0x80]
mova m0, [stk+0x90]
mov myd, mym
punpcklwd m4, m5, m6 ; 45a
punpckhwd m5, m6 ; 45b
punpcklwd m6, m7, m0 ; 67a
punpckhwd m7, m0 ; 67b
mova [stk+0x60], m4
mova [stk+0x70], m5
mova [stk+0x80], m6
mova [stk+0x90], m7
mova m1, [stk+0x20]
mova m2, [stk+0x30]
mova m3, [stk+0x40]
mova m4, [stk+0x50]
punpcklwd m0, m1, m2 ; 01a
punpckhwd m1, m2 ; 01b
punpcklwd m2, m3, m4 ; 23a
punpckhwd m3, m4 ; 23b
mova [stk+0x20], m0
mova [stk+0x30], m1
mova [stk+0x40], m2
mova [stk+0x50], m3
.vloop:
mov r0, r0m
mov r5, [esp+0x1f4]
and myd, 0x3ff
mov mym, myd
xor r3, r3
shr r4, 6
lea r5, [r5+r4]
mov r4, 64 << 24
cmovnz r4, [base+subpel_filters+r5*8+0]
cmovnz r3, [base+subpel_filters+r5*8+4]
movd m7, r4
movd m6, r3
punpckldq m7, m6
punpcklbw m7, m7
psraw m7, 8
pshufd m4, m7, q0000
pshufd m5, m7, q1111
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m5
pmaddwd m3, m5
pshufd m6, m7, q2222
pshufd m7, m7, q3333
paddd m0, m2
paddd m1, m3
pmaddwd m2, [stk+0x60], m6
pmaddwd m3, [stk+0x70], m6
pmaddwd m4, [stk+0x80], m7
pmaddwd m5, [stk+0x90], m7
%if isput
movd m6, [esp+0x18]
%endif
paddd m0, m2
paddd m1, m3
paddd m0, vrnd_mem
paddd m1, vrnd_mem
paddd m4, m0
paddd m5, m1
%endif
%ifidn %1, put
psrad m4, m6
psrad m5, m6
packssdw m4, m5
pxor m7, m7
pmaxsw m4, m7
pminsw m4, pxmaxm
mova [dstq], m4
add dstq, dsm
%else
psrad m4, 6
psrad m5, 6
packssdw m4, m5
mova [tmpq], m4
add tmpq, tmp_stridem
%endif
dec hd
jz .hloop_prep
%if ARCH_X86_64
add myd, dyd
test myd, ~0x3ff
jz .vloop
test myd, 0x400
mov [stk+0x140], myd
mov r4d, [stk+ 0]
mov r6d, [stk+ 4]
mov r7d, [stk+ 8]
mov r9d, [stk+12]
jz .skip_line
mova m14, [base+unpckw]
movu m8, [srcq+r10*2]
movu m9, [srcq+r11*2]
movu m10, [srcq+r13*2]
movu m11, [srcq+ rX*2]
movu m4, [srcq+ r4*2]
movu m5, [srcq+ r6*2]
movu m6, [srcq+ r7*2]
movu m7, [srcq+ r9*2]
add srcq, ssq
mov myd, [stk+0x140]
mov dyd, dym
pshufd m15, m14, q1032
pshufb m0, m14 ; 0a 1a
pshufb m1, m14 ; 0b 1b
pshufb m2, m15 ; 3a 2a
pshufb m3, m15 ; 3b 2b
pmaddwd m8, [stk+0x50]
pmaddwd m9, [stk+0x60]
pmaddwd m10, [stk+0x70]
pmaddwd m11, [stk+0x80]
pmaddwd m4, [stk+0x10]
pmaddwd m5, [stk+0x20]
pmaddwd m6, [stk+0x30]
pmaddwd m7, [stk+0x40]
phaddd m8, m9
phaddd m10, m11
mova m11, hround
phaddd m4, m5
phaddd m6, m7
phaddd m8, m10
phaddd m4, m6
paddd m4, m11
paddd m8, m11
psrad m4, m12
psrad m8, m12
packssdw m4, m8
pshufb m5, [stk+0x90], m14 ; 4a 5a
pshufb m6, [stk+0xa0], m14 ; 4b 5b
pshufb m7, [stk+0xb0], m15 ; 7a 6a
pshufb m8, [stk+0xc0], m15 ; 7b 6b
punpckhwd m0, m2 ; 12a
punpckhwd m1, m3 ; 12b
punpcklwd m2, m5 ; 34a
punpcklwd m3, m6 ; 34b
punpckhwd m5, m7 ; 56a
punpckhwd m6, m8 ; 56b
punpcklwd m7, m4 ; 78a
punpckhqdq m4, m4
punpcklwd m8, m4 ; 78b
mova [stk+0x90], m5
mova [stk+0xa0], m6
mova [stk+0xb0], m7
mova [stk+0xc0], m8
jmp .vloop
.skip_line:
MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11
MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11
mov myd, [stk+0x140]
mov dyd, dym
mova m0, m2 ; 01a
mova m1, m3 ; 01b
mova m2, [stk+0x90] ; 23a
mova m3, [stk+0xa0] ; 23b
mova m5, [stk+0xb0] ; 45a
mova m6, [stk+0xc0] ; 45b
punpcklwd m7, m4, m8 ; 67a
punpckhwd m4, m8 ; 67b
mova [stk+0x90], m5
mova [stk+0xa0], m6
mova [stk+0xb0], m7
mova [stk+0xc0], m4
%else
mov r0m, r0
mov myd, mym
mov r3, r3m
add myd, dym
test myd, ~0x3ff
mov mym, myd
jnz .next_line
mova m0, [stk+0x20]
mova m1, [stk+0x30]
mova m2, [stk+0x40]
mova m3, [stk+0x50]
jmp .vloop
.next_line:
test myd, 0x400
mov r0, [stk+ 0]
mov rX, [stk+ 4]
mov r4, [stk+ 8]
mov r5, [stk+12]
jz .skip_line
MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
mova m7, [base+unpckw]
pshufd m4, m7, q1032
pshufb m0, [stk+0x20], m7 ; 0a 1a
pshufb m1, [stk+0x30], m7 ; 0b 1b
pshufb m2, [stk+0x40], m4 ; 3a 2a
pshufb m3, [stk+0x50], m4 ; 3b 2b
pshufb m5, [stk+0x60], m7 ; 4a 5a
pshufb m6, [stk+0x70], m7 ; 4b 5b
pshufb m7, [stk+0x80], m4 ; 7a 6a
punpckhwd m0, m2 ; 12a
punpckhwd m1, m3 ; 12b
punpcklwd m2, m5 ; 34a
punpcklwd m3, m6 ; 34b
mova [stk+0x20], m0
mova [stk+0x30], m1
mova [stk+0x40], m2
mova [stk+0x50], m3
punpckhwd m5, m7 ; 56a
mova [stk+0x60], m5
pshufb m5, [stk+0x90], m4 ; 7b 6b
punpcklwd m7, [stk+0xe0] ; 78a
punpckhwd m6, m5 ; 56b
mova [stk+0x70], m6
movq m6, [stk+0xe8]
mova [stk+0x80], m7
punpcklwd m5, m6
mov myd, mym
mova [stk+0x90], m5
jmp .vloop
.skip_line:
MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
MC_8TAP_SCALED_H 0xa0, 0 ; 9
mova m7, [stk+0xe0]
mova m2, [stk+0x60] ; 23a
mova m3, [stk+0x70] ; 23b
mova m4, [stk+0x80] ; 45a
mova m5, [stk+0x90] ; 45b
punpcklwd m6, m7, m0 ; 67a
punpckhwd m7, m0 ; 67b
mova m0, [stk+0x40] ; 01a
mova m1, [stk+0x50] ; 01b
mov myd, mym
mova [stk+0x40], m2
mova [stk+0x50], m3
mova [stk+0x60], m4
mova [stk+0x70], m5
mova [stk+0x80], m6
mova [stk+0x90], m7
mova [stk+0x20], m0
mova [stk+0x30], m1
%endif
jmp .vloop
INIT_XMM ssse3
.dy1:
movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
add wq, base_reg
jmp wq
%if isput
.dy1_w2:
%if ARCH_X86_64
mov myd, mym
movzx t0d, t0b
sub srcq, 2
movd m15, t0d
%else
%define m8 m0
%define m9 m1
%define m14 m4
%define m15 m3
%define m11 [esp+0x00]
%define m12 [esp+0x10]
%define m13 [esp+0x20]
movzx r5, byte [esp+0x1f0]
sub srcq, 2
movd m15, r5
mov r1, r1m
%endif
pxor m9, m9
punpckldq m9, m8
paddd m14, m9 ; mx+dx*[0-1]
%if ARCH_X86_64
mova m9, [base+pd_0x4000]
%endif
pshufd m15, m15, q0000
pand m8, m14, m10
psrld m8, 6
paddd m15, m8
movd r4d, m15
pshufd m15, m15, q0321
%if ARCH_X86_64
movd r6d, m15
%else
movd r3d, m15
%endif
mova m5, [base+bdct_lb_q]
mova m6, [base+spel_s_shuf2]
movd m15, [base+subpel_filters+r4*8+2]
%if ARCH_X86_64
movd m7, [base+subpel_filters+r6*8+2]
%else
movd m7, [base+subpel_filters+r3*8+2]
%endif
pxor m2, m2
pcmpeqd m8, m2
psrld m14, 10
paddd m14, m14
%if ARCH_X86_32
mov r3, r3m
pshufb m14, m5
paddb m14, m6
mova [stk], m14
SWAP m5, m0
SWAP m6, m3
%define m15 m6
%endif
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
movu m2, [srcq+ssq*2]
movu m3, [srcq+ss3q ]
lea srcq, [srcq+ssq*4]
punpckldq m15, m7
%if ARCH_X86_64
pshufb m14, m5
paddb m14, m6
pand m9, m8
pandn m8, m15
SWAP m15, m8
por m15, m9
movu m4, [srcq+ssq*0]
movu m5, [srcq+ssq*1]
movu m6, [srcq+ssq*2]
add srcq, ss3q
shr myd, 6
mov r4d, 64 << 24
lea myd, [t1+myq]
cmovnz r4q, [base+subpel_filters+myq*8]
%else
pand m7, m5, [base+pd_0x4000]
pandn m5, m15
por m5, m7
%define m15 m5
mov myd, mym
mov r5, [esp+0x1f4]
xor r3, r3
shr myd, 6
lea r5, [r5+myd]
mov r4, 64 << 24
cmovnz r4, [base+subpel_filters+r5*8+0]
cmovnz r3, [base+subpel_filters+r5*8+4]
mov [stk+0x20], r3
mov r3, r3m
%endif
punpcklbw m15, m15
psraw m15, 8
REPX {pshufb x, m14}, m0, m1, m2, m3
REPX {pmaddwd x, m15}, m0, m1, m2, m3
%if ARCH_X86_64
REPX {pshufb x, m14}, m4, m5, m6
REPX {pmaddwd x, m15}, m4, m5, m6
phaddd m0, m1
phaddd m2, m3
phaddd m4, m5
phaddd m6, m6
REPX {paddd x, m11}, m0, m2, m4, m6
REPX {psrad x, m12}, m0, m2, m4, m6
packssdw m0, m2 ; 0 1 2 3
packssdw m4, m6 ; 4 5 6
SWAP m1, m4
movq m10, r4
%else
mova [stk+0x10], m15
phaddd m0, m1
phaddd m2, m3
movu m1, [srcq+ssq*0]
movu m7, [srcq+ssq*1]
movu m6, [srcq+ssq*2]
add srcq, ss3q
REPX {pshufb x, m14}, m1, m7, m6
REPX {pmaddwd x, m15}, m1, m7, m6
%define m14 [stk+0x00]
%define m15 [stk+0x10]
phaddd m1, m7
phaddd m6, m6
REPX {paddd x, m11}, m0, m2, m1, m6
REPX {psrad x, m12}, m0, m2, m1, m6
packssdw m0, m2
packssdw m1, m6
%define m8 m6
%define m9 m4
%define m10 m5
movd m10, r4
movd m9, [stk+0x20]
punpckldq m10, m9
%endif
punpcklbw m10, m10
psraw m10, 8
pshufd m7, m10, q0000
pshufd m8, m10, q1111
pshufd m9, m10, q2222
pshufd m10, m10, q3333
%if ARCH_X86_32
mova [stk+0x50], m7
mova [stk+0x60], m8
mova [stk+0x70], m9
mova [stk+0x80], m10
%define m7 [stk+0x50]
%define m8 [stk+0x60]
%define m9 [stk+0x70]
%define m10 [stk+0x80]
%endif
palignr m2, m1, m0, 4 ; 1 2 3 4
punpcklwd m3, m0, m2 ; 01 12
punpckhwd m0, m2 ; 23 34
pshufd m4, m1, q2121 ; 5 6 5 6
punpcklwd m2, m1, m4 ; 45 56
%if ARCH_X86_32
mov r0, r0m
%endif
.dy1_w2_loop:
movu m1, [srcq+ssq*0]
movu m6, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmaddwd m5, m3, m7
mova m3, m0
pmaddwd m0, m8
pshufb m1, m14
pshufb m6, m14
pmaddwd m1, m15
pmaddwd m6, m15
phaddd m1, m6
paddd m1, m11
psrad m1, m12
packssdw m1, m1
paddd m5, m0
mova m0, m2
pmaddwd m2, m9
paddd m5, m2
palignr m2, m1, m4, 12
punpcklwd m2, m1 ; 67 78
pmaddwd m4, m2, m10
paddd m5, m13
paddd m5, m4
pxor m6, m6
mova m4, m1
pshufd m1, m12, q1032
psrad m5, m1
packssdw m5, m5
pmaxsw m5, m6
pminsw m5, pxmaxm
movd [dstq+dsq*0], m5
pshuflw m5, m5, q1032
movd [dstq+dsq*1], m5
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .dy1_w2_loop
RET
%endif
INIT_XMM ssse3
.dy1_w4:
%if ARCH_X86_64
mov myd, mym
mova [rsp+0x10], m11
mova [rsp+0x20], m12
%if isput
mova [rsp+0x30], m13
%define vrnd_mem [rsp+0x30]
%define stk rsp+0x40
%else
%define vrnd_mem [base+pd_m524256]
%define stk rsp+0x30
%endif
movzx t0d, t0b
sub srcq, 2
movd m15, t0d
%else
%define m10 [base+pd_0x3ff]
%define m9 [base+pd_0x4000]
%define m8 m0
%xdefine m14 m4
%define m15 m3
%if isprep
%define ssq r3
%endif
movzx r5, byte [esp+0x1f0]
sub srcq, 2
movd m15, r5
%endif
pmaddwd m8, [base+rescale_mul]
%if ARCH_X86_64
mova m9, [base+pd_0x4000]
%endif
pshufd m15, m15, q0000
paddd m14, m8 ; mx+dx*[0-3]
pand m0, m14, m10
psrld m0, 6
paddd m15, m0
pshufd m7, m15, q1032
%if ARCH_X86_64
movd r4d, m15
movd r11d, m7
pshufd m15, m15, q0321
pshufd m7, m7, q0321
movd r6d, m15
movd r13d, m7
mova m10, [base+bdct_lb_q+ 0]
mova m11, [base+bdct_lb_q+16]
movd m13, [base+subpel_filters+ r4*8+2]
movd m2, [base+subpel_filters+ r6*8+2]
movd m15, [base+subpel_filters+r11*8+2]
movd m4, [base+subpel_filters+r13*8+2]
%else
movd r0, m15
movd r4, m7
pshufd m15, m15, q0321
pshufd m7, m7, q0321
movd rX, m15
movd r5, m7
mova m5, [base+bdct_lb_q+ 0]
mova m6, [base+bdct_lb_q+16]
movd m1, [base+subpel_filters+r0*8+2]
movd m2, [base+subpel_filters+rX*8+2]
movd m3, [base+subpel_filters+r4*8+2]
movd m7, [base+subpel_filters+r5*8+2]
SWAP m4, m7
%if isprep
mov r3, r3m
%endif
%define m10 m5
%define m11 m6
%define m12 m1
%define m13 m1
%endif
psrld m14, 10
paddd m14, m14
punpckldq m13, m2
punpckldq m15, m4
punpcklqdq m13, m15
pxor m2, m2
pcmpeqd m0, m2
%if ARCH_X86_64
pand m9, m0
%else
pand m2, m9, m0
%define m9 m2
SWAP m7, m4
%endif
pandn m0, m13
%if ARCH_X86_64
SWAP m13, m0
%else
%define m13 m0
%endif
por m13, m9
punpckhbw m15, m13, m13
punpcklbw m13, m13
psraw m15, 8
psraw m13, 8
pshufb m12, m14, m10
pshufb m14, m11
mova m10, [base+spel_s_shuf2]
movd r4d, m14
shr r4d, 24
%if ARCH_X86_32
mova [stk+0x40], m13
mova [stk+0x50], m15
pxor m2, m2
%endif
pshufb m7, m14, m2
psubb m14, m7
paddb m12, m10
paddb m14, m10
%if ARCH_X86_64
lea r6, [r4+ssq*1]
lea r11, [r4+ssq*2]
lea r13, [r4+ss3q ]
movu m7, [srcq+ssq*0]
movu m9, [srcq+ssq*1]
movu m8, [srcq+ssq*2]
movu m10, [srcq+ss3q ]
movu m1, [srcq+r4 ]
movu m3, [srcq+r6 ]
movu m2, [srcq+r11 ]
movu m4, [srcq+r13 ]
lea srcq, [srcq+ssq*4]
REPX {pshufb x, m12}, m7, m9, m8, m10
REPX {pmaddwd x, m13}, m7, m9, m8, m10
REPX {pshufb x, m14}, m1, m3, m2, m4
REPX {pmaddwd x, m15}, m1, m3, m2, m4
mova m5, [rsp+0x10]
movd xm6, [rsp+0x20]
phaddd m7, m1
phaddd m9, m3
phaddd m8, m2
phaddd m10, m4
movu m1, [srcq+ssq*0]
movu m2, [srcq+ssq*1]
movu m3, [srcq+ssq*2]
REPX {paddd x, m5}, m7, m9, m8, m10
REPX {psrad x, xm6}, m7, m9, m8, m10
packssdw m7, m9 ; 0 1
packssdw m8, m10 ; 2 3
movu m0, [srcq+r4 ]
movu m9, [srcq+r6 ]
movu m10, [srcq+r11 ]
add srcq, ss3q
REPX {pshufb x, m12}, m1, m2, m3
REPX {pmaddwd x, m13}, m1, m2, m3
REPX {pshufb x, m14}, m0, m9, m10
REPX {pmaddwd x, m15}, m0, m9, m10
phaddd m1, m0
phaddd m2, m9
phaddd m3, m10
shr myd, 6
mov r13d, 64 << 24
lea myd, [t1+myq]
cmovnz r13q, [base+subpel_filters+myq*8]
REPX {paddd x, m5}, m1, m2, m3
REPX {psrad x, xm6}, m1, m2, m3
packssdw m1, m2 ; 4 5
packssdw m3, m3 ; 6 6
SWAP m9, m1
shufps m4, m7, m8, q1032 ; 1 2
shufps m5, m8, m9, q1032 ; 3 4
shufps m6, m9, m3, q1032 ; 5 6
punpcklwd m0, m7, m4 ; 01
punpckhwd m7, m4 ; 12
punpcklwd m1, m8, m5 ; 23
punpckhwd m8, m5 ; 34
punpcklwd m2, m9, m6 ; 45
punpckhwd m9, m6 ; 56
movq m10, r13
mova [stk+0x00], m1
mova [stk+0x10], m8
mova [stk+0x20], m2
mova [stk+0x30], m9
mova [stk+0x40], m3
%define hrnd_mem [rsp+0x10]
%define hsh_mem [rsp+0x20]
%define vsh_mem [rsp+0x28]
%if isput
%define vrnd_mem [rsp+0x30]
%else
%define vrnd_mem [base+pd_m524256]
%endif
%else
mova [stk+0x20], m12
mova [stk+0x30], m14
add r4, srcq
MC_4TAP_SCALED_H 0x60 ; 0 1
MC_4TAP_SCALED_H 0x70 ; 2 3
MC_4TAP_SCALED_H 0x80 ; 4 5
movu m7, [srcq]
movu m2, [r4]
add srcq, ssq
add r4, ssq
mov [stk+0xb0], r4
pshufb m7, m12
pshufb m2, m14
pmaddwd m7, m13
pmaddwd m2, m15
phaddd m7, m2
paddd m7, [esp+0x00]
psrad m7, [esp+0x10]
packssdw m7, m7 ; 6 6
mova m4, [stk+0x60]
mova m5, [stk+0x70]
mova m6, [stk+0x80]
mov myd, mym
mov rX, [esp+0x1f4]
xor r5, r5
shr myd, 6
lea rX, [rX+myd]
mov r4, 64 << 24
cmovnz r4, [base+subpel_filters+rX*8+0]
cmovnz r5, [base+subpel_filters+rX*8+4]
mov r3, r3m
shufps m1, m4, m5, q1032 ; 1 2
shufps m2, m5, m6, q1032 ; 3 4
shufps m3, m6, m7, q1032 ; 5 6
mova [stk+0xa0], m7
punpcklwd m0, m4, m1 ; 01
punpckhwd m4, m1 ; 12
punpcklwd m1, m5, m2 ; 23
punpckhwd m5, m2 ; 34
punpcklwd m2, m6, m3 ; 45
punpckhwd m6, m3 ; 56
movd m7, r4
movd m3, r5
mov r0, r0m
%if isput
mov r1, r1m
%endif
mov r4, [stk+0xb0]
mova [stk+0xc0], m4 ; 12
mova [stk+0x60], m1 ; 23
mova [stk+0x70], m2 ; 45
mova [stk+0x80], m5 ; 34
mova [stk+0x90], m6 ; 56
%define m12 [stk+0x20]
%define m14 [stk+0x30]
%define m13 [stk+0x40]
%define m15 [stk+0x50]
%define hrnd_mem [esp+0x00]
%define hsh_mem [esp+0x10]
%define vsh_mem [esp+0x18]
%if isput
%define vrnd_mem [esp+0x20]
%else
%define vrnd_mem [base+pd_m524256]
%endif
%define m10 m7
punpckldq m10, m3
%endif
punpcklbw m10, m10
psraw m10, 8
pshufd m3, m10, q0000
pshufd m4, m10, q1111
pshufd m5, m10, q2222
pshufd m10, m10, q3333
%if ARCH_X86_32
%xdefine m8 m3
%xdefine m9 m6
%xdefine m11 m5
%xdefine m6 m4
mova [stk+0x100], m3
mova [stk+0x110], m4
mova [stk+0x120], m5
mova [stk+0x130], m10
%define m3 [stk+0x100]
%define m4 [stk+0x110]
%define m5 [stk+0x120]
%define m10 [stk+0x130]
mova m7, [stk+0xc0]
mova m8, [stk+0x80]
%endif
.dy1_w4_loop:
movu m11, [srcq+ssq*0]
movu m6, [srcq+ssq*1]
pmaddwd m0, m3
pmaddwd m7, m3
pmaddwd m1, m4
pmaddwd m8, m4
pmaddwd m2, m5
pmaddwd m9, m5
paddd m1, m0
paddd m8, m7
%if ARCH_X86_64
movu m0, [srcq+r4]
movu m7, [srcq+r6]
%else
movu m0, [r4+ssq*0]
movu m7, [r4+ssq*1]
lea r4, [r4+ssq*2]
%endif
lea srcq, [srcq+ssq*2]
paddd m1, m2
paddd m8, m9
pshufb m11, m12
pshufb m6, m12
pmaddwd m11, m13
pmaddwd m6, m13
pshufb m0, m14
pshufb m7, m14
pmaddwd m0, m15
pmaddwd m7, m15
phaddd m11, m0
phaddd m6, m7
paddd m11, hrnd_mem
paddd m6, hrnd_mem
psrad m11, hsh_mem
psrad m6, hsh_mem
packssdw m11, m6 ; 7 8
%if ARCH_X86_64
shufps m9, [stk+0x40], m11, q1032 ; 6 7
mova m0, [stk+0x00]
mova [stk+0x40], m11
%else
shufps m9, [stk+0xa0], m11, q1032 ; 6 7
mova m0, [stk+0x60]
mova [stk+0xa0], m11
%endif
punpcklwd m2, m9, m11 ; 67
punpckhwd m9, m11 ; 78
pmaddwd m6, m2, m10
pmaddwd m7, m9, m10
%if isput
movd m11, vsh_mem
%endif
paddd m1, vrnd_mem
paddd m8, vrnd_mem
paddd m1, m6
paddd m8, m7
%if ARCH_X86_64
mova m7, [stk+0x10]
%else
mova m7, [stk+0x80]
%endif
%if isput
psrad m1, m11
psrad m8, m11
%else
psrad m1, 6
psrad m8, 6
%endif
packssdw m1, m8
%if ARCH_X86_64
mova m8, [stk+0x30]
%else
mova m8, [stk+0x90]
%endif
%if isput
pxor m6, m6
pmaxsw m1, m6
pminsw m1, pxmaxm
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
%else
mova [tmpq], m1
add tmpq, 16
%endif
%if ARCH_X86_64
mova m1, [stk+0x20]
mova [stk+0x10], m8
mova [stk+0x00], m1
mova [stk+0x20], m2
mova [stk+0x30], m9
%else
mova m1, [stk+0x70]
mova [stk+0x80], m8
mova [stk+0x60], m1
mova [stk+0x70], m2
mova [stk+0x90], m9
%endif
sub hd, 2
jg .dy1_w4_loop
MC_8TAP_SCALED_RET ; why not jz .ret?
INIT_XMM ssse3
.dy1_w8:
mov dword [stk+0xf0], 1
movifprep tmp_stridem, 16
jmp .dy1_w_start
.dy1_w16:
mov dword [stk+0xf0], 2
movifprep tmp_stridem, 32
jmp .dy1_w_start
.dy1_w32:
mov dword [stk+0xf0], 4
movifprep tmp_stridem, 64
jmp .dy1_w_start
.dy1_w64:
mov dword [stk+0xf0], 8
movifprep tmp_stridem, 128
jmp .dy1_w_start
.dy1_w128:
mov dword [stk+0xf0], 16
movifprep tmp_stridem, 256
.dy1_w_start:
mov myd, mym
%if ARCH_X86_64
%ifidn %1, put
movifnidn dsm, dsq
%endif
mova [rsp+0x10], m11
mova [rsp+0x20], m12
%define hround m11
%if isput
mova [rsp+0x30], m13
%else
mova m13, [base+pd_m524256]
%endif
shr t0d, 16
shr myd, 6
mov r4d, 64 << 24
lea myd, [t1+myq]
cmovnz r4q, [base+subpel_filters+myq*8]
movd m15, t0d
%else
%define hround [esp+0x00]
%define m12 [esp+0x10]
%define m10 [base+pd_0x3ff]
%define m8 m0
%xdefine m14 m4
%xdefine m15 m3
%if isprep
%define ssq ssm
%endif
mov r5, [esp+0x1f0]
mov r3, [esp+0x1f4]
shr r5, 16
movd m15, r5
xor r5, r5
shr myd, 6
lea r3, [r3+myd]
mov r4, 64 << 24
cmovnz r4, [base+subpel_filters+r3*8+0]
cmovnz r5, [base+subpel_filters+r3*8+4]
mov r0, r0m
mov r3, r3m
%endif
sub srcq, 6
pslld m7, m8, 2 ; dx*4
pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
pshufd m15, m15, q0000
paddd m14, m8 ; mx+dx*[0-3]
%if ARCH_X86_64
movq m3, r4q
%else
movd m5, r4
movd m6, r5
punpckldq m5, m6
SWAP m3, m5
%endif
punpcklbw m3, m3
psraw m3, 8
mova [stk+0x100], m7
mova [stk+0x120], m15
mov [stk+0x0f8], srcq
mov [stk+0x130], r0q ; dstq / tmpq
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
%if ARCH_X86_64
mova [stk+0x140], m0
mova [stk+0x150], m1
mova [stk+0x160], m2
mova [stk+0x170], m3
%if UNIX64
mov hm, hd
%endif
%else
mova [stk+0x180], m0
mova [stk+0x190], m1
mova [stk+0x1a0], m2
mova [stk+0x1b0], m3
SWAP m5, m3
mov r5, hm
mov [stk+0x134], r5
%endif
jmp .dy1_hloop
.dy1_hloop_prep:
dec dword [stk+0x0f0]
jz .ret
%if ARCH_X86_64
add qword [stk+0x130], 16
mov hd, hm
%else
add dword [stk+0x130], 16
mov r5, [stk+0x134]
mov r0, [stk+0x130]
%endif
mova m7, [stk+0x100]
mova m14, [stk+0x110]
%if ARCH_X86_64
mova m10, [base+pd_0x3ff]
mova m11, [rsp+0x10]
%endif
mova m15, [stk+0x120]
mov srcq, [stk+0x0f8]
%if ARCH_X86_64
mov r0q, [stk+0x130] ; dstq / tmpq
%else
mov hm, r5
mov r0m, r0
mov r3, r3m
%endif
paddd m14, m7
.dy1_hloop:
%if ARCH_X86_64
mova m9, [base+pq_0x40000000]
%else
%define m9 [base+pq_0x40000000]
%endif
pxor m1, m1
psrld m2, m14, 10
mova [stk], m2
pand m6, m14, m10
psrld m6, 6
paddd m5, m15, m6
pcmpeqd m6, m1
pshufd m2, m5, q1032
%if ARCH_X86_64
movd r4d, m5
movd r6d, m2
pshufd m5, m5, q0321
pshufd m2, m2, q0321
movd r7d, m5
movd r9d, m2
movq m0, [base+subpel_filters+r4*8]
movq m1, [base+subpel_filters+r6*8]
movhps m0, [base+subpel_filters+r7*8]
movhps m1, [base+subpel_filters+r9*8]
%else
movd r0, m5
movd rX, m2
pshufd m5, m5, q0321
pshufd m2, m2, q0321
movd r4, m5
movd r5, m2
movq m0, [base+subpel_filters+r0*8]
movq m1, [base+subpel_filters+rX*8]
movhps m0, [base+subpel_filters+r4*8]
movhps m1, [base+subpel_filters+r5*8]
%endif
paddd m14, m7 ; mx+dx*[4-7]
pand m5, m14, m10
psrld m5, 6
paddd m15, m5
pxor m2, m2
pcmpeqd m5, m2
mova [stk+0x110], m14
pshufd m4, m15, q1032
%if ARCH_X86_64
movd r10d, m15
movd r11d, m4
pshufd m15, m15, q0321
pshufd m4, m4, q0321
movd r13d, m15
movd rXd, m4
movq m2, [base+subpel_filters+r10*8]
movq m3, [base+subpel_filters+r11*8]
movhps m2, [base+subpel_filters+r13*8]
movhps m3, [base+subpel_filters+ rX*8]
psrld m14, 10
movq r11, m14
punpckhqdq m14, m14
movq rX, m14
mov r10d, r11d
shr r11, 32
mov r13d, rXd
shr rX, 32
mov r4d, [stk+ 0]
mov r6d, [stk+ 4]
mov r7d, [stk+ 8]
mov r9d, [stk+12]
pshufd m4, m6, q1100
pshufd m6, m6, q3322
pshufd m14, m5, q1100
pshufd m5, m5, q3322
pand m7, m9, m4
pand m8, m9, m6
pand m15, m9, m14
pand m9, m9, m5
pandn m4, m0
pandn m6, m1
pandn m14, m2
pandn m5, m3
por m7, m4
por m8, m6
por m15, m14
por m9, m5
punpcklbw m0, m7, m7
punpckhbw m7, m7
punpcklbw m1, m8, m8
punpckhbw m8, m8
psraw m0, 8
psraw m7, 8
psraw m1, 8
psraw m8, 8
punpcklbw m2, m15, m15
punpckhbw m15, m15
punpcklbw m3, m9, m9
punpckhbw m9, m9
psraw m2, 8
psraw m15, 8
psraw m3, 8
psraw m9, 8
mova [stk+0x10], m0
mova [stk+0x20], m7
mova [stk+0x30], m1
mova [stk+0x40], m8
mova [stk+0x50], m2
mova [stk+0x60], m15
mova [stk+0x70], m3
mova [stk+0x80], m9
MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
mova [stk+0x90], m1
MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
mova [stk+0xa0], m2
MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
mova [stk+0xb0], m3
MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
mova [stk+0xc0], m4
MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
mova [stk+0xd0], m5
MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
mova m5, [stk+0xd0]
mova m1, [stk+0x90]
mova m2, [stk+0xa0]
mova m3, [stk+0xb0]
mova m9, [stk+0xc0]
punpcklwd m4, m5, m6 ; 45a
punpckhwd m5, m6 ; 45b
punpcklwd m6, m7, m8 ; 67a
punpckhwd m7, m8 ; 67b
punpcklwd m0, m1, m2 ; 01a
punpckhwd m1, m2 ; 01b
punpcklwd m2, m3, m9 ; 23a
punpckhwd m3, m9 ; 23b
mova m10, [stk+0x140]
mova m11, [stk+0x150]
mova m14, [stk+0x160]
mova m15, [stk+0x170]
mova [stk+0x90], m4
mova [stk+0xa0], m5
mova [stk+0xb0], m6
mova [stk+0xc0], m7
%define hround [rsp+0x10]
%define shift [rsp+0x20]
%if isput
%define vround [rsp+0x30]
%else
%define vround [base+pd_m524256]
%endif
.dy1_vloop:
pmaddwd m4, m0, m10
pmaddwd m5, m1, m10
pmaddwd m6, m2, m11
pmaddwd m7, m3, m11
paddd m4, m13
paddd m5, m13
paddd m4, m6
paddd m5, m7
pmaddwd m6, [stk+0x90], m14
pmaddwd m7, [stk+0xa0], m14
pmaddwd m8, [stk+0xb0], m15
pmaddwd m9, [stk+0xc0], m15
paddd m4, m6
paddd m5, m7
%if isput
pshufd m6, m12, q1032
%endif
paddd m4, m8
paddd m5, m9
%else
movd r0, m15
movd rX, m4
pshufd m15, m15, q0321
pshufd m4, m4, q0321
movd r4, m15
movd r5, m4
mova m14, [stk+0x110]
movq m2, [base+subpel_filters+r0*8]
movq m3, [base+subpel_filters+rX*8]
movhps m2, [base+subpel_filters+r4*8]
movhps m3, [base+subpel_filters+r5*8]
psrld m14, 10
mova [stk+16], m14
mov r0, [stk+ 0]
mov rX, [stk+ 4]
mov r4, [stk+ 8]
mov r5, [stk+12]
mova [stk+0x20], m0
mova [stk+0x30], m1
mova [stk+0x40], m2
mova [stk+0x50], m3
pshufd m4, m6, q1100
pshufd m6, m6, q3322
pshufd m7, m5, q1100
pshufd m5, m5, q3322
pand m0, m9, m4
pand m1, m9, m6
pand m2, m9, m7
pand m3, m9, m5
pandn m4, [stk+0x20]
pandn m6, [stk+0x30]
pandn m7, [stk+0x40]
pandn m5, [stk+0x50]
por m0, m4
por m1, m6
por m2, m7
por m3, m5
punpcklbw m4, m0, m0
punpckhbw m0, m0
punpcklbw m5, m1, m1
punpckhbw m1, m1
psraw m4, 8
psraw m0, 8
psraw m5, 8
psraw m1, 8
punpcklbw m6, m2, m2
punpckhbw m2, m2
punpcklbw m7, m3, m3
punpckhbw m3, m3
psraw m6, 8
psraw m2, 8
psraw m7, 8
psraw m3, 8
mova [stk+0x0a0], m4
mova [stk+0x0b0], m0
mova [stk+0x0c0], m5
mova [stk+0x0d0], m1
mova [stk+0x140], m6
mova [stk+0x150], m2
mova [stk+0x160], m7
mova [stk+0x170], m3
MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
mova m5, [stk+0x60]
mova m6, [stk+0x70]
mova m7, [stk+0x80]
mova m0, [stk+0x90]
mov r0, r0m
punpcklwd m4, m5, m6 ; 45a
punpckhwd m5, m6 ; 45b
punpcklwd m6, m7, m0 ; 67a
punpckhwd m7, m0 ; 67b
mova [stk+0x60], m4
mova [stk+0x70], m5
mova [stk+0x80], m6
mova [stk+0x90], m7
mova m1, [stk+0x20]
mova m2, [stk+0x30]
mova m3, [stk+0x40]
mova m4, [stk+0x50]
punpcklwd m0, m1, m2 ; 01a
punpckhwd m1, m2 ; 01b
punpcklwd m2, m3, m4 ; 23a
punpckhwd m3, m4 ; 23b
mova m4, [stk+0x180]
mova m5, [stk+0x190]
mova m6, [stk+0x1a0]
mova m7, [stk+0x1b0]
mova [stk+0x20], m0
mova [stk+0x30], m1
mova [stk+0x40], m2
mova [stk+0x50], m3
.dy1_vloop:
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m5
pmaddwd m3, m5
paddd m0, m2
paddd m1, m3
pmaddwd m2, [stk+0x60], m6
pmaddwd m3, [stk+0x70], m6
pmaddwd m4, [stk+0x80], m7
pmaddwd m5, [stk+0x90], m7
%if isput
movd m6, [esp+0x18]
%endif
paddd m0, m2
paddd m1, m3
paddd m0, vrnd_mem
paddd m1, vrnd_mem
paddd m4, m0
paddd m5, m1
%endif
%ifidn %1, put
psrad m4, m6
psrad m5, m6
packssdw m4, m5
pxor m7, m7
pmaxsw m4, m7
pminsw m4, pxmaxm
mova [dstq], m4
add dstq, dsm
%else
psrad m4, 6
psrad m5, 6
packssdw m4, m5
mova [tmpq], m4
add tmpq, tmp_stridem
%endif
dec hd
jz .dy1_hloop_prep
%if ARCH_X86_64
movu m8, [srcq+r10*2]
movu m9, [srcq+r11*2]
movu m12, [srcq+r13*2]
movu m13, [srcq+ rX*2]
movu m4, [srcq+ r4*2]
movu m5, [srcq+ r6*2]
movu m6, [srcq+ r7*2]
movu m7, [srcq+ r9*2]
add srcq, ssq
pmaddwd m8, [stk+0x50]
pmaddwd m9, [stk+0x60]
pmaddwd m12, [stk+0x70]
pmaddwd m13, [stk+0x80]
pmaddwd m4, [stk+0x10]
pmaddwd m5, [stk+0x20]
pmaddwd m6, [stk+0x30]
pmaddwd m7, [stk+0x40]
phaddd m8, m9
phaddd m12, m13
mova m9, [base+unpckw]
mova m13, hround
phaddd m4, m5
phaddd m6, m7
phaddd m8, m12
phaddd m4, m6
pshufd m5, m9, q1032
pshufb m0, m9 ; 0a 1a
pshufb m1, m9 ; 0b 1b
pshufb m2, m5 ; 3a 2a
pshufb m3, m5 ; 3b 2b
mova m12, shift
paddd m4, m13
paddd m8, m13
psrad m4, m12
psrad m8, m12
packssdw m4, m8
pshufb m6, [stk+0x90], m9 ; 4a 5a
pshufb m7, [stk+0xa0], m9 ; 4b 5b
pshufb m8, [stk+0xb0], m5 ; 7a 6a
pshufb m13, [stk+0xc0], m5 ; 7b 6b
punpckhwd m0, m2 ; 12a
punpckhwd m1, m3 ; 12b
punpcklwd m2, m6 ; 34a
punpcklwd m3, m7 ; 34b
punpckhwd m6, m8 ; 56a
punpckhwd m7, m13 ; 56b
punpcklwd m8, m4 ; 78a
punpckhqdq m4, m4
punpcklwd m13, m4 ; 78b
mova [stk+0x90], m6
mova [stk+0xa0], m7
mova [stk+0xb0], m8
mova [stk+0xc0], m13
mova m13, vround
%else
mov r0m, r0
mov r3, r3m
mov r0, [stk+ 0]
mov rX, [stk+ 4]
mov r4, [stk+ 8]
mov r5, [stk+12]
MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
mova m7, [base+unpckw]
pshufd m4, m7, q1032
pshufb m0, [stk+0x20], m7 ; 0a 1a
pshufb m1, [stk+0x30], m7 ; 0b 1b
pshufb m2, [stk+0x40], m4 ; 3a 2a
pshufb m3, [stk+0x50], m4 ; 3b 2b
pshufb m5, [stk+0x60], m7 ; 4a 5a
pshufb m6, [stk+0x70], m7 ; 4b 5b
pshufb m7, [stk+0x80], m4 ; 7a 6a
punpckhwd m0, m2 ; 12a
punpckhwd m1, m3 ; 12b
punpcklwd m2, m5 ; 34a
punpcklwd m3, m6 ; 34b
mova [stk+0x20], m0
mova [stk+0x30], m1
mova [stk+0x40], m2
mova [stk+0x50], m3
punpckhwd m5, m7 ; 56a
mova [stk+0x60], m5
pshufb m5, [stk+0x90], m4 ; 7b 6b
punpcklwd m7, [stk+0xe0] ; 78a
mova m4, [stk+0x180]
punpckhwd m6, m5 ; 56b
mova [stk+0x70], m6
movq m6, [stk+0xe8]
mova [stk+0x80], m7
mova m7, [stk+0x1b0]
punpcklwd m5, m6
mova m6, [stk+0x1a0]
mova [stk+0x90], m5
mova m5, [stk+0x190]
mov r0, r0m
%endif
jmp .dy1_vloop
INIT_XMM ssse3
%if ARCH_X86_64
%define stk rsp+0x20
%endif
.dy2:
movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
add wq, base_reg
jmp wq
%if isput
.dy2_w2:
%if ARCH_X86_64
mov myd, mym
mova [rsp+0x10], m13
%define vrnd_mem [rsp+0x10]
movzx t0d, t0b
sub srcq, 2
movd m15, t0d
%else
%define m8 m0
%define m9 m1
%define m14 m4
%define m15 m3
%define m11 [esp+0x00]
%define m12 [esp+0x10]
%define vrnd_mem [esp+0x20]
mov r1, r1m
movzx r5, byte [esp+0x1f0]
sub srcq, 2
movd m15, r5
%endif
pxor m9, m9
punpckldq m9, m8
paddd m14, m9 ; mx+dx*[0-1]
%if ARCH_X86_64
mova m9, [base+pd_0x4000]
%endif
pshufd m15, m15, q0000
pand m8, m14, m10
psrld m8, 6
paddd m15, m8
movd r4d, m15
pshufd m15, m15, q0321
%if ARCH_X86_64
movd r6d, m15
%else
movd r3d, m15
%endif
mova m5, [base+bdct_lb_q]
mova m6, [base+spel_s_shuf2]
movd m15, [base+subpel_filters+r4*8+2]
%if ARCH_X86_64
movd m7, [base+subpel_filters+r6*8+2]
%else
movd m7, [base+subpel_filters+r3*8+2]
%endif
pxor m2, m2
pcmpeqd m8, m2
psrld m14, 10
paddd m14, m14
%if ARCH_X86_32
mov r3, r3m
pshufb m14, m5
paddb m14, m6
mova [stk], m14
SWAP m5, m0
SWAP m6, m3
%define m15 m6
%endif
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*2]
movu m2, [srcq+ssq*4]
punpckldq m15, m7
%if ARCH_X86_64
pshufb m14, m5
paddb m14, m6
pand m9, m8
pandn m8, m15
SWAP m15, m8
por m15, m9
movu m4, [srcq+ssq*1]
movu m5, [srcq+ss3q ]
lea srcq, [srcq+ssq*4]
movu m6, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
shr myd, 6
mov r4d, 64 << 24
lea myd, [t1+myq]
cmovnz r4q, [base+subpel_filters+myq*8]
%else
pand m7, m5, [base+pd_0x4000]
pandn m5, m15
por m5, m7
%define m15 m5
mov myd, mym
mov r5, [esp+0x1f4]
xor r3, r3
shr myd, 6
lea r5, [r5+myd]
mov r4, 64 << 24
cmovnz r4, [base+subpel_filters+r5*8+0]
cmovnz r3, [base+subpel_filters+r5*8+4]
mov [stk+0x20], r3
mov r3, r3m
%endif
punpcklbw m15, m15
psraw m15, 8
REPX {pshufb x, m14}, m0, m1, m2
REPX {pmaddwd x, m15}, m0, m1, m2
%if ARCH_X86_64
REPX {pshufb x, m14}, m4, m5, m6
REPX {pmaddwd x, m15}, m4, m5, m6
phaddd m0, m1
phaddd m1, m2
phaddd m4, m5
phaddd m5, m6
REPX {paddd x, m11}, m0, m1, m4, m5
REPX {psrad x, m12}, m0, m1, m4, m5
packssdw m0, m1 ; 0 2 2 4
packssdw m4, m5 ; 1 3 3 5
SWAP m2, m4
movq m10, r4
%else
mova [stk+0x10], m15
phaddd m0, m1
phaddd m1, m2
movu m2, [srcq+ssq*1]
movu m7, [srcq+ss3q ]
lea srcq, [srcq+ssq*4]
movu m6, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
REPX {pshufb x, m14}, m2, m7, m6
REPX {pmaddwd x, m15}, m2, m7, m6
%define m14 [stk+0x00]
%define m15 [stk+0x10]
phaddd m2, m7
phaddd m7, m6
REPX {paddd x, m11}, m0, m1, m2, m7
REPX {psrad x, m12}, m0, m1, m2, m7
packssdw m0, m1
packssdw m2, m7
%define m8 m6
%define m9 m4
%define m10 m5
movd m10, r4
movd m9, [stk+0x20]
punpckldq m10, m9
%endif
punpcklbw m10, m10
psraw m10, 8
pshufd m7, m10, q0000
pshufd m8, m10, q1111
pshufd m9, m10, q2222
pshufd m10, m10, q3333
%if ARCH_X86_32
mova [stk+0x50], m7
mova [stk+0x60], m8
mova [stk+0x70], m9
mova [stk+0x80], m10
%xdefine m13 m7
%define m7 [stk+0x50]
%define m8 [stk+0x60]
%define m9 [stk+0x70]
%define m10 [stk+0x80]
%endif
punpcklwd m1, m0, m2 ; 01 23
punpckhwd m3, m0, m2 ; 23 45
%if ARCH_X86_32
mov r4, r0m
%define dstq r4
mova [stk+0x20], m3
mova [stk+0x30], m0
%endif
.dy2_w2_loop:
movu m4, [srcq+ssq*0]
movu m5, [srcq+ssq*1]
movu m6, [srcq+ssq*2]
movu m13, [srcq+ss3q ]
lea srcq, [srcq+ssq*4]
pmaddwd m3, m8
REPX {pshufb x, m14}, m4, m5, m6, m13
REPX {pmaddwd x, m15}, m4, m5, m6, m13
phaddd m4, m5
phaddd m6, m13
pmaddwd m5, m1, m7
paddd m4, m11
paddd m6, m11
psrad m4, m12
psrad m6, m12
packssdw m4, m6 ; 6 7 8 9
paddd m5, m3
pshufd m3, m4, q2200
pshufd m4, m4, q3311
palignr m3, m0, 12 ; 4 6 6 8
palignr m4, m2, 12 ; 5 7 7 9
mova m0, m3
mova m2, m4
punpcklwd m1, m3, m4
punpckhwd m3, m4
pmaddwd m6, m1, m9
pmaddwd m4, m3, m10
paddd m5, vrnd_mem
paddd m6, m4
paddd m5, m6
pshufd m4, m12, q1032
pxor m6, m6
psrad m5, m4
packssdw m5, m5
pmaxsw m5, m6
pminsw m5, pxmaxm
movd [dstq+dsq*0], m5
pshuflw m5, m5, q1032
movd [dstq+dsq*1], m5
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .dy2_w2_loop
RET
%endif
INIT_XMM ssse3
.dy2_w4:
%if ARCH_X86_64
mov myd, mym
mova [rsp+0x10], m11
mova [rsp+0x20], m12
%if isput
mova [rsp+0x30], m13
%define vrnd_mem [rsp+0x30]
%define stk rsp+0x40
%else
%define vrnd_mem [base+pd_m524256]
%define stk rsp+0x30
%endif
movzx t0d, t0b
sub srcq, 2
movd m15, t0d
%else
%define m10 [base+pd_0x3ff]
%define m9 [base+pd_0x4000]
%define m8 m0
%xdefine m14 m4
%define m15 m3
%if isprep
%define ssq r3
%endif
movzx r5, byte [esp+0x1f0]
sub srcq, 2
movd m15, r5
%endif
pmaddwd m8, [base+rescale_mul]
%if ARCH_X86_64
mova m9, [base+pd_0x4000]
%endif
pshufd m15, m15, q0000
paddd m14, m8 ; mx+dx*[0-3]
pand m0, m14, m10
psrld m0, 6
paddd m15, m0
pshufd m7, m15, q1032
%if ARCH_X86_64
movd r4d, m15
movd r11d, m7
pshufd m15, m15, q0321
pshufd m7, m7, q0321
movd r6d, m15
movd r13d, m7
mova m10, [base+bdct_lb_q+ 0]
mova m11, [base+bdct_lb_q+16]
movd m13, [base+subpel_filters+ r4*8+2]
movd m2, [base+subpel_filters+ r6*8+2]
movd m15, [base+subpel_filters+r11*8+2]
movd m4, [base+subpel_filters+r13*8+2]
%else
movd r1, m15
movd r4, m7
pshufd m15, m15, q0321
pshufd m7, m7, q0321
movd r3, m15
movd r5, m7
mova m5, [base+bdct_lb_q+ 0]
mova m6, [base+bdct_lb_q+16]
movd m1, [base+subpel_filters+r1*8+2]
movd m2, [base+subpel_filters+r3*8+2]
movd m3, [base+subpel_filters+r4*8+2]
movd m7, [base+subpel_filters+r5*8+2]
SWAP m4, m7
mov r3, r3m
%if isprep
lea ss3q, [ssq*3]
%endif
%define m10 m5
%define m11 m6
%define m12 m1
%define m13 m1
%endif
psrld m14, 10
paddd m14, m14
punpckldq m13, m2
punpckldq m15, m4
punpcklqdq m13, m15
pxor m2, m2
pcmpeqd m0, m2
%if ARCH_X86_64
pand m9, m0
%else
pand m2, m9, m0
%define m9 m2
SWAP m7, m4
%endif
pandn m0, m13
%if ARCH_X86_64
SWAP m13, m0
%else
%define m13 m0
%endif
por m13, m9
punpckhbw m15, m13, m13
punpcklbw m13, m13
psraw m15, 8
psraw m13, 8
pshufb m12, m14, m10
pshufb m14, m11
mova m10, [base+spel_s_shuf2]
movd r4d, m14
shr r4d, 24
%if ARCH_X86_32
mova [stk+0x40], m13
mova [stk+0x50], m15
pxor m2, m2
%endif
pshufb m7, m14, m2
psubb m14, m7
paddb m12, m10
paddb m14, m10
%if ARCH_X86_64
lea r6, [r4+ssq*1]
lea r11, [r4+ssq*2]
lea r13, [r4+ss3q ]
movu m1, [srcq+ssq*0]
movu m8, [srcq+ssq*2]
movu m9, [srcq+ssq*1]
movu m10, [srcq+ss3q ]
movu m7, [srcq+r4 ]
movu m2, [srcq+r11 ]
movu m3, [srcq+r6 ]
movu m4, [srcq+r13 ]
lea srcq, [srcq+ssq*4]
REPX {pshufb x, m12}, m1, m9, m8, m10
REPX {pmaddwd x, m13}, m1, m9, m8, m10
REPX {pshufb x, m14}, m7, m3, m2, m4
REPX {pmaddwd x, m15}, m7, m3, m2, m4
mova m5, [rsp+0x10]
movd xm6, [rsp+0x20]
phaddd m1, m7
phaddd m8, m2
phaddd m9, m3
phaddd m10, m4
movu m2, [srcq+ssq*0]
movu m3, [srcq+ssq*1]
REPX {paddd x, m5}, m1, m9, m8, m10
REPX {psrad x, xm6}, m1, m9, m8, m10
packssdw m1, m8 ; 0 2
packssdw m9, m10 ; 1 3
movu m0, [srcq+r4 ]
movu m8, [srcq+r6 ]
lea srcq, [srcq+ssq*2]
REPX {pshufb x, m12}, m2, m3
REPX {pmaddwd x, m13}, m2, m3
REPX {pshufb x, m14}, m0, m8
REPX {pmaddwd x, m15}, m0, m8
phaddd m2, m0
phaddd m3, m8
shr myd, 6
mov r9d, 64 << 24
lea myd, [t1+myq]
cmovnz r9q, [base+subpel_filters+myq*8]
REPX {paddd x, m5}, m2, m3
REPX {psrad x, xm6}, m2, m3
packssdw m2, m3 ; 4 5
pshufd m3, m2, q1032 ; 5 _
punpcklwd m0, m1, m9 ; 01
punpckhwd m1, m9 ; 23
punpcklwd m2, m3 ; 45
movq m10, r9
%define hrnd_mem [rsp+0x10]
%define hsh_mem [rsp+0x20]
%define vsh_mem [rsp+0x28]
%if isput
%define vrnd_mem [rsp+0x30]
%else
%define vrnd_mem [base+pd_m524256]
%endif
%else
mova [stk+0x20], m12
mova [stk+0x30], m14
add r4, srcq
MC_4TAP_SCALED_H 0x60 ; 0 1
MC_4TAP_SCALED_H 0x70 ; 2 3
MC_4TAP_SCALED_H 0x80 ; 4 5
mov [stk+0xe0], r4
mova m3, [base+spel_s_shuf8]
mova m0, [stk+0x60]
mova m1, [stk+0x70]
mova m2, [stk+0x80]
mov myd, mym
mov rX, [esp+0x1f4]
xor r5, r5
shr myd, 6
lea rX, [rX+myd]
mov r4, 64 << 24
cmovnz r4, [base+subpel_filters+rX*8+0]
cmovnz r5, [base+subpel_filters+rX*8+4]
mov r3, r3m
pshufb m0, m3 ; 01
pshufb m1, m3 ; 23
pshufb m2, m3 ; 45
movd m7, r4
movd m4, r5
mov r5, r0m
%if isput
mov r1, r1m
%endif
mov r4, [stk+0xe0]
%define dstq r5
%define tmpq r5
%define m12 [stk+0x20]
%define m14 [stk+0x30]
%define m13 [stk+0x40]
%define m15 [stk+0x50]
%define hrnd_mem [esp+0x00]
%define hsh_mem [esp+0x10]
%define vsh_mem [esp+0x18]
%if isput
%define vrnd_mem [esp+0x20]
%else
%define vrnd_mem [base+pd_m524256]
%endif
%define m10 m7
punpckldq m10, m4
%endif
punpcklbw m10, m10
psraw m10, 8
pshufd m3, m10, q0000
pshufd m4, m10, q1111
pshufd m5, m10, q2222
pshufd m10, m10, q3333
%if ARCH_X86_32
%xdefine m8 m3
%xdefine m9 m6
%xdefine m11 m5
%xdefine m6 m4
mova [stk+0x100], m3
mova [stk+0x110], m4
mova [stk+0x120], m5
mova [stk+0x130], m10
%define m3 [stk+0x100]
%define m4 [stk+0x110]
%define m5 [stk+0x120]
%define m10 [stk+0x130]
%endif
.dy2_w4_loop:
pmaddwd m8, m0, m3
pmaddwd m9, m1, m3
mova m0, m2
pmaddwd m1, m4
pmaddwd m11, m2, m4
paddd m8, vrnd_mem
paddd m9, vrnd_mem
pmaddwd m2, m5
paddd m8, m1
paddd m9, m11
paddd m8, m2
movu m6, [srcq+ssq*0]
movu m1, [srcq+ssq*2]
%if ARCH_X86_64
movu m11, [srcq+r4 ]
movu m2, [srcq+r11]
%else
movu m11, [r4+ssq*0]
movu m2, [r4+ssq*2]
%endif
pshufb m6, m12
pshufb m1, m12
pmaddwd m6, m13
pmaddwd m1, m13
pshufb m11, m14
pshufb m2, m14
pmaddwd m11, m15
pmaddwd m2, m15
phaddd m6, m11
phaddd m1, m2
paddd m6, hrnd_mem
paddd m1, hrnd_mem
psrad m6, hsh_mem
psrad m1, hsh_mem
movu m7, [srcq+ssq*1]
movu m11, [srcq+ss3q ]
packssdw m6, m1 ; 6 8
%if ARCH_X86_64
movu m2, [srcq+r6 ]
movu m1, [srcq+r13]
%else
movu m2, [r4+ssq*1]
movu m1, [r4+ss3q ]
%endif
pshufb m7, m12
pshufb m11, m12
pmaddwd m7, m13
pmaddwd m11, m13
pshufb m2, m14
pshufb m1, m14
pmaddwd m2, m15
pmaddwd m1, m15
phaddd m7, m2
phaddd m11, m1
paddd m7, hrnd_mem
paddd m11, hrnd_mem
psrad m7, hsh_mem
psrad m11, hsh_mem
packssdw m7, m11 ; 7 9
%if ARCH_X86_32
lea r4, [r4+ssq*4]
%endif
lea srcq, [srcq+ssq*4]
punpcklwd m1, m6, m7 ; 67
punpckhwd m6, m7 ; 89
mova m2, m6
pmaddwd m11, m1, m5
pmaddwd m7, m1, m10
pmaddwd m6, m10
paddd m9, m11
%if isput
movd m11, vsh_mem
%endif
paddd m8, m7
paddd m9, m6
%if isput
psrad m8, m11
psrad m9, m11
packssdw m8, m9
pxor m7, m7
pmaxsw m8, m7
pminsw m8, pxmaxm
movq [dstq+dsq*0], m8
movhps [dstq+dsq*1], m8
lea dstq, [dstq+dsq*2]
%else
psrad m8, 6
psrad m9, 6
packssdw m8, m9
mova [tmpq], m8
add tmpq, 16
%endif
sub hd, 2
jg .dy2_w4_loop
MC_8TAP_SCALED_RET ; why not jz .ret?
INIT_XMM ssse3
.dy2_w8:
mov dword [stk+0xf0], 1
movifprep tmp_stridem, 16
jmp .dy2_w_start
.dy2_w16:
mov dword [stk+0xf0], 2
movifprep tmp_stridem, 32
jmp .dy2_w_start
.dy2_w32:
mov dword [stk+0xf0], 4
movifprep tmp_stridem, 64
jmp .dy2_w_start
.dy2_w64:
mov dword [stk+0xf0], 8
movifprep tmp_stridem, 128
jmp .dy2_w_start
.dy2_w128:
mov dword [stk+0xf0], 16
movifprep tmp_stridem, 256
.dy2_w_start:
mov myd, mym
%if ARCH_X86_64
%ifidn %1, put
movifnidn dsm, dsq
%endif
mova [rsp+0x10], m11
mova [rsp+0x20], m12
%define hround m11
%if isput
mova [rsp+0x30], m13
%else
mova m13, [base+pd_m524256]
%endif
shr t0d, 16
shr myd, 6
mov r4d, 64 << 24
lea myd, [t1+myq]
cmovnz r4q, [base+subpel_filters+myq*8]
movd m15, t0d
%else
%define hround [esp+0x00]
%define m12 [esp+0x10]
%define m10 [base+pd_0x3ff]
%define m8 m0
%xdefine m14 m4
%xdefine m15 m3
%if isput
%define dstq r0
%else
%define tmpq r0
%define ssq ssm
%endif
mov r5, [esp+0x1f0]
mov r3, [esp+0x1f4]
shr r5, 16
movd m15, r5
xor r5, r5
shr myd, 6
lea r3, [r3+myd]
mov r4, 64 << 24
cmovnz r4, [base+subpel_filters+r3*8+0]
cmovnz r5, [base+subpel_filters+r3*8+4]
mov r0, r0m
mov r3, r3m
%endif
sub srcq, 6
pslld m7, m8, 2 ; dx*4
pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
pshufd m15, m15, q0000
paddd m14, m8 ; mx+dx*[0-3]
%if ARCH_X86_64
movq m3, r4q
%else
movd m5, r4
movd m6, r5
punpckldq m5, m6
SWAP m3, m5
%endif
punpcklbw m3, m3
psraw m3, 8
mova [stk+0x100], m7
mova [stk+0x120], m15
mov [stk+0x0f8], srcq
mov [stk+0x130], r0q ; dstq / tmpq
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
%if ARCH_X86_64
mova [stk+0x140], m0
mova [stk+0x150], m1
mova [stk+0x160], m2
mova [stk+0x170], m3
%if UNIX64
mov hm, hd
%endif
%else
mova [stk+0x180], m0
mova [stk+0x190], m1
mova [stk+0x1a0], m2
mova [stk+0x1b0], m3
SWAP m5, m3
mov r5, hm
mov [stk+0x134], r5
%endif
jmp .dy2_hloop
.dy2_hloop_prep:
dec dword [stk+0x0f0]
jz .ret
%if ARCH_X86_64
add qword [stk+0x130], 16
mov hd, hm
%else
add dword [stk+0x130], 16
mov r5, [stk+0x134]
mov r0, [stk+0x130]
%endif
mova m7, [stk+0x100]
mova m14, [stk+0x110]
%if ARCH_X86_64
mova m10, [base+pd_0x3ff]
mova m11, [rsp+0x10]
%endif
mova m15, [stk+0x120]
mov srcq, [stk+0x0f8]
%if ARCH_X86_64
mov r0q, [stk+0x130] ; dstq / tmpq
%else
mov hm, r5
mov r0m, r0
mov r3, r3m
%endif
paddd m14, m7
.dy2_hloop:
%if ARCH_X86_64
mova m9, [base+pq_0x40000000]
%else
%define m9 [base+pq_0x40000000]
%endif
pxor m1, m1
psrld m2, m14, 10
mova [stk], m2
pand m6, m14, m10
psrld m6, 6
paddd m5, m15, m6
pcmpeqd m6, m1
pshufd m2, m5, q1032
%if ARCH_X86_64
movd r4d, m5
movd r6d, m2
pshufd m5, m5, q0321
pshufd m2, m2, q0321
movd r7d, m5
movd r9d, m2
movq m0, [base+subpel_filters+r4*8]
movq m1, [base+subpel_filters+r6*8]
movhps m0, [base+subpel_filters+r7*8]
movhps m1, [base+subpel_filters+r9*8]
%else
movd r0, m5
movd rX, m2
pshufd m5, m5, q0321
pshufd m2, m2, q0321
movd r4, m5
movd r5, m2
movq m0, [base+subpel_filters+r0*8]
movq m1, [base+subpel_filters+rX*8]
movhps m0, [base+subpel_filters+r4*8]
movhps m1, [base+subpel_filters+r5*8]
%endif
paddd m14, m7 ; mx+dx*[4-7]
pand m5, m14, m10
psrld m5, 6
paddd m15, m5
pxor m2, m2
pcmpeqd m5, m2
mova [stk+0x110], m14
pshufd m4, m15, q1032
%if ARCH_X86_64
movd r10d, m15
movd r11d, m4
pshufd m15, m15, q0321
pshufd m4, m4, q0321
movd r13d, m15
movd rXd, m4
movq m2, [base+subpel_filters+r10*8]
movq m3, [base+subpel_filters+r11*8]
movhps m2, [base+subpel_filters+r13*8]
movhps m3, [base+subpel_filters+ rX*8]
psrld m14, 10
movq r11, m14
punpckhqdq m14, m14
movq rX, m14
mov r10d, r11d
shr r11, 32
mov r13d, rXd
shr rX, 32
mov r4d, [stk+ 0]
mov r6d, [stk+ 4]
mov r7d, [stk+ 8]
mov r9d, [stk+12]
pshufd m4, m6, q1100
pshufd m6, m6, q3322
pshufd m14, m5, q1100
pshufd m5, m5, q3322
pand m7, m9, m4
pand m8, m9, m6
pand m15, m9, m14
pand m9, m9, m5
pandn m4, m0
pandn m6, m1
pandn m14, m2
pandn m5, m3
por m7, m4
por m8, m6
por m15, m14
por m9, m5
punpcklbw m0, m7, m7
punpckhbw m7, m7
punpcklbw m1, m8, m8
punpckhbw m8, m8
psraw m0, 8
psraw m7, 8
psraw m1, 8
psraw m8, 8
punpcklbw m2, m15, m15
punpckhbw m15, m15
punpcklbw m3, m9, m9
punpckhbw m9, m9
psraw m2, 8
psraw m15, 8
psraw m3, 8
psraw m9, 8
mova [stk+0x10], m0
mova [stk+0x20], m7
mova [stk+0x30], m1
mova [stk+0x40], m8
mova [stk+0x50], m2
mova [stk+0x60], m15
mova [stk+0x70], m3
mova [stk+0x80], m9
MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
mova [stk+0x90], m1
MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
mova [stk+0xa0], m2
MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
mova [stk+0xb0], m3
MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
mova [stk+0xc0], m4
MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
mova [stk+0xd0], m5
MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
mova m5, [stk+0xd0]
mova m1, [stk+0x90]
mova m2, [stk+0xa0]
mova m3, [stk+0xb0]
mova m9, [stk+0xc0]
punpcklwd m4, m5, m6 ; 45a
punpckhwd m5, m6 ; 45b
punpcklwd m6, m7, m8 ; 67a
punpckhwd m7, m8 ; 67b
punpcklwd m0, m1, m2 ; 01a
punpckhwd m1, m2 ; 01b
punpcklwd m2, m3, m9 ; 23a
punpckhwd m3, m9 ; 23b
mova m10, [stk+0x140]
mova m11, [stk+0x150]
mova m14, [stk+0x160]
mova m15, [stk+0x170]
mova [stk+0x90], m4
mova [stk+0xa0], m5
mova [stk+0xb0], m6
mova [stk+0xc0], m7
%define hround [rsp+0x10]
%define shift [rsp+0x20]
%if isput
%define vround [rsp+0x30]
%else
%define vround [base+pd_m524256]
%endif
.dy2_vloop:
pmaddwd m4, m0, m10
pmaddwd m5, m1, m10
pmaddwd m6, m2, m11
pmaddwd m7, m3, m11
paddd m4, m13
paddd m5, m13
paddd m4, m6
paddd m5, m7
pmaddwd m6, [stk+0x90], m14
pmaddwd m7, [stk+0xa0], m14
pmaddwd m8, [stk+0xb0], m15
pmaddwd m9, [stk+0xc0], m15
paddd m4, m6
paddd m5, m7
%if isput
pshufd m6, m12, q1032
%endif
paddd m4, m8
paddd m5, m9
%else
movd r0, m15
movd rX, m4
pshufd m15, m15, q0321
pshufd m4, m4, q0321
movd r4, m15
movd r5, m4
mova m14, [stk+0x110]
movq m2, [base+subpel_filters+r0*8]
movq m3, [base+subpel_filters+rX*8]
movhps m2, [base+subpel_filters+r4*8]
movhps m3, [base+subpel_filters+r5*8]
psrld m14, 10
mova [stk+16], m14
mov r0, [stk+ 0]
mov rX, [stk+ 4]
mov r4, [stk+ 8]
mov r5, [stk+12]
mova [stk+0x20], m0
mova [stk+0x30], m1
mova [stk+0x40], m2
mova [stk+0x50], m3
pshufd m4, m6, q1100
pshufd m6, m6, q3322
pshufd m7, m5, q1100
pshufd m5, m5, q3322
pand m0, m9, m4
pand m1, m9, m6
pand m2, m9, m7
pand m3, m9, m5
pandn m4, [stk+0x20]
pandn m6, [stk+0x30]
pandn m7, [stk+0x40]
pandn m5, [stk+0x50]
por m0, m4
por m1, m6
por m2, m7
por m3, m5
punpcklbw m4, m0, m0
punpckhbw m0, m0
punpcklbw m5, m1, m1
punpckhbw m1, m1
psraw m4, 8
psraw m0, 8
psraw m5, 8
psraw m1, 8
punpcklbw m6, m2, m2
punpckhbw m2, m2
punpcklbw m7, m3, m3
punpckhbw m3, m3
psraw m6, 8
psraw m2, 8
psraw m7, 8
psraw m3, 8
mova [stk+0x0a0], m4
mova [stk+0x0b0], m0
mova [stk+0x0c0], m5
mova [stk+0x0d0], m1
mova [stk+0x140], m6
mova [stk+0x150], m2
mova [stk+0x160], m7
mova [stk+0x170], m3
MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
mova m5, [stk+0x60]
mova m6, [stk+0x70]
mova m7, [stk+0x80]
mova m0, [stk+0x90]
mov r0, r0m
punpcklwd m4, m5, m6 ; 45a
punpckhwd m5, m6 ; 45b
punpcklwd m6, m7, m0 ; 67a
punpckhwd m7, m0 ; 67b
mova [stk+0x60], m4
mova [stk+0x70], m5
mova [stk+0x80], m6
mova [stk+0x90], m7
mova m1, [stk+0x20]
mova m2, [stk+0x30]
mova m3, [stk+0x40]
mova m4, [stk+0x50]
punpcklwd m0, m1, m2 ; 01a
punpckhwd m1, m2 ; 01b
punpcklwd m2, m3, m4 ; 23a
punpckhwd m3, m4 ; 23b
mova m4, [stk+0x180]
mova m5, [stk+0x190]
mova m6, [stk+0x1a0]
mova m7, [stk+0x1b0]
mova [stk+0x40], m2
mova [stk+0x50], m3
.dy2_vloop:
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m5
pmaddwd m3, m5
paddd m0, m2
paddd m1, m3
pmaddwd m2, [stk+0x60], m6
pmaddwd m3, [stk+0x70], m6
pmaddwd m4, [stk+0x80], m7
pmaddwd m5, [stk+0x90], m7
%if isput
movd m6, [esp+0x18]
%endif
paddd m0, m2
paddd m1, m3
paddd m0, vrnd_mem
paddd m1, vrnd_mem
paddd m4, m0
paddd m5, m1
%endif
%ifidn %1, put
psrad m4, m6
psrad m5, m6
packssdw m4, m5
pxor m7, m7
pmaxsw m4, m7
pminsw m4, pxmaxm
mova [dstq], m4
add dstq, dsm
%else
psrad m4, 6
psrad m5, 6
packssdw m4, m5
mova [tmpq], m4
add tmpq, tmp_stridem
%endif
dec hd
jz .dy2_hloop_prep
%if ARCH_X86_64
MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1
mova [stk+0xd0], m4
MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1
mova m4, [stk+0xd0]
mova m0, m2 ; 01a
mova m1, m3 ; 01b
mova m2, [stk+0x90] ; 23a
mova m3, [stk+0xa0] ; 23b
mova m5, [stk+0xb0] ; 45a
mova m6, [stk+0xc0] ; 45b
punpcklwd m7, m4, m8 ; 67a
punpckhwd m4, m8 ; 67b
mova [stk+0x90], m5
mova [stk+0xa0], m6
mova [stk+0xb0], m7
mova [stk+0xc0], m4
%else
mov r0m, r0
mov r3, r3m
MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8
MC_8TAP_SCALED_H 0xa0, 0 ; 9
mova m7, [stk+0xe0]
mova m2, [stk+0x60] ; 23a
mova m3, [stk+0x70] ; 23b
mova m4, [stk+0x80] ; 45a
mova m5, [stk+0x90] ; 45b
punpcklwd m6, m7, m0 ; 67a
punpckhwd m7, m0 ; 67b
mova m0, [stk+0x40] ; 01a
mova m1, [stk+0x50] ; 01b
mova [stk+0x40], m2
mova [stk+0x50], m3
mova [stk+0x60], m4
mova [stk+0x70], m5
mova m4, [stk+0x180]
mova m5, [stk+0x190]
mova [stk+0x80], m6
mova [stk+0x90], m7
mova m6, [stk+0x1a0]
mova m7, [stk+0x1b0]
mov r0, r0m
%endif
jmp .dy2_vloop
INIT_XMM ssse3
.ret:
MC_8TAP_SCALED_RET 0
%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
%define r0m [rstk+stack_offset+ 4]
%define r1m [rstk+stack_offset+ 8]
%define r2m [rstk+stack_offset+12]
%define r3m [rstk+stack_offset+16]
%endif
%undef isput
%undef isprep
%endmacro
%macro BILIN_SCALED_FN 1
cglobal %1_bilin_scaled_16bpc
mov t0d, (5*15 << 16) | 5*15
mov t1d, (5*15 << 16) | 5*15
jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
%endmacro
%if WIN64
DECLARE_REG_TMP 6, 5
%elif ARCH_X86_64
DECLARE_REG_TMP 6, 8
%else
DECLARE_REG_TMP 1, 2
%endif
BILIN_SCALED_FN put
FN put_8tap_scaled, sharp, SHARP, SHARP
FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH
FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP
FN put_8tap_scaled, smooth, SMOOTH, SMOOTH
FN put_8tap_scaled, sharp_regular, SHARP, REGULAR
FN put_8tap_scaled, regular_sharp, REGULAR, SHARP
FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR
FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH
FN put_8tap_scaled, regular, REGULAR, REGULAR
MC_8TAP_SCALED put
%if WIN64
DECLARE_REG_TMP 5, 4
%elif ARCH_X86_64
DECLARE_REG_TMP 6, 7
%else
DECLARE_REG_TMP 1, 2
%endif
BILIN_SCALED_FN prep
FN prep_8tap_scaled, sharp, SHARP, SHARP
FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH
FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP
FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH
FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR
FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP
FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR
FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH
FN prep_8tap_scaled, regular, REGULAR, REGULAR
MC_8TAP_SCALED prep
%if ARCH_X86_64
DECLARE_REG_TMP 6
%else
DECLARE_REG_TMP 2
%endif
%if ARCH_X86_64
; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that
; by allocating 16 bytes more stack space so that stack offsets match up.
%if WIN64 && STACK_ALIGNMENT == 16
%assign stksz 16*14
%else
%assign stksz 16*13
%endif
cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \
mx, tmp, alpha, beta, \
filter, my, gamma, cnt
%assign stack_size_padded_8x8t stack_size_padded
%else
cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
filter, mx, my
%define m8 [esp+16*13]
%define m9 [esp+16*14]
%define cntd dword [esp+4*63]
%define dstq tmpq
%define dsq 0
%if STACK_ALIGNMENT < 16
%define dstm [esp+4*65]
%define dsm [esp+4*66]
%else
%define dstm r0m
%define dsm r1m
%endif
%endif
%define base filterq-$$
mov t0d, r7m
LEA filterq, $$
shr t0d, 11
%if ARCH_X86_64
movddup m8, [base+warp8x8t_rnd]
%else
movddup m1, [base+warp8x8t_rnd]
mov r1, r1m
add r1, r1
mova m8, m1
mov r1m, r1 ; ds *= 2
%endif
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main
jmp .start
.loop:
%if ARCH_X86_64
lea dstq, [dstq+dsq*4]
%else
add dstq, dsm
mov dstm, dstq
%endif
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2
.start:
%if ARCH_X86_32
mov dstq, dstm
%endif
paddd m1, m8
paddd m2, m8
psrad m1, 15
psrad m2, 15
packssdw m1, m2
mova [dstq+dsq*0], m1
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3
%if ARCH_X86_32
mov dstq, dstm
add dstq, dsm
%endif
paddd m1, m8
paddd m2, m8
psrad m1, 15
psrad m2, 15
packssdw m1, m2
mova [dstq+dsq*2], m1
dec cntd
jg .loop
RET
%if ARCH_X86_64
cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \
mx, tmp, alpha, beta, \
filter, my, gamma, cnt
ASSERT stack_size_padded == stack_size_padded_8x8t
%else
cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
filter, mx, my
%endif
mov t0d, r7m
LEA filterq, $$
shr t0d, 11
%if ARCH_X86_64
movddup m8, [base+warp8x8_rnd2+t0*8]
movd m9, r7m ; pixel_max
pshufb m9, [base+pw_256]
%else
movddup m1, [base+warp8x8_rnd2+t0*8]
movd m2, r7m ; pixel_max
pshufb m2, [base+pw_256]
mova m8, m1
mova m9, m2
%endif
call .main
jmp .start
.loop:
%if ARCH_X86_64
lea dstq, [dstq+dsq*2]
%else
add dstq, dsm
mov dstm, dstq
%endif
call .main2
.start:
%if ARCH_X86_32
mov dstq, dstm
%endif
psrad m1, 16
psrad m2, 16
packssdw m1, m2
pmaxsw m1, m6
pmulhrsw m1, m8
pminsw m1, m9
mova [dstq+dsq*0], m1
call .main3
%if ARCH_X86_32
mov dstq, dstm
add dstq, dsm
%endif
psrad m1, 16
psrad m2, 16
packssdw m1, m2
pmaxsw m1, m6
pmulhrsw m1, m8
pminsw m1, m9
mova [dstq+dsq*1], m1
dec cntd
jg .loop
RET
ALIGN function_align
.main:
; Stack args offset by one (r4m -> r5m etc.) due to call
%if WIN64
mov deltaq, r5m
mov mxd, r6m
%endif
movd m0, [base+warp8x8_shift+t0*4]
movddup m7, [base+warp8x8_rnd1+t0*8]
add filterq, mc_warp_filter-$$
%if ARCH_X86_64
movsx alphad, word [deltaq+2*0]
movsx betad, word [deltaq+2*1]
movsx gammad, word [deltaq+2*2]
movsx deltad, word [deltaq+2*3]
lea tmpq, [ssq*3]
add mxd, 512+(64<<10)
sub srcq, tmpq ; src -= ss*3
imul tmpd, alphad, -7
mov myd, r7m
add betad, tmpd ; beta -= alpha*7
imul tmpd, gammad, -7
add myd, 512+(64<<10)
mov cntd, 4
add deltad, tmpd ; delta -= gamma*7
%else
%if STACK_ALIGNMENT < 16
%assign stack_offset stack_offset - gprsize
%endif
mov r3d, r5m ; abcd
%if STACK_ALIGNMENT < 16
mov r0, r1m ; dst
mov r1, r2m ; ds
mov [esp+gprsize+4*65], r0
mov [esp+gprsize+4*66], r1
%endif
movsx alphad, word [r3+2*0]
movsx r2d, word [r3+2*1]
movsx gammad, word [r3+2*2]
movsx r3d, word [r3+2*3]
imul r5d, alphad, -7
add r2d, r5d ; beta -= alpha*7
imul r5d, gammad, -7
mov [esp+gprsize+4*60], r2d
add r3d, r5d ; delta -= gamma*7
mov [esp+gprsize+4*61], r3d
mov r3d, r4m ; ss
mov srcq, r3m
mov mxd, r6m
mov myd, r7m
mov dword [esp+gprsize+4*63], 4 ; cnt
mov [esp+gprsize+4*62], r3
lea r3, [r3*3]
add mxd, 512+(64<<10)
add myd, 512+(64<<10)
sub srcq, r3 ; src -= ss*3
%if STACK_ALIGNMENT < 16
%assign stack_offset stack_offset + gprsize
%endif
%endif
mova [rsp+gprsize], m0
pxor m6, m6
call .h
mova m5, m0
call .h
punpcklwd m1, m5, m0 ; 01
punpckhwd m5, m0
mova [rsp+gprsize+16* 1], m1
mova [rsp+gprsize+16* 4], m5
mova m5, m0
call .h
punpcklwd m1, m5, m0 ; 12
punpckhwd m5, m0
mova [rsp+gprsize+16* 7], m1
mova [rsp+gprsize+16*10], m5
mova m5, m0
call .h
punpcklwd m1, m5, m0 ; 23
punpckhwd m5, m0
mova [rsp+gprsize+16* 2], m1
mova [rsp+gprsize+16* 5], m5
mova m5, m0
call .h
punpcklwd m1, m5, m0 ; 34
punpckhwd m5, m0
mova [rsp+gprsize+16* 8], m1
mova [rsp+gprsize+16*11], m5
mova m5, m0
call .h
punpcklwd m1, m5, m0 ; 45
punpckhwd m5, m0
mova [rsp+gprsize+16* 3], m1
mova [rsp+gprsize+16* 6], m5
mova m5, m0
call .h
punpcklwd m1, m5, m0 ; 56
punpckhwd m5, m0
mova [rsp+gprsize+16* 9], m1
mova [rsp+gprsize+16*12], m5
mova m5, m0
.main2:
call .h
%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h
lea tmpd, [myq+gammaq]
shr myd, 10
movq m4, [filterq+myq*8] ; a
lea myd, [tmpq+gammaq]
shr tmpd, 10
movq m2, [filterq+tmpq*8] ; b
lea tmpd, [myq+gammaq]
shr myd, 10
movq m3, [filterq+myq*8] ; c
lea myd, [tmpq+gammaq]
shr tmpd, 10
movq m1, [filterq+tmpq*8] ; d
lea tmpd, [myq+gammaq]
shr myd, 10
punpcklwd m4, m2
punpcklwd m3, m1
punpckldq m2, m4, m3
punpckhdq m4, m3
punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
pmaddwd m1, [rsp+gprsize+16*%1]
punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
mova m2, [rsp+gprsize+16*%2]
pmaddwd m3, m2
mova [rsp+gprsize+16*%1], m2
paddd m1, m3
punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
mova m2, [rsp+gprsize+16*%3]
pmaddwd m3, m2
mova [rsp+gprsize+16*%2], m2
paddd m1, m3
punpcklwd m3, m5, m0 ; 67
punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
pmaddwd m2, m3
mova [rsp+gprsize+16*%3], m3
paddd m1, m2
movq m4, [filterq+myq*8] ; e
lea myd, [tmpq+gammaq]
shr tmpd, 10
movq m3, [filterq+tmpq*8] ; f
lea tmpd, [myq+gammaq]
shr myd, 10
movq m2, [filterq+myq*8] ; g
%if ARCH_X86_64
lea myd, [tmpq+deltaq] ; my += delta
%else
mov myd, [esp+gprsize+4*61]
add myd, tmpd
%endif
shr tmpd, 10
punpcklwd m4, m3
movq m3, [filterq+tmpq*8] ; h
punpcklwd m2, m3
punpckldq m3, m4, m2
punpckhdq m4, m2
punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8
pmaddwd m2, [rsp+gprsize+16*%4]
punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8
mova m3, [rsp+gprsize+16*%5]
pmaddwd m6, m3
mova [rsp+gprsize+16*%4], m3
pxor m3, m3
paddd m2, m6
punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8
mova m6, [rsp+gprsize+16*%6]
pmaddwd m3, m6
mova [rsp+gprsize+16*%5], m6
punpckhwd m5, m0
pxor m6, m6
paddd m2, m3
punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8
pmaddwd m3, m5
mova [rsp+gprsize+16*%6], m5
mova m5, m0
paddd m2, m3
%endmacro
WARP_V 1, 2, 3, 4, 5, 6
ret
.main3:
call .h
WARP_V 7, 8, 9, 10, 11, 12
ret
ALIGN function_align
.h:
lea tmpd, [mxq+alphaq]
shr mxd, 10
movq m3, [filterq+mxq*8]
punpcklbw m0, m6, m3
movu m3, [srcq-6]
pmaddwd m0, m3 ; 0
lea mxd, [tmpq+alphaq]
shr tmpd, 10
movq m3, [filterq+tmpq*8]
punpcklbw m2, m6, m3
movu m3, [srcq-4]
pmaddwd m2, m3 ; 1
lea tmpd, [mxq+alphaq]
shr mxd, 10
movq m3, [filterq+mxq*8]
phaddd m0, m2 ; 0 1
punpcklbw m2, m6, m3
movu m3, [srcq-2]
pmaddwd m2, m3 ; 2
lea mxd, [tmpq+alphaq]
shr tmpd, 10
movq m3, [filterq+tmpq*8]
punpcklbw m1, m6, m3
movu m3, [srcq+0]
pmaddwd m1, m3 ; 3
lea tmpd, [mxq+alphaq]
shr mxd, 10
movq m3, [filterq+mxq*8]
phaddd m2, m1 ; 2 3
punpcklbw m1, m6, m3
movu m3, [srcq+2]
pmaddwd m1, m3 ; 4
lea mxd, [tmpq+alphaq]
shr tmpd, 10
movq m3, [filterq+tmpq*8]
phaddd m0, m2 ; 0 1 2 3
punpcklbw m2, m6, m3
movu m3, [srcq+4]
pmaddwd m2, m3 ; 5
lea tmpd, [mxq+alphaq]
shr mxd, 10
movq m3, [filterq+mxq*8]
phaddd m1, m2 ; 4 5
punpcklbw m2, m6, m3
movu m3, [srcq+6]
pmaddwd m2, m3 ; 6
%if ARCH_X86_64
lea mxd, [tmpq+betaq] ; mx += beta
%else
mov mxd, [esp+gprsize*2+4*60]
add mxd, tmpd
%endif
shr tmpd, 10
movq m3, [filterq+tmpq*8]
punpcklbw m4, m6, m3
movu m3, [srcq+8]
%if ARCH_X86_64
add srcq, ssq
%else
add srcq, [esp+gprsize*2+4*62]
%endif
pmaddwd m3, m4 ; 7
phaddd m2, m3 ; 6 7
phaddd m1, m2 ; 4 5 6 7
paddd m0, m7
paddd m1, m7
psrad m0, [rsp+gprsize*2]
psrad m1, [rsp+gprsize*2]
packssdw m0, m1
ret
%macro BIDIR_FN 0
call .main
jmp wq
.w4_loop:
call .main
lea dstq, [dstq+strideq*2]
.w4:
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
movq [dstq+strideq*0], m1
movhps [dstq+strideq*1], m1
sub hd, 4
jg .w4_loop
.ret:
RET
.w8_loop:
call .main
lea dstq, [dstq+strideq*2]
.w8:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
sub hd, 2
jne .w8_loop
RET
.w16_loop:
call .main
add dstq, strideq
.w16:
mova [dstq+16*0], m0
mova [dstq+16*1], m1
dec hd
jg .w16_loop
RET
.w32_loop:
call .main
add dstq, strideq
.w32:
mova [dstq+16*0], m0
mova [dstq+16*1], m1
call .main
mova [dstq+16*2], m0
mova [dstq+16*3], m1
dec hd
jg .w32_loop
RET
.w64_loop:
call .main
add dstq, strideq
.w64:
mova [dstq+16*0], m0
mova [dstq+16*1], m1
call .main
mova [dstq+16*2], m0
mova [dstq+16*3], m1
call .main
mova [dstq+16*4], m0
mova [dstq+16*5], m1
call .main
mova [dstq+16*6], m0
mova [dstq+16*7], m1
dec hd
jg .w64_loop
RET
.w128_loop:
call .main
add dstq, strideq
.w128:
mova [dstq+16* 0], m0
mova [dstq+16* 1], m1
call .main
mova [dstq+16* 2], m0
mova [dstq+16* 3], m1
call .main
mova [dstq+16* 4], m0
mova [dstq+16* 5], m1
call .main
mova [dstq+16* 6], m0
mova [dstq+16* 7], m1
call .main
mova [dstq+16* 8], m0
mova [dstq+16* 9], m1
call .main
mova [dstq+16*10], m0
mova [dstq+16*11], m1
call .main
mova [dstq+16*12], m0
mova [dstq+16*13], m1
call .main
mova [dstq+16*14], m0
mova [dstq+16*15], m1
dec hd
jg .w128_loop
RET
%endmacro
%if UNIX64
DECLARE_REG_TMP 7
%else
DECLARE_REG_TMP 5
%endif
cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h
%define base r6-avg_ssse3_table
LEA r6, avg_ssse3_table
tzcnt wd, wm
mov t0d, r6m ; pixel_max
movsxd wq, [r6+wq*4]
shr t0d, 11
movddup m2, [base+bidir_rnd+t0*8]
movddup m3, [base+bidir_mul+t0*8]
movifnidn hd, hm
add wq, r6
BIDIR_FN
ALIGN function_align
.main:
mova m0, [tmp1q+16*0]
paddsw m0, [tmp2q+16*0]
mova m1, [tmp1q+16*1]
paddsw m1, [tmp2q+16*1]
add tmp1q, 16*2
add tmp2q, 16*2
pmaxsw m0, m2
pmaxsw m1, m2
psubsw m0, m2
psubsw m1, m2
pmulhw m0, m3
pmulhw m1, m3
ret
cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h
%define base r6-w_avg_ssse3_table
LEA r6, w_avg_ssse3_table
tzcnt wd, wm
mov t0d, r6m ; weight
movd m6, r7m ; pixel_max
movddup m5, [base+pd_65538]
movsxd wq, [r6+wq*4]
pshufb m6, [base+pw_256]
add wq, r6
lea r6d, [t0-16]
shl t0d, 16
sub t0d, r6d ; 16-weight, weight
paddw m5, m6
mov r6d, t0d
shl t0d, 2
test dword r7m, 0x800
cmovnz r6d, t0d
movifnidn hd, hm
movd m4, r6d
pslld m5, 7
pxor m7, m7
pshufd m4, m4, q0000
BIDIR_FN
ALIGN function_align
.main:
mova m2, [tmp1q+16*0]
mova m0, [tmp2q+16*0]
punpckhwd m3, m0, m2
punpcklwd m0, m2
mova m2, [tmp1q+16*1]
mova m1, [tmp2q+16*1]
add tmp1q, 16*2
add tmp2q, 16*2
pmaddwd m3, m4
pmaddwd m0, m4
paddd m3, m5
paddd m0, m5
psrad m3, 8
psrad m0, 8
packssdw m0, m3
punpckhwd m3, m1, m2
punpcklwd m1, m2
pmaddwd m3, m4
pmaddwd m1, m4
paddd m3, m5
paddd m1, m5
psrad m3, 8
psrad m1, 8
packssdw m1, m3
pminsw m0, m6
pminsw m1, m6
pmaxsw m0, m7
pmaxsw m1, m7
ret
%if ARCH_X86_64
cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
%else
cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
%define hd dword r5m
%define m8 [base+pw_64]
%endif
%define base r6-mask_ssse3_table
LEA r6, mask_ssse3_table
tzcnt wd, wm
mov t0d, r7m ; pixel_max
shr t0d, 11
movsxd wq, [r6+wq*4]
movddup m6, [base+bidir_rnd+t0*8]
movddup m7, [base+bidir_mul+t0*8]
%if ARCH_X86_64
mova m8, [base+pw_64]
movifnidn hd, hm
%endif
add wq, r6
mov maskq, r6mp
BIDIR_FN
ALIGN function_align
.main:
movq m3, [maskq+8*0]
mova m0, [tmp1q+16*0]
mova m4, [tmp2q+16*0]
pxor m5, m5
punpcklbw m3, m5
punpckhwd m2, m0, m4
punpcklwd m0, m4
psubw m1, m8, m3
punpckhwd m4, m3, m1 ; m, 64-m
punpcklwd m3, m1
pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m)
pmaddwd m0, m3
movq m3, [maskq+8*1]
mova m1, [tmp1q+16*1]
mova m4, [tmp2q+16*1]
add maskq, 8*2
add tmp1q, 16*2
add tmp2q, 16*2
psrad m2, 5
psrad m0, 5
packssdw m0, m2
punpcklbw m3, m5
punpckhwd m2, m1, m4
punpcklwd m1, m4
psubw m5, m8, m3
punpckhwd m4, m3, m5 ; m, 64-m
punpcklwd m3, m5
pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m)
pmaddwd m1, m3
psrad m2, 5
psrad m1, 5
packssdw m1, m2
pmaxsw m0, m6
pmaxsw m1, m6
psubsw m0, m6
psubsw m1, m6
pmulhw m0, m7
pmulhw m1, m7
ret
cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
%define base t0-w_mask_420_ssse3_table
LEA t0, w_mask_420_ssse3_table
tzcnt wd, wm
mov r6d, r8m ; pixel_max
movd m0, r7m ; sign
shr r6d, 11
movsxd wq, [t0+wq*4]
%if ARCH_X86_64
mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
mova m9, [base+pw_64]
movddup m10, [base+bidir_rnd+r6*8]
movddup m11, [base+bidir_mul+r6*8]
%else
mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
mova m2, [base+pw_64]
movddup m3, [base+bidir_rnd+r6*8]
movddup m4, [base+bidir_mul+r6*8]
ALLOC_STACK -16*4
mova [rsp+16*0], m1
mova [rsp+16*1], m2
mova [rsp+16*2], m3
mova [rsp+16*3], m4
%define m8 [rsp+gprsize+16*0]
%define m9 [rsp+gprsize+16*1]
%define m10 [rsp+gprsize+16*2]
%define m11 [rsp+gprsize+16*3]
%endif
movd m7, [base+pw_2]
psubw m7, m0
pshufb m7, [base+pw_256]
add wq, t0
movifnidn hd, r5m
mov maskq, r6mp
call .main
jmp wq
.w4_loop:
call .main
lea dstq, [dstq+strideq*2]
add maskq, 4
.w4:
movq [dstq+strideq*0], m0
phaddw m2, m3
movhps [dstq+strideq*1], m0
phaddd m2, m2
lea dstq, [dstq+strideq*2]
paddw m2, m7
movq [dstq+strideq*0], m1
psrlw m2, 2
movhps [dstq+strideq*1], m1
packuswb m2, m2
movd [maskq], m2
sub hd, 4
jg .w4_loop
RET
.w8_loop:
call .main
lea dstq, [dstq+strideq*2]
add maskq, 4
.w8:
mova [dstq+strideq*0], m0
paddw m2, m3
phaddw m2, m2
mova [dstq+strideq*1], m1
paddw m2, m7
psrlw m2, 2
packuswb m2, m2
movd [maskq], m2
sub hd, 2
jg .w8_loop
RET
.w16_loop:
call .main
lea dstq, [dstq+strideq*2]
add maskq, 8
.w16:
mova [dstq+strideq*1+16*0], m2
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*1+16*1], m3
mova [dstq+strideq*0+16*1], m1
call .main
paddw m2, [dstq+strideq*1+16*0]
paddw m3, [dstq+strideq*1+16*1]
mova [dstq+strideq*1+16*0], m0
phaddw m2, m3
mova [dstq+strideq*1+16*1], m1
paddw m2, m7
psrlw m2, 2
packuswb m2, m2
movq [maskq], m2
sub hd, 2
jg .w16_loop
RET
.w32_loop:
call .main
lea dstq, [dstq+strideq*2]
add maskq, 16
.w32:
mova [dstq+strideq*1+16*0], m2
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*1+16*1], m3
mova [dstq+strideq*0+16*1], m1
call .main
mova [dstq+strideq*0+16*2], m0
phaddw m2, m3
mova [dstq+strideq*1+16*3], m2
mova [dstq+strideq*0+16*3], m1
call .main
paddw m2, [dstq+strideq*1+16*0]
paddw m3, [dstq+strideq*1+16*1]
mova [dstq+strideq*1+16*0], m0
phaddw m2, m3
mova [dstq+strideq*1+16*2], m2
mova [dstq+strideq*1+16*1], m1
call .main
phaddw m2, m3
paddw m3, m7, [dstq+strideq*1+16*2]
paddw m2, [dstq+strideq*1+16*3]
mova [dstq+strideq*1+16*2], m0
paddw m2, m7
psrlw m3, 2
psrlw m2, 2
mova [dstq+strideq*1+16*3], m1
packuswb m3, m2
mova [maskq], m3
sub hd, 2
jg .w32_loop
RET
.w64_loop:
call .main
lea dstq, [dstq+strideq*2]
add maskq, 16*2
.w64:
mova [dstq+strideq*1+16*1], m2
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*1+16*2], m3
mova [dstq+strideq*0+16*1], m1
call .main
mova [dstq+strideq*1+16*3], m2
mova [dstq+strideq*0+16*2], m0
mova [dstq+strideq*1+16*4], m3
mova [dstq+strideq*0+16*3], m1
call .main
mova [dstq+strideq*1+16*5], m2
mova [dstq+strideq*0+16*4], m0
mova [dstq+strideq*1+16*6], m3
mova [dstq+strideq*0+16*5], m1
call .main
mova [dstq+strideq*0+16*6], m0
phaddw m2, m3
mova [dstq+strideq*1+16*7], m2
mova [dstq+strideq*0+16*7], m1
call .main
paddw m2, [dstq+strideq*1+16*1]
paddw m3, [dstq+strideq*1+16*2]
mova [dstq+strideq*1+16*0], m0
phaddw m2, m3
mova [dstq+strideq*1+16*2], m2
mova [dstq+strideq*1+16*1], m1
call .main
paddw m2, [dstq+strideq*1+16*3]
paddw m3, [dstq+strideq*1+16*4]
phaddw m2, m3
paddw m3, m7, [dstq+strideq*1+16*2]
mova [dstq+strideq*1+16*2], m0
paddw m2, m7
psrlw m3, 2
psrlw m2, 2
mova [dstq+strideq*1+16*3], m1
packuswb m3, m2
mova [maskq+16*0], m3
call .main
paddw m2, [dstq+strideq*1+16*5]
paddw m3, [dstq+strideq*1+16*6]
mova [dstq+strideq*1+16*4], m0
phaddw m2, m3
mova [dstq+strideq*1+16*6], m2
mova [dstq+strideq*1+16*5], m1
call .main
phaddw m2, m3
paddw m3, m7, [dstq+strideq*1+16*6]
paddw m2, [dstq+strideq*1+16*7]
mova [dstq+strideq*1+16*6], m0
paddw m2, m7
psrlw m3, 2
psrlw m2, 2
mova [dstq+strideq*1+16*7], m1
packuswb m3, m2
mova [maskq+16*1], m3
sub hd, 2
jg .w64_loop
RET
.w128_loop:
call .main
lea dstq, [dstq+strideq*2]
add maskq, 16*4
.w128:
mova [dstq+strideq*1+16* 1], m2
mova [dstq+strideq*0+16* 0], m0
mova [dstq+strideq*1+16* 2], m3
mova [dstq+strideq*0+16* 1], m1
call .main
mova [dstq+strideq*1+16* 3], m2
mova [dstq+strideq*0+16* 2], m0
mova [dstq+strideq*1+16* 4], m3
mova [dstq+strideq*0+16* 3], m1
call .main
mova [dstq+strideq*1+16* 5], m2
mova [dstq+strideq*0+16* 4], m0
mova [dstq+strideq*1+16* 6], m3
mova [dstq+strideq*0+16* 5], m1
call .main
mova [dstq+strideq*1+16* 7], m2
mova [dstq+strideq*0+16* 6], m0
mova [dstq+strideq*1+16* 8], m3
mova [dstq+strideq*0+16* 7], m1
call .main
mova [dstq+strideq*1+16* 9], m2
mova [dstq+strideq*0+16* 8], m0
mova [dstq+strideq*1+16*10], m3
mova [dstq+strideq*0+16* 9], m1
call .main
mova [dstq+strideq*1+16*11], m2
mova [dstq+strideq*0+16*10], m0
mova [dstq+strideq*1+16*12], m3
mova [dstq+strideq*0+16*11], m1
call .main
mova [dstq+strideq*1+16*13], m2
mova [dstq+strideq*0+16*12], m0
mova [dstq+strideq*1+16*14], m3
mova [dstq+strideq*0+16*13], m1
call .main
mova [dstq+strideq*0+16*14], m0
phaddw m2, m3
mova [dstq+strideq*1+16*15], m2
mova [dstq+strideq*0+16*15], m1
call .main
paddw m2, [dstq+strideq*1+16* 1]
paddw m3, [dstq+strideq*1+16* 2]
mova [dstq+strideq*1+16* 0], m0
phaddw m2, m3
mova [dstq+strideq*1+16* 2], m2
mova [dstq+strideq*1+16* 1], m1
call .main
paddw m2, [dstq+strideq*1+16* 3]
paddw m3, [dstq+strideq*1+16* 4]
phaddw m2, m3
paddw m3, m7, [dstq+strideq*1+16* 2]
mova [dstq+strideq*1+16* 2], m0
paddw m2, m7
psrlw m3, 2
psrlw m2, 2
mova [dstq+strideq*1+16* 3], m1
packuswb m3, m2
mova [maskq+16*0], m3
call .main
paddw m2, [dstq+strideq*1+16* 5]
paddw m3, [dstq+strideq*1+16* 6]
mova [dstq+strideq*1+16* 4], m0
phaddw m2, m3
mova [dstq+strideq*1+16* 6], m2
mova [dstq+strideq*1+16* 5], m1
call .main
paddw m2, [dstq+strideq*1+16* 7]
paddw m3, [dstq+strideq*1+16* 8]
phaddw m2, m3
paddw m3, m7, [dstq+strideq*1+16* 6]
mova [dstq+strideq*1+16* 6], m0
paddw m2, m7
psrlw m3, 2
psrlw m2, 2
mova [dstq+strideq*1+16* 7], m1
packuswb m3, m2
mova [maskq+16*1], m3
call .main
paddw m2, [dstq+strideq*1+16* 9]
paddw m3, [dstq+strideq*1+16*10]
mova [dstq+strideq*1+16* 8], m0
phaddw m2, m3
mova [dstq+strideq*1+16*10], m2
mova [dstq+strideq*1+16* 9], m1
call .main
paddw m2, [dstq+strideq*1+16*11]
paddw m3, [dstq+strideq*1+16*12]
phaddw m2, m3
paddw m3, m7, [dstq+strideq*1+16*10]
mova [dstq+strideq*1+16*10], m0
paddw m2, m7
psrlw m3, 2
psrlw m2, 2
mova [dstq+strideq*1+16*11], m1
packuswb m3, m2
mova [maskq+16*2], m3
call .main
paddw m2, [dstq+strideq*1+16*13]
paddw m3, [dstq+strideq*1+16*14]
mova [dstq+strideq*1+16*12], m0
phaddw m2, m3
mova [dstq+strideq*1+16*14], m2
mova [dstq+strideq*1+16*13], m1
call .main
phaddw m2, m3
paddw m3, m7, [dstq+strideq*1+16*14]
paddw m2, [dstq+strideq*1+16*15]
mova [dstq+strideq*1+16*14], m0
paddw m2, m7
psrlw m3, 2
psrlw m2, 2
mova [dstq+strideq*1+16*15], m1
packuswb m3, m2
mova [maskq+16*3], m3
sub hd, 2
jg .w128_loop
RET
ALIGN function_align
.main:
%macro W_MASK 2 ; dst/tmp_offset, mask
mova m%1, [tmp1q+16*%1]
mova m%2, [tmp2q+16*%1]
punpcklwd m4, m%2, m%1
punpckhwd m5, m%2, m%1
psubsw m%1, m%2
pabsw m%1, m%1
psubusw m6, m8, m%1
psrlw m6, 10 ; 64-m
psubw m%2, m9, m6 ; m
punpcklwd m%1, m6, m%2
punpckhwd m6, m%2
pmaddwd m%1, m4
pmaddwd m6, m5
psrad m%1, 5
psrad m6, 5
packssdw m%1, m6
pmaxsw m%1, m10
psubsw m%1, m10
pmulhw m%1, m11
%endmacro
W_MASK 0, 2
W_MASK 1, 3
add tmp1q, 16*2
add tmp2q, 16*2
ret
cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
%define base t0-w_mask_422_ssse3_table
LEA t0, w_mask_422_ssse3_table
tzcnt wd, wm
mov r6d, r8m ; pixel_max
movd m7, r7m ; sign
shr r6d, 11
movsxd wq, [t0+wq*4]
%if ARCH_X86_64
mova m8, [base+pw_27615]
mova m9, [base+pw_64]
movddup m10, [base+bidir_rnd+r6*8]
movddup m11, [base+bidir_mul+r6*8]
%else
mova m1, [base+pw_27615]
mova m2, [base+pw_64]
movddup m3, [base+bidir_rnd+r6*8]
movddup m4, [base+bidir_mul+r6*8]
ALLOC_STACK -16*4
mova [rsp+16*0], m1
mova [rsp+16*1], m2
mova [rsp+16*2], m3
mova [rsp+16*3], m4
%endif
pxor m0, m0
add wq, t0
pshufb m7, m0
movifnidn hd, r5m
mov maskq, r6mp
call .main
jmp wq
.w4_loop:
call .main
lea dstq, [dstq+strideq*2]
.w4:
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
movq [dstq+strideq*0], m1
movhps [dstq+strideq*1], m1
sub hd, 4
jg .w4_loop
.end:
RET
.w8_loop:
call .main
lea dstq, [dstq+strideq*2]
.w8:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
sub hd, 2
jg .w8_loop
.w8_end:
RET
.w16_loop:
call .main
lea dstq, [dstq+strideq*2]
.w16:
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*0+16*1], m1
call .main
mova [dstq+strideq*1+16*0], m0
mova [dstq+strideq*1+16*1], m1
sub hd, 2
jg .w16_loop
RET
.w32_loop:
call .main
add dstq, strideq
.w32:
mova [dstq+16*0], m0
mova [dstq+16*1], m1
call .main
mova [dstq+16*2], m0
mova [dstq+16*3], m1
dec hd
jg .w32_loop
RET
.w64_loop:
call .main
add dstq, strideq
.w64:
mova [dstq+16*0], m0
mova [dstq+16*1], m1
call .main
mova [dstq+16*2], m0
mova [dstq+16*3], m1
call .main
mova [dstq+16*4], m0
mova [dstq+16*5], m1
call .main
mova [dstq+16*6], m0
mova [dstq+16*7], m1
dec hd
jg .w64_loop
RET
.w128_loop:
call .main
add dstq, strideq
.w128:
mova [dstq+16* 0], m0
mova [dstq+16* 1], m1
call .main
mova [dstq+16* 2], m0
mova [dstq+16* 3], m1
call .main
mova [dstq+16* 4], m0
mova [dstq+16* 5], m1
call .main
mova [dstq+16* 6], m0
mova [dstq+16* 7], m1
call .main
mova [dstq+16* 8], m0
mova [dstq+16* 9], m1
call .main
mova [dstq+16*10], m0
mova [dstq+16*11], m1
call .main
mova [dstq+16*12], m0
mova [dstq+16*13], m1
call .main
mova [dstq+16*14], m0
mova [dstq+16*15], m1
dec hd
jg .w128_loop
RET
ALIGN function_align
.main:
W_MASK 0, 2
W_MASK 1, 3
phaddw m2, m3
add tmp1q, 16*2
add tmp2q, 16*2
packuswb m2, m2
pxor m3, m3
psubb m2, m7
pavgb m2, m3
movq [maskq], m2
add maskq, 8
ret
cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
%define base t0-w_mask_444_ssse3_table
LEA t0, w_mask_444_ssse3_table
tzcnt wd, wm
mov r6d, r8m ; pixel_max
shr r6d, 11
movsxd wq, [t0+wq*4]
%if ARCH_X86_64
mova m8, [base+pw_27615]
mova m9, [base+pw_64]
movddup m10, [base+bidir_rnd+r6*8]
movddup m11, [base+bidir_mul+r6*8]
%else
mova m1, [base+pw_27615]
mova m2, [base+pw_64]
movddup m3, [base+bidir_rnd+r6*8]
movddup m7, [base+bidir_mul+r6*8]
ALLOC_STACK -16*3
mova [rsp+16*0], m1
mova [rsp+16*1], m2
mova [rsp+16*2], m3
%define m11 m7
%endif
add wq, t0
movifnidn hd, r5m
mov maskq, r6mp
call .main
jmp wq
.w4_loop:
call .main
lea dstq, [dstq+strideq*2]
.w4:
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
movq [dstq+strideq*0], m1
movhps [dstq+strideq*1], m1
sub hd, 4
jg .w4_loop
.end:
RET
.w8_loop:
call .main
lea dstq, [dstq+strideq*2]
.w8:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
sub hd, 2
jg .w8_loop
.w8_end:
RET
.w16_loop:
call .main
lea dstq, [dstq+strideq*2]
.w16:
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*0+16*1], m1
call .main
mova [dstq+strideq*1+16*0], m0
mova [dstq+strideq*1+16*1], m1
sub hd, 2
jg .w16_loop
RET
.w32_loop:
call .main
add dstq, strideq
.w32:
mova [dstq+16*0], m0
mova [dstq+16*1], m1
call .main
mova [dstq+16*2], m0
mova [dstq+16*3], m1
dec hd
jg .w32_loop
RET
.w64_loop:
call .main
add dstq, strideq
.w64:
mova [dstq+16*0], m0
mova [dstq+16*1], m1
call .main
mova [dstq+16*2], m0
mova [dstq+16*3], m1
call .main
mova [dstq+16*4], m0
mova [dstq+16*5], m1
call .main
mova [dstq+16*6], m0
mova [dstq+16*7], m1
dec hd
jg .w64_loop
RET
.w128_loop:
call .main
add dstq, strideq
.w128:
mova [dstq+16* 0], m0
mova [dstq+16* 1], m1
call .main
mova [dstq+16* 2], m0
mova [dstq+16* 3], m1
call .main
mova [dstq+16* 4], m0
mova [dstq+16* 5], m1
call .main
mova [dstq+16* 6], m0
mova [dstq+16* 7], m1
call .main
mova [dstq+16* 8], m0
mova [dstq+16* 9], m1
call .main
mova [dstq+16*10], m0
mova [dstq+16*11], m1
call .main
mova [dstq+16*12], m0
mova [dstq+16*13], m1
call .main
mova [dstq+16*14], m0
mova [dstq+16*15], m1
dec hd
jg .w128_loop
RET
ALIGN function_align
.main:
W_MASK 0, 2
W_MASK 1, 3
packuswb m2, m3
add tmp1q, 16*2
add tmp2q, 16*2
mova [maskq], m2
add maskq, 16
ret
; (a * (64 - m) + b * m + 32) >> 6
; = (((b - a) * m + 32) >> 6) + a
; = (((b - a) * (m << 9) + 16384) >> 15) + a
; except m << 9 overflows int16_t when m == 64 (which is possible),
; but if we negate m it works out (-64 << 9 == -32768).
; = (((a - b) * (m * -512) + 16384) >> 15) + a
cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3
%define base r6-blend_ssse3_table
LEA r6, blend_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r6+wq*4]
movifnidn maskq, maskmp
mova m7, [base+pw_m512]
add wq, r6
lea stride3q, [strideq*3]
pxor m6, m6
jmp wq
.w4:
mova m5, [maskq]
movq m0, [dstq+strideq*0]
movhps m0, [dstq+strideq*1]
movq m1, [dstq+strideq*2]
movhps m1, [dstq+stride3q ]
psubw m2, m0, [tmpq+16*0]
psubw m3, m1, [tmpq+16*1]
add maskq, 16
add tmpq, 32
punpcklbw m4, m5, m6
punpckhbw m5, m6
pmullw m4, m7
pmullw m5, m7
pmulhrsw m2, m4
pmulhrsw m3, m5
paddw m0, m2
paddw m1, m3
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
movq [dstq+strideq*2], m1
movhps [dstq+stride3q ], m1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4
RET
.w8:
mova m5, [maskq]
mova m0, [dstq+strideq*0]
mova m1, [dstq+strideq*1]
psubw m2, m0, [tmpq+16*0]
psubw m3, m1, [tmpq+16*1]
add maskq, 16
add tmpq, 32
punpcklbw m4, m5, m6
punpckhbw m5, m6
pmullw m4, m7
pmullw m5, m7
pmulhrsw m2, m4
pmulhrsw m3, m5
paddw m0, m2
paddw m1, m3
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w8
RET
.w16:
mova m5, [maskq]
mova m0, [dstq+16*0]
mova m1, [dstq+16*1]
psubw m2, m0, [tmpq+16*0]
psubw m3, m1, [tmpq+16*1]
add maskq, 16
add tmpq, 32
punpcklbw m4, m5, m6
punpckhbw m5, m6
pmullw m4, m7
pmullw m5, m7
pmulhrsw m2, m4
pmulhrsw m3, m5
paddw m0, m2
paddw m1, m3
mova [dstq+16*0], m0
mova [dstq+16*1], m1
add dstq, strideq
dec hd
jg .w16
RET
.w32:
mova m5, [maskq+16*0]
mova m0, [dstq+16*0]
mova m1, [dstq+16*1]
psubw m2, m0, [tmpq+16*0]
psubw m3, m1, [tmpq+16*1]
punpcklbw m4, m5, m6
punpckhbw m5, m6
pmullw m4, m7
pmullw m5, m7
pmulhrsw m2, m4
pmulhrsw m3, m5
paddw m0, m2
paddw m1, m3
mova [dstq+16*0], m0
mova [dstq+16*1], m1
mova m5, [maskq+16*1]
mova m0, [dstq+16*2]
mova m1, [dstq+16*3]
psubw m2, m0, [tmpq+16*2]
psubw m3, m1, [tmpq+16*3]
add maskq, 32
add tmpq, 64
punpcklbw m4, m5, m6
punpckhbw m5, m6
pmullw m4, m7
pmullw m5, m7
pmulhrsw m2, m4
pmulhrsw m3, m5
paddw m0, m2
paddw m1, m3
mova [dstq+16*2], m0
mova [dstq+16*3], m1
add dstq, strideq
dec hd
jg .w32
RET
cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
%define base r5-blend_v_ssse3_table
LEA r5, blend_v_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r5+wq*4]
add wq, r5
jmp wq
.w2:
movd m4, [base+obmc_masks+2*2]
.w2_loop:
movd m0, [dstq+strideq*0]
movd m2, [tmpq+4*0]
movd m1, [dstq+strideq*1]
movd m3, [tmpq+4*1]
add tmpq, 4*2
psubw m2, m0
psubw m3, m1
pmulhrsw m2, m4
pmulhrsw m3, m4
paddw m0, m2
paddw m1, m3
movd [dstq+strideq*0], m0
movd [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w2_loop
RET
.w4:
movddup m2, [base+obmc_masks+4*2]
.w4_loop:
movq m0, [dstq+strideq*0]
movhps m0, [dstq+strideq*1]
mova m1, [tmpq]
add tmpq, 8*2
psubw m1, m0
pmulhrsw m1, m2
paddw m0, m1
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w4_loop
RET
.w8:
mova m4, [base+obmc_masks+8*2]
.w8_loop:
mova m0, [dstq+strideq*0]
mova m2, [tmpq+16*0]
mova m1, [dstq+strideq*1]
mova m3, [tmpq+16*1]
add tmpq, 16*2
psubw m2, m0
psubw m3, m1
pmulhrsw m2, m4
pmulhrsw m3, m4
paddw m0, m2
paddw m1, m3
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w8_loop
RET
.w16:
mova m4, [base+obmc_masks+16*2]
movq m5, [base+obmc_masks+16*3]
.w16_loop:
mova m0, [dstq+16*0]
mova m2, [tmpq+16*0]
mova m1, [dstq+16*1]
mova m3, [tmpq+16*1]
add tmpq, 16*2
psubw m2, m0
psubw m3, m1
pmulhrsw m2, m4
pmulhrsw m3, m5
paddw m0, m2
paddw m1, m3
mova [dstq+16*0], m0
mova [dstq+16*1], m1
add dstq, strideq
dec hd
jg .w16_loop
RET
.w32:
%if WIN64
movaps [rsp+8], m6
%endif
mova m4, [base+obmc_masks+16*4]
mova m5, [base+obmc_masks+16*5]
mova m6, [base+obmc_masks+16*6]
.w32_loop:
mova m0, [dstq+16*0]
mova m2, [tmpq+16*0]
mova m1, [dstq+16*1]
mova m3, [tmpq+16*1]
psubw m2, m0
psubw m3, m1
pmulhrsw m2, m4
pmulhrsw m3, m5
paddw m0, m2
mova m2, [dstq+16*2]
paddw m1, m3
mova m3, [tmpq+16*2]
add tmpq, 16*4
psubw m3, m2
pmulhrsw m3, m6
paddw m2, m3
mova [dstq+16*0], m0
mova [dstq+16*1], m1
mova [dstq+16*2], m2
add dstq, strideq
dec hd
jg .w32_loop
%if WIN64
movaps m6, [rsp+8]
%endif
RET
%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
mova m0, [dstq+16*(%1+0)]
mova m2, [tmpq+16*(%2+0)]
mova m1, [dstq+16*(%1+1)]
mova m3, [tmpq+16*(%2+1)]
%if %3
add tmpq, 16*%3
%endif
psubw m2, m0
psubw m3, m1
pmulhrsw m2, m5
pmulhrsw m3, m5
paddw m0, m2
paddw m1, m3
mova [dstq+16*(%1+0)], m0
mova [dstq+16*(%1+1)], m1
%endmacro
cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
%define base r6-blend_h_ssse3_table
LEA r6, blend_h_ssse3_table
tzcnt wd, wm
mov hd, hm
movsxd wq, [r6+wq*4]
movddup m4, [base+blend_shuf]
lea maskq, [base+obmc_masks+hq*2]
lea hd, [hq*3]
add wq, r6
shr hd, 2 ; h * 3/4
lea maskq, [maskq+hq*2]
neg hq
jmp wq
.w2:
movd m0, [dstq+dsq*0]
movd m2, [dstq+dsq*1]
movd m3, [maskq+hq*2]
movq m1, [tmpq]
add tmpq, 4*2
punpckldq m0, m2
punpcklwd m3, m3
psubw m1, m0
pmulhrsw m1, m3
paddw m0, m1
movd [dstq+dsq*0], m0
psrlq m0, 32
movd [dstq+dsq*1], m0
lea dstq, [dstq+dsq*2]
add hq, 2
jl .w2
RET
.w4:
mova m3, [base+blend_shuf]
.w4_loop:
movq m0, [dstq+dsq*0]
movhps m0, [dstq+dsq*1]
movd m2, [maskq+hq*2]
mova m1, [tmpq]
add tmpq, 8*2
psubw m1, m0
pshufb m2, m3
pmulhrsw m1, m2
paddw m0, m1
movq [dstq+dsq*0], m0
movhps [dstq+dsq*1], m0
lea dstq, [dstq+dsq*2]
add hq, 2
jl .w4_loop
RET
.w8:
movddup m5, [base+blend_shuf+8]
%if WIN64
movaps [rsp+ 8], m6
movaps [rsp+24], m7
%endif
.w8_loop:
movd m7, [maskq+hq*2]
mova m0, [dstq+dsq*0]
mova m2, [tmpq+16*0]
mova m1, [dstq+dsq*1]
mova m3, [tmpq+16*1]
add tmpq, 16*2
pshufb m6, m7, m4
psubw m2, m0
pshufb m7, m5
psubw m3, m1
pmulhrsw m2, m6
pmulhrsw m3, m7
paddw m0, m2
paddw m1, m3
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
add hq, 2
jl .w8_loop
%if WIN64
movaps m6, [rsp+ 8]
movaps m7, [rsp+24]
%endif
RET
.w16:
movd m5, [maskq+hq*2]
pshufb m5, m4
BLEND_H_ROW 0, 0, 2
add dstq, dsq
inc hq
jl .w16
RET
.w32:
movd m5, [maskq+hq*2]
pshufb m5, m4
BLEND_H_ROW 0, 0
BLEND_H_ROW 2, 2, 4
add dstq, dsq
inc hq
jl .w32
RET
.w64:
movd m5, [maskq+hq*2]
pshufb m5, m4
BLEND_H_ROW 0, 0
BLEND_H_ROW 2, 2
BLEND_H_ROW 4, 4
BLEND_H_ROW 6, 6, 8
add dstq, dsq
inc hq
jl .w64
RET
.w128:
movd m5, [maskq+hq*2]
pshufb m5, m4
BLEND_H_ROW 0, 0
BLEND_H_ROW 2, 2
BLEND_H_ROW 4, 4
BLEND_H_ROW 6, 6, 16
BLEND_H_ROW 8, -8
BLEND_H_ROW 10, -6
BLEND_H_ROW 12, -4
BLEND_H_ROW 14, -2
add dstq, dsq
inc hq
jl .w128
RET
; emu_edge args:
; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
; const pixel *ref, const ptrdiff_t ref_stride
;
; bw, bh total filled size
; iw, ih, copied block -> fill bottom, right
; x, y, offset in bw/bh -> fill top, left
cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \
y, dst, dstride, src, sstride, \
bottomext, rightext, blk
; we assume that the buffer (stride) is larger than width, so we can
; safely overwrite by a few bytes
%if ARCH_X86_64
%define reg_zero r12q
%define reg_tmp r10
%define reg_src srcq
%define reg_bottomext bottomextq
%define reg_rightext rightextq
%define reg_blkm r9m
%else
%define reg_zero r6
%define reg_tmp r0
%define reg_src r1
%define reg_bottomext r0
%define reg_rightext r1
%define reg_blkm r2m
%endif
;
; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
xor reg_zero, reg_zero
lea reg_tmp, [ihq-1]
cmp yq, ihq
cmovs reg_tmp, yq
test yq, yq
cmovs reg_tmp, reg_zero
%if ARCH_X86_64
imul reg_tmp, sstrideq
add srcq, reg_tmp
%else
imul reg_tmp, sstridem
mov reg_src, srcm
add reg_src, reg_tmp
%endif
;
; ref += iclip(x, 0, iw - 1)
lea reg_tmp, [iwq-1]
cmp xq, iwq
cmovs reg_tmp, xq
test xq, xq
cmovs reg_tmp, reg_zero
lea reg_src, [reg_src+reg_tmp*2]
%if ARCH_X86_32
mov srcm, reg_src
%endif
;
; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
%if ARCH_X86_32
mov r1, r1m ; restore bh
%endif
lea reg_bottomext, [yq+bhq]
sub reg_bottomext, ihq
lea r3, [bhq-1]
cmovs reg_bottomext, reg_zero
;
DEFINE_ARGS bw, bh, iw, ih, x, \
topext, dst, dstride, src, sstride, \
bottomext, rightext, blk
; top_ext = iclip(-y, 0, bh - 1)
neg topextq
cmovs topextq, reg_zero
cmp reg_bottomext, bhq
cmovns reg_bottomext, r3
cmp topextq, bhq
cmovg topextq, r3
%if ARCH_X86_32
mov r4m, reg_bottomext
;
; right_ext = iclip(x + bw - iw, 0, bw - 1)
mov r0, r0m ; restore bw
%endif
lea reg_rightext, [xq+bwq]
sub reg_rightext, iwq
lea r2, [bwq-1]
cmovs reg_rightext, reg_zero
DEFINE_ARGS bw, bh, iw, ih, leftext, \
topext, dst, dstride, src, sstride, \
bottomext, rightext, blk
; left_ext = iclip(-x, 0, bw - 1)
neg leftextq
cmovs leftextq, reg_zero
cmp reg_rightext, bwq
cmovns reg_rightext, r2
%if ARCH_X86_32
mov r3m, r1
%endif
cmp leftextq, bwq
cmovns leftextq, r2
%undef reg_zero
%undef reg_tmp
%undef reg_src
%undef reg_bottomext
%undef reg_rightext
DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
topext, dst, dstride, src, sstride, \
bottomext, rightext, blk
; center_h = bh - top_ext - bottom_ext
%if ARCH_X86_64
lea r3, [bottomextq+topextq]
sub centerhq, r3
%else
mov r1, centerhm ; restore r1
sub centerhq, topextq
sub centerhq, r4m
mov r1m, centerhq
%endif
;
; blk += top_ext * PXSTRIDE(dst_stride)
mov r2, topextq
%if ARCH_X86_64
imul r2, dstrideq
%else
mov r6, r6m ; restore dstq
imul r2, dstridem
%endif
add dstq, r2
mov reg_blkm, dstq ; save pointer for ext
;
; center_w = bw - left_ext - right_ext
mov centerwq, bwq
%if ARCH_X86_64
lea r3, [rightextq+leftextq]
sub centerwq, r3
%else
sub centerwq, r3m
sub centerwq, leftextq
%endif
; vloop Macro
%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
%if ARCH_X86_64
%define reg_tmp r12
%else
%define reg_tmp r0
%endif
.v_loop_%3:
%if ARCH_X86_32
mov r0, r0m
mov r1, r1m
%endif
%if %1
; left extension
%if ARCH_X86_64
movd m0, [srcq]
%else
mov r3, srcm
movd m0, [r3]
%endif
pshuflw m0, m0, q0000
punpcklqdq m0, m0
xor r3, r3
.left_loop_%3:
mova [dstq+r3*2], m0
add r3, mmsize/2
cmp r3, leftextq
jl .left_loop_%3
; body
lea reg_tmp, [dstq+leftextq*2]
%endif
xor r3, r3
.body_loop_%3:
%if ARCH_X86_64
movu m0, [srcq+r3*2]
%else
mov r1, srcm
movu m0, [r1+r3*2]
%endif
%if %1
movu [reg_tmp+r3*2], m0
%else
movu [dstq+r3*2], m0
%endif
add r3, mmsize/2
cmp r3, centerwq
jl .body_loop_%3
%if %2
; right extension
%if %1
lea reg_tmp, [reg_tmp+centerwq*2]
%else
lea reg_tmp, [dstq+centerwq*2]
%endif
%if ARCH_X86_64
movd m0, [srcq+centerwq*2-2]
%else
mov r3, srcm
movd m0, [r3+centerwq*2-2]
%endif
pshuflw m0, m0, q0000
punpcklqdq m0, m0
xor r3, r3
.right_loop_%3:
movu [reg_tmp+r3*2], m0
add r3, mmsize/2
%if ARCH_X86_64
cmp r3, rightextq
%else
cmp r3, r3m
%endif
jl .right_loop_%3
%endif
%if ARCH_X86_64
add dstq, dstrideq
add srcq, sstrideq
dec centerhq
jg .v_loop_%3
%else
add dstq, dstridem
mov r0, sstridem
add srcm, r0
sub dword centerhm, 1
jg .v_loop_%3
mov r0, r0m ; restore r0
%endif
%endmacro ; vloop MACRO
test leftextq, leftextq
jnz .need_left_ext
%if ARCH_X86_64
test rightextq, rightextq
jnz .need_right_ext
%else
cmp leftextq, r3m ; leftextq == 0
jne .need_right_ext
%endif
v_loop 0, 0, 0
jmp .body_done
;left right extensions
.need_left_ext:
%if ARCH_X86_64
test rightextq, rightextq
%else
mov r3, r3m
test r3, r3
%endif
jnz .need_left_right_ext
v_loop 1, 0, 1
jmp .body_done
.need_left_right_ext:
v_loop 1, 1, 2
jmp .body_done
.need_right_ext:
v_loop 0, 1, 3
.body_done:
; r0 ; bw
; r1 ;; x loop
; r4 ;; y loop
; r5 ; topextq
; r6 ;dstq
; r7 ;dstrideq
; r8 ; srcq
%if ARCH_X86_64
%define reg_dstride dstrideq
%else
%define reg_dstride r2
%endif
;
; bottom edge extension
%if ARCH_X86_64
test bottomextq, bottomextq
jz .top
%else
xor r1, r1
cmp r1, r4m
je .top
%endif
;
%if ARCH_X86_64
mov srcq, dstq
sub srcq, dstrideq
xor r1, r1
%else
mov r3, dstq
mov reg_dstride, dstridem
sub r3, reg_dstride
mov srcm, r3
%endif
;
.bottom_x_loop:
%if ARCH_X86_64
mova m0, [srcq+r1*2]
lea r3, [dstq+r1*2]
mov r4, bottomextq
%else
mov r3, srcm
mova m0, [r3+r1*2]
lea r3, [dstq+r1*2]
mov r4, r4m
%endif
;
.bottom_y_loop:
mova [r3], m0
add r3, reg_dstride
dec r4
jg .bottom_y_loop
add r1, mmsize/2
cmp r1, bwq
jl .bottom_x_loop
.top:
; top edge extension
test topextq, topextq
jz .end
%if ARCH_X86_64
mov srcq, reg_blkm
%else
mov r3, reg_blkm
mov reg_dstride, dstridem
%endif
mov dstq, dstm
xor r1, r1
;
.top_x_loop:
%if ARCH_X86_64
mova m0, [srcq+r1*2]
%else
mov r3, reg_blkm
mova m0, [r3+r1*2]
%endif
lea r3, [dstq+r1*2]
mov r4, topextq
;
.top_y_loop:
mova [r3], m0
add r3, reg_dstride
dec r4
jg .top_y_loop
add r1, mmsize/2
cmp r1, bwq
jl .top_x_loop
.end:
RET
%undef reg_dstride
%undef reg_blkm
%undef reg_tmp
%macro SCRATCH 3
%if ARCH_X86_32
mova [rsp+%3*mmsize], m%1
%define m%2 [rsp+%3*mmsize]
%else
SWAP %1, %2
%endif
%endmacro
%if ARCH_X86_64
cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0, pxmax
%elif STACK_ALIGNMENT >= 16
cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0, pxmax
%else
cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0, pxmax
%endif
movifnidn dstq, dstmp
movifnidn srcq, srcmp
%if STACK_ALIGNMENT >= 16
movifnidn dst_wd, dst_wm
%endif
%if ARCH_X86_64
movifnidn hd, hm
%endif
sub dword mx0m, 4<<14
sub dword src_wm, 8
movd m4, pxmaxm
movd m7, dxm
movd m6, mx0m
movd m5, src_wm
punpcklwd m4, m4
pshufd m4, m4, q0000
pshufd m7, m7, q0000
pshufd m6, m6, q0000
pshufd m5, m5, q0000
mova [rsp+16*3*ARCH_X86_32], m4
%if ARCH_X86_64
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
LEA r7, $$
%define base r7-$$
%else
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
%define hd dword r5m
%if STACK_ALIGNMENT >= 16
LEA r6, $$
%define base r6-$$
%else
LEA r4, $$
%define base r4-$$
%endif
%endif
%if ARCH_X86_64
mova m12, [base+pd_64]
mova m11, [base+pd_63]
%else
%define m12 [base+pd_64]
%define m11 [base+pd_63]
%endif
pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
pslld m7, 2 ; dx*4
pslld m5, 14
paddd m6, m4 ; mx+[0..3]*dx
SCRATCH 7, 15, 0
SCRATCH 6, 14, 1
SCRATCH 5, 13, 2
pxor m1, m1
.loop_y:
xor xd, xd
mova m0, m14 ; per-line working version of mx
.loop_x:
pcmpgtd m1, m0
pandn m1, m0
psrad m2, m0, 8 ; filter offset (unmasked)
pcmpgtd m3, m13, m1
pand m1, m3
pandn m3, m13
por m1, m3
psubd m3, m0, m1 ; pshufb offset
psrad m1, 14 ; clipped src_x offset
psrad m3, 14 ; pshufb edge_emu offset
pand m2, m11 ; filter offset (masked)
; load source pixels
%if ARCH_X86_64
movd r8d, m1
pshuflw m1, m1, q3232
movd r9d, m1
punpckhqdq m1, m1
movd r10d, m1
psrlq m1, 32
movd r11d, m1
movu m4, [srcq+r8*2]
movu m5, [srcq+r9*2]
movu m6, [srcq+r10*2]
movu m7, [srcq+r11*2]
; if no emulation is required, we don't need to shuffle or emulate edges
packssdw m3, m3
movq r11, m3
test r11, r11
jz .filter
movsx r8, r11w
sar r11, 16
movsx r9, r11w
sar r11, 16
movsx r10, r11w
sar r11, 16
movu m1, [base+resize_shuf+8+r8*2]
movu m3, [base+resize_shuf+8+r9*2]
movu m8, [base+resize_shuf+8+r10*2]
movu m9, [base+resize_shuf+8+r11*2]
pshufb m4, m1
pshufb m5, m3
pshufb m6, m8
pshufb m7, m9
.filter:
movd r8d, m2
pshuflw m2, m2, q3232
movd r9d, m2
punpckhqdq m2, m2
movd r10d, m2
psrlq m2, 32
movd r11d, m2
movq m8, [base+resize_filter+r8*8]
movq m2, [base+resize_filter+r9*8]
pxor m9, m9
punpcklbw m1, m9, m8
punpcklbw m3, m9, m2
psraw m1, 8
psraw m3, 8
movq m10, [base+resize_filter+r10*8]
movq m2, [base+resize_filter+r11*8]
punpcklbw m8, m9, m10
punpcklbw m9, m2
psraw m8, 8
psraw m9, 8
pmaddwd m4, m1
pmaddwd m5, m3
pmaddwd m6, m8
pmaddwd m7, m9
phaddd m4, m5
%else
movd r3, m1
pshuflw m1, m1, q3232
movd r1, m1
punpckhqdq m1, m1
movu m4, [srcq+r3*2]
movu m5, [srcq+r1*2]
movd r3, m1
psrlq m1, 32
movd r1, m1
movu m6, [srcq+r3*2]
movu m7, [srcq+r1*2]
; if no emulation is required, we don't need to shuffle or emulate edges
pxor m1, m1
pcmpeqb m1, m3
pmovmskb r3d, m1
cmp r3d, 0xffff
je .filter
movd r3, m3
movu m1, [base+resize_shuf+8+r3*2]
pshuflw m3, m3, q3232
movd r1, m3
pshufb m4, m1
movu m1, [base+resize_shuf+8+r1*2]
punpckhqdq m3, m3
movd r3, m3
pshufb m5, m1
movu m1, [base+resize_shuf+8+r3*2]
psrlq m3, 32
movd r1, m3
pshufb m6, m1
movu m1, [base+resize_shuf+8+r1*2]
pshufb m7, m1
.filter:
mova [esp+4*16], m6
mova [esp+5*16], m7
movd r3, m2
pshuflw m2, m2, q3232
movd r1, m2
movq m6, [base+resize_filter+r3*8]
movq m7, [base+resize_filter+r1*8]
pxor m3, m3
punpcklbw m1, m3, m6
punpcklbw m3, m7
psraw m1, 8
psraw m3, 8
pmaddwd m4, m1
pmaddwd m5, m3
punpckhqdq m2, m2
movd r3, m2
psrlq m2, 32
movd r1, m2
phaddd m4, m5
movq m2, [base+resize_filter+r3*8]
movq m5, [base+resize_filter+r1*8]
mova m6, [esp+4*16]
mova m7, [esp+5*16]
pxor m3, m3
punpcklbw m1, m3, m2
punpcklbw m3, m5
psraw m1, 8
psraw m3, 8
pmaddwd m6, m1
pmaddwd m7, m3
%endif
phaddd m6, m7
phaddd m4, m6
pxor m1, m1
psubd m2, m12, m4
psrad m2, 7
packssdw m2, m2
pmaxsw m2, m1
pminsw m2, [rsp+16*3*ARCH_X86_32]
movq [dstq+xq*2], m2
paddd m0, m15
add xd, 4
%if STACK_ALIGNMENT >= 16
cmp xd, dst_wd
%else
cmp xd, dst_wm
%endif
jl .loop_x
add dstq, dst_stridemp
add srcq, src_stridemp
dec hd
jg .loop_y
RET