Source code

Revision control

Copy as Markdown

Other Tools

; Copyright © 2020, VideoLAN and dav1d authors
; Copyright © 2020, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 64
spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41
spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17
db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49
db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25
db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57
spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45
spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21
db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53
db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29
db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61
spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46
db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62
db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110
db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126
prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78
db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94
db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110
db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126
spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46
spel_shuf4b: db 18, 19, 33, 34, 22, 23, 37, 38, 26, 27, 41, 42, 30, 31, 45, 46
db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78
db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110
spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78
db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110
db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46
db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62
db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110
db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126
spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78
db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94
db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110
db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126
spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38
db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14
db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46
spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30
spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21
db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25
w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94
db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126
w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94
db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126
w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110
db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126
warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37
db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41
db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45
db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53
db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57
db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61
warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7
pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7
dd 1
pw_2048: times 2 dw 2048
dd 3
pw_8192: times 2 dw 8192
avg_shift: dw 5, 5, 3, 3
pw_27615: times 2 dw 27615
pw_32766: times 2 dw 32766
warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29
resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13
resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15
resize_permE: dq 0, 2, 4, 6
resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13
resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
prep_hv_shift: dq 6, 4
put_bilin_h_rnd: dw 8, 8, 10, 10
prep_mul: dw 16, 16, 4, 4
put_8tap_h_rnd: dd 34, 40
prep_8tap_rnd: dd 128 - (8192 << 8)
warp_8x8_rnd_h: dd 512, 2048
warp_8x8_rnd_v: dd 262144, 65536
warp_8x8t_rnd_v: dd 16384 - (8192 << 15)
avg_round: dw -16400, -16400, -16388, -16388
w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4)
mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6)
w_mask_round: dd 128, 64
bidir_shift: dw 6, 6, 4, 4
pb_64: times 4 db 64
pw_m512: times 2 dw -512
pw_2: times 2 dw 2
pw_64: times 2 dw 64
pd_32: dd 32
pd_63: dd 63
pd_128: dd 128
pd_640: dd 640
pd_2176: dd 2176
pd_16384: dd 16384
pd_0_4: dd 0, 4
%define pw_16 prep_mul
%define pd_512 warp_8x8_rnd_h
%macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3)
%xdefine %%base %1_%2
%%table:
%rep %0 - 2
dw %%base %+ _w%3 - %%base
%rotate 1
%endrep
%endmacro
%macro HV_JMP_TABLE 5-*
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
%xdefine %%base %1_%3
%assign %%types %4
%if %%types & 1
%xdefine %1_%2_h_%3_table (%%h - %5)
%%h:
%rep %0 - 4
dw %%prefix %+ .h_w%5 - %%base
%rotate 1
%endrep
%rotate 4
%endif
%if %%types & 2
%xdefine %1_%2_v_%3_table (%%v - %5)
%%v:
%rep %0 - 4
dw %%prefix %+ .v_w%5 - %%base
%rotate 1
%endrep
%rotate 4
%endif
%if %%types & 4
%xdefine %1_%2_hv_%3_table (%%hv - %5)
%%hv:
%rep %0 - 4
dw %%prefix %+ .hv_w%5 - %%base
%rotate 1
%endrep
%endif
%endmacro
%macro BIDIR_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - 2*%3)
%xdefine %%base %1_%2_table
%xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
%%table:
%rep %0 - 2
dd %%prefix %+ .w%3 - %%base
%rotate 1
%endrep
%endmacro
%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put)
%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep)
BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
cextern mc_subpel_filters
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
cextern mc_warp_filter
cextern obmc_masks_avx2
cextern resize_filter
SECTION .text
%if WIN64
DECLARE_REG_TMP 4
%else
DECLARE_REG_TMP 8
%endif
INIT_ZMM avx512icl
cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy
mov mxyd, r6m ; mx
lea r7, [put_avx512icl]
tzcnt t0d, wm
movifnidn hd, hm
test mxyd, mxyd
jnz .h
mov mxyd, r7m ; my
test mxyd, mxyd
jnz .v
.put:
movzx t0d, word [r7+t0*2+table_offset(put,)]
add t0, r7
jmp t0
.put_w2:
mov r6d, [srcq+ssq*0]
mov r7d, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r6d
mov [dstq+dsq*1], r7d
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w2
RET
.put_w4:
mov r6, [srcq+ssq*0]
mov r7, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r6
mov [dstq+dsq*1], r7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w4
RET
.put_w8:
movu xmm0, [srcq+ssq*0]
movu xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], xmm0
mova [dstq+dsq*1], xmm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w8
RET
.put_w16:
movu ym0, [srcq+ssq*0]
movu ym1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], ym0
mova [dstq+dsq*1], ym1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w16
RET
.put_w32:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w32
RET
.put_w64:
movu m0, [srcq+ssq*0+64*0]
movu m1, [srcq+ssq*0+64*1]
movu m2, [srcq+ssq*1+64*0]
movu m3, [srcq+ssq*1+64*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0+64*0], m0
mova [dstq+dsq*0+64*1], m1
mova [dstq+dsq*1+64*0], m2
mova [dstq+dsq*1+64*1], m3
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w64
RET
.put_w128:
movu m0, [srcq+64*0]
movu m1, [srcq+64*1]
movu m2, [srcq+64*2]
movu m3, [srcq+64*3]
add srcq, ssq
mova [dstq+64*0], m0
mova [dstq+64*1], m1
mova [dstq+64*2], m2
mova [dstq+64*3], m3
add dstq, dsq
dec hd
jg .put_w128
RET
.h:
vpbroadcastw m5, mxyd
mov mxyd, r7m ; my
vpbroadcastd m4, [pw_16]
psubw m4, m5
test mxyd, mxyd
jnz .hv
; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)]
mov r6d, r8m ; bitdepth_max
add t0, r7
shr r6d, 11
vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4]
jmp t0
.h_w2:
movq xmm1, [srcq+ssq*0]
movhps xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmullw xmm0, xmm1, xm4
psrlq xmm1, 16
pmullw xmm1, xm5
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 4
movd [dstq+dsq*0], xmm0
pextrd [dstq+dsq*1], xmm0, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2
RET
.h_w4:
movq xmm0, [srcq+ssq*0+0]
movhps xmm0, [srcq+ssq*1+0]
movq xmm1, [srcq+ssq*0+2]
movhps xmm1, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
pmullw xmm0, xm4
pmullw xmm1, xm5
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 4
movq [dstq+dsq*0], xmm0
movhps [dstq+dsq*1], xmm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w4
RET
.h_w8:
movu xm0, [srcq+ssq*0+0]
vinserti32x4 ym0, [srcq+ssq*1+0], 1
movu xm1, [srcq+ssq*0+2]
vinserti32x4 ym1, [srcq+ssq*1+2], 1
lea srcq, [srcq+ssq*2]
pmullw ym0, ym4
pmullw ym1, ym5
paddw ym0, ym6
paddw ym0, ym1
psrlw ym0, 4
mova [dstq+dsq*0], xm0
vextracti32x4 [dstq+dsq*1], ym0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
RET
.h_w16:
movu ym0, [srcq+ssq*0+0]
vinserti32x8 m0, [srcq+ssq*1+0], 1
movu ym1, [srcq+ssq*0+2]
vinserti32x8 m1, [srcq+ssq*1+2], 1
lea srcq, [srcq+ssq*2]
pmullw m0, m4
pmullw m1, m5
paddw m0, m6
paddw m0, m1
psrlw m0, 4
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w16
RET
.h_w32:
pmullw m0, m4, [srcq+ssq*0+0]
pmullw m2, m5, [srcq+ssq*0+2]
pmullw m1, m4, [srcq+ssq*1+0]
pmullw m3, m5, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
paddw m0, m6
paddw m1, m6
paddw m0, m2
paddw m1, m3
psrlw m0, 4
psrlw m1, 4
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w32
RET
.h_w64:
pmullw m0, m4, [srcq+64*0+0]
pmullw m2, m5, [srcq+64*0+2]
pmullw m1, m4, [srcq+64*1+0]
pmullw m3, m5, [srcq+64*1+2]
add srcq, ssq
paddw m0, m6
paddw m1, m6
paddw m0, m2
paddw m1, m3
psrlw m0, 4
psrlw m1, 4
mova [dstq+64*0], m0
mova [dstq+64*1], m1
add dstq, dsq
dec hd
jg .h_w64
RET
.h_w128:
pmullw m0, m4, [srcq+64*0+0]
pmullw m7, m5, [srcq+64*0+2]
pmullw m1, m4, [srcq+64*1+0]
pmullw m8, m5, [srcq+64*1+2]
pmullw m2, m4, [srcq+64*2+0]
pmullw m9, m5, [srcq+64*2+2]
pmullw m3, m4, [srcq+64*3+0]
pmullw m10, m5, [srcq+64*3+2]
add srcq, ssq
REPX {paddw x, m6}, m0, m1, m2, m3
paddw m0, m7
paddw m1, m8
paddw m2, m9
paddw m3, m10
REPX {psrlw x, 4}, m0, m1, m2, m3
mova [dstq+64*0], m0
mova [dstq+64*1], m1
mova [dstq+64*2], m2
mova [dstq+64*3], m3
add dstq, dsq
dec hd
jg .h_w128
RET
.v:
movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)]
shl mxyd, 11
vpbroadcastw m8, mxyd
add t0, r7
jmp t0
.v_w2:
movd xmm0, [srcq+ssq*0]
.v_w2_loop:
movd xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpckldq xmm2, xmm0, xmm1
movd xmm0, [srcq+ssq*0]
punpckldq xmm1, xmm0
psubw xmm1, xmm2
pmulhrsw xmm1, xm8
paddw xmm1, xmm2
movd [dstq+dsq*0], xmm1
pextrd [dstq+dsq*1], xmm1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movq xmm0, [srcq+ssq*0]
.v_w4_loop:
movq xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklqdq xmm2, xmm0, xmm1
movq xmm0, [srcq+ssq*0]
punpcklqdq xmm1, xmm0
psubw xmm1, xmm2
pmulhrsw xmm1, xm8
paddw xmm1, xmm2
movq [dstq+dsq*0], xmm1
movhps [dstq+dsq*1], xmm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
RET
.v_w8:
movu xmm0, [srcq+ssq*0]
.v_w8_loop:
vbroadcasti128 ymm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpblendd ymm2, ymm0, ymm1, 0xf0
vbroadcasti128 ymm0, [srcq+ssq*0]
vpblendd ymm1, ymm0, 0xf0
psubw ymm1, ymm2
pmulhrsw ymm1, ym8
paddw ymm1, ymm2
mova [dstq+dsq*0], xmm1
vextracti128 [dstq+dsq*1], ymm1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
vzeroupper
RET
.v_w16:
movu ym0, [srcq+ssq*0]
.v_w16_loop:
movu ym3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
psubw ym1, ym3, ym0
pmulhrsw ym1, ym8
paddw ym1, ym0
movu ym0, [srcq+ssq*0]
psubw ym2, ym0, ym3
pmulhrsw ym2, ym8
paddw ym2, ym3
mova [dstq+dsq*0], ym1
mova [dstq+dsq*1], ym2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w16_loop
RET
.v_w32:
movu m0, [srcq+ssq*0]
.v_w32_loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
psubw m1, m3, m0
pmulhrsw m1, m8
paddw m1, m0
movu m0, [srcq+ssq*0]
psubw m2, m0, m3
pmulhrsw m2, m8
paddw m2, m3
mova [dstq+dsq*0], m1
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w32_loop
RET
.v_w64:
movu m0, [srcq+ssq*0+64*0]
movu m1, [srcq+ssq*0+64*1]
.v_w64_loop:
movu m2, [srcq+ssq*1+64*0]
movu m3, [srcq+ssq*1+64*1]
lea srcq, [srcq+ssq*2]
psubw m4, m2, m0
pmulhrsw m4, m8
paddw m4, m0
movu m0, [srcq+ssq*0+64*0]
psubw m5, m3, m1
pmulhrsw m5, m8
paddw m5, m1
movu m1, [srcq+ssq*0+64*1]
psubw m6, m0, m2
pmulhrsw m6, m8
psubw m7, m1, m3
pmulhrsw m7, m8
mova [dstq+dsq*0+64*0], m4
mova [dstq+dsq*0+64*1], m5
paddw m6, m2
paddw m7, m3
mova [dstq+dsq*1+64*0], m6
mova [dstq+dsq*1+64*1], m7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w64_loop
RET
.v_w128:
movu m0, [srcq+ssq*0+64*0]
movu m1, [srcq+ssq*0+64*1]
movu m2, [srcq+ssq*0+64*2]
movu m3, [srcq+ssq*0+64*3]
.v_w128_loop:
movu m4, [srcq+ssq*1+64*0]
movu m5, [srcq+ssq*1+64*1]
movu m6, [srcq+ssq*1+64*2]
movu m7, [srcq+ssq*1+64*3]
lea srcq, [srcq+ssq*2]
psubw m9, m4, m0
pmulhrsw m9, m8
paddw m9, m0
movu m0, [srcq+ssq*0+64*0]
psubw m10, m5, m1
pmulhrsw m10, m8
paddw m10, m1
movu m1, [srcq+ssq*0+64*1]
psubw m11, m6, m2
pmulhrsw m11, m8
paddw m11, m2
movu m2, [srcq+ssq*0+64*2]
psubw m12, m7, m3
pmulhrsw m12, m8
paddw m12, m3
movu m3, [srcq+ssq*0+64*3]
mova [dstq+dsq*0+64*0], m9
psubw m9, m0, m4
pmulhrsw m9, m8
mova [dstq+dsq*0+64*1], m10
psubw m10, m1, m5
pmulhrsw m10, m8
mova [dstq+dsq*0+64*2], m11
psubw m11, m2, m6
pmulhrsw m11, m8
mova [dstq+dsq*0+64*3], m12
psubw m12, m3, m7
pmulhrsw m12, m8
paddw m9, m4
paddw m10, m5
mova [dstq+dsq*1+64*0], m9
mova [dstq+dsq*1+64*1], m10
paddw m11, m6
paddw m12, m7
mova [dstq+dsq*1+64*2], m11
mova [dstq+dsq*1+64*3], m12
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w128_loop
RET
.hv:
movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)]
shl mxyd, 11
vpbroadcastd m6, [pw_2]
vpbroadcastw m7, mxyd
vpbroadcastd m8, [pw_8192]
add t0, r7
test dword r8m, 0x800
jnz .hv_12bpc
psllw m4, 2
psllw m5, 2
vpbroadcastd m8, [pw_2048]
.hv_12bpc:
jmp t0
.hv_w2:
vpbroadcastq xmm1, [srcq+ssq*0]
pmullw xmm0, xmm1, xm4
psrlq xmm1, 16
pmullw xmm1, xm5
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 2
.hv_w2_loop:
movq xmm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movhps xmm2, [srcq+ssq*0]
pmullw xmm1, xmm2, xm4
psrlq xmm2, 16
pmullw xmm2, xm5
paddw xmm1, xm6
paddw xmm1, xmm2
psrlw xmm1, 2 ; 1 _ 2 _
shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _
mova xmm0, xmm1
psubw xmm1, xmm2
paddw xmm1, xmm1
pmulhw xmm1, xm7
paddw xmm1, xmm2
pmulhrsw xmm1, xm8
movd [dstq+dsq*0], xmm1
pextrd [dstq+dsq*1], xmm1, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w4:
pmullw xmm0, xm4, [srcq+ssq*0-8]
pmullw xmm1, xm5, [srcq+ssq*0-6]
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 2
.hv_w4_loop:
movq xmm1, [srcq+ssq*1+0]
movq xmm2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
movhps xmm1, [srcq+ssq*0+0]
movhps xmm2, [srcq+ssq*0+2]
pmullw xmm1, xm4
pmullw xmm2, xm5
paddw xmm1, xm6
paddw xmm1, xmm2
psrlw xmm1, 2 ; 1 2
shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1
mova xmm0, xmm1
psubw xmm1, xmm2
paddw xmm1, xmm1
pmulhw xmm1, xm7
paddw xmm1, xmm2
pmulhrsw xmm1, xm8
movq [dstq+dsq*0], xmm1
movhps [dstq+dsq*1], xmm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
RET
.hv_w8:
pmullw xmm0, xm4, [srcq+ssq*0+0]
pmullw xmm1, xm5, [srcq+ssq*0+2]
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 2
vinserti32x4 ym0, xmm0, 1
.hv_w8_loop:
movu xm1, [srcq+ssq*1+0]
movu xm2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
vinserti32x4 ym1, [srcq+ssq*0+0], 1
vinserti32x4 ym2, [srcq+ssq*0+2], 1
pmullw ym1, ym4
pmullw ym2, ym5
paddw ym1, ym6
paddw ym1, ym2
psrlw ym1, 2 ; 1 2
vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1
mova ym0, ym1
psubw ym1, ym2
paddw ym1, ym1
pmulhw ym1, ym7
paddw ym1, ym2
pmulhrsw ym1, ym8
mova [dstq+dsq*0], xm1
vextracti32x4 [dstq+dsq*1], ym1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
RET
.hv_w16:
pmullw ym0, ym4, [srcq+ssq*0+0]
pmullw ym1, ym5, [srcq+ssq*0+2]
paddw ym0, ym6
paddw ym0, ym1
psrlw ym0, 2
vinserti32x8 m0, ym0, 1
.hv_w16_loop:
movu ym1, [srcq+ssq*1+0]
movu ym2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
vinserti32x8 m1, [srcq+ssq*0+0], 1
vinserti32x8 m2, [srcq+ssq*0+2], 1
pmullw m1, m4
pmullw m2, m5
paddw m1, m6
paddw m1, m2
psrlw m1, 2 ; 1 2
vshufi32x4 m2, m0, m1, q1032 ; 0 1
mova m0, m1
psubw m1, m2
paddw m1, m1
pmulhw m1, m7
paddw m1, m2
pmulhrsw m1, m8
mova [dstq+dsq*0], ym1
vextracti32x8 [dstq+dsq*1], m1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w16_loop
RET
.hv_w32:
.hv_w64:
.hv_w128:
movifnidn wd, wm
lea r6d, [hq+wq*8-256]
mov r4, srcq
mov r7, dstq
.hv_w32_loop0:
pmullw m0, m4, [srcq+ssq*0+0]
pmullw m1, m5, [srcq+ssq*0+2]
paddw m0, m6
paddw m0, m1
psrlw m0, 2
.hv_w32_loop:
pmullw m3, m4, [srcq+ssq*1+0]
pmullw m1, m5, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
paddw m3, m6
paddw m3, m1
psrlw m3, 2
psubw m1, m3, m0
paddw m1, m1
pmulhw m1, m7
paddw m1, m0
pmullw m0, m4, [srcq+ssq*0+0]
pmullw m2, m5, [srcq+ssq*0+2]
paddw m0, m6
paddw m0, m2
psrlw m0, 2
psubw m2, m0, m3
paddw m2, m2
pmulhw m2, m7
paddw m2, m3
pmulhrsw m1, m8
pmulhrsw m2, m8
mova [dstq+dsq*0], m1
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w32_loop
add r4, 64
add r7, 64
movzx hd, r6b
mov srcq, r4
mov dstq, r7
sub r6d, 1<<8
jg .hv_w32_loop0
RET
cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3
movifnidn mxyd, r5m ; mx
lea r6, [prep_avx512icl]
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
jnz .h
mov mxyd, r6m ; my
test mxyd, mxyd
jnz .v
.prep:
movzx wd, word [r6+wq*2+table_offset(prep,)]
mov r5d, r7m ; bitdepth_max
vpbroadcastd m5, [r6-prep_avx512icl+pw_8192]
add wq, r6
shr r5d, 11
vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4]
lea stride3q, [strideq*3]
jmp wq
.prep_w4:
movq xmm0, [srcq+strideq*0]
movhps xmm0, [srcq+strideq*1]
vpbroadcastq ymm1, [srcq+strideq*2]
vpbroadcastq ymm2, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
vpblendd ymm0, ymm1, 0x30
vpblendd ymm0, ymm2, 0xc0
pmullw ymm0, ym4
psubw ymm0, ym5
mova [tmpq], ymm0
add tmpq, 32
sub hd, 4
jg .prep_w4
vzeroupper
RET
.prep_w8:
movu xm0, [srcq+strideq*0]
vinserti32x4 ym0, [srcq+strideq*1], 1
vinserti32x4 m0, [srcq+strideq*2], 2
vinserti32x4 m0, [srcq+stride3q ], 3
lea srcq, [srcq+strideq*4]
pmullw m0, m4
psubw m0, m5
mova [tmpq], m0
add tmpq, 64
sub hd, 4
jg .prep_w8
RET
.prep_w16:
movu ym0, [srcq+strideq*0]
vinserti32x8 m0, [srcq+strideq*1], 1
movu ym1, [srcq+strideq*2]
vinserti32x8 m1, [srcq+stride3q ], 1
lea srcq, [srcq+strideq*4]
pmullw m0, m4
pmullw m1, m4
psubw m0, m5
psubw m1, m5
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
add tmpq, 64*2
sub hd, 4
jg .prep_w16
RET
.prep_w32:
pmullw m0, m4, [srcq+strideq*0]
pmullw m1, m4, [srcq+strideq*1]
pmullw m2, m4, [srcq+strideq*2]
pmullw m3, m4, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
sub hd, 4
jg .prep_w32
RET
.prep_w64:
pmullw m0, m4, [srcq+strideq*0+64*0]
pmullw m1, m4, [srcq+strideq*0+64*1]
pmullw m2, m4, [srcq+strideq*1+64*0]
pmullw m3, m4, [srcq+strideq*1+64*1]
lea srcq, [srcq+strideq*2]
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
sub hd, 2
jg .prep_w64
RET
.prep_w128:
pmullw m0, m4, [srcq+64*0]
pmullw m1, m4, [srcq+64*1]
pmullw m2, m4, [srcq+64*2]
pmullw m3, m4, [srcq+64*3]
add srcq, strideq
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
dec hd
jg .prep_w128
RET
.h:
vpbroadcastw m5, mxyd
mov mxyd, r6m ; my
vpbroadcastd m4, [pw_16]
vpbroadcastd m6, [pw_32766]
psubw m4, m5
test dword r7m, 0x800
jnz .h_12bpc
psllw m4, 2
psllw m5, 2
.h_12bpc:
test mxyd, mxyd
jnz .hv
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
add wq, r6
lea stride3q, [strideq*3]
jmp wq
.h_w4:
movu xm1, [srcq+strideq*0]
vinserti32x4 ym1, [srcq+strideq*2], 1
movu xm2, [srcq+strideq*1]
vinserti32x4 ym2, [srcq+stride3q ], 1
lea srcq, [srcq+strideq*4]
punpcklqdq ym0, ym1, ym2
psrldq ym1, 2
psrldq ym2, 2
pmullw ym0, ym4
punpcklqdq ym1, ym2
pmullw ym1, ym5
psubw ym0, ym6
paddw ym0, ym1
psraw ym0, 2
mova [tmpq], ym0
add tmpq, 32
sub hd, 4
jg .h_w4
RET
.h_w8:
movu xm0, [srcq+strideq*0+0]
movu xm1, [srcq+strideq*0+2]
vinserti32x4 ym0, [srcq+strideq*1+0], 1
vinserti32x4 ym1, [srcq+strideq*1+2], 1
vinserti32x4 m0, [srcq+strideq*2+0], 2
vinserti32x4 m1, [srcq+strideq*2+2], 2
vinserti32x4 m0, [srcq+stride3q +0], 3
vinserti32x4 m1, [srcq+stride3q +2], 3
lea srcq, [srcq+strideq*4]
pmullw m0, m4
pmullw m1, m5
psubw m0, m6
paddw m0, m1
psraw m0, 2
mova [tmpq], m0
add tmpq, 64
sub hd, 4
jg .h_w8
RET
.h_w16:
movu ym0, [srcq+strideq*0+0]
vinserti32x8 m0, [srcq+strideq*1+0], 1
movu ym1, [srcq+strideq*0+2]
vinserti32x8 m1, [srcq+strideq*1+2], 1
lea srcq, [srcq+strideq*2]
pmullw m0, m4
pmullw m1, m5
psubw m0, m6
paddw m0, m1
psraw m0, 2
mova [tmpq], m0
add tmpq, 64
sub hd, 2
jg .h_w16
RET
.h_w32:
pmullw m0, m4, [srcq+strideq*0+0]
pmullw m2, m5, [srcq+strideq*0+2]
pmullw m1, m4, [srcq+strideq*1+0]
pmullw m3, m5, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
psubw m0, m6
psubw m1, m6
paddw m0, m2
paddw m1, m3
psraw m0, 2
psraw m1, 2
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
add tmpq, 64*2
sub hd, 2
jg .h_w32
RET
.h_w64:
pmullw m0, m4, [srcq+ 0]
pmullw m2, m5, [srcq+ 2]
pmullw m1, m4, [srcq+64]
pmullw m3, m5, [srcq+66]
add srcq, strideq
psubw m0, m6
psubw m1, m6
paddw m0, m2
paddw m1, m3
psraw m0, 2
psraw m1, 2
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
add tmpq, 64*2
dec hd
jg .h_w64
RET
.h_w128:
pmullw m0, m4, [srcq+ 0]
pmullw m7, m5, [srcq+ 2]
pmullw m1, m4, [srcq+ 64]
pmullw m8, m5, [srcq+ 66]
pmullw m2, m4, [srcq+128]
pmullw m9, m5, [srcq+130]
pmullw m3, m4, [srcq+192]
pmullw m10, m5, [srcq+194]
add srcq, strideq
REPX {psubw x, m6}, m0, m1, m2, m3
paddw m0, m7
paddw m1, m8
paddw m2, m9
paddw m3, m10
REPX {psraw x, 2}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
dec hd
jg .h_w128
RET
.v:
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
vpbroadcastw m9, mxyd
vpbroadcastd m8, [pw_16]
vpbroadcastd m10, [pw_32766]
add wq, r6
lea stride3q, [strideq*3]
psubw m8, m9
test dword r7m, 0x800
jnz .v_12bpc
psllw m8, 2
psllw m9, 2
.v_12bpc:
jmp wq
.v_w4:
movq xmm0, [srcq+strideq*0]
.v_w4_loop:
vpbroadcastq xmm2, [srcq+strideq*1]
vpbroadcastq ymm1, [srcq+strideq*2]
vpbroadcastq ymm3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
vpblendd ymm2, ymm1, 0x30
vpblendd ymm2, ymm3, 0xc0
vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3
movq xmm0, [srcq+strideq*0]
valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4
pmullw ymm1, ym8
pmullw ymm2, ym9
psubw ymm1, ym10
paddw ymm1, ymm2
psraw ymm1, 2
mova [tmpq], ymm1
add tmpq, 32
sub hd, 4
jg .v_w4_loop
vzeroupper
RET
.v_w8:
movu xm0, [srcq+strideq*0]
.v_w8_loop:
vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
vinserti32x4 m1, [srcq+strideq*2], 2
vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3
lea srcq, [srcq+strideq*4]
movu xm0, [srcq+strideq*0]
valignq m2, m0, m1, 2 ; 1 2 3 4
pmullw m1, m8
pmullw m2, m9
psubw m1, m10
paddw m1, m2
psraw m1, 2
mova [tmpq], m1
add tmpq, 64
sub hd, 4
jg .v_w8_loop
RET
.v_w16:
movu ym0, [srcq+strideq*0]
.v_w16_loop:
vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1
movu ym3, [srcq+strideq*2]
vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3
lea srcq, [srcq+strideq*4]
movu ym0, [srcq+strideq*0]
vshufi32x4 m3, m1, m3, q1032 ; 1 2
vshufi32x4 m4, m2, m0, q1032 ; 3 4
pmullw m1, m8
pmullw m2, m8
pmullw m3, m9
pmullw m4, m9
psubw m1, m10
psubw m2, m10
paddw m1, m3
paddw m2, m4
psraw m1, 2
psraw m2, 2
mova [tmpq+64*0], m1
mova [tmpq+64*1], m2
add tmpq, 64*2
sub hd, 4
jg .v_w16_loop
RET
.v_w32:
movu m0, [srcq+strideq*0]
.v_w32_loop:
movu m3, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
pmullw m1, m8, m0
movu m0, [srcq+strideq*0]
pmullw m2, m8, m3
pmullw m3, m9
pmullw m4, m9, m0
psubw m1, m10
psubw m2, m10
paddw m1, m3
paddw m2, m4
psraw m1, 2
psraw m2, 2
mova [tmpq+64*0], m1
mova [tmpq+64*1], m2
add tmpq, 64*2
sub hd, 2
jg .v_w32_loop
RET
.v_w64:
movu m0, [srcq+64*0]
movu m1, [srcq+64*1]
.v_w64_loop:
add srcq, strideq
pmullw m2, m8, m0
movu m0, [srcq+64*0]
pmullw m3, m8, m1
movu m1, [srcq+64*1]
pmullw m4, m9, m0
pmullw m5, m9, m1
psubw m2, m10
psubw m3, m10
paddw m2, m4
paddw m3, m5
psraw m2, 2
psraw m3, 2
mova [tmpq+64*0], m2
mova [tmpq+64*1], m3
add tmpq, 64*2
dec hd
jg .v_w64_loop
RET
.v_w128:
movu m0, [srcq+64*0]
movu m1, [srcq+64*1]
movu m2, [srcq+64*2]
movu m3, [srcq+64*3]
.v_w128_loop:
add srcq, strideq
pmullw m4, m8, m0
movu m0, [srcq+64*0]
pmullw m5, m8, m1
movu m1, [srcq+64*1]
pmullw m6, m8, m2
movu m2, [srcq+64*2]
pmullw m7, m8, m3
movu m3, [srcq+64*3]
pmullw m11, m9, m0
pmullw m12, m9, m1
pmullw m13, m9, m2
pmullw m14, m9, m3
REPX {psubw x, m10}, m4, m5, m6, m7
paddw m4, m11
paddw m5, m12
paddw m6, m13
paddw m7, m14
REPX {psraw x, 2}, m4, m5, m6, m7
mova [tmpq+64*0], m4
mova [tmpq+64*1], m5
mova [tmpq+64*2], m6
mova [tmpq+64*3], m7
add tmpq, 64*4
dec hd
jg .v_w128_loop
RET
.hv:
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
shl mxyd, 11
vpbroadcastw m7, mxyd
add wq, r6
lea stride3q, [strideq*3]
jmp wq
.hv_w4:
movq xmm0, [srcq+strideq*0+0]
movq xmm1, [srcq+strideq*0+2]
pmullw xmm0, xm4
pmullw xmm1, xm5
psubw xmm0, xm6
paddw xmm0, xmm1
psraw xmm0, 2
vpbroadcastq ym0, xmm0
.hv_w4_loop:
movu xm1, [srcq+strideq*1]
vinserti128 ym1, [srcq+stride3q ], 1
movu xm2, [srcq+strideq*2]
lea srcq, [srcq+strideq*4]
vinserti128 ym2, [srcq+strideq*0], 1
punpcklqdq ym3, ym1, ym2
psrldq ym1, 2
psrldq ym2, 2
pmullw ym3, ym4
punpcklqdq ym1, ym2
pmullw ym1, ym5
psubw ym3, ym6
paddw ym1, ym3
psraw ym1, 2 ; 1 2 3 4
valignq ym2, ym1, ym0, 3 ; 0 1 2 3
mova ym0, ym1
psubw ym1, ym2
pmulhrsw ym1, ym7
paddw ym1, ym2
mova [tmpq], ym1
add tmpq, 32
sub hd, 4
jg .hv_w4_loop
RET
.hv_w8:
pmullw xm0, xm4, [srcq+strideq*0+0]
pmullw xm1, xm5, [srcq+strideq*0+2]
psubw xm0, xm6
paddw xm0, xm1
psraw xm0, 2
vinserti32x4 m0, xm0, 3
.hv_w8_loop:
movu xm1, [srcq+strideq*1+0]
movu xm2, [srcq+strideq*1+2]
vinserti32x4 ym1, [srcq+strideq*2+0], 1
vinserti32x4 ym2, [srcq+strideq*2+2], 1
vinserti32x4 m1, [srcq+stride3q +0], 2
vinserti32x4 m2, [srcq+stride3q +2], 2
lea srcq, [srcq+strideq*4]
vinserti32x4 m1, [srcq+strideq*0+0], 3
vinserti32x4 m2, [srcq+strideq*0+2], 3
pmullw m1, m4
pmullw m2, m5
psubw m1, m6
paddw m1, m2
psraw m1, 2 ; 1 2 3 4
valignq m2, m1, m0, 6 ; 0 1 2 3
mova m0, m1
psubw m1, m2
pmulhrsw m1, m7
paddw m1, m2
mova [tmpq], m1
add tmpq, 64
sub hd, 4
jg .hv_w8_loop
RET
.hv_w16:
pmullw ym0, ym4, [srcq+strideq*0+0]
pmullw ym1, ym5, [srcq+strideq*0+2]
psubw ym0, ym6
paddw ym0, ym1
psraw ym0, 2
vinserti32x8 m0, ym0, 1
.hv_w16_loop:
movu ym1, [srcq+strideq*1+0]
movu ym2, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
vinserti32x8 m1, [srcq+strideq*0+0], 1
vinserti32x8 m2, [srcq+strideq*0+2], 1
pmullw m1, m4
pmullw m2, m5
psubw m1, m6
paddw m1, m2
psraw m1, 2 ; 1 2
vshufi32x4 m2, m0, m1, q1032 ; 0 1
mova m0, m1
psubw m1, m2
pmulhrsw m1, m7
paddw m1, m2
mova [tmpq], m1
add tmpq, 64
sub hd, 2
jg .hv_w16_loop
RET
.hv_w32:
pmullw m0, m4, [srcq+strideq*0+0]
pmullw m1, m5, [srcq+strideq*0+2]
psubw m0, m6
paddw m0, m1
psraw m0, 2
.hv_w32_loop:
pmullw m3, m4, [srcq+strideq*1+0]
pmullw m1, m5, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
psubw m3, m6
paddw m3, m1
psraw m3, 2
psubw m1, m3, m0
pmulhrsw m1, m7
paddw m1, m0
pmullw m0, m4, [srcq+strideq*0+0]
pmullw m2, m5, [srcq+strideq*0+2]
psubw m0, m6
paddw m0, m2
psraw m0, 2
psubw m2, m0, m3
pmulhrsw m2, m7
paddw m2, m3
mova [tmpq+64*0], m1
mova [tmpq+64*1], m2
add tmpq, 64*2
sub hd, 2
jg .hv_w32_loop
RET
.hv_w64:
pmullw m0, m4, [srcq+ 0]
pmullw m2, m5, [srcq+ 2]
pmullw m1, m4, [srcq+64]
pmullw m3, m5, [srcq+66]
psubw m0, m6
psubw m1, m6
paddw m0, m2
paddw m1, m3
psraw m0, 2
psraw m1, 2
.hv_w64_loop:
add srcq, strideq
pmullw m2, m4, [srcq+ 0]
pmullw m8, m5, [srcq+ 2]
pmullw m3, m4, [srcq+64]
pmullw m9, m5, [srcq+66]
psubw m2, m6
psubw m3, m6
paddw m2, m8
paddw m3, m9
psraw m2, 2
psraw m3, 2
psubw m8, m2, m0
psubw m9, m3, m1
pmulhrsw m8, m7
pmulhrsw m9, m7
paddw m8, m0
mova m0, m2
paddw m9, m1
mova m1, m3
mova [tmpq+64*0], m8
mova [tmpq+64*1], m9
add tmpq, 64*2
dec hd
jg .hv_w64_loop
RET
.hv_w128:
pmullw m0, m4, [srcq+ 0]
pmullw m8, m5, [srcq+ 2]
pmullw m1, m4, [srcq+ 64]
pmullw m9, m5, [srcq+ 66]
pmullw m2, m4, [srcq+128]
pmullw m10, m5, [srcq+130]
pmullw m3, m4, [srcq+192]
pmullw m11, m5, [srcq+194]
REPX {psubw x, m6}, m0, m1, m2, m3
paddw m0, m8
paddw m1, m9
paddw m2, m10
paddw m3, m11
REPX {psraw x, 2}, m0, m1, m2, m3
.hv_w128_loop:
add srcq, strideq
pmullw m8, m4, [srcq+ 0]
pmullw m12, m5, [srcq+ 2]
pmullw m9, m4, [srcq+ 64]
pmullw m13, m5, [srcq+ 66]
pmullw m10, m4, [srcq+128]
pmullw m14, m5, [srcq+130]
pmullw m11, m4, [srcq+192]
pmullw m15, m5, [srcq+194]
REPX {psubw x, m6}, m8, m9, m10, m11
paddw m8, m12
paddw m9, m13
paddw m10, m14
paddw m11, m15
REPX {psraw x, 2}, m8, m9, m10, m11
psubw m12, m8, m0
psubw m13, m9, m1
psubw m14, m10, m2
psubw m15, m11, m3
REPX {pmulhrsw x, m7}, m12, m13, m14, m15
paddw m12, m0
mova m0, m8
paddw m13, m1
mova m1, m9
mova [tmpq+64*0], m12
mova [tmpq+64*1], m13
paddw m14, m2
mova m2, m10
paddw m15, m3
mova m3, m11
mova [tmpq+64*2], m14
mova [tmpq+64*3], m15
add tmpq, 64*4
dec hd
jg .hv_w128_loop
RET
; int8_t subpel_filters[5][15][8]
%assign FILTER_REGULAR (0*15 << 16) | 3*15
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v
cglobal %1_8tap_%2_16bpc
mov t0d, FILTER_%3
%ifidn %3, %4
mov t1d, t0d
%else
mov t1d, FILTER_%4
%endif
%ifnidn %2, regular ; skip the jump in the last filter
jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX)
%endif
%endmacro
%if WIN64
DECLARE_REG_TMP 4, 5
%define buf rsp+stack_offset+8 ; shadow space
%else
DECLARE_REG_TMP 7, 8
%define buf rsp-40 ; red zone
%endif
MC_8TAP_FN put, sharp, SHARP, SHARP
MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH
MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP
MC_8TAP_FN put, smooth, SMOOTH, SMOOTH
MC_8TAP_FN put, sharp_regular, SHARP, REGULAR
MC_8TAP_FN put, regular_sharp, REGULAR, SHARP
MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR
MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH
MC_8TAP_FN put, regular, REGULAR, REGULAR
cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my
%define base r8-put_avx512icl
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 8tap_v, my, 4tap_v
lea r8, [put_avx512icl]
movifnidn wd, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
tzcnt wd, wd
movzx wd, word [r8+wq*2+table_offset(put,)]
add wq, r8
%if WIN64
pop r8
%endif
jmp wq
.h_w2:
movzx mxd, mxb
sub srcq, 2
mova ym2, [spel_h_shuf2a]
pmovsxbw xmm4, [base+subpel_filters+mxq*8]
pshufd xmm3, xmm4, q1111
pshufd xmm4, xmm4, q2222
.h_w2_loop:
movu xm1, [srcq+ssq*0]
vinserti32x4 ym1, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
mova xmm0, xm8
vpermb ym1, ym2, ym1
vpdpwssd xmm0, xmm3, xm1
vextracti32x4 xm1, ym1, 1
vpdpwssd xmm0, xmm4, xm1
psrad xmm0, 6
packusdw xmm0, xmm0
pminsw xmm0, xm9
movd [dstq+dsq*0], xmm0
pextrd [dstq+dsq*1], xmm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2_loop
RET
.h_w4:
movzx mxd, mxb
sub srcq, 2
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
vbroadcasti32x4 ym4, [spel_h_shufA]
vbroadcasti32x4 ym5, [spel_h_shufB]
pshufd xmm0, xmm0, q2211
vpbroadcastq ym6, xmm0
vpermq ym7, ymm0, q1111
.h_w4_loop:
movu xm2, [srcq+ssq*0]
vinserti32x4 ym2, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
mova ym0, ym8
pshufb ym1, ym2, ym4
vpdpwssd ym0, ym6, ym1
pshufb ym2, ym5
vpdpwssd ym0, ym7, ym2
psrad ym0, 6
vextracti32x4 xm1, ym0, 1
packusdw xm0, xm1
pminsw xmm0, xm0, xm9
movq [dstq+dsq*0], xmm0
movhps [dstq+dsq*1], xmm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w4_loop
RET
.h:
test myd, 0xf00
jnz .hv
mov r7d, r8m
vpbroadcastw m9, r8m
shr r7d, 11
vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4]
cmp wd, 4
je .h_w4
jl .h_w2
shr mxd, 16
sub srcq, 6
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
mova [buf], xmm0
vpbroadcastd m10, xmm0
vpbroadcastd m11, [buf+ 4]
vpbroadcastd m12, [buf+ 8]
vpbroadcastd m13, [buf+12]
sub wd, 16
je .h_w16
jg .h_w32
.h_w8:
mova m4, [spel_h_shufA]
movu m5, [spel_h_shufB]
movu m6, [spel_h_shufC]
mova m7, [spel_h_shufD]
.h_w8_loop:
movu ym2, [srcq+ssq*0]
vinserti32x8 m2, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
mova m0, m8
vpermb m1, m4, m2
vpdpwssd m0, m10, m1
vpermb m1, m5, m2
vpdpwssd m0, m11, m1
vpermb m1, m6, m2
vpdpwssd m0, m12, m1
vpermb m1, m7, m2
vpdpwssd m0, m13, m1
psrad m0, 6
vextracti32x8 ym1, m0, 1
packusdw ym0, ym1
pminsw ym0, ym9
mova [dstq+dsq*0], xm0
vextracti32x4 [dstq+dsq*1], ym0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8_loop
RET
.h_w16:
vbroadcasti32x4 m6, [spel_h_shufA]
vbroadcasti32x4 m7, [spel_h_shufB]
.h_w16_loop:
movu ym2, [srcq+ssq*0+ 0]
vinserti32x8 m2, [srcq+ssq*1+ 0], 1
movu ym3, [srcq+ssq*0+16]
vinserti32x8 m3, [srcq+ssq*1+16], 1
lea srcq, [srcq+ssq*2]
mova m0, m8
mova m1, m8
pshufb m4, m2, m6
vpdpwssd m0, m10, m4 ; a0
pshufb m4, m3, m6
vpdpwssd m1, m12, m4 ; b2
pshufb m4, m2, m7
vpdpwssd m0, m11, m4 ; a1
pshufb m4, m3, m7
vpdpwssd m1, m13, m4 ; b3
shufpd m2, m3, 0x55
pshufb m4, m2, m6
vpdpwssd m0, m12, m4 ; a2
vpdpwssd m1, m10, m4 ; b0
pshufb m2, m7
vpdpwssd m0, m13, m2 ; a3
vpdpwssd m1, m11, m2 ; b1
psrad m0, 6
psrad m1, 6
packusdw m0, m1
pminsw m0, m9
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w16_loop
RET
.h_w32:
lea srcq, [srcq+wq*2]
vbroadcasti32x4 m6, [spel_h_shufA]
lea dstq, [dstq+wq*2]
vbroadcasti32x4 m7, [spel_h_shufB]
neg wq
.h_w32_loop0:
mov r6, wq
.h_w32_loop:
movu m2, [srcq+r6*2+ 0]
movu m3, [srcq+r6*2+ 8]
mova m0, m8
mova m1, m8
pshufb m4, m2, m6
vpdpwssd m0, m10, m4 ; a0
pshufb m4, m3, m6
vpdpwssd m1, m10, m4 ; b0
vpdpwssd m0, m12, m4 ; a2
movu m4, [srcq+r6*2+16]
pshufb m3, m7
vpdpwssd m1, m11, m3 ; b1
vpdpwssd m0, m13, m3 ; a3
pshufb m3, m4, m6
vpdpwssd m1, m12, m3 ; b2
pshufb m2, m7
vpdpwssd m0, m11, m2 ; a1
pshufb m4, m7
vpdpwssd m1, m13, m4 ; b3
psrad m0, 6
psrad m1, 6
packusdw m0, m1
pminsw m0, m9
mova [dstq+r6*2], m0
add r6, 32
jl .h_w32_loop
add srcq, ssq
add dstq, dsq
dec hd
jg .h_w32_loop0
RET
.v:
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
vpbroadcastd m10, [pd_32]
pmovsxbw xmm0, [base+subpel_filters+myq*8]
tzcnt r7d, wd
vpbroadcastw m11, r8m
lea r6, [ssq*3]
movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)]
sub srcq, r6
mova [rsp+stack_offset+8], xmm0
vpbroadcastd m12, xmm0
add r7, r8
vpbroadcastd m13, [rsp+stack_offset+12]
vpbroadcastd m14, [rsp+stack_offset+16]
vpbroadcastd m15, [rsp+stack_offset+20]
jmp r7
.v_w2:
movd xmm2, [srcq+ssq*0]
pinsrd xmm2, [srcq+ssq*1], 1
pinsrd xmm2, [srcq+ssq*2], 2
add srcq, r6
pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3
movd xmm3, [srcq+ssq*1]
vpbroadcastd xmm1, [srcq+ssq*2]
add srcq, r6
vpbroadcastd xmm0, [srcq+ssq*0]
vpblendd xmm3, xmm1, 0x02 ; 4 5
vpblendd xmm1, xmm0, 0x02 ; 5 6
palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
punpcklwd xmm3, xmm1 ; 45 56
punpcklwd xmm1, xmm2, xmm4 ; 01 12
punpckhwd xmm2, xmm4 ; 23 34
.v_w2_loop:
vpbroadcastd xmm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova xmm5, xm10
vpdpwssd xmm5, xm12, xmm1 ; a0 b0
mova xmm1, xmm2
vpdpwssd xmm5, xm13, xmm2 ; a1 b1
mova xmm2, xmm3
vpdpwssd xmm5, xm14, xmm3 ; a2 b2
vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
vpbroadcastd xmm0, [srcq+ssq*0]
vpblendd xmm4, xmm0, 0x02 ; 7 8
punpcklwd xmm3, xmm4 ; 67 78
vpdpwssd xmm5, xm15, xmm3 ; a3 b3
psrad xmm5, 6
packusdw xmm5, xmm5
pminsw xmm5, xm11
movd [dstq+dsq*0], xmm5
pextrd [dstq+dsq*1], xmm5, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movq xmm1, [srcq+ssq*0]
vpbroadcastq ymm0, [srcq+ssq*1]
vpbroadcastq ymm2, [srcq+ssq*2]
add srcq, r6
vpbroadcastq ymm4, [srcq+ssq*0]
vpbroadcastq ymm3, [srcq+ssq*1]
vpbroadcastq ymm5, [srcq+ssq*2]
add srcq, r6
vpblendd ymm1, ymm0, 0x30
vpblendd ymm0, ymm2, 0x30
punpcklwd ymm1, ymm0 ; 01 12
vpbroadcastq ymm0, [srcq+ssq*0]
vpblendd ymm2, ymm4, 0x30
vpblendd ymm4, ymm3, 0x30
punpcklwd ymm2, ymm4 ; 23 34
vpblendd ymm3, ymm5, 0x30
vpblendd ymm5, ymm0, 0x30
punpcklwd ymm3, ymm5 ; 45 56
.v_w4_loop:
vpbroadcastq ymm5, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova ymm4, ym10
vpdpwssd ymm4, ym12, ymm1 ; a0 b0
mova ymm1, ymm2
vpdpwssd ymm4, ym13, ymm2 ; a1 b1
mova ymm2, ymm3
vpdpwssd ymm4, ym14, ymm3 ; a2 b2
vpblendd ymm3, ymm0, ymm5, 0x30
vpbroadcastq ymm0, [srcq+ssq*0]
vpblendd ymm5, ymm0, 0x30
punpcklwd ymm3, ymm5 ; 67 78
vpdpwssd ymm4, ym15, ymm3 ; a3 b3
psrad ymm4, 6
vextracti128 xmm5, ymm4, 1
packusdw xmm4, xmm5
pminsw xmm4, xm11
movq [dstq+dsq*0], xmm4
movhps [dstq+dsq*1], xmm4
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
vzeroupper
RET
.v_w8:
vbroadcasti32x4 m2, [srcq+ssq*2]
vinserti32x4 m1, m2, [srcq+ssq*0], 0
vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2
add srcq, r6
vinserti32x4 ym2, [srcq+ssq*0], 1
vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4
mova m6, [spel_v_shuf8]
movu xm0, [srcq+ssq*1]
vinserti32x4 ym0, [srcq+ssq*2], 1
add srcq, r6
vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6
vpermb m1, m6, m1 ; 01 12
vpermb m2, m6, m2 ; 23 34
vpermb m3, m6, m0 ; 45 56
.v_w8_loop:
vinserti32x4 m0, [srcq+ssq*1], 3
lea srcq, [srcq+ssq*2]
movu xm5, [srcq+ssq*0]
mova m4, m10
vpdpwssd m4, m12, m1 ; a0 b0
mova m1, m2
vshufi32x4 m0, m5, q1032 ; 6 7 8
vpdpwssd m4, m13, m2 ; a1 b1
mova m2, m3
vpdpwssd m4, m14, m3 ; a2 b2
vpermb m3, m6, m0 ; 67 78
vpdpwssd m4, m15, m3 ; a3 b3
psrad m4, 6
vextracti32x8 ym5, m4, 1
packusdw ym4, ym5
pminsw ym4, ym11
mova [dstq+dsq*0], xm4
vextracti32x4 [dstq+dsq*1], ym4, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
RET
.v_w16:
vbroadcasti32x8 m1, [srcq+ssq*1]
vinserti32x8 m0, m1, [srcq+ssq*0], 0
vinserti32x8 m1, [srcq+ssq*2], 1
mova m8, [spel_v_shuf16]
add srcq, r6
movu ym3, [srcq+ssq*0]
vinserti32x8 m3, [srcq+ssq*1], 1
movu ym5, [srcq+ssq*2]
add srcq, r6
vinserti32x8 m5, [srcq+ssq*0], 1
vpermb m0, m8, m0 ; 01
vpermb m1, m8, m1 ; 12
vpermb m3, m8, m3 ; 34
vpermb m5, m8, m5 ; 56
mova m9, [deint_q_shuf]
vpshrdd m2, m1, m3, 16 ; 23
vpshrdd m4, m3, m5, 16 ; 45
.v_w16_loop:
mova m6, m10
mova m7, m10
vpdpwssd m6, m12, m0 ; a0
mova m0, m2
vpdpwssd m7, m12, m1 ; b0
mova m1, m3
vpdpwssd m6, m13, m2 ; a1
mova m2, m4
vpdpwssd m7, m13, m3 ; b1
mova m3, m5
vpdpwssd m6, m14, m4 ; a2
mova m4, m5
vpdpwssd m7, m14, m5 ; b2
movu ym5, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x8 m5, [srcq+ssq*0], 1
vpermb m5, m8, m5 ; 78
vpshrdd m4, m5, 16 ; 67
vpdpwssd m6, m15, m4 ; a3
vpdpwssd m7, m15, m5 ; b3
psrad m6, 6
psrad m7, 6
packusdw m6, m7
pminsw m6, m11
vpermq m6, m9, m6
mova [dstq+dsq*0], ym6
vextracti32x8 [dstq+dsq*1], m6, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w16_loop
RET
.v_w32:
.v_w64:
.v_w128:
%if WIN64
movaps [rsp+stack_offset+8], xmm6
%endif
lea wd, [hq+wq*8-256]
mov r7, srcq
mov r8, dstq
.v_w32_loop0:
movu m16, [srcq+ssq*0]
movu m17, [srcq+ssq*1]
movu m18, [srcq+ssq*2]
add srcq, r6
movu m19, [srcq+ssq*0]
movu m20, [srcq+ssq*1]
movu m21, [srcq+ssq*2]
add srcq, r6
movu m22, [srcq+ssq*0]
punpcklwd m0, m16, m17 ; 01l
punpckhwd m16, m17 ; 01h
punpcklwd m1, m17, m18 ; 12l
punpckhwd m17, m18 ; 12h
punpcklwd m2, m18, m19 ; 23l
punpckhwd m18, m19 ; 23h
punpcklwd m3, m19, m20 ; 34l
punpckhwd m19, m20 ; 34h
punpcklwd m4, m20, m21 ; 45l
punpckhwd m20, m21 ; 45h
punpcklwd m5, m21, m22 ; 56l
punpckhwd m21, m22 ; 56h
.v_w32_loop:
mova m6, m10
vpdpwssd m6, m12, m0 ; a0l
mova m8, m10
vpdpwssd m8, m12, m16 ; a0h
mova m7, m10
vpdpwssd m7, m12, m1 ; b0l
mova m9, m10
vpdpwssd m9, m12, m17 ; b0h
mova m0, m2
vpdpwssd m6, m13, m2 ; a1l
mova m16, m18
vpdpwssd m8, m13, m18 ; a1h
mova m1, m3
vpdpwssd m7, m13, m3 ; b1l
mova m17, m19
vpdpwssd m9, m13, m19 ; b1h
mova m2, m4
vpdpwssd m6, m14, m4 ; a2l
mova m18, m20
vpdpwssd m8, m14, m20 ; a2h
mova m3, m5
vpdpwssd m7, m14, m5 ; b2l
mova m19, m21
vpdpwssd m9, m14, m21 ; b2h
movu m21, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklwd m4, m22, m21 ; 67l
punpckhwd m20, m22, m21 ; 67h
movu m22, [srcq+ssq*0]
vpdpwssd m6, m15, m4 ; a3l
vpdpwssd m8, m15, m20 ; a3h
punpcklwd m5, m21, m22 ; 78l
punpckhwd m21, m22 ; 78h
vpdpwssd m7, m15, m5 ; b3l
vpdpwssd m9, m15, m21 ; b3h
REPX {psrad x, 6}, m6, m8, m7, m9
packusdw m6, m8
packusdw m7, m9
pminsw m6, m11
pminsw m7, m11
mova [dstq+dsq*0], m6
mova [dstq+dsq*1], m7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w32_loop
add r7, 64
add r8, 64
movzx hd, wb
mov srcq, r7
mov dstq, r8
sub wd, 1<<8
jg .v_w32_loop0
%if WIN64
movaps xmm6, [rsp+stack_offset+8]
%endif
vzeroupper
RET
.hv:
vpbroadcastw m11, r8m
cmp wd, 4
jg .hv_w8
movzx mxd, mxb
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
pmovsxbw xmm1, [base+subpel_filters+myq*8]
lea r6, [ssq*3]
sub srcq, 2
sub srcq, r6
test dword r8m, 0x800
jnz .hv_12bit
vpbroadcastd m10, [pd_2176]
psllw xmm0, 6
jmp .hv_main
.hv_12bit:
vpbroadcastd m10, [pd_640]
psllw xmm0, 4
psllw xmm1, 2
.hv_main:
mova [buf+ 0], xmm0
mova [buf+16], xmm1
vpbroadcastd m8, [buf+ 4]
vpbroadcastd m9, [buf+ 8]
vpbroadcastd ym12, xmm1
vpbroadcastd ym13, [buf+20]
vpbroadcastd ym14, [buf+24]
vpbroadcastd ym15, [buf+28]
movu xm4, [srcq+ssq*0]
vinserti32x4 ym4, [srcq+ssq*1], 1
vinserti32x4 m4, [srcq+ssq*2], 2
add srcq, r6
vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3
movu xm0, [srcq+ssq*1]
vinserti32x4 ym0, [srcq+ssq*2], 1
add srcq, r6
vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6
cmp wd, 4
je .hv_w4
vbroadcasti32x4 m2, [spel_h_shufA]
mova m3, [spel_h_shuf2b]
mova ym6, [spel_h_shuf2a]
mova xm7, [spel_shuf2]
mova m1, m10
pshufb m4, m2
pshufb m0, m2
punpcklqdq m2, m4, m0
vpdpwssd m1, m8, m2 ; 04 15 26 3_
punpckhqdq m4, m0
vpdpwssd m1, m9, m4
vpermb m1, m3, m1 ; 01 12
vextracti32x4 xm2, ym1, 1 ; 23 34
vextracti32x4 xm3, m1, 2 ; 45 56
.hv_w2_loop:
movu xm5, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x4 ym5, [srcq+ssq*0], 1
mova xm4, xm10
vpermb ym5, ym6, ym5
pmaddwd xmm0, xm12, xm1 ; a0 b0
vpdpwssd xm4, xm8, xm5
vextracti32x4 xm5, ym5, 1
mova xm1, xm2
vpdpwssd xmm0, xm13, xm2 ; a1 b1
vpdpwssd xm4, xm9, xm5 ; 7 8
mova xm2, xm3
vpdpwssd xmm0, xm14, xm3 ; a2 b2
vpermt2b xm3, xm7, xm4 ; 67 78
vpdpwssd xmm0, xm15, xm3 ; a3 b3
psrad xmm0, 10
packusdw xmm0, xmm0
pminsw xmm0, xm11
movd [dstq+dsq*0], xmm0
pextrd [dstq+dsq*1], xmm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w4:
vbroadcasti32x4 m19, [spel_h_shufA]
vbroadcasti32x4 m20, [spel_h_shufB]
mova ym6, [spel_shuf4a]
mova ym7, [spel_shuf4b]
mova m2, m10
mova m3, m10
pshufb m1, m4, m19
vpdpwssd m2, m8, m1
pshufb m1, m0, m19
vpdpwssd m3, m8, m1
pshufb m4, m20
vpdpwssd m2, m9, m4
pshufb m0, m20
vpdpwssd m3, m9, m0
vpermb m1, m6, m2 ; 01 12
vshufi32x4 m2, m3, q1032
vpermb m3, m6, m3 ; 45 56
vpermb m2, m6, m2 ; 23 34
.hv_w4_loop:
movu xm18, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti128 ym18, [srcq+ssq*0], 1
mova ym4, ym10
pshufb ym17, ym18, ym19
pmaddwd ym16, ym12, ym1 ; a0 b0
vpdpwssd ym4, ym8, ym17
pshufb ym18, ym20
mova ym1, ym2
vpdpwssd ym16, ym13, ym2 ; a1 b1
vpdpwssd ym4, ym9, ym18 ; 7 8
mova ym2, ym3
vpdpwssd ym16, ym14, ym3 ; a2 b2
vpermt2b ym3, ym7, ym4 ; 67 78
vpdpwssd ym16, ym15, ym3 ; a3 b3
psrad ym16, 10
vextracti128 xm17, ym16, 1
packusdw xm16, xm17
pminsw xm16, xm11
movq [dstq+dsq*0], xm16
movhps [dstq+dsq*1], xm16
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
vzeroupper
RET
.hv_w8:
shr mxd, 16
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
pmovsxbw xmm1, [base+subpel_filters+myq*8]
lea r6, [ssq*3]
sub srcq, 6
sub srcq, r6
test dword r8m, 0x800
jnz .hv_w8_12bit
vpbroadcastd m10, [pd_2176]
psllw xmm0, 6
jmp .hv_w8_main
.hv_w8_12bit:
vpbroadcastd m10, [pd_640]
psllw xmm0, 4
psllw xmm1, 2
.hv_w8_main:
mova [buf+ 0], xmm0
mova [buf+16], xmm1
vpbroadcastd m12, xmm0
vpbroadcastd m13, [buf+ 4]
vpbroadcastd m14, [buf+ 8]
vpbroadcastd m15, [buf+12]
vpbroadcastd m16, xmm1
vpbroadcastd m17, [buf+20]
vpbroadcastd m18, [buf+24]
vpbroadcastd m19, [buf+28]
cmp wd, 16
je .hv_w16
jg .hv_w32
mova m5, [spel_h_shufA]
movu ym0, [srcq+ssq*0]
vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1
movu ym9, [srcq+ssq*2]
add srcq, r6
vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3
movu ym20, [srcq+ssq*1]
vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5
add srcq, r6
movu ym21, [srcq+ssq*0] ; 6
movu m6, [spel_h_shufB]
movu m7, [spel_h_shufC]
vpermb m8, m5, m0
mova m1, m10
vpdpwssd m1, m12, m8 ; a0 b0
vpermb m8, m5, m9
mova m2, m10
vpdpwssd m2, m12, m8 ; c0 d0
vpermb m8, m5, m20
mova m3, m10
vpdpwssd m3, m12, m8 ; e0 f0
vpermb m8, m5, m21
mova m4, m10
vpdpwssd m4, m12, m8 ; g0
vpermb m8, m6, m0
vpdpwssd m1, m13, m8 ; a1 b1
vpermb m8, m6, m9
vpdpwssd m2, m13, m8 ; c1 d1
vpermb m8, m6, m20
vpdpwssd m3, m13, m8 ; e1 f1
vpermb m8, m6, m21
vpdpwssd m4, m13, m8 ; g1
vpermb m8, m7, m0
vpdpwssd m1, m14, m8 ; a2 b2
vpermb m8, m7, m9
vpdpwssd m2, m14, m8 ; c2 d2
vpermb m8, m7, m20
vpdpwssd m3, m14, m8 ; e2 f2
vpermb m8, m7, m21
vpdpwssd m4, m14, m8 ; g2
mova m8, [spel_h_shufD]
vpermb m0, m8, m0
vpdpwssd m1, m15, m0 ; a3 b3
mova m0, [spel_shuf8a]
vpermb m9, m8, m9
vpdpwssd m2, m15, m9 ; c3 d3
mova m9, [spel_shuf8b]
vpermb m20, m8, m20
vpdpwssd m3, m15, m20 ; e3 f3
vpermb m21, m8, m21
vpdpwssd m4, m15, m21 ; g3
vpermt2b m1, m0, m2 ; 01 12
vpermt2b m2, m0, m3 ; 23 34
vpermt2b m3, m0, m4 ; 45 56
.hv_w8_loop:
movu ym0, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x8 m0, [srcq+ssq*0], 1
mova m4, m10
vpermb m21, m5, m0
vpdpwssd m4, m12, m21 ; h0 i0
vpermb m21, m6, m0
pmaddwd m20, m16, m1 ; A0 B0
vpdpwssd m4, m13, m21 ; h1 i1
vpermb m21, m7, m0
mova m1, m2
vpdpwssd m20, m17, m2 ; A1 B1
vpdpwssd m4, m14, m21 ; h2 i2
vpermb m21, m8, m0
mova m2, m3
vpdpwssd m20, m18, m3 ; A2 B2
vpdpwssd m4, m15, m21 ; h3 i3
vpermt2b m3, m9, m4 ; 67 78
vpdpwssd m20, m19, m3 ; A3 B3
psrad m20, 10
vextracti32x8 ym21, m20, 1
packusdw ym20, ym21
pminsw ym20, ym11
mova [dstq+dsq*0], xm20
vextracti128 [dstq+dsq*1], ym20, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
vzeroupper
RET
.hv_w16:
WIN64_SPILL_XMM 26
vbroadcasti32x8 m5, [srcq+ssq*0+ 8]
vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0
vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0
movu ym6, [srcq+ssq*1+ 0]
movu ym7, [srcq+ssq*1+16]
vinserti32x8 m6, [srcq+ssq*2+ 0], 1
vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2
add srcq, r6
movu ym22, [srcq+ssq*0+ 0]
movu ym23, [srcq+ssq*0+16]
vinserti32x8 m22, [srcq+ssq*1+ 0], 1
vinserti32x8 m23, [srcq+ssq*1+16], 1 ; 3 4
movu ym24, [srcq+ssq*2+ 0]
movu ym25, [srcq+ssq*2+16]
add srcq, r6
vinserti32x8 m24, [srcq+ssq*0+ 0], 1
vinserti32x8 m25, [srcq+ssq*0+16], 1 ; 5 6
vbroadcasti32x4 m20, [spel_h_shufA]
vbroadcasti32x4 m21, [spel_h_shufB]
mova m9, [spel_shuf16]
pshufb m0, m4, m20
mova m1, m10
vpdpwssd m1, m12, m0 ; a0
pshufb m0, m6, m20
mova m2, m10
vpdpwssd m2, m12, m0 ; b0
pshufb m0, m7, m20
mova m3, m10
vpdpwssd m3, m14, m0 ; c2
pshufb m0, m4, m21
vpdpwssd m1, m13, m0 ; a1
pshufb m0, m6, m21
vpdpwssd m2, m13, m0 ; b1
pshufb m0, m7, m21
vpdpwssd m3, m15, m0 ; c3
pshufb m0, m5, m20
vpdpwssd m1, m14, m0 ; a2
shufpd m6, m7, 0x55
pshufb m7, m6, m20
vpdpwssd m2, m14, m7 ; b2
vpdpwssd m3, m12, m7 ; c0
pshufb m5, m21
vpdpwssd m1, m15, m5 ; a3
pshufb m6, m21
vpdpwssd m2, m15, m6 ; b3
vpdpwssd m3, m13, m6 ; c1
pshufb m0, m22, m20
mova m4, m10
vpdpwssd m4, m12, m0 ; d0
pshufb m0, m23, m20
mova m5, m10
vpdpwssd m5, m14, m0 ; e2
pshufb m0, m24, m20
mova m6, m10
vpdpwssd m6, m12, m0 ; f0
pshufb m0, m25, m20
mova m7, m10
vpdpwssd m7, m14, m0 ; g2
pshufb m0, m22, m21
vpdpwssd m4, m13, m0 ; d1
pshufb m0, m23, m21
vpdpwssd m5, m15, m0 ; e3
pshufb m0, m24, m21
vpdpwssd m6, m13, m0 ; f1
pshufb m0, m25, m21
vpdpwssd m7, m15, m0 ; g3
shufpd m22, m23, 0x55
pshufb m23, m22, m20
vpdpwssd m4, m14, m23 ; d2
vpdpwssd m5, m12, m23 ; e0
shufpd m24, m25, 0x55
pshufb m25, m24, m20
vpdpwssd m6, m14, m25 ; f2
vpdpwssd m7, m12, m25 ; g0
pshufb m22, m21
vpdpwssd m4, m15, m22 ; d3
vpdpwssd m5, m13, m22 ; e1
pshufb m24, m21
vpdpwssd m6, m15, m24 ; f3
vpdpwssd m7, m13, m24 ; g1
pslldq m1, 1
vpermt2b m2, m9, m3 ; 12
vpermt2b m4, m9, m5 ; 34
vpermt2b m6, m9, m7 ; 56
vpshrdd m1, m2, 16 ; 01
vpshrdd m3, m2, m4, 16 ; 23
vpshrdd m5, m4, m6, 16 ; 45
.hv_w16_loop:
movu ym24, [srcq+ssq*1+ 0]
movu ym25, [srcq+ssq*1+16]
lea srcq, [srcq+ssq*2]
vinserti32x8 m24, [srcq+ssq*0+ 0], 1
vinserti32x8 m25, [srcq+ssq*0+16], 1
mova m7, m10
mova m8, m10
pshufb m0, m24, m20
vpdpwssd m7, m12, m0 ; h0
pshufb m0, m25, m20
vpdpwssd m8, m14, m0 ; i2
pmaddwd m22, m16, m1 ; A0
mova m1, m3
pmaddwd m23, m16, m2 ; B0
mova m2, m4
pshufb m0, m24, m21
vpdpwssd m7, m13, m0 ; h1
pshufb m0, m25, m21
vpdpwssd m8, m15, m0 ; i3
vpdpwssd m22, m17, m3 ; A1
mova m3, m5
vpdpwssd m23, m17, m4 ; B1
mova m4, m6
shufpd m24, m25, 0x55
pshufb m25, m24, m20
vpdpwssd m7, m14, m25 ; h2
vpdpwssd m8, m12, m25 ; i0
vpdpwssd m22, m18, m5 ; A2
vpdpwssd m23, m18, m6 ; B2
pshufb m24, m21
vpdpwssd m7, m15, m24 ; h3
vpdpwssd m8, m13, m24 ; i1
vpermt2b m7, m9, m8 ; 78
vpshrdd m5, m6, m7, 16 ; 67
vpdpwssd m22, m19, m5 ; A3
vpdpwssd m23, m19, m7 ; B3
mova m6, m7
psrad m22, 10
psrad m23, 10
vshufi32x4 m0, m22, m23, q3232
vinserti32x8 m22, ym23, 1
packusdw m22, m0
pminsw m22, m11
mova [dstq+dsq*0], ym22
vextracti32x8 [dstq+dsq*1], m22, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w16_loop
RET
.hv_w32:
WIN64_SPILL_XMM 32
vbroadcasti32x4 m20, [spel_h_shufA]
vbroadcasti32x4 m21, [spel_h_shufB]
mova m22, [spel_shuf32]
lea wd, [hq+wq*8-256]
mov r7, srcq
mov r8, dstq
.hv_w32_loop0:
movu m6, [srcq+ssq*0+ 0]
movu m7, [srcq+ssq*0+ 8]
movu m8, [srcq+ssq*0+16]
mova m0, m10
mova m23, m10
pshufb m9, m6, m20
vpdpwssd m0, m12, m9 ; a0l
pshufb m9, m7, m20
vpdpwssd m23, m12, m9 ; a0h
vpdpwssd m0, m14, m9 ; a2l
pshufb m7, m21
vpdpwssd m23, m13, m7 ; a1h
vpdpwssd m0, m15, m7 ; a3l
pshufb m7, m8, m20
vpdpwssd m23, m14, m7 ; a2h
pshufb m6, m21
vpdpwssd m0, m13, m6 ; a1l
pshufb m8, m21
vpdpwssd m23, m15, m8 ; a3h
%macro PUT_8TAP_HV_W32 5 ; dst_lo, dst_hi, stride_name, stride[1-2]
movu m6, [srcq+%3*%4+ 0]
movu m7, [srcq+%3*%4+ 8]
movu m8, [srcq+%3*%4+16]
%if %4 == 2
add srcq, r6
%endif
movu m29, [srcq+%3*%5+ 0]
movu m30, [srcq+%3*%5+ 8]
movu m31, [srcq+%3*%5+16]
%if %5 == 2
add srcq, r6
%endif
mova m%1, m10
mova m9, m10
pshufb m%2, m6, m20
vpdpwssd m%1, m12, m%2 ; x0l
pshufb m%2, m29, m20
vpdpwssd m9, m12, m%2 ; y0l
pshufb m6, m21
vpdpwssd m%1, m13, m6 ; x1l
pshufb m29, m21
vpdpwssd m9, m13, m29 ; y1l
pshufb m6, m7, m20
mova m%2, m10
vpdpwssd m%2, m12, m6 ; x0h
pshufb m29, m30, m20
vpdpwssd m%1, m14, m6 ; y2l
mova m6, m10
vpdpwssd m6, m12, m29 ; x0h
pshufb m7, m21
vpdpwssd m9, m14, m29 ; y2l
pshufb m30, m21
vpdpwssd m%2, m13, m7 ; x1h
vpdpwssd m%1, m15, m7 ; x3l
pshufb m7, m8, m20
vpdpwssd m6, m13, m30 ; y1h
vpdpwssd m9, m15, m30 ; y3l
pshufb m30, m31, m20
vpdpwssd m%2, m14, m7 ; x2h
pshufb m8, m21
vpdpwssd m6, m14, m30 ; y2h
pshufb m31, m21
vpdpwssd m%2, m15, m8 ; x3h
vpdpwssd m6, m15, m31 ; y3h
%if %1 == 1
vpermt2b m0, m22, m%1 ; 01l
vpermt2b m23, m22, m%2 ; 01h
%endif
vpermt2b m%1, m22, m9 ; xyl
vpermt2b m%2, m22, m6 ; xyh
%endmacro
PUT_8TAP_HV_W32 1, 24, ssq, 1, 2 ; 12
PUT_8TAP_HV_W32 3, 26, ssq, 0, 1 ; 34
PUT_8TAP_HV_W32 5, 28, ssq, 2, 0 ; 56
vpshrdd m2, m1, m3, 16 ; 23l
vpshrdd m25, m24, m26, 16 ; 23h
vpshrdd m4, m3, m5, 16 ; 45l
vpshrdd m27, m26, m28, 16 ; 45h
.hv_w32_loop:
movu m7, [srcq+ssq*1+ 0]
movu m9, [srcq+ssq*2+ 0]
movu m6, [srcq+ssq*1+ 8]
movu m8, [srcq+ssq*2+ 8]
mova m29, m10
mova m31, m10
pshufb m30, m7, m20
vpdpwssd m29, m12, m30 ; h0l
pshufb m30, m9, m20
vpdpwssd m31, m12, m30 ; i0l
pshufb m7, m21
vpdpwssd m29, m13, m7 ; h1l
pshufb m9, m21
vpdpwssd m31, m13, m9 ; i1l
pshufb m7, m6, m20
vpdpwssd m29, m14, m7 ; h2l
pshufb m9, m8, m20
vpdpwssd m31, m14, m9 ; i2l
pshufb m6, m21
vpdpwssd m29, m15, m6 ; h3l
pshufb m8, m21
vpdpwssd m31, m15, m8 ; i3l
mova m30, m10
vpdpwssd m30, m12, m7 ; h0h
movu m7, [srcq+ssq*1+16]
lea srcq, [srcq+ssq*2]
vpermt2b m29, m22, m31 ; 78l
mova m31, m10
vpdpwssd m31, m12, m9 ; i0h
movu m9, [srcq+ssq*0+16]
vpdpwssd m30, m13, m6 ; h1h
pshufb m6, m7, m20
vpdpwssd m31, m13, m8 ; i1h
pshufb m8, m9, m20
vpdpwssd m30, m14, m6 ; h2h
pmaddwd m6, m16, m0 ; A0l
pshufb m7, m21
vpdpwssd m31, m14, m8 ; i2h
pmaddwd m8, m16, m23 ; A0h
pshufb m9, m21
vpdpwssd m30, m15, m7 ; h3h
pmaddwd m7, m16, m1 ; B0l
vpdpwssd m31, m15, m9 ; i3h
pmaddwd m9, m16, m24 ; B0h
mova m0, m2
vpdpwssd m6, m17, m2 ; A1l
mova m23, m25
vpdpwssd m8, m17, m25 ; A1h
mova m1, m3
vpdpwssd m7, m17, m3 ; B1l
mova m24, m26
vpdpwssd m9, m17, m26 ; B1h
vpermt2b m30, m22, m31 ; 78h
vpdpwssd m6, m18, m4 ; A2l
mova m2, m4
vpdpwssd m8, m18, m27 ; A2h
mova m25, m27
vpdpwssd m7, m18, m5 ; B2l
mova m3, m5
vpdpwssd m9, m18, m28 ; B2h
mova m26, m28
vpshrdd m4, m5, m29, 16 ; 67l
vpdpwssd m6, m19, m4 ; A3l
vpshrdd m27, m28, m30, 16 ; 67h
vpdpwssd m8, m19, m27 ; A3h
mova m5, m29
vpdpwssd m7, m19, m29 ; B3l
mova m28, m30
vpdpwssd m9, m19, m30 ; B3h
REPX {psrad x, 10}, m6, m8, m7, m9
packusdw m6, m8
packusdw m7, m9
pminsw m6, m11
pminsw m7, m11
mova [dstq+dsq*0], m6
mova [dstq+dsq*1], m7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w32_loop
add r7, 64
add r8, 64
movzx hd, wb
mov srcq, r7
mov dstq, r8
sub wd, 1<<8
jg .hv_w32_loop0
RET
%if WIN64
DECLARE_REG_TMP 6, 4
%else
DECLARE_REG_TMP 6, 7
%endif
MC_8TAP_FN prep, sharp, SHARP, SHARP
MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH
MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP
MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH
MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR
MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP
MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR
MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH
MC_8TAP_FN prep, regular, REGULAR, REGULAR
cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3
%define base r7-prep_avx512icl
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 8tap_v, my, 4tap_v
lea r7, [prep_avx512icl]
mov wd, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
tzcnt wd, wd
mov r5d, r7m ; bitdepth_max
vpbroadcastd m5, [pw_8192]
movzx wd, word [r7+wq*2+table_offset(prep,)]
shr r5d, 11
vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4]
add wq, r7
lea r6, [strideq*3]
%if WIN64
pop r7
%endif
jmp wq
.h_w4:
movzx mxd, mxb
sub srcq, 2
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
mov r5d, r7m
vbroadcasti32x4 m4, [spel_h_shufA]
vbroadcasti32x4 m5, [spel_h_shufB]
shr r5d, 11
mova ym9, [prep_endA]
psllw xmm0, [base+prep_hv_shift+r5*8]
mova [tmpq], xmm0
vpbroadcastd m6, [tmpq+4]
vpbroadcastd m7, [tmpq+8]
.h_w4_loop:
movu xm2, [srcq+strideq*0]
vinserti32x4 ym2, [srcq+strideq*1], 1
vinserti32x4 m2, [srcq+strideq*2], 2
vinserti32x4 m2, [srcq+r6 ], 3
lea srcq, [srcq+strideq*4]
mova m0, m10
pshufb m1, m2, m4
vpdpwssd m0, m6, m1
pshufb m2, m5
vpdpwssd m0, m7, m2
vpermb m0, m9, m0
mova [tmpq], ym0
add tmpq, 32
sub hd, 4
jg .h_w4_loop
RET
.h:
test myd, 0xf00
jnz .hv
vpbroadcastd m10, [prep_8tap_rnd]
lea r6, [strideq*3]
cmp wd, 4
je .h_w4
shr mxd, 16
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
mov r5d, r7m
sub srcq, 6
shr r5d, 11
psllw xmm0, [base+prep_hv_shift+r5*8]
mova [tmpq], xmm0
vpbroadcastd m12, xmm0
vpbroadcastd m13, [tmpq+ 4]
vpbroadcastd m14, [tmpq+ 8]
vpbroadcastd m15, [tmpq+12]
cmp wd, 16
je .h_w16
jg .h_w32
.h_w8:
mova m6, [spel_h_shufA]
movu m7, [spel_h_shufB]
movu m8, [spel_h_shufC]
mova m9, [spel_h_shufD]
mova m11, [prep_endB]
.h_w8_loop:
movu ym4, [srcq+strideq*0]
vinserti32x8 m4, [srcq+strideq*1], 1
movu ym5, [srcq+strideq*2]
vinserti32x8 m5, [srcq+r6 ], 1
lea srcq, [srcq+strideq*4]
mova m0, m10
mova m1, m10
vpermb m2, m6, m4
vpermb m3, m6, m5
vpdpwssd m0, m12, m2
vpdpwssd m1, m12, m3
vpermb m2, m7, m4
vpermb m3, m7, m5
vpdpwssd m0, m13, m2
vpdpwssd m1, m13, m3
vpermb m2, m8, m4
vpermb m3, m8, m5
vpdpwssd m0, m14, m2
vpdpwssd m1, m14, m3
vpermb m2, m9, m4
vpermb m3, m9, m5
vpdpwssd m0, m15, m2
vpdpwssd m1, m15, m3
vpermt2b m0, m11, m1
mova [tmpq], m0
add tmpq, 64
sub hd, 4
jg .h_w8_loop
RET
.h_w16:
vbroadcasti32x4 m6, [spel_h_shufA]
vbroadcasti32x4 m7, [spel_h_shufB]
mova m11, [prep_endC]
.h_w16_loop:
movu ym2, [srcq+strideq*0+ 0]
vinserti32x8 m2, [srcq+strideq*1+ 0], 1
movu ym3, [srcq+strideq*0+16]
vinserti32x8 m3, [srcq+strideq*1+16], 1
lea srcq, [srcq+strideq*2]
mova m0, m10
mova m1, m10
pshufb m4, m2, m6
vpdpwssd m0, m12, m4 ; a0
pshufb m4, m3, m6
vpdpwssd m1, m14, m4 ; b2
pshufb m4, m2, m7
vpdpwssd m0, m13, m4 ; a1
pshufb m4, m3, m7
vpdpwssd m1, m15, m4 ; b3
shufpd m2, m3, 0x55
pshufb m4, m2, m6
vpdpwssd m0, m14, m4 ; a2
vpdpwssd m1, m12, m4 ; b0
pshufb m2, m7
vpdpwssd m0, m15, m2 ; a3
vpdpwssd m1, m13, m2 ; b1
vpermt2b m0, m11, m1
mova [tmpq], m0
add tmpq, 64
sub hd, 2
jg .h_w16_loop
RET
.h_w32:
vbroadcasti32x4 m6, [spel_h_shufA]
lea srcq, [srcq+wq*2]
vbroadcasti32x4 m7, [spel_h_shufB]
neg wq
mova m11, [prep_endC]
.h_w32_loop0:
mov r6, wq
.h_w32_loop:
movu m2, [srcq+r6*2+ 0]
movu m3, [srcq+r6*2+ 8]
mova m0, m10
mova m1, m10
pshufb m4, m2, m6
vpdpwssd m0, m12, m4 ; a0
pshufb m4, m3, m6
vpdpwssd m1, m12, m4 ; b0
vpdpwssd m0, m14, m4 ; a2
movu m4, [srcq+r6*2+16]
pshufb m3, m7
vpdpwssd m1, m13, m3 ; b1
vpdpwssd m0, m15, m3 ; a3
pshufb m3, m4, m6
vpdpwssd m1, m14, m3 ; b2
pshufb m2, m7
vpdpwssd m0, m13, m2 ; a1
pshufb m4, m7
vpdpwssd m1, m15, m4 ; b3
vpermt2b m0, m11, m1
mova [tmpq], m0
add tmpq, 64
add r6, 32
jl .h_w32_loop
add srcq, strideq
dec hd
jg .h_w32_loop0
RET
.v:
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmove myd, mxd
mov r5d, r7m
vpbroadcastd m10, [prep_8tap_rnd]
pmovsxbw xmm0, [base+subpel_filters+myq*8]
tzcnt r6d, wd
shr r5d, 11
movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)]
psllw xmm0, [base+prep_hv_shift+r5*8]
add r7, r6
lea r6, [strideq*3]
sub srcq, r6
mova [tmpq], xmm0
vpbroadcastd m12, xmm0
vpbroadcastd m13, [tmpq+ 4]
vpbroadcastd m14, [tmpq+ 8]
vpbroadcastd m15, [tmpq+12]
jmp r7
.v_w4:
movq xmm1, [srcq+strideq*0]
vpbroadcastq ymm0, [srcq+strideq*1]
vpbroadcastq ymm2, [srcq+strideq*2]
add srcq, r6
vpbroadcastq ymm4, [srcq+strideq*0]
vpbroadcastq ymm3, [srcq+strideq*1]
vpbroadcastq ymm5, [srcq+strideq*2]
mova xm11, [prep_endA]
add srcq, r6
vpblendd ymm1, ymm0, 0x30
vpblendd ymm0, ymm2, 0x30
punpcklwd ymm1, ymm0 ; 01 12
vpbroadcastq ymm0, [srcq+strideq*0]
vpblendd ymm2, ymm4, 0x30
vpblendd ymm4, ymm3, 0x30
punpcklwd ymm2, ymm4 ; 23 34
vpblendd ymm3, ymm5, 0x30
vpblendd ymm5, ymm0, 0x30
punpcklwd ymm3, ymm5 ; 45 56
.v_w4_loop:
vpbroadcastq ymm5, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
mova ymm4, ym10
vpdpwssd ymm4, ym12, ymm1 ; a0 b0
mova ymm1, ymm2
vpdpwssd ymm4, ym13, ymm2 ; a1 b1
mova ymm2, ymm3
vpdpwssd ymm4, ym14, ymm3 ; a2 b2
vpblendd ymm3, ymm0, ymm5, 0x30
vpbroadcastq ymm0, [srcq+strideq*0]
vpblendd ymm5, ymm0, 0x30
punpcklwd ymm3, ymm5 ; 67 78
vpdpwssd ymm4, ym15, ymm3 ; a3 b3
vpermb ymm4, ym11, ymm4
mova [tmpq], xmm4
add tmpq, 16
sub hd, 2
jg .v_w4_loop
vzeroupper
RET
.v_w8:
vbroadcasti32x4 m2, [srcq+strideq*2]
vinserti32x4 m1, m2, [srcq+strideq*0], 0
vinserti32x4 m1, [srcq+strideq*1], 1 ; 0 1 2
add srcq, r6
vinserti32x4 ym2, [srcq+strideq*0], 1
vinserti32x4 m2, [srcq+strideq*1], 2 ; 2 3 4
mova m6, [spel_v_shuf8]
movu xm0, [srcq+strideq*1]
vinserti32x4 ym0, [srcq+strideq*2], 1
add srcq, r6
vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6
mova ym11, [prep_endB]
vpermb m1, m6, m1 ; 01 12
vpermb m2, m6, m2 ; 23 34
vpermb m3, m6, m0 ; 45 56
.v_w8_loop:
vinserti32x4 m0, [srcq+strideq*1], 3
lea srcq, [srcq+strideq*2]
movu xm5, [srcq+strideq*0]
mova m4, m10
vpdpwssd m4, m12, m1 ; a0 b0
mova m1, m2
vshufi32x4 m0, m5, q1032 ; 6 7 8
vpdpwssd m4, m13, m2 ; a1 b1
mova m2, m3
vpdpwssd m4, m14, m3 ; a2 b2
vpermb m3, m6, m0 ; 67 78
vpdpwssd m4, m15, m3 ; a3 b3
vpermb m4, m11, m4
mova [tmpq], ym4
add tmpq, 32
sub hd, 2
jg .v_w8_loop
RET
.v_w16:
vbroadcasti32x8 m1, [srcq+strideq*1]
vinserti32x8 m0, m1, [srcq+strideq*0], 0
vinserti32x8 m1, [srcq+strideq*2], 1
mova m8, [spel_v_shuf16]
add srcq, r6
movu ym3, [srcq+strideq*0]
vinserti32x8 m3, [srcq+strideq*1], 1
movu ym5, [srcq+strideq*2]
add srcq, r6
vinserti32x8 m5, [srcq+strideq*0], 1
mova m11, [prep_endA]
vpermb m0, m8, m0 ; 01
vpermb m1, m8, m1 ; 12
vpermb m3, m8, m3 ; 34
vpermb m5, m8, m5 ; 56
vpshrdd m2, m1, m3, 16 ; 23
vpshrdd m4, m3, m5, 16 ; 45
.v_w16_loop:
mova m6, m10
mova m7, m10
vpdpwssd m6, m12, m0 ; a0
mova m0, m2
vpdpwssd m7, m12, m1 ; b0
mova m1, m3
vpdpwssd m6, m13, m2 ; a1
mova m2, m4
vpdpwssd m7, m13, m3 ; b1
mova m3, m5
vpdpwssd m6, m14, m4 ; a2
mova m4, m5
vpdpwssd m7, m14, m5 ; b2
movu ym5, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
vinserti32x8 m5, [srcq+strideq*0], 1
vpermb m5, m8, m5 ; 78
vpshrdd m4, m5, 16 ; 67
vpdpwssd m6, m15, m4 ; a3
vpdpwssd m7, m15, m5 ; b3
vpermt2b m6, m11, m7
mova [tmpq], m6
add tmpq, 64
sub hd, 2
jg .v_w16_loop
RET
.v_w32:
.v_w64:
.v_w128:
%if WIN64
PUSH r8
movaps [rsp+stack_offset+8], xmm6
%endif
lea r5, [hq+wq*8-256]
mov r7, srcq
mov r8, tmpq
.v_w32_loop0:
movu m16, [srcq+strideq*0]
movu m17, [srcq+strideq*1]
movu m18, [srcq+strideq*2]
add srcq, r6
movu m19, [srcq+strideq*0]
movu m20, [srcq+strideq*1]
movu m21, [srcq+strideq*2]
add srcq, r6
movu m22, [srcq+strideq*0]
mova m11, [prep_endC]
punpcklwd m0, m16, m17 ; 01l
punpckhwd m16, m17 ; 01h
punpcklwd m1, m17, m18 ; 12l
punpckhwd m17, m18 ; 12h
punpcklwd m2, m18, m19 ; 23l
punpckhwd m18, m19 ; 23h
punpcklwd m3, m19, m20 ; 34l
punpckhwd m19, m20 ; 34h
punpcklwd m4, m20, m21 ; 45l
punpckhwd m20, m21 ; 45h
punpcklwd m5, m21, m22 ; 56l
punpckhwd m21, m22 ; 56h
.v_w32_loop:
mova m6, m10
vpdpwssd m6, m12, m0 ; a0l
mova m8, m10
vpdpwssd m8, m12, m16 ; a0h
mova m7, m10
vpdpwssd m7, m12, m1 ; b0l
mova m9, m10
vpdpwssd m9, m12, m17 ; b0h
mova m0, m2
vpdpwssd m6, m13, m2 ; a1l
mova m16, m18
vpdpwssd m8, m13, m18 ; a1h
mova m1, m3
vpdpwssd m7, m13, m3 ; b1l
mova m17, m19
vpdpwssd m9, m13, m19 ; b1h
mova m2, m4
vpdpwssd m6, m14, m4 ; a2l
mova m18, m20
vpdpwssd m8, m14, m20 ; a2h
mova m3, m5
vpdpwssd m7, m14, m5 ; b2l
mova m19, m21
vpdpwssd m9, m14, m21 ; b2h
movu m21, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
punpcklwd m4, m22, m21 ; 67l
punpckhwd m20, m22, m21 ; 67h
movu m22, [srcq+strideq*0]
vpdpwssd m6, m15, m4 ; a3l
vpdpwssd m8, m15, m20 ; a3h
punpcklwd m5, m21, m22 ; 78l
punpckhwd m21, m22 ; 78h
vpdpwssd m7, m15, m5 ; b3l
vpdpwssd m9, m15, m21 ; b3h
vpermt2b m6, m11, m8
vpermt2b m7, m11, m9
mova [tmpq+wq*0], m6
mova [tmpq+wq*2], m7
lea tmpq, [tmpq+wq*4]
sub hd, 2
jg .v_w32_loop
add r7, 64
add r8, 64
movzx hd, r5b
mov srcq, r7
mov tmpq, r8
sub r5d, 1<<8
jg .v_w32_loop0
%if WIN64
movaps xmm6, [rsp+stack_offset+8]
POP r8
%endif
vzeroupper
RET
.hv:
cmp wd, 4
jg .hv_w8
movzx mxd, mxb
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmove myd, mxd
mov r5d, r7m
pmovsxbw xmm1, [base+subpel_filters+myq*8]
lea r6, [strideq*3]
sub srcq, 2
shr r5d, 11
sub srcq, r6
psllw xmm0, [base+prep_hv_shift+r5*8]
psllw xmm1, 2
vpbroadcastd m10, [prep_8tap_rnd]
vpbroadcastd ym11, [pd_128]
mova xm21, [prep_endA]
mova [tmpq+ 0], xmm0
mova [tmpq+16], xmm1
vpbroadcastd m8, [tmpq+ 4]
vpbroadcastd m9, [tmpq+ 8]
vpbroadcastd ym12, xmm1
vpbroadcastd ym13, [tmpq+20]
vpbroadcastd ym14, [tmpq+24]
vpbroadcastd ym15, [tmpq+28]
movu xm4, [srcq+strideq*0]
vinserti32x4 ym4, [srcq+strideq*1], 1
vinserti32x4 m4, [srcq+strideq*2], 2
add srcq, r6
vinserti32x4 m4, [srcq+strideq*0], 3 ; 0 1 2 3
movu xm0, [srcq+strideq*1]
vinserti32x4 ym0, [srcq+strideq*2], 1
add srcq, r6
vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6
vbroadcasti32x4 m19, [spel_h_shufA]
vbroadcasti32x4 m20, [spel_h_shufB]
mova ym6, [spel_shuf4a]
mova ym7, [spel_shuf4b]
mova m2, m10
mova m3, m10
pshufb m1, m4, m19
vpdpwssd m2, m8, m1
pshufb m1, m0, m19
vpdpwssd m3, m8, m1
pshufb m4, m20
vpdpwssd m2, m9, m4
pshufb m0, m20
vpdpwssd m3, m9, m0
vpermb m1, m6, m2 ; 01 12
vshufi32x4 m2, m3, q1032
vpermb m3, m6, m3 ; 45 56
vpermb m2, m6, m2 ; 23 34
.hv_w4_loop:
movu xm18, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
vinserti128 ym18, [srcq+strideq*0], 1
mova ym16, ym11
mova ym4, ym10
pshufb ym17, ym18, ym19
vpdpwssd ym16, ym12, ym1 ; a0 b0
vpdpwssd ym4, ym8, ym17
pshufb ym18, ym20
mova ym1, ym2
vpdpwssd ym16, ym13, ym2 ; a1 b1
vpdpwssd ym4, ym9, ym18 ; 7 8
mova ym2, ym3
vpdpwssd ym16, ym14, ym3 ; a2 b2
vpermt2b ym3, ym7, ym4 ; 67 78
vpdpwssd ym16, ym15, ym3 ; a3 b3
vpermb ym16, ym21, ym16
mova [tmpq], xm16
add tmpq, 16
sub hd, 2
jg .hv_w4_loop
vzeroupper
RET
.hv_w8:
shr mxd, 16
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
mov r5d, r7m
pmovsxbw xmm1, [base+subpel_filters+myq*8]
lea r6, [strideq*3]
sub srcq, 6
shr r5d, 11
sub srcq, r6
vpbroadcastd m10, [prep_8tap_rnd]
vpbroadcastd m11, [pd_128]
psllw xmm0, [base+prep_hv_shift+r5*8]
psllw xmm1, 2
mova [tmpq+ 0], xmm0
mova [tmpq+16], xmm1
vpbroadcastd m12, xmm0
vpbroadcastd m13, [tmpq+ 4]
vpbroadcastd m14, [tmpq+ 8]
vpbroadcastd m15, [tmpq+12]
vpbroadcastd m16, xmm1
vpbroadcastd m17, [tmpq+20]
vpbroadcastd m18, [tmpq+24]
vpbroadcastd m19, [tmpq+28]
cmp wd, 16
je .hv_w16
jg .hv_w32
WIN64_SPILL_XMM 23
mova m5, [spel_h_shufA]
movu ym0, [srcq+strideq*0]
vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1
movu ym9, [srcq+strideq*2]
add srcq, r6
vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3
movu ym20, [srcq+strideq*1]
vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5
add srcq, r6
movu ym21, [srcq+strideq*0] ; 6
movu m6, [spel_h_shufB]
movu m7, [spel_h_shufC]
mova ym22, [prep_endB]
vpermb m8, m5, m0
mova m1, m10
vpdpwssd m1, m12, m8 ; a0 b0
vpermb m8, m5, m9
mova m2, m10
vpdpwssd m2, m12, m8 ; c0 d0
vpermb m8, m5, m20
mova m3, m10
vpdpwssd m3, m12, m8 ; e0 f0
vpermb m8, m5, m21
mova m4, m10
vpdpwssd m4, m12, m8 ; g0
vpermb m8, m6, m0
vpdpwssd m1, m13, m8 ; a1 b1
vpermb m8, m6, m9
vpdpwssd m2, m13, m8 ; c1 d1
vpermb m8, m6, m20
vpdpwssd m3, m13, m8 ; e1 f1
vpermb m8, m6, m21
vpdpwssd m4, m13, m8 ; g1
vpermb m8, m7, m0
vpdpwssd m1, m14, m8 ; a2 b2
vpermb m8, m7, m9
vpdpwssd m2, m14, m8 ; c2 d2
vpermb m8, m7, m20
vpdpwssd m3, m14, m8 ; e2 f2
vpermb m8, m7, m21
vpdpwssd m4, m14, m8 ; g2
mova m8, [spel_h_shufD]
vpermb m0, m8, m0
vpdpwssd m1, m15, m0 ; a3 b3
mova m0, [spel_shuf8a]
vpermb m9, m8, m9
vpdpwssd m2, m15, m9 ; c3 d3
mova m9, [spel_shuf8b]
vpermb m20, m8, m20
vpdpwssd m3, m15, m20 ; e3 f3
vpermb m21, m8, m21
vpdpwssd m4, m15, m21 ; g3
vpermt2b m1, m0, m2 ; 01 12
vpermt2b m2, m0, m3 ; 23 34
vpermt2b m3, m0, m4 ; 45 56
.hv_w8_loop:
movu ym0, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
vinserti32x8 m0, [srcq+strideq*0], 1
mova m4, m10
mova m20, m11
vpermb m21, m5, m0
vpdpwssd m4, m12, m21 ; h0 i0
vpermb m21, m6, m0
vpdpwssd m20, m16, m1 ; A0 B0
vpdpwssd m4, m13, m21 ; h1 i1
vpermb m21, m7, m0
mova m1, m2
vpdpwssd m20, m17, m2 ; A1 B1
vpdpwssd m4, m14, m21 ; h2 i2
vpermb m21, m8, m0
mova m2, m3
vpdpwssd m20, m18, m3 ; A2 B2
vpdpwssd m4, m15, m21 ; h3 i3
vpermt2b m3, m9, m4 ; 67 78
vpdpwssd m20, m19, m3 ; A3 B3
vpermb m20, m22, m20
mova [tmpq], ym20
add tmpq, 32
sub hd, 2
jg .hv_w8_loop
RET
.hv_w16:
WIN64_SPILL_XMM 27
vbroadcasti32x8 m5, [srcq+strideq*0+ 8]
vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0
vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0
movu ym6, [srcq+strideq*1+ 0]
movu ym7, [srcq+strideq*1+16]
vinserti32x8 m6, [srcq+strideq*2+ 0], 1
vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2
add srcq, r6
movu ym22, [srcq+strideq*0+ 0]
movu ym23, [srcq+strideq*0+16]
vinserti32x8 m22, [srcq+strideq*1+ 0], 1
vinserti32x8 m23, [srcq+strideq*1+16], 1 ; 3 4
movu ym24, [srcq+strideq*2+ 0]
movu ym25, [srcq+strideq*2+16]
add srcq, r6
vinserti32x8 m24, [srcq+strideq*0+ 0], 1
vinserti32x8 m25, [srcq+strideq*0+16], 1 ; 5 6
vbroadcasti32x4 m20, [spel_h_shufA]
vbroadcasti32x4 m21, [spel_h_shufB]
mova m9, [spel_shuf16]
mova m26, [prep_endB]
pshufb m0, m4, m20
mova m1, m10
vpdpwssd m1, m12, m0 ; a0
pshufb m0, m6, m20
mova m2, m10
vpdpwssd m2, m12, m0 ; b0
pshufb m0, m7, m20
mova m3, m10
vpdpwssd m3, m14, m0 ; c2
pshufb m0, m4, m21
vpdpwssd m1, m13, m0 ; a1
pshufb m0, m6, m21
vpdpwssd m2, m13, m0 ; b1
pshufb m0, m7, m21
vpdpwssd m3, m15, m0 ; c3
pshufb m0, m5, m20
vpdpwssd m1, m14, m0 ; a2
shufpd m6, m7, 0x55
pshufb m7, m6, m20
vpdpwssd m2, m14, m7 ; b2
vpdpwssd m3, m12, m7 ; c0
pshufb m5, m21
vpdpwssd m1, m15, m5 ; a3
pshufb m6, m21
vpdpwssd m2, m15, m6 ; b3
vpdpwssd m3, m13, m6 ; c1
pshufb m0, m22, m20
mova m4, m10
vpdpwssd m4, m12, m0 ; d0
pshufb m0, m23, m20
mova m5, m10
vpdpwssd m5, m14, m0 ; e2
pshufb m0, m24, m20
mova m6, m10
vpdpwssd m6, m12, m0 ; f0
pshufb m0, m25, m20
mova m7, m10
vpdpwssd m7, m14, m0 ; g2
pshufb m0, m22, m21
vpdpwssd m4, m13, m0 ; d1
pshufb m0, m23, m21
vpdpwssd m5, m15, m0 ; e3
pshufb m0, m24, m21
vpdpwssd m6, m13, m0 ; f1
pshufb m0, m25, m21
vpdpwssd m7, m15, m0 ; g3
shufpd m22, m23, 0x55
pshufb m23, m22, m20
vpdpwssd m4, m14, m23 ; d2
vpdpwssd m5, m12, m23 ; e0
shufpd m24, m25, 0x55
pshufb m25, m24, m20
vpdpwssd m6, m14, m25 ; f2
vpdpwssd m7, m12, m25 ; g0
pshufb m22, m21
vpdpwssd m4, m15, m22 ; d3
vpdpwssd m5, m13, m22 ; e1
pshufb m24, m21
vpdpwssd m6, m15, m24 ; f3
vpdpwssd m7, m13, m24 ; g1
pslldq m1, 1
vpermt2b m2, m9, m3 ; 12
vpermt2b m4, m9, m5 ; 34
vpermt2b m6, m9, m7 ; 56
vpshrdd m1, m2, 16 ; 01
vpshrdd m3, m2, m4, 16 ; 23
vpshrdd m5, m4, m6, 16 ; 45
.hv_w16_loop:
movu ym24, [srcq+strideq*1+ 0]
movu ym25, [srcq+strideq*1+16]
lea srcq, [srcq+strideq*2]
vinserti32x8 m24, [srcq+strideq*0+ 0], 1
vinserti32x8 m25, [srcq+strideq*0+16], 1
mova m7, m10
mova m8, m10
pshufb m0, m24, m20
vpdpwssd m7, m12, m0 ; h0
mova m22, m11
pshufb m0, m25, m20
vpdpwssd m8, m14, m0 ; i2
mova m23, m11
vpdpwssd m22, m16, m1 ; A0
mova m1, m3
vpdpwssd m23, m16, m2 ; B0
mova m2, m4
pshufb m0, m24, m21
vpdpwssd m7, m13, m0 ; h1
pshufb m0, m25, m21
vpdpwssd m8, m15, m0 ; i3
vpdpwssd m22, m17, m3 ; A1
mova m3, m5
vpdpwssd m23, m17, m4 ; B1
mova m4, m6
shufpd m24, m25, 0x55
pshufb m25, m24, m20
vpdpwssd m7, m14, m25 ; h2
vpdpwssd m8, m12, m25 ; i0
vpdpwssd m22, m18, m5 ; A2
vpdpwssd m23, m18, m6 ; B2
pshufb m24, m21
vpdpwssd m7, m15, m24 ; h3
vpdpwssd m8, m13, m24 ; i1
vpermt2b m7, m9, m8 ; 78
vpshrdd m5, m6, m7, 16 ; 67
vpdpwssd m22, m19, m5 ; A3
vpdpwssd m23, m19, m7 ; B3
mova m6, m7
vpermt2b m22, m26, m23
mova [tmpq], m22
add tmpq, 64
sub hd, 2
jg .hv_w16_loop
RET
.hv_w32:
%if WIN64
PUSH r8
%assign regs_used regs_used + 1
WIN64_SPILL_XMM 32
%endif
vbroadcasti32x4 m20, [spel_h_shufA]
vbroadcasti32x4 m21, [spel_h_shufB]
mova m22, [spel_shuf32]
lea r5d, [hq+wq*8-256]
mov r7, srcq
mov r8, tmpq
.hv_w32_loop0:
movu m6, [srcq+strideq*0+ 0]
movu m7, [srcq+strideq*0+ 8]
movu m8, [srcq+strideq*0+16]
mova m0, m10
mova m23, m10
pshufb m9, m6, m20
vpdpwssd m0, m12, m9 ; a0l
pshufb m9, m7, m20
vpdpwssd m23, m12, m9 ; a0h
vpdpwssd m0, m14, m9 ; a2l
pshufb m7, m21
vpdpwssd m23, m13, m7 ; a1h
vpdpwssd m0, m15, m7 ; a3l
pshufb m7, m8, m20
vpdpwssd m23, m14, m7 ; a2h
pshufb m6, m21
vpdpwssd m0, m13, m6 ; a1l
pshufb m8, m21
vpdpwssd m23, m15, m8 ; a3h
PUT_8TAP_HV_W32 1, 24, strideq, 1, 2 ; 12
PUT_8TAP_HV_W32 3, 26, strideq, 0, 1 ; 34
PUT_8TAP_HV_W32 5, 28, strideq, 2, 0 ; 56
vpshrdd m2, m1, m3, 16 ; 23l
vpshrdd m25, m24, m26, 16 ; 23h
vpshrdd m4, m3, m5, 16 ; 45l
vpshrdd m27, m26, m28, 16 ; 45h
.hv_w32_loop:
movu m7, [srcq+strideq*1+ 0]
movu m9, [srcq+strideq*2+ 0]
movu m6, [srcq+strideq*1+ 8]
movu m8, [srcq+strideq*2+ 8]
mova m29, m10
mova m31, m10
pshufb m30, m7, m20
vpdpwssd m29, m12, m30 ; h0l
pshufb m30, m9, m20
vpdpwssd m31, m12, m30 ; i0l
pshufb m7, m21
vpdpwssd m29, m13, m7 ; h1l
pshufb m9, m21
vpdpwssd m31, m13, m9 ; i1l
pshufb m7, m6, m20
vpdpwssd m29, m14, m7 ; h2l
pshufb m9, m8, m20
vpdpwssd m31, m14, m9 ; i2l
pshufb m6, m21
vpdpwssd m29, m15, m6 ; h3l
pshufb m8, m21
vpdpwssd m31, m15, m8 ; i3l
mova m30, m10
vpdpwssd m30, m12, m7 ; h0h
movu m7, [srcq+strideq*1+16]
lea srcq, [srcq+strideq*2]
vpermt2b m29, m22, m31 ; 78l
mova m31, m10
vpdpwssd m31, m12, m9 ; i0h
movu m9, [srcq+strideq*0+16]
vpdpwssd m30, m13, m6 ; h1h
pshufb m6, m7, m20
vpdpwssd m31, m13, m8 ; i1h
pshufb m8, m9, m20
vpdpwssd m30, m14, m6 ; h2h
mova m6, m11
vpdpwssd m6, m16, m0 ; A0l
pshufb m7, m21
vpdpwssd m31, m14, m8 ; i2h
mova m8, m11
vpdpwssd m8, m16, m23 ; A0h
pshufb m9, m21
vpdpwssd m30, m15, m7 ; h3h
mova m7, m11
vpdpwssd m7, m16, m1 ; B0l
vpdpwssd m31, m15, m9 ; i3h
mova m9, m11
vpdpwssd m9, m16, m24 ; B0h
mova m0, m2
vpdpwssd m6, m17, m2 ; A1l
mova m23, m25
vpdpwssd m8, m17, m25 ; A1h
mova m1, m3
vpdpwssd m7, m17, m3 ; B1l
mova m24, m26
vpdpwssd m9, m17, m26 ; B1h
vpermt2b m30, m22, m31 ; 78h
mova m31, [prep_endC]
vpdpwssd m6, m18, m4 ; A2l
mova m2, m4
vpdpwssd m8, m18, m27 ; A2h
mova m25, m27
vpdpwssd m7, m18, m5 ; B2l
mova m3, m5
vpdpwssd m9, m18, m28 ; B2h
mova m26, m28
vpshrdd m4, m5, m29, 16 ; 67l
vpdpwssd m6, m19, m4 ; A3l
vpshrdd m27, m28, m30, 16 ; 67h
vpdpwssd m8, m19, m27 ; A3h
mova m5, m29
vpdpwssd m7, m19, m29 ; B3l
mova m28, m30
vpdpwssd m9, m19, m30 ; B3h
vpermt2b m6, m31, m8
vpermt2b m7, m31, m9
mova [tmpq+wq*0], m6
mova [tmpq+wq*2], m7
lea tmpq, [tmpq+wq*4]
sub hd, 2
jg .hv_w32_loop
add r7, 64
add r8, 64
movzx hd, r5b
mov srcq, r7
mov tmpq, r8
sub r5d, 1<<8
jg .hv_w32_loop0
RET
%if WIN64
DECLARE_REG_TMP 5
%else
DECLARE_REG_TMP 7
%endif
cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts
%define base r6-pd_0to7
mov t0d, r7m
lea r6, [pd_0to7]
shr t0d, 11
vpbroadcastd m8, [base+warp_8x8t_rnd_v]
vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4]
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main
psrad m14, m16, 15
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
psrad m16, 15
packssdw m14, m16
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
psrad m15, m16, 15
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
add tsq, tsq
psrad m16, 15
packssdw m15, m16
jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end
cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd
mov t0d, r7m ; pixel_max
lea r6, [pd_0to7]
shr t0d, 11
vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4]
vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4]
call .main
psrad m14, m16, 13
call .main2
psrad m16, 13
packusdw m14, m16
call .main2
psrad m15, m16, 13
call .main2
vpbroadcastd m0, [base+bidir_shift+t0*4]
vpsrlvw m14, m0
psrad m16, 13
packusdw m15, m16
vpsrlvw m15, m0
.end:
mova m0, [base+warp8x8_end]
vpermb m16, m0, m14
lea r2, [dsq*3]
mova [dstq+dsq*0], xm16
vextracti128 [dstq+dsq*1], ym16, 1
vextracti32x4 [dstq+dsq*2], m16, 2
vextracti32x4 [dstq+r2 ], m16, 3
vpermb m16, m0, m15
lea dstq, [dstq+dsq*4]
mova [dstq+dsq*0], xm16
vextracti128 [dstq+dsq*1], ym16, 1
vextracti32x4 [dstq+dsq*2], m16, 2
vextracti32x4 [dstq+r2 ], m16, 3
RET
.main:
vpbroadcastd ym3, [base+pd_512]
%if WIN64
mov abcdq, r5mp
vpaddd ym18, ym3, r6m {1to8} ; mx
%else
add r5d, 512
vpbroadcastd ym18, r5d
%endif
vpaddd ym20, ym3, r7m {1to8} ; my
mova ym16, [base+pd_0to7]
vpbroadcastd ym19, [abcdq+4*0] ; alpha
vpbroadcastd ym21, [abcdq+4*1] ; gamma
lea r4, [ssq*3+6]
vpdpwssd ym18, ym19, ym16 ; tmx
vpdpwssd ym20, ym21, ym16 ; tmy
sub srcq, r4
mova m10, [base+warp8x8_permA]
lea r4, [mc_warp_filter+64*8]
vbroadcasti32x4 m12, [base+warp8x8_permC]
kxnorb k1, k1, k1
vbroadcasti32x4 m13, [base+warp8x8_permD]
movu ym5, [srcq+0]
vinserti32x8 m5, [srcq+8], 1
psrad ym17, ym18, 10
mova m11, [base+warp8x8_permB]
kmovb k2, k1
vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0
psrad ym19, 16 ; beta
psrad ym21, 16 ; delta
paddd ym18, ym19
vpermb m4, m10, m5
vpbroadcastq m9, [base+warp_shift_h+t0*8]
pshufd m3, m3, q3120
paddd m7, m1, m1
pshufb m2, m3, m12
vpdpwssd m1, m4, m2
vpermb m5, m11, m5
vshufi32x4 m4, m5, q1021
pshufb m3, m13
vpdpwssd m1, m4, m3
call .h
psllq m2, m1, 32
paddd m1, m2
vpmultishiftqb m1, m9, m1
vpshrdq m1, m0, 48 ; 01 12
call .h
vpshrdq m2, m1, m0, 48 ; 23 34
call .h
vpshrdq m3, m2, m0, 48 ; 45 56
.main2:
call .h
psrad ym6, ym20, 10
kmovb k1, k2
paddd ym17, ym20, ym21 ; my += delta
vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0
psrad ym16, ym17, 10
kmovb k2, k1
vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1
shufps m5, m20, m6, q2020
mova m16, m8
pshufb m4, m5, m12
vpdpwssd m16, m1, m4 ; a0 b0
pshufb m5, m13
mova m1, m2
vpdpwssd m16, m2, m5 ; a1 b1
shufps m6, m20, m6, q3131
paddd ym20, ym17, ym21
pshufb m4, m6, m12
mova m2, m3
vpdpwssd m16, m3, m4 ; a2 b2
vpshrdq m3, m0, 48 ; 67 78
pshufb m6, m13
vpdpwssd m16, m3, m6 ; a3 b3
ret
ALIGN function_align
.h:
movu ym16, [srcq+ssq*1]
psrad ym6, ym18, 10
lea srcq, [srcq+ssq*2]
vinserti32x8 m5, m16, [srcq+ssq*0], 1
kmovb k1, k2
paddd ym17, ym18, ym19 ; mx += beta
vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1
psrad ym16, ym17, 10
kmovb k2, k1
vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2
vpermb m4, m10, m5
shufps m16, m18, m6, q2020
shufps m6, m18, m6, q3131
mova m0, m7
pshufb m18, m16, m12
vpdpwssd m0, m4, m18 ; a0 b0
vpermb m5, m11, m5
pshufb m18, m6, m13
vpdpwssd m0, m5, m18 ; a3 b3
paddd ym18, ym17, ym19
vshufi32x4 m17, m4, m5, q1021
pshufb m16, m13
vpdpwssd m0, m17, m16 ; a1 b1
vshufi32x4 m4, m5, q2132
pshufb m6, m12
vpdpwssd m0, m4, m6 ; a2 b2
vpmultishiftqb m0, m9, m0 ; a a b b
ret
%macro BIDIR_FN 0
call .main
lea stride3q, [strideq*3]
jmp wq
.w4:
movq [dstq ], xm0
movhps [dstq+strideq*1], xm0
vextracti32x4 xm2, ym0, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
movq [dstq ], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm0, m0, 3
movq [dstq+strideq*2], xm0
movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq ], xm1
movhps [dstq+strideq*1], xm1
vextracti32x4 xm0, ym1, 1
movq [dstq+strideq*2], xm0
movhps [dstq+stride3q ], xm0
vextracti32x4 xm0, m1, 2
lea dstq, [dstq+strideq*4]
movq [dstq ], xm0
movhps [dstq+strideq*1], xm0
vextracti32x4 xm1, m1, 3
movq [dstq+strideq*2], xm1
movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
call .main
lea dstq, [dstq+strideq*4]
.w8:
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
vextracti32x4 [dstq+stride3q ], m0, 3
sub hd, 8
jl .w8_end
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], xm1
vextracti32x4 [dstq+strideq*1], ym1, 1
vextracti32x4 [dstq+strideq*2], m1, 2
vextracti32x4 [dstq+stride3q ], m1, 3
jg .w8_loop
.w8_end:
RET
.w16_loop:
call .main
lea dstq, [dstq+strideq*4]
.w16:
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
vextracti32x8 [dstq+stride3q ], m1, 1
sub hd, 4
jg .w16_loop
RET
.w32_loop:
call .main
lea dstq, [dstq+strideq*2]
.w32:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
sub hd, 2
jg .w32_loop
RET
.w64_loop:
call .main
add dstq, strideq
.w64:
mova [dstq+64*0], m0
mova [dstq+64*1], m1
dec hd
jg .w64_loop
RET
.w128_loop:
call .main
add dstq, strideq
.w128:
mova [dstq+64*0], m0
mova [dstq+64*1], m1
call .main
mova [dstq+64*2], m0
mova [dstq+64*3], m1
dec hd
jg .w128_loop
RET
%endmacro
%if WIN64
DECLARE_REG_TMP 5
%else
DECLARE_REG_TMP 7
%endif
cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3
%define base r6-avg_avx512icl_table
lea r6, [avg_avx512icl_table]
tzcnt wd, wm
mov t0d, r6m ; pixel_max
movsxd wq, [r6+wq*4]
shr t0d, 11
vpbroadcastd m2, [base+avg_round+t0*4]
vpbroadcastd m3, [base+avg_shift+t0*4]
movifnidn hd, hm
add wq, r6
BIDIR_FN
ALIGN function_align
.main:
mova m0, [tmp1q+64*0]
paddsw m0, [tmp2q+64*0]
mova m1, [tmp1q+64*1]
paddsw m1, [tmp2q+64*1]
add tmp1q, 64*2
add tmp2q, 64*2
pmaxsw m0, m2
pmaxsw m1, m2
psubsw m0, m2
psubsw m1, m2
vpsrlvw m0, m3
vpsrlvw m1, m3
ret
cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3
%define base r6-w_avg_avx512icl_table
lea r6, [w_avg_avx512icl_table]
tzcnt wd, wm
mov t0d, r7m ; pixel_max
shr t0d, 11
movsxd wq, [r6+wq*4]
vpbroadcastd m5, [base+w_avg_round+t0*4]
vpbroadcastd m7, [base+bidir_shift+t0*4]
add wq, r6
mov r6d, r6m ; weight
lea t0d, [r6-16]
shl r6d, 16
sub r6d, t0d ; 16-weight, weight
movifnidn hd, hm
vpbroadcastd m6, r6d
BIDIR_FN
ALIGN function_align
.main:
mova m3, [tmp1q+64*0]
mova m1, [tmp2q+64*0]
mova m0, [tmp1q+64*1]
mova m4, [tmp2q+64*1]
add tmp1q, 64*2
add tmp2q, 64*2
punpcklwd m2, m1, m3
punpckhwd m1, m3
punpcklwd m3, m4, m0
punpckhwd m4, m0
mova m0, m5
vpdpwssd m0, m6, m2
mova m2, m5
vpdpwssd m2, m6, m1
mova m1, m5
vpdpwssd m1, m6, m3
mova m3, m5
vpdpwssd m3, m6, m4
REPX {psrad x, 2}, m0, m2, m1, m3
packusdw m0, m2
packusdw m1, m3
vpsrlvw m0, m7
vpsrlvw m1, m7
ret
cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-mask_avx512icl_table
lea r7, [mask_avx512icl_table]
tzcnt wd, wm
mov r6d, r7m ; pixel_max
movifnidn hd, hm
shr r6d, 11
movsxd wq, [r7+wq*4]
vpbroadcastd m8, [base+pw_64]
vpbroadcastd m9, [base+mask_round+r6*4]
vpbroadcastd m10, [base+bidir_shift+r6*4]
mov maskq, maskmp
add wq, r7
BIDIR_FN
ALIGN function_align
.main:
pmovzxbw m1, [maskq+32*0]
mova m4, [tmp1q+64*0]
mova m2, [tmp2q+64*0]
pmovzxbw m6, [maskq+32*1]
mova m5, [tmp1q+64*1]
mova m3, [tmp2q+64*1]
add maskq, 32*2
add tmp1q, 64*2
add tmp2q, 64*2
punpcklwd m7, m4, m2
punpckhwd m4, m2
psubw m0, m8, m1
punpcklwd m2, m1, m0 ; m, 64-m
punpckhwd m1, m0
mova m0, m9
vpdpwssd m0, m7, m2
mova m2, m9
vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m)
punpcklwd m7, m5, m3
punpckhwd m5, m3
psubw m1, m8, m6
punpcklwd m3, m6, m1
punpckhwd m6, m1
mova m1, m9
vpdpwssd m1, m7, m3
mova m3, m9
vpdpwssd m3, m5, m6
REPX {psrad x, 4}, m0, m2, m1, m3
packusdw m0, m2
packusdw m1, m3
vpsrlvw m0, m10
vpsrlvw m1, m10
ret
cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-w_mask_420_avx512icl_table
lea r7, [w_mask_420_avx512icl_table]
tzcnt wd, wm
mov r6d, r8m ; pixel_max
movifnidn hd, hm
shr r6d, 11
movsxd wq, [r7+wq*4]
vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
vpbroadcastd m11, [base+pw_64]
vpbroadcastd m12, [base+mask_round+r6*4]
vpbroadcastd m13, [base+bidir_shift+r6*4]
mov r6d, r7m ; sign
vpbroadcastd m14, [base+w_mask_round+r6*4]
mova ym15, [w_mask_end42x]
mov maskq, maskmp
add wq, r7
call .main
lea stride3q, [strideq*3]
jmp wq
.w4:
mova m4, [w_mask_shuf4]
vpermt2b m2, m4, m3
mova m3, m14
vpdpbusd m3, m2, [pb_64] {1to16}
vpermb m3, m15, m3
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
vextracti32x4 xm2, ym0, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
mova [maskq], xm3
cmp hd, 8
jl .w4_end
vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm0, m0, 3
movq [dstq+strideq*2], xm0
movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
vextracti32x4 xm2, ym1, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm1, m1, 3
movq [dstq+strideq*2], xm1
movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8:
mova m8, [w_mask_shuf8]
vpbroadcastd m9, [pb_64]
jmp .w8_start
.w8_loop:
call .main
lea dstq, [dstq+strideq*4]
add maskq, 16
.w8_start:
vpermt2b m2, m8, m3
mova m3, m14
vpdpbusd m3, m2, m9
vpermb m3, m15, m3
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
vextracti32x4 [dstq+stride3q ], m0, 3
mova [maskq], xm3
sub hd, 8
jl .w8_end
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], xm1
vextracti32x4 [dstq+strideq*1], ym1, 1
vextracti32x4 [dstq+strideq*2], m1, 2
vextracti32x4 [dstq+stride3q ], m1, 3
jg .w8_loop
.w8_end:
RET
.w16:
mova m8, [w_mask_shuf16]
vpbroadcastd m9, [pb_64]
jmp .w16_start
.w16_loop:
call .main
lea dstq, [dstq+strideq*4]
add maskq, 16
.w16_start:
vpermt2b m2, m8, m3
mova m3, m14
vpdpbusd m3, m2, m9
vpermb m3, m15, m3
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
vextracti32x8 [dstq+stride3q ], m1, 1
mova [maskq], xm3
sub hd, 4
jg .w16_loop
RET
.w32_loop:
call .main
lea dstq, [dstq+strideq*4]
add maskq, 32
.w32:
paddw m2, m3
mova m8, m14
vpdpwssd m8, m11, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
call .main
paddw m2, m3
mova m3, m14
vpdpwssd m3, m11, m2
vpermt2b m8, m15, m3
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m1
mova [maskq], ym8
sub hd, 4
jg .w32_loop
RET
.w64_loop:
call .main
lea dstq, [dstq+strideq*2]
add maskq, 32
.w64:
mova m8, m2
mova m9, m3
mova [dstq+strideq*0+64*0], m0
mova [dstq+strideq*0+64*1], m1
call .main
paddw m8, m2
paddw m9, m3
mova m2, m14
vpdpwssd m2, m11, m8
mova m3, m14
vpdpwssd m3, m11, m9
vpermt2b m2, m15, m3
mova [dstq+strideq*1+64*0], m0
mova [dstq+strideq*1+64*1], m1
mova [maskq], ym2
sub hd, 2
jg .w64_loop
RET
.w128_loop:
call .main
lea dstq, [dstq+strideq*2]
add maskq, 64
.w128:
mova m16, m2
mova m8, m3
mova [dstq+strideq*0+64*0], m0
mova [dstq+strideq*0+64*1], m1
call .main
mova m17, m2
mova m9, m3
mova [dstq+strideq*0+64*2], m0
mova [dstq+strideq*0+64*3], m1
call .main
paddw m2, m16
paddw m3, m8
mova m16, m14
vpdpwssd m16, m11, m2
mova m8, m14
vpdpwssd m8, m11, m3
mova [dstq+strideq*1+64*0], m0
mova [dstq+strideq*1+64*1], m1
call .main
paddw m2, m17
paddw m3, m9
mova m17, m14
vpdpwssd m17, m11, m2
mova m9, m14
vpdpwssd m9, m11, m3
vpermt2b m16, m15, m8
vpermt2b m17, m15, m9
mova [dstq+strideq*1+64*2], m0
mova [dstq+strideq*1+64*3], m1
mova [maskq+32*0], ym16
mova [maskq+32*1], ym17
sub hd, 2
jg .w128_loop
vzeroupper
RET
ALIGN function_align
.main:
mova m1, [tmp1q+64*0]
mova m3, [tmp2q+64*0]
mova m4, [tmp1q+64*1]
mova m7, [tmp2q+64*1]
add tmp1q, 64*2
add tmp2q, 64*2
psubsw m6, m1, m3
punpcklwd m5, m3, m1
pabsw m6, m6
punpckhwd m3, m1
psubusw m6, m10, m6
psrlw m6, 10 ; 64-m
psubw m2, m11, m6 ; m
punpcklwd m1, m6, m2
punpckhwd m6, m2
mova m0, m12
vpdpwssd m0, m5, m1
mova m1, m12
vpdpwssd m1, m3, m6
psubsw m5, m4, m7
punpcklwd m6, m7, m4
pabsw m5, m5
punpckhwd m7, m4
psubusw m5, m10, m5
psrlw m5, 10
psubw m3, m11, m5
punpcklwd m4, m5, m3
psrad m0, 4
punpckhwd m5, m3
psrad m1, 4
packusdw m0, m1
mova m1, m12
vpdpwssd m1, m6, m4
mova m4, m12
vpdpwssd m4, m7, m5
psrad m1, 4
psrad m4, 4
packusdw m1, m4
vpsrlvw m0, m13
vpsrlvw m1, m13
ret
cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-w_mask_422_avx512icl_table
lea r7, [w_mask_422_avx512icl_table]
tzcnt wd, wm
mov r6d, r8m ; pixel_max
movifnidn hd, hm
shr r6d, 11
movsxd wq, [r7+wq*4]
vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
vpbroadcastd m9, [base+pw_64]
vpbroadcastd m10, [base+mask_round+r6*4]
vpbroadcastd m11, [base+bidir_shift+r6*4]
mov r6d, r7m ; sign
vpbroadcastd m12, [base+w_mask_round+r6*4]
mova ym13, [w_mask_end42x]
mov maskq, maskmp
add wq, r7
paddw m14, m9, m9 ; pw_128
call .main
lea stride3q, [strideq*3]
jmp wq
.w4:
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
vextracti32x4 xm2, ym0, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm0, m0, 3
movq [dstq+strideq*2], xm0
movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
vextracti32x4 xm2, ym1, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm1, m1, 3
movq [dstq+strideq*2], xm1
movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
call .main
lea dstq, [dstq+strideq*4]
.w8:
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
vextracti32x4 [dstq+stride3q ], m0, 3
sub hd, 8
jl .w8_end
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], xm1
vextracti32x4 [dstq+strideq*1], ym1, 1
vextracti32x4 [dstq+strideq*2], m1, 2
vextracti32x4 [dstq+stride3q ], m1, 3
jg .w8_loop
.w8_end:
RET
.w16_loop:
call .main
lea dstq, [dstq+strideq*4]
.w16:
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
vextracti32x8 [dstq+stride3q ], m1, 1
sub hd, 4
jg .w16_loop
RET
.w32_loop:
call .main
lea dstq, [dstq+strideq*2]
.w32:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
sub hd, 2
jg .w32_loop
RET
.w64_loop:
call .main
add dstq, strideq
.w64:
mova [dstq+64*0], m0
mova [dstq+64*1], m1
dec hd
jg .w64_loop
RET
.w128_loop:
call .main
add dstq, strideq
.w128:
mova [dstq+64*0], m0
mova [dstq+64*1], m1
call .main
mova [dstq+64*2], m0
mova [dstq+64*3], m1
dec hd
jg .w128_loop
RET
ALIGN function_align
.main:
mova m1, [tmp1q+64*0]
mova m3, [tmp2q+64*0]
mova m4, [tmp1q+64*1]
mova m7, [tmp2q+64*1]
add tmp1q, 64*2
add tmp2q, 64*2
psubsw m6, m1, m3
punpcklwd m5, m3, m1
pabsw m6, m6
punpckhwd m3, m1
psubusw m6, m8, m6
psrlw m6, 10
psubw m2, m9, m6
punpcklwd m1, m6, m2
punpckhwd m6, m2
mova m0, m10
vpdpwssd m0, m5, m1
mova m1, m10
vpdpwssd m1, m3, m6
psubsw m5, m4, m7
punpcklwd m6, m7, m4
pabsw m5, m5
punpckhwd m7, m4
psubusw m5, m8, m5
psrlw m5, 10
psubw m3, m9, m5
punpcklwd m4, m5, m3
psrad m0, 4
punpckhwd m5, m3
psrad m1, 4
packusdw m0, m1
mova m1, m10
vpdpwssd m1, m6, m4
mova m4, m10
vpdpwssd m4, m7, m5
mova m5, m12
vpdpwssd m5, m14, m2
mova m2, m12
vpdpwssd m2, m14, m3
psrad m1, 4
psrad m4, 4
packusdw m1, m4
vpermt2b m5, m13, m2
vpsrlvw m0, m11
vpsrlvw m1, m11
mova [maskq], ym5
add maskq, 32
ret
cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-w_mask_444_avx512icl_table
lea r7, [w_mask_444_avx512icl_table]
tzcnt wd, wm
mov r6d, r8m ; pixel_max
movifnidn hd, hm
shr r6d, 11
movsxd wq, [r7+wq*4]
vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
vpbroadcastd m9, [base+pw_64]
vpbroadcastd m10, [base+mask_round+r6*4]
mova m11, [w_mask_end444]
vpbroadcastd m12, [base+bidir_shift+r6*4]
mov maskq, maskmp
add wq, r7
call .main
lea stride3q, [strideq*3]
jmp wq
.w4:
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
vextracti32x4 xm2, ym0, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm0, m0, 3
movq [dstq+strideq*2], xm0
movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
vextracti32x4 xm2, ym1, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm1, m1, 3
movq [dstq+strideq*2], xm1
movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
call .main
lea dstq, [dstq+strideq*4]
.w8:
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
vextracti32x4 [dstq+stride3q ], m0, 3
sub hd, 8
jl .w8_end
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], xm1
vextracti32x4 [dstq+strideq*1], ym1, 1
vextracti32x4 [dstq+strideq*2], m1, 2
vextracti32x4 [dstq+stride3q ], m1, 3
jg .w8_loop
.w8_end:
RET
.w16_loop:
call .main
lea dstq, [dstq+strideq*4]
.w16:
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
vextracti32x8 [dstq+stride3q ], m1, 1
sub hd, 4
jg .w16_loop
RET
.w32_loop:
call .main
lea dstq, [dstq+strideq*2]
.w32:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
sub hd, 2
jg .w32_loop
RET
.w64_loop:
call .main
add dstq, strideq
.w64:
mova [dstq+64*0], m0
mova [dstq+64*1], m1
dec hd
jg .w64_loop
RET
.w128_loop:
call .main
add dstq, strideq
.w128:
mova [dstq+64*0], m0
mova [dstq+64*1], m1
call .main
mova [dstq+64*2], m0
mova [dstq+64*3], m1
dec hd
jg .w128_loop
RET
ALIGN function_align
.main:
mova m1, [tmp1q+64*0]
mova m3, [tmp2q+64*0]
mova m4, [tmp1q+64*1]
mova m7, [tmp2q+64*1]
add tmp1q, 64*2
add tmp2q, 64*2
psubsw m6, m1, m3
punpcklwd m5, m3, m1
pabsw m6, m6
punpckhwd m3, m1
psubusw m6, m8, m6
psrlw m6, 10
psubw m2, m9, m6
punpcklwd m1, m6, m2
punpckhwd m6, m2
mova m0, m10
vpdpwssd m0, m5, m1
mova m1, m10
vpdpwssd m1, m3, m6
psubsw m5, m4, m7
punpcklwd m6, m7, m4
pabsw m5, m5
punpckhwd m7, m4
psubusw m5, m8, m5
psrlw m5, 10
psubw m3, m9, m5
punpcklwd m4, m5, m3
psrad m0, 4
punpckhwd m5, m3
psrad m1, 4
packusdw m0, m1
mova m1, m10
vpdpwssd m1, m6, m4
mova m4, m10
vpdpwssd m4, m7, m5
vpermt2b m2, m11, m3
psrad m1, 4
psrad m4, 4
packusdw m1, m4
vpsrlvw m0, m12
vpsrlvw m1, m12
mova [maskq], m2
add maskq, 64
ret
cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
%define base r6-blend_avx512icl_table
lea r6, [blend_avx512icl_table]
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r6+wq*4]
movifnidn maskq, maskmp
vpbroadcastd m6, [base+pw_m512]
add wq, r6
lea r6, [dsq*3]
jmp wq
.w4:
pmovzxbw ym19, [maskq]
movq xm16, [dstq+dsq*0]
movhps xm16, [dstq+dsq*1]
vpbroadcastq ym17, [dstq+dsq*2]
vpbroadcastq ym18, [dstq+r6 ]
pmullw ym19, ym6
vpblendd ym16, ym17, 0x30
vpblendd ym16, ym18, 0xc0
psubw ym17, ym16, [tmpq]
add maskq, 16
add tmpq, 32
pmulhrsw ym17, ym19
paddw ym16, ym17
vextracti128 xm17, ym16, 1
movq [dstq+dsq*0], xm16
movhps [dstq+dsq*1], xm16
movq [dstq+dsq*2], xm17
movhps [dstq+r6 ], xm17
lea dstq, [dstq+dsq*4]
sub hd, 4
jg .w4
vzeroupper
RET
.w8:
pmovzxbw m2, [maskq]
mova xm0, [dstq+dsq*0]
vinserti32x4 ym0, [dstq+dsq*1], 1
vinserti32x4 m0, [dstq+dsq*2], 2
vinserti32x4 m0, [dstq+r6 ], 3
pmullw m2, m6
psubw m1, m0, [tmpq]
add maskq, 32
add tmpq, 64
pmulhrsw m1, m2
paddw m0, m1
mova [dstq+dsq*0], xm0
vextracti32x4 [dstq+dsq*1], ym0, 1
vextracti32x4 [dstq+dsq*2], m0, 2
vextracti32x4 [dstq+r6 ], m0, 3
lea dstq, [dstq+dsq*4]
sub hd, 4
jg .w8
RET
.w16:
pmovzxbw m4, [maskq+32*0]
pmovzxbw m5, [maskq+32*1]
mova ym0, [dstq+dsq*0]
vinserti32x8 m0, [dstq+dsq*1], 1
mova ym1, [dstq+dsq*2]
vinserti32x8 m1, [dstq+r6 ], 1
pmullw m4, m6
pmullw m5, m6
psubw m2, m0, [tmpq+64*0]
psubw m3, m1, [tmpq+64*1]
add maskq, 32*2
add tmpq, 64*2
pmulhrsw m2, m4
pmulhrsw m3, m5
paddw m0, m2
paddw m1, m3
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
mova [dstq+dsq*2], ym1
vextracti32x8 [dstq+r6 ], m1, 1
lea dstq, [dstq+dsq*4]
sub hd, 4
jg .w16
RET
.w32:
pmovzxbw m4, [maskq+32*0]
pmovzxbw m5, [maskq+32*1]
mova m0, [dstq+dsq*0]
mova m1, [dstq+dsq*1]
pmullw m4, m6
pmullw m5, m6
psubw m2, m0, [tmpq+ 64*0]
psubw m3, m1, [tmpq+ 64*1]
add maskq, 32*2
add tmpq, 64*2
pmulhrsw m2, m4
pmulhrsw m3, m5
paddw m0, m2
paddw m1, m3
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w32
RET
cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
lea r5, [blend_v_avx512icl_table]
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r5+wq*4]
add wq, r5
jmp wq
.w2:
vpbroadcastd xmm2, [obmc_masks_avx2+2*2]
.w2_loop:
movd xmm0, [dstq+dsq*0]
pinsrd xmm0, [dstq+dsq*1], 1
movq xmm1, [tmpq]
add tmpq, 4*2
psubw xmm1, xmm0, xmm1
pmulhrsw xmm1, xmm2
paddw xmm0, xmm1
movd [dstq+dsq*0], xmm0
pextrd [dstq+dsq*1], xmm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w2_loop
RET
.w4:
vpbroadcastq xmm2, [obmc_masks_avx2+4*2]
.w4_loop:
movq xmm0, [dstq+dsq*0]
movhps xmm0, [dstq+dsq*1]
psubw xmm1, xmm0, [tmpq]
add tmpq, 8*2
pmulhrsw xmm1, xmm2
paddw xmm0, xmm1
movq [dstq+dsq*0], xmm0
movhps [dstq+dsq*1], xmm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w4_loop
RET
.w8:
vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2]
.w8_loop:
mova xm0, [dstq+dsq*0]
vinserti32x4 ym0, [dstq+dsq*1], 1
psubw ym1, ym0, [tmpq]
add tmpq, 16*2
pmulhrsw ym1, ym2
paddw ym0, ym1
mova [dstq+dsq*0], xm0
vextracti32x4 [dstq+dsq*1], ym0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w8_loop
RET
.w16:
vbroadcasti32x8 m2, [obmc_masks_avx2+16*2]
.w16_loop:
mova ym0, [dstq+dsq*0]
vinserti32x8 m0, [dstq+dsq*1], 1
psubw m1, m0, [tmpq]
add tmpq, 32*2
pmulhrsw m1, m2
paddw m0, m1
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w16_loop
RET
.w32:
mova m4, [obmc_masks_avx2+32*2]
.w32_loop:
mova m0, [dstq+dsq*0]
psubw m2, m0, [tmpq+ 64*0]
mova m1, [dstq+dsq*1]
psubw m3, m1, [tmpq+ 64*1]
add tmpq, 64*2
pmulhrsw m2, m4
pmulhrsw m3, m4
paddw m0, m2
paddw m1, m3
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w32_loop
RET
cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
%define base r6-$$
lea r6, [$$]
tzcnt wd, wm
mov hd, hm
movsxd wq, [base+blend_h_avx512icl_table+wq*4]
lea maskq, [base+obmc_masks_avx2+hq*2]
lea hd, [hq*3]
lea wq, [base+blend_h_avx512icl_table+wq]
shr hd, 2 ; h * 3/4
lea maskq, [maskq+hq*2]
neg hq
jmp wq
.w2:
movd xmm0, [dstq+dsq*0]
pinsrd xmm0, [dstq+dsq*1], 1
movd xmm2, [maskq+hq*2]
movq xmm1, [tmpq]
add tmpq, 4*2
punpcklwd xmm2, xmm2
psubw xmm1, xmm0, xmm1
pmulhrsw xmm1, xmm2
paddw xmm0, xmm1
movd [dstq+dsq*0], xmm0
pextrd [dstq+dsq*1], xmm0, 1
lea dstq, [dstq+dsq*2]
add hq, 2
jl .w2
RET
.w4:
mova xmm3, [blend_shuf]
.w4_loop:
movq xmm0, [dstq+dsq*0]
movhps xmm0, [dstq+dsq*1]
movd xmm2, [maskq+hq*2]
psubw xmm1, xmm0, [tmpq]
add tmpq, 8*2
pshufb xmm2, xmm3
pmulhrsw xmm1, xmm2
paddw xmm0, xmm1
movq [dstq+dsq*0], xmm0
movhps [dstq+dsq*1], xmm0
lea dstq, [dstq+dsq*2]
add hq, 2
jl .w4_loop
RET
.w8:
vbroadcasti32x4 ym3, [blend_shuf]
shufpd ym3, ym3, 0x0c
.w8_loop:
mova xm0, [dstq+dsq*0]
vinserti32x4 ym0, [dstq+dsq*1], 1
vpbroadcastd ym2, [maskq+hq*2]
psubw ym1, ym0, [tmpq]
add tmpq, 16*2
pshufb ym2, ym3
pmulhrsw ym1, ym2
paddw ym0, ym1
mova [dstq+dsq*0], xm0
vextracti32x4 [dstq+dsq*1], ym0, 1
lea dstq, [dstq+dsq*2]
add hq, 2
jl .w8_loop
RET
.w16:
vbroadcasti32x4 m3, [blend_shuf]
shufpd m3, m3, 0xf0
.w16_loop:
mova ym0, [dstq+dsq*0]
vinserti32x8 m0, [dstq+dsq*1], 1
vpbroadcastd m2, [maskq+hq*2]
psubw m1, m0, [tmpq]
add tmpq, 32*2
pshufb m2, m3
pmulhrsw m1, m2
paddw m0, m1
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
add hq, 2
jl .w16_loop
RET
.w32:
vpbroadcastw m4, [maskq+hq*2]
vpbroadcastw m5, [maskq+hq*2+2]
mova m0, [dstq+dsq*0]
psubw m2, m0, [tmpq+ 64*0]
mova m1, [dstq+dsq*1]
psubw m3, m1, [tmpq+ 64*1]
add tmpq, 64*2
pmulhrsw m2, m4
pmulhrsw m3, m5
paddw m0, m2
paddw m1, m3
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
add hq, 2
jl .w32
RET
.w64:
vpbroadcastw m4, [maskq+hq*2]
mova m0, [dstq+64*0]
psubw m2, m0, [tmpq+64*0]
mova m1, [dstq+64*1]
psubw m3, m1, [tmpq+64*1]
add tmpq, 64*2
pmulhrsw m2, m4
pmulhrsw m3, m4
paddw m0, m2
paddw m1, m3
mova [dstq+64*0], m0
mova [dstq+64*1], m1
add dstq, dsq
inc hq
jl .w64
RET
.w128:
vpbroadcastw m8, [maskq+hq*2]
mova m0, [dstq+64*0]
psubw m4, m0, [tmpq+64*0]
mova m1, [dstq+64*1]
psubw m5, m1, [tmpq+64*1]
mova m2, [dstq+64*2]
psubw m6, m2, [tmpq+64*2]
mova m3, [dstq+64*3]
psubw m7, m3, [tmpq+64*3]
add tmpq, 64*4
REPX {pmulhrsw x, m8}, m4, m5, m6, m7
paddw m0, m4
paddw m1, m5
paddw m2, m6
paddw m3, m7
mova [dstq+64*0], m0
mova [dstq+64*1], m1
mova [dstq+64*2], m2
mova [dstq+64*3], m3
add dstq, dsq
inc hq
jl .w128
RET
cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0, pxmax
sub dword mx0m, 4<<14
sub dword src_wm, 8
mov r6, ~0
vpbroadcastd m5, dxm
vpbroadcastd m8, mx0m
vpbroadcastd m6, src_wm
kmovq k6, r6
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
LEA r7, $$
%define base r7-$$
vpbroadcastd m3, [base+pd_16384]
vpbroadcastd m7, [base+pd_63]
mova m24, [base+resize_permA]
mova m25, [base+resize_permB]
mova m26, [base+resize_permC]
mova m27, [base+resize_permD]
vbroadcasti32x4 m28, [base+resize_shufA]
vbroadcasti32x4 m29, [base+resize_shufB]
mova m30, [base+resize_permE]
vpbroadcastw ym31, pxmaxm
vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
pslld m5, 4 ; dx*16
pslld m6, 14
pxor m2, m2
.loop_y:
xor xd, xd
mova m4, m8 ; per-line working version of mx
.loop_x:
pmaxsd m0, m4, m2
psrad m9, m4, 8 ; filter offset (unmasked)
pminsd m0, m6 ; iclip(mx, 0, src_w-8)
psubd m1, m4, m0 ; pshufb offset
psrad m0, 14 ; clipped src_x offset
psrad m1, 14 ; pshufb edge_emu offset
vptestmd k5, m1, m1
pand m9, m7 ; filter offset (masked)
ktestw k5, k5
jz .load
vpbroadcastq m14, [base+pd_0_4]
vpermq m10, m0, q1100
vpermq m11, m0, q3322
vpermq m20, m1, q1100
vpermq m21, m1, q3322
punpckldq m10, m10
punpckldq m11, m11
punpckldq m20, m20
punpckldq m21, m21
paddd m10, m14
paddd m11, m14
paddd m20, m14
paddd m21, m14
vextracti32x8 ym12, m10, 1
vextracti32x8 ym13, m11, 1
vextracti32x8 ym22, m20, 1
vextracti32x8 ym23, m21, 1
kmovq k1, k6
kmovq k2, k6
kmovq k3, k6
kmovq k4, k6
vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3
vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7
vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B
vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F
kmovq k1, k6
kmovq k2, k6
kmovq k3, k6
kmovq k4, k6
vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2]
vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2]
vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2]
vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2]
pshufb m16, m0
pshufb m17, m1
pshufb m18, m14
pshufb m19, m15
mova m20, m24
mova m22, m24
mova m21, m25
mova m23, m25
vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
mova m15, m26
mova m17, m26
mova m16, m27
mova m18, m27
vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
kmovq k1, k6
kmovq k2, k6
vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
pshufb m10, m11, m28
pshufb m11, m11, m29
pshufb m12, m13, m28
pshufb m13, m13, m29
jmp .filter
.load:
kmovq k1, k6
kmovq k2, k6
kmovq k3, k6
kmovq k4, k6
vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
pshufb m10, m11, m28
pshufb m11, m11, m29
pshufb m12, m13, m28
pshufb m13, m13, m29
vpgatherdd m15{k3}, [srcq+m0*2+ 0]
vpgatherdd m16{k4}, [srcq+m0*2+ 4]
kmovq k1, k6
kmovq k2, k6
vpgatherdd m17{k1}, [srcq+m0*2+ 8]
vpgatherdd m18{k2}, [srcq+m0*2+12]
.filter:
mova m14, m2
vpdpwssd m14, m15, m10
vpdpwssd m14, m16, m11
vpdpwssd m14, m17, m12
vpdpwssd m14, m18, m13
psubd m14, m3, m14
psrad m14, 15
packusdw m14, m14
vpermq m14, m30, m14
pminsw ym14, ym31
mova [dstq+xq*2], ym14
paddd m4, m5
add xd, 16
cmp xd, dst_wd
jl .loop_x
add dstq, dst_strideq
add srcq, src_strideq
dec hd
jg .loop_y
RET
%endif ; ARCH_X86_64