1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
|
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_idct4x4llm_v6_dual|
AREA |.text|, CODE, READONLY
; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
; unsigned char *dst, int stride)
; r0 short* input
; r1 unsigned char* pred
; r2 int pitch
; r3 unsigned char* dst
; sp int stride
|vp8_short_idct4x4llm_v6_dual| PROC
stmdb sp!, {r4-r11, lr}
sub sp, sp, #4
mov r4, #0x00008A00 ; sin
orr r4, r4, #0x0000008C ; sinpi8sqrt2
mov r5, #0x00004E00 ; cos
orr r5, r5, #0x0000007B ; cospi8sqrt2minus1
orr r5, r5, #1<<31 ; loop counter on top bit
loop1_dual
ldr r6, [r0, #(4*2)] ; i5 | i4
ldr r12, [r0, #(12*2)] ; i13|i12
ldr r14, [r0, #(8*2)] ; i9 | i8
smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16
smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16
pkhtb r7, r9, r7, asr #16 ; 5c | 4c
pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16
smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16
subs r5, r5, #1<<31 ; i--
pkhtb r9, r11, r9, asr #16 ; 13c | 12c
ldr r11, [r0] ; i1 | i0
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
usub16 r7, r8, r7 ; c
uadd16 r6, r6, r10 ; d
uadd16 r10, r11, r14 ; a
usub16 r8, r11, r14 ; b
uadd16 r9, r10, r6 ; a+d
usub16 r10, r10, r6 ; a-d
uadd16 r6, r8, r7 ; b+c
usub16 r7, r8, r7 ; b-c
; use input buffer to store intermediate results
str r6, [r0, #(4*2)] ; o5 | o4
str r7, [r0, #(8*2)] ; o9 | o8
str r10,[r0, #(12*2)] ; o13|o12
str r9, [r0], #4 ; o1 | o0
bcs loop1_dual
sub r0, r0, #8 ; reset input/output
str r0, [sp]
loop2_dual
ldr r6, [r0, #(4*2)] ; i5 | i4
ldr r12,[r0, #(2*2)] ; i3 | i2
ldr r14,[r0, #(6*2)] ; i7 | i6
ldr r0, [r0, #(0*2)] ; i1 | i0
smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16
pkhbt r11, r6, r0, lsl #16 ; i0 | i4
pkhtb r7, r7, r9, asr #16 ; 1c | 5c
pkhtb r0, r0, r6, asr #16 ; i1 | i5
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1
uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2
pkhbt r9, r14, r12, lsl #16 ; i2 | i6
uadd16 r10, r11, r9 ; a
usub16 r9, r11, r9 ; b
pkhtb r6, r12, r14, asr #16 ; i3 | i7
subs r5, r5, #1<<31 ; i--
smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16
smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16
pkhtb r7, r7, r12, asr #16 ; 3c | 7c
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2
usub16 r12, r8, r6 ; c (o1 | o5)
uadd16 r6, r11, r0 ; d (o3 | o7)
uadd16 r7, r10, r6 ; a+d
mov r8, #4 ; set up 4's
orr r8, r8, #0x40000 ; 4|4
usub16 r6, r10, r6 ; a-d
uadd16 r6, r6, r8 ; a-d+4, 3|7
uadd16 r7, r7, r8 ; a+d+4, 0|4
uadd16 r10, r9, r12 ; b+c
usub16 r0, r9, r12 ; b-c
uadd16 r10, r10, r8 ; b+c+4, 1|5
uadd16 r8, r0, r8 ; b-c+4, 2|6
ldr lr, [sp, #40] ; dst stride
ldrb r0, [r1] ; pred p0
ldrb r11, [r1, #1] ; pred p1
ldrb r12, [r1, #2] ; pred p2
add r0, r0, r7, asr #19 ; p0 + o0
add r11, r11, r10, asr #19 ; p1 + o1
add r12, r12, r8, asr #19 ; p2 + o2
usat r0, #8, r0 ; d0 = clip8(p0 + o0)
usat r11, #8, r11 ; d1 = clip8(p1 + o1)
usat r12, #8, r12 ; d2 = clip8(p2 + o2)
add r0, r0, r11, lsl #8 ; |--|--|d1|d0|
ldrb r11, [r1, #3] ; pred p3
add r0, r0, r12, lsl #16 ; |--|d2|d1|d0|
add r11, r11, r6, asr #19 ; p3 + o3
sxth r7, r7 ;
sxth r10, r10 ;
usat r11, #8, r11 ; d3 = clip8(p3 + o3)
sxth r8, r8 ;
sxth r6, r6 ;
add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0|
ldrb r12, [r1, r2]! ; pred p4
str r0, [r3], lr
ldrb r11, [r1, #1] ; pred p5
add r12, r12, r7, asr #3 ; p4 + o4
add r11, r11, r10, asr #3 ; p5 + o5
usat r12, #8, r12 ; d4 = clip8(p4 + o4)
usat r11, #8, r11 ; d5 = clip8(p5 + o5)
ldrb r7, [r1, #2] ; pred p6
ldrb r10, [r1, #3] ; pred p6
add r12, r12, r11, lsl #8 ; |--|--|d5|d4|
add r7, r7, r8, asr #3 ; p6 + o6
add r10, r10, r6, asr #3 ; p7 + o7
ldr r0, [sp] ; load input pointer
usat r7, #8, r7 ; d6 = clip8(p6 + o6)
usat r10, #8, r10 ; d7 = clip8(p7 + o7)
add r12, r12, r7, lsl #16 ; |--|d6|d5|d4|
add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4|
str r12, [r3], lr
add r0, r0, #16
add r1, r1, r2 ; pred + pitch
bcs loop2_dual
add sp, sp, #4 ; idct_output buffer
ldmia sp!, {r4 - r11, pc}
ENDP
END
|