| ; |
| ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| ; |
| ; Use of this source code is governed by a BSD-style license |
| ; that can be found in the LICENSE file in the root of the source |
| ; tree. An additional intellectual property rights grant can be found |
| ; in the file PATENTS. All contributing project authors may |
| ; be found in the AUTHORS file in the root of the source tree. |
| ; |
| |
| |
| EXPORT |vp8_sixtap_predict8x4_armv6| |
| |
| AREA |.text|, CODE, READONLY ; name this block of code |
| ;------------------------------------- |
| ; r0 unsigned char *src_ptr, |
| ; r1 int src_pixels_per_line, |
| ; r2 int xoffset, |
| ; r3 int yoffset, |
| ; stack unsigned char *dst_ptr, |
| ; stack int dst_pitch |
| ;------------------------------------- |
| ;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. |
| ;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, |
| ;and the result is stored in transpose. |
| |vp8_sixtap_predict8x4_armv6| PROC |
| stmdb sp!, {r4 - r11, lr} |
| str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset |
| |
| cmp r2, #0 ;skip first_pass filter if xoffset=0 |
| add lr, sp, #4 ;point to temporary buffer |
| beq skip_firstpass_filter |
| |
| ;first-pass filter |
| ldr r12, _filter8_coeff_ |
| sub r0, r0, r1, lsl #1 |
| |
| add r2, r12, r2, lsl #4 ;calculate filter location |
| add r0, r0, #3 ;adjust src only for loading convinience |
| |
| ldr r3, [r2] ; load up packed filter coefficients |
| ldr r4, [r2, #4] |
| ldr r5, [r2, #8] |
| |
| mov r2, #0x90000 ; height=9 is top part of counter |
| |
| sub r1, r1, #8 |
| |
| |first_pass_hloop_v6| |
| ldrb r6, [r0, #-5] ; load source data |
| ldrb r7, [r0, #-4] |
| ldrb r8, [r0, #-3] |
| ldrb r9, [r0, #-2] |
| ldrb r10, [r0, #-1] |
| |
| orr r2, r2, #0x4 ; construct loop counter. width=8=4x2 |
| |
| pkhbt r6, r6, r7, lsl #16 ; r7 | r6 |
| pkhbt r7, r7, r8, lsl #16 ; r8 | r7 |
| |
| pkhbt r8, r8, r9, lsl #16 ; r9 | r8 |
| pkhbt r9, r9, r10, lsl #16 ; r10 | r9 |
| |
| |first_pass_wloop_v6| |
| smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1] |
| smuad r12, r7, r3 |
| |
| ldrb r6, [r0], #1 |
| |
| smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3] |
| ldrb r7, [r0], #1 |
| smlad r12, r9, r4, r12 |
| |
| pkhbt r10, r10, r6, lsl #16 ; r10 | r9 |
| pkhbt r6, r6, r7, lsl #16 ; r11 | r10 |
| smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5] |
| smlad r12, r6, r5, r12 |
| |
| sub r2, r2, #1 |
| |
| add r11, r11, #0x40 ; round_shift_and_clamp |
| tst r2, #0xff ; test loop counter |
| usat r11, #8, r11, asr #7 |
| add r12, r12, #0x40 |
| strh r11, [lr], #20 ; result is transposed and stored, which |
| usat r12, #8, r12, asr #7 |
| |
| strh r12, [lr], #20 |
| |
| movne r11, r6 |
| movne r12, r7 |
| |
| movne r6, r8 |
| movne r7, r9 |
| movne r8, r10 |
| movne r9, r11 |
| movne r10, r12 |
| |
| bne first_pass_wloop_v6 |
| |
| ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines |
| ;;IF ARCHITECTURE=6 |
| ;pld [src, ppl] |
| ;;pld [src, r9] |
| ;;ENDIF |
| |
| subs r2, r2, #0x10000 |
| |
| sub lr, lr, #158 |
| |
| add r0, r0, r1 ; move to next input line |
| |
| bne first_pass_hloop_v6 |
| |
| ;second pass filter |
| secondpass_filter |
| ldr r3, [sp], #4 ; load back yoffset |
| ldr r0, [sp, #216] ; load dst address from stack 180+36 |
| ldr r1, [sp, #220] ; load dst stride from stack 180+40 |
| |
| cmp r3, #0 |
| beq skip_secondpass_filter |
| |
| ldr r12, _filter8_coeff_ |
| add lr, r12, r3, lsl #4 ;calculate filter location |
| |
| mov r2, #0x00080000 |
| |
| ldr r3, [lr] ; load up packed filter coefficients |
| ldr r4, [lr, #4] |
| ldr r5, [lr, #8] |
| |
| pkhbt r12, r4, r3 ; pack the filter differently |
| pkhbt r11, r5, r4 |
| |
| second_pass_hloop_v6 |
| ldr r6, [sp] ; load the data |
| ldr r7, [sp, #4] |
| |
| orr r2, r2, #2 ; loop counter |
| |
| second_pass_wloop_v6 |
| smuad lr, r3, r6 ; apply filter |
| smulbt r10, r3, r6 |
| |
| ldr r8, [sp, #8] |
| |
| smlad lr, r4, r7, lr |
| smladx r10, r12, r7, r10 |
| |
| ldrh r9, [sp, #12] |
| |
| smlad lr, r5, r8, lr |
| smladx r10, r11, r8, r10 |
| |
| add sp, sp, #4 |
| smlatb r10, r5, r9, r10 |
| |
| sub r2, r2, #1 |
| |
| add lr, lr, #0x40 ; round_shift_and_clamp |
| tst r2, #0xff |
| usat lr, #8, lr, asr #7 |
| add r10, r10, #0x40 |
| strb lr, [r0], r1 ; the result is transposed back and stored |
| usat r10, #8, r10, asr #7 |
| |
| strb r10, [r0],r1 |
| |
| movne r6, r7 |
| movne r7, r8 |
| |
| bne second_pass_wloop_v6 |
| |
| subs r2, r2, #0x10000 |
| add sp, sp, #12 ; updata src for next loop (20-8) |
| sub r0, r0, r1, lsl #2 |
| add r0, r0, #1 |
| |
| bne second_pass_hloop_v6 |
| |
| add sp, sp, #20 |
| ldmia sp!, {r4 - r11, pc} |
| |
| ;-------------------- |
| skip_firstpass_filter |
| sub r0, r0, r1, lsl #1 |
| sub r1, r1, #8 |
| mov r2, #9 |
| |
| skip_firstpass_hloop |
| ldrb r4, [r0], #1 ; load data |
| subs r2, r2, #1 |
| ldrb r5, [r0], #1 |
| strh r4, [lr], #20 ; store it to immediate buffer |
| ldrb r6, [r0], #1 ; load data |
| strh r5, [lr], #20 |
| ldrb r7, [r0], #1 |
| strh r6, [lr], #20 |
| ldrb r8, [r0], #1 |
| strh r7, [lr], #20 |
| ldrb r9, [r0], #1 |
| strh r8, [lr], #20 |
| ldrb r10, [r0], #1 |
| strh r9, [lr], #20 |
| ldrb r11, [r0], #1 |
| strh r10, [lr], #20 |
| add r0, r0, r1 ; move to next input line |
| strh r11, [lr], #20 |
| |
| sub lr, lr, #158 ; move over to next column |
| bne skip_firstpass_hloop |
| |
| b secondpass_filter |
| |
| ;-------------------- |
| skip_secondpass_filter |
| mov r2, #8 |
| add sp, sp, #4 ;start from src[0] instead of src[-2] |
| |
| skip_secondpass_hloop |
| ldr r6, [sp], #4 |
| subs r2, r2, #1 |
| ldr r8, [sp], #4 |
| |
| mov r7, r6, lsr #16 ; unpack |
| strb r6, [r0], r1 |
| mov r9, r8, lsr #16 |
| strb r7, [r0], r1 |
| add sp, sp, #12 ; 20-8 |
| strb r8, [r0], r1 |
| strb r9, [r0], r1 |
| |
| sub r0, r0, r1, lsl #2 |
| add r0, r0, #1 |
| |
| bne skip_secondpass_hloop |
| |
| add sp, sp, #16 ; 180 - (160 +4) |
| |
| ldmia sp!, {r4 - r11, pc} |
| |
| ENDP |
| |
| ;----------------- |
| AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default |
| ;Data section with name data_area is specified. DCD reserves space in memory for 48 data. |
| ;One word each is reserved. Label filter_coeff can be used to access the data. |
| ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... |
| _filter8_coeff_ |
| DCD filter8_coeff |
| filter8_coeff |
| DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000 |
| DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000 |
| DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000 |
| DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000 |
| DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000 |
| DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000 |
| DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000 |
| DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000 |
| |
| ;DCD 0, 0, 128, 0, 0, 0 |
| ;DCD 0, -6, 123, 12, -1, 0 |
| ;DCD 2, -11, 108, 36, -8, 1 |
| ;DCD 0, -9, 93, 50, -6, 0 |
| ;DCD 3, -16, 77, 77, -16, 3 |
| ;DCD 0, -6, 50, 93, -9, 0 |
| ;DCD 1, -8, 36, 108, -11, 2 |
| ;DCD 0, -1, 12, 123, -6, 0 |
| |
| END |