;
;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license and patent
;  grant that can be found in the LICENSE file in the root of the source
;  tree. All contributing project authors may be found in the AUTHORS
;  file in the root of the source tree.
;


    EXPORT  |vp8_sixtap_predict8x4_armv6|

    AREA    |.text|, CODE, READONLY  ; name this block of code
;-------------------------------------
; r0    unsigned char *src_ptr,
; r1    int  src_pixels_per_line,
; r2    int  xoffset,
; r3    int  yoffset,
; stack unsigned char *dst_ptr,
; stack int  dst_pitch
;-------------------------------------
;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
;and the result is stored in transpose.
|vp8_sixtap_predict8x4_armv6| PROC
    stmdb       sp!, {r4 - r11, lr}
    sub         sp, sp, #184                ;reserve space on stack for temporary storage: 20x(8+1) +4

    cmp         r2, #0                      ;skip first_pass filter if xoffset=0
    str         r3, [sp], #4                ;store yoffset
    beq         skip_firstpass_filter

;first-pass filter
    ldr         r12, _filter8_coeff_
    sub         r0, r0, r1, lsl #1

    add         r2, r12, r2, lsl #4         ;calculate filter location
    add         r0, r0, #3                  ;adjust src only for loading convinience

    ldr         r3, [r2]                    ; load up packed filter coefficients
    ldr         r4, [r2, #4]
    ldr         r5, [r2, #8]

    mov         r2, #0x90000                ; height=9 is top part of counter

    sub         r1, r1, #8
    mov         lr, #20

|first_pass_hloop_v6|
    ldrb        r6, [r0, #-5]               ; load source data
    ldrb        r7, [r0, #-4]
    ldrb        r8, [r0, #-3]
    ldrb        r9, [r0, #-2]
    ldrb        r10, [r0, #-1]

    orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2

    pkhbt       r6, r6, r7, lsl #16         ; r7 | r6
    pkhbt       r7, r7, r8, lsl #16         ; r8 | r7

    pkhbt       r8, r8, r9, lsl #16         ; r9 | r8
    pkhbt       r9, r9, r10, lsl #16        ; r10 | r9

|first_pass_wloop_v6|
    smuad       r11, r6, r3                 ; vp8_filter[0], vp8_filter[1]
    smuad       r12, r7, r3

    ldrb        r6, [r0], #1

    smlad       r11, r8, r4, r11            ; vp8_filter[2], vp8_filter[3]
    ldrb        r7, [r0], #1
    smlad       r12, r9, r4, r12

    pkhbt       r10, r10, r6, lsl #16       ; r10 | r9
    pkhbt       r6, r6, r7, lsl #16         ; r11 | r10
    smlad       r11, r10, r5, r11           ; vp8_filter[4], vp8_filter[5]
    smlad       r12, r6, r5, r12

    sub         r2, r2, #1

    add         r11, r11, #0x40             ; round_shift_and_clamp
    tst         r2, #0xff                   ; test loop counter
    usat        r11, #8, r11, asr #7
    add         r12, r12, #0x40
    strh        r11, [sp], lr               ; result is transposed and stored, which
    usat        r12, #8, r12, asr #7

    strh        r12, [sp], lr

    movne       r11, r6
    movne       r12, r7

    movne       r6, r8
    movne       r7, r9
    movne       r8, r10
    movne       r9, r11
    movne       r10, r12

    bne         first_pass_wloop_v6

    ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines
    ;;IF ARCHITECTURE=6
    ;pld        [src, ppl]
    ;;pld       [src, r9]
    ;;ENDIF

    subs        r2, r2, #0x10000

    mov         r6, #158
    sub         sp, sp, r6

    add         r0, r0, r1                  ; move to next input line

    bne         first_pass_hloop_v6

;second pass filter
secondpass_filter
    mov         r1, #18
    sub         sp, sp, r1                  ; 18+4

    ldr         r3, [sp, #-4]               ; load back yoffset
    ldr         r0, [sp, #216]              ; load dst address from stack 180+36
    ldr         r1, [sp, #220]              ; load dst stride from stack 180+40

    cmp         r3, #0
    beq         skip_secondpass_filter

    ldr         r12, _filter8_coeff_
    add         lr, r12, r3, lsl #4         ;calculate filter location

    mov         r2, #0x00080000

    ldr         r3, [lr]                    ; load up packed filter coefficients
    ldr         r4, [lr, #4]
    ldr         r5, [lr, #8]

    pkhbt       r12, r4, r3                 ; pack the filter differently
    pkhbt       r11, r5, r4

second_pass_hloop_v6
    ldr         r6, [sp]                    ; load the data
    ldr         r7, [sp, #4]

    orr         r2, r2, #2                  ; loop counter

second_pass_wloop_v6
    smuad       lr, r3, r6                  ; apply filter
    smulbt      r10, r3, r6

    ldr         r8, [sp, #8]

    smlad       lr, r4, r7, lr
    smladx      r10, r12, r7, r10

    ldrh        r9, [sp, #12]

    smlad       lr, r5, r8, lr
    smladx      r10, r11, r8, r10

    add         sp, sp, #4
    smlatb      r10, r5, r9, r10

    sub         r2, r2, #1

    add         lr, lr, #0x40               ; round_shift_and_clamp
    tst         r2, #0xff
    usat        lr, #8, lr, asr #7
    add         r10, r10, #0x40
    strb        lr, [r0], r1                ; the result is transposed back and stored
    usat        r10, #8, r10, asr #7

    strb        r10, [r0],r1

    movne       r6, r7
    movne       r7, r8

    bne         second_pass_wloop_v6

    subs        r2, r2, #0x10000
    add         sp, sp, #12                 ; updata src for next loop (20-8)
    sub         r0, r0, r1, lsl #2
    add         r0, r0, #1

    bne         second_pass_hloop_v6

    add         sp, sp, #20
    ldmia       sp!, {r4 - r11, pc}

;--------------------
skip_firstpass_filter
    sub         r0, r0, r1, lsl #1
    sub         r1, r1, #8
    mov         r2, #9
    mov         r3, #20

skip_firstpass_hloop
    ldrb        r4, [r0], #1                ; load data
    subs        r2, r2, #1
    ldrb        r5, [r0], #1
    strh        r4, [sp], r3                ; store it to immediate buffer
    ldrb        r6, [r0], #1                ; load data
    strh        r5, [sp], r3
    ldrb        r7, [r0], #1
    strh        r6, [sp], r3
    ldrb        r8, [r0], #1
    strh        r7, [sp], r3
    ldrb        r9, [r0], #1
    strh        r8, [sp], r3
    ldrb        r10, [r0], #1
    strh        r9, [sp], r3
    ldrb        r11, [r0], #1
    strh        r10, [sp], r3
    add         r0, r0, r1                  ; move to next input line
    strh        r11, [sp], r3

    mov         r4, #158
    sub         sp, sp, r4                  ; move over to next column
    bne         skip_firstpass_hloop

    b           secondpass_filter

;--------------------
skip_secondpass_filter
    mov         r2, #8
    add         sp, sp, #4                  ;start from src[0] instead of src[-2]

skip_secondpass_hloop
    ldr         r6, [sp], #4
    subs        r2, r2, #1
    ldr         r8, [sp], #4

    mov         r7, r6, lsr #16             ; unpack
    strb        r6, [r0], r1
    mov         r9, r8, lsr #16
    strb        r7, [r0], r1
    add         sp, sp, #12                 ; 20-8
    strb        r8, [r0], r1
    strb        r9, [r0], r1

    sub         r0, r0, r1, lsl #2
    add         r0, r0, #1

    bne         skip_secondpass_hloop

    add         sp, sp, #16                 ; 180 - (160 +4)

    ldmia       sp!, {r4 - r11, pc}

    ENDP

;-----------------
    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_filter8_coeff_
    DCD     filter8_coeff
filter8_coeff
    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
    DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
    DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
    DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
    DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000
    DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000
    DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000

    ;DCD        0,  0,  128,    0,   0,  0
    ;DCD        0, -6,  123,   12,  -1,  0
    ;DCD        2, -11, 108,   36,  -8,  1
    ;DCD        0, -9,   93,   50,  -6,  0
    ;DCD        3, -16,  77,   77, -16,  3
    ;DCD        0, -6,   50,   93,  -9,  0
    ;DCD        1, -8,   36,  108, -11,  2
    ;DCD        0, -1,   12,  123,  -6,  0

    END
