/* * Copyright © 2008 Mozilla Corporation * Copyright © 2010 Nokia Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that * copyright notice and this permission notice appear in supporting * documentation, and that the name of Mozilla Corporation not be used in * advertising or publicity pertaining to distribution of the software without * specific, written prior permission. Mozilla Corporation makes no * representations about the suitability of this software for any purpose. It * is provided "as is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS * SOFTWARE. * * Author: Jeff Muizelaar (jeff@infidigm.net) * */ /* Prevent the stack from becoming executable */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif .text .arch armv6 .object_arch armv4 .arm .altmacro .p2align 2 /* Supplementary macro for setting function attributes */ .macro pixman_asm_function fname .func fname .global fname #ifdef __ELF__ .hidden fname .type fname, %function #endif fname: .endm /* * The code below was generated by gcc 4.3.4 from the commented out * functions in 'pixman-arm-simd.c' file with the following optimization * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer" * * TODO: replace gcc generated code with hand tuned versions because * the code quality is not very good, introduce symbolic register * aliases for better readability and maintainability. */ pixman_asm_function pixman_composite_add_8_8_asm_armv6 push {r4, r5, r6, r7, r8, r9, r10, r11} mov r10, r1 sub sp, sp, #4 subs r10, r10, #1 mov r11, r0 mov r8, r2 str r3, [sp] ldr r7, [sp, #36] bcc 0f 6: cmp r11, #0 beq 1f orr r3, r8, r7 tst r3, #3 beq 2f mov r1, r8 mov r0, r7 mov r12, r11 b 3f 5: tst r3, #3 beq 4f 3: ldrb r2, [r0], #1 subs r12, r12, #1 ldrb r3, [r1] uqadd8 r3, r2, r3 strb r3, [r1], #1 orr r3, r1, r0 bne 5b 1: ldr r3, [sp] add r8, r8, r3 ldr r3, [sp, #40] add r7, r7, r3 10: subs r10, r10, #1 bcs 6b 0: add sp, sp, #4 pop {r4, r5, r6, r7, r8, r9, r10, r11} bx lr 2: mov r12, r11 mov r1, r8 mov r0, r7 4: cmp r12, #3 subgt r6, r12, #4 movgt r9, r12 lsrgt r5, r6, #2 addgt r3, r5, #1 movgt r12, #0 lslgt r4, r3, #2 ble 7f 8: ldr r3, [r0, r12] ldr r2, [r1, r12] uqadd8 r3, r3, r2 str r3, [r1, r12] add r12, r12, #4 cmp r12, r4 bne 8b sub r3, r9, #4 bic r3, r3, #3 add r3, r3, #4 subs r12, r6, r5, lsl #2 add r1, r1, r3 add r0, r0, r3 beq 1b 7: mov r4, #0 9: ldrb r3, [r1, r4] ldrb r2, [r0, r4] uqadd8 r3, r2, r3 strb r3, [r1, r4] add r4, r4, #1 cmp r4, r12 bne 9b ldr r3, [sp] add r8, r8, r3 ldr r3, [sp, #40] add r7, r7, r3 b 10b .endfunc pixman_asm_function pixman_composite_over_8888_8888_asm_armv6 push {r4, r5, r6, r7, r8, r9, r10, r11} sub sp, sp, #20 cmp r1, #0 mov r12, r2 str r1, [sp, #12] str r0, [sp, #16] ldr r2, [sp, #52] beq 0f lsl r3, r3, #2 str r3, [sp] ldr r3, [sp, #56] mov r10, #0 lsl r3, r3, #2 str r3, [sp, #8] mov r11, r3 b 1f 6: ldr r11, [sp, #8] 1: ldr r9, [sp] mov r0, r12 add r12, r12, r9 mov r1, r2 str r12, [sp, #4] add r2, r2, r11 ldr r12, [sp, #16] ldr r3, =0x00800080 ldr r9, =0xff00ff00 mov r11, #255 cmp r12, #0 beq 4f 5: ldr r5, [r1], #4 ldr r4, [r0] sub r8, r11, r5, lsr #24 uxtb16 r6, r4 uxtb16 r7, r4, ror #8 mla r6, r6, r8, r3 mla r7, r7, r8, r3 uxtab16 r6, r6, r6, ror #8 uxtab16 r7, r7, r7, ror #8 and r7, r7, r9 uxtab16 r6, r7, r6, ror #8 uqadd8 r5, r6, r5 str r5, [r0], #4 subs r12, r12, #1 bne 5b 4: ldr r3, [sp, #12] add r10, r10, #1 cmp r10, r3 ldr r12, [sp, #4] bne 6b 0: add sp, sp, #20 pop {r4, r5, r6, r7, r8, r9, r10, r11} bx lr .endfunc pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6 push {r4, r5, r6, r7, r8, r9, r10, r11} sub sp, sp, #28 cmp r1, #0 str r1, [sp, #12] ldrb r1, [sp, #71] mov r12, r2 str r0, [sp, #16] ldr r2, [sp, #60] str r1, [sp, #24] beq 0f lsl r3, r3, #2 str r3, [sp, #20] ldr r3, [sp, #64] mov r10, #0 lsl r3, r3, #2 str r3, [sp, #8] mov r11, r3 b 1f 5: ldr r11, [sp, #8] 1: ldr r4, [sp, #20] mov r0, r12 mov r1, r2 add r12, r12, r4 add r2, r2, r11 str r12, [sp] str r2, [sp, #4] ldr r12, [sp, #16] ldr r2, =0x00800080 ldr r3, [sp, #24] mov r11, #255 cmp r12, #0 beq 3f 4: ldr r5, [r1], #4 ldr r4, [r0] uxtb16 r6, r5 uxtb16 r7, r5, ror #8 mla r6, r6, r3, r2 mla r7, r7, r3, r2 uxtab16 r6, r6, r6, ror #8 uxtab16 r7, r7, r7, ror #8 uxtb16 r6, r6, ror #8 uxtb16 r7, r7, ror #8 orr r5, r6, r7, lsl #8 uxtb16 r6, r4 uxtb16 r7, r4, ror #8 sub r8, r11, r5, lsr #24 mla r6, r6, r8, r2 mla r7, r7, r8, r2 uxtab16 r6, r6, r6, ror #8 uxtab16 r7, r7, r7, ror #8 uxtb16 r6, r6, ror #8 uxtb16 r7, r7, ror #8 orr r6, r6, r7, lsl #8 uqadd8 r5, r6, r5 str r5, [r0], #4 subs r12, r12, #1 bne 4b 3: ldr r1, [sp, #12] add r10, r10, #1 cmp r10, r1 ldr r12, [sp] ldr r2, [sp, #4] bne 5b 0: add sp, sp, #28 pop {r4, r5, r6, r7, r8, r9, r10, r11} bx lr .endfunc pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6 push {r4, r5, r6, r7, r8, r9, r10, r11} sub sp, sp, #28 cmp r1, #0 ldr r9, [sp, #60] str r1, [sp, #12] bic r1, r9, #-16777216 str r1, [sp, #20] mov r12, r2 lsr r1, r9, #8 ldr r2, [sp, #20] bic r1, r1, #-16777216 bic r2, r2, #65280 bic r1, r1, #65280 str r2, [sp, #20] str r0, [sp, #16] str r1, [sp, #4] ldr r2, [sp, #68] beq 0f lsl r3, r3, #2 str r3, [sp, #24] mov r0, #0 b 1f 5: ldr r3, [sp, #24] 1: ldr r4, [sp, #72] mov r10, r12 mov r1, r2 add r12, r12, r3 add r2, r2, r4 str r12, [sp, #8] str r2, [sp] ldr r12, [sp, #16] ldr r11, =0x00800080 ldr r2, [sp, #4] ldr r3, [sp, #20] cmp r12, #0 beq 3f 4: ldrb r5, [r1], #1 ldr r4, [r10] mla r6, r3, r5, r11 mla r7, r2, r5, r11 uxtab16 r6, r6, r6, ror #8 uxtab16 r7, r7, r7, ror #8 uxtb16 r6, r6, ror #8 uxtb16 r7, r7, ror #8 orr r5, r6, r7, lsl #8 uxtb16 r6, r4 uxtb16 r7, r4, ror #8 mvn r8, r5 lsr r8, r8, #24 mla r6, r6, r8, r11 mla r7, r7, r8, r11 uxtab16 r6, r6, r6, ror #8 uxtab16 r7, r7, r7, ror #8 uxtb16 r6, r6, ror #8 uxtb16 r7, r7, ror #8 orr r6, r6, r7, lsl #8 uqadd8 r5, r6, r5 str r5, [r10], #4 subs r12, r12, #1 bne 4b 3: ldr r4, [sp, #12] add r0, r0, #1 cmp r0, r4 ldr r12, [sp, #8] ldr r2, [sp] bne 5b 0: add sp, sp, #28 pop {r4, r5, r6, r7, r8, r9, r10, r11} bx lr .endfunc /* * Note: This code is only using armv5te instructions (not even armv6), * but is scheduled for ARM Cortex-A8 pipeline. So it might need to * be split into a few variants, tuned for each microarchitecture. * * TODO: In order to get good performance on ARM9/ARM11 cores (which don't * have efficient write combining), it needs to be changed to use 16-byte * aligned writes using STM instruction. * * Nearest scanline scaler macro template uses the following arguments: * fname - name of the function to generate * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes * t - type suffix for LDR/STR instructions * prefetch_distance - prefetch in the source image by that many * pixels ahead * prefetch_braking_distance - stop prefetching when that many pixels are * remaining before the end of scanline */ .macro generate_nearest_scanline_func fname, bpp_shift, t, \ prefetch_distance, \ prefetch_braking_distance pixman_asm_function fname W .req r0 DST .req r1 SRC .req r2 VX .req r3 UNIT_X .req ip TMP1 .req r4 TMP2 .req r5 VXMASK .req r6 PF_OFFS .req r7 ldr UNIT_X, [sp] push {r4, r5, r6, r7} mvn VXMASK, #((1 << bpp_shift) - 1) /* define helper macro */ .macro scale_2_pixels ldr&t TMP1, [SRC, TMP1] and TMP2, VXMASK, VX, lsr #(16 - bpp_shift) add VX, VX, UNIT_X str&t TMP1, [DST], #(1 << bpp_shift) ldr&t TMP2, [SRC, TMP2] and TMP1, VXMASK, VX, lsr #(16 - bpp_shift) add VX, VX, UNIT_X str&t TMP2, [DST], #(1 << bpp_shift) .endm /* now do the scaling */ and TMP1, VXMASK, VX, lsr #(16 - bpp_shift) add VX, VX, UNIT_X subs W, W, #(8 + prefetch_braking_distance) blt 2f /* calculate prefetch offset */ mov PF_OFFS, #prefetch_distance mla PF_OFFS, UNIT_X, PF_OFFS, VX 1: /* main loop, process 8 pixels per iteration with prefetch */ subs W, W, #8 add PF_OFFS, UNIT_X, lsl #3 scale_2_pixels scale_2_pixels scale_2_pixels scale_2_pixels pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)] bge 1b 2: subs W, W, #(4 - 8 - prefetch_braking_distance) blt 2f 1: /* process the remaining pixels */ scale_2_pixels scale_2_pixels subs W, W, #4 bge 1b 2: tst W, #2 beq 2f scale_2_pixels 2: tst W, #1 ldrne&t TMP1, [SRC, TMP1] strne&t TMP1, [DST] /* cleanup helper macro */ .purgem scale_2_pixels .unreq DST .unreq SRC .unreq W .unreq VX .unreq UNIT_X .unreq TMP1 .unreq TMP2 .unreq VXMASK .unreq PF_OFFS /* return */ pop {r4, r5, r6, r7} bx lr .endfunc .endm generate_nearest_scanline_func \ pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 generate_nearest_scanline_func \ pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32