请教ARM中的预取命令PLD的使用

sno_guo 2012-11-24 09:27:32

我现在在看android2.3.3提供的关于ARM平台的memcmp这个函数的实现代码,它是用汇编编写的,如下:

   .text



    .global __memcmp16

    .type __memcmp16, %function

    .align 4



/*

 * Optimized memcmp16() for ARM9.

 * This would not be optimal on XScale or ARM11, where more prefetching

 * and use of PLD will be needed.

 * The 2 major optimzations here are

 * (1) The main loop compares 16 bytes at a time

 * (2) The loads are scheduled in a way they won't stall

 */



__memcmp16:

        .fnstart

        PLD         (r0, #0)

        PLD         (r1, #0)



        /* take of the case where length is nul or the buffers are the same */

        cmp         r0, r1

        cmpne       r2, #0

        moveq       r0, #0

        bxeq        lr



        /* since r0 hold the result, move the first source

         * pointer somewhere else

         */



        mov         r3, r0



         /* make sure we have at least 12 words, this simplify things below

          * and avoid some overhead for small blocks

          */



        cmp         r2, #12

        bpl         0f



        /* small blocks (less then 12 words) */

        PLD         (r0, #32)

        PLD         (r1, #32)



1:      ldrh        r0, [r3], #2

        ldrh        ip, [r1], #2

        subs        r0, r0, ip

        bxne        lr        

        subs        r2, r2, #1

        bne         1b

        bx          lr





        .save {r4, lr}

        /* save registers */

0:      stmfd       sp!, {r4, lr}

        

        /* align first pointer to word boundary */

        tst         r3, #2

        beq         0f

        

        ldrh        r0, [r3], #2

        ldrh        ip, [r1], #2

        sub         r2, r2, #1

        subs        r0, r0, ip

        /* restore registers and return */

        ldmnefd     sp!, {r4, lr}

        bxne        lr

        .fnend







0:      /* here the first pointer is aligned, and we have at least 3 words

         * to process.

         */



        /* see if the pointers are congruent */

        eor         r0, r3, r1

        ands        r0, r0, #2

        bne         5f



        /* congruent case, 16 half-words per iteration

         * We need to make sure there are at least 16+2 words left

         * because we effectively read ahead one long word, and we could

         * read past the buffer (and segfault) if we're not careful.

         */



        ldr         ip, [r1]

        subs        r2, r2, #(16 + 2)

        bmi         1f

        

0: ///PLD是宏定义,意思是ARM指令集支持pld命令,就用pld,否则为空

  ///我不明白的是:这里为什么要把r3+64的地址中的数据取出来,这里好像也没有用到r3+64这个地址中的数据啊????? 请各位忙吧分析下,谢谢了,

        PLD         (r3, #64)  

        PLD         (r1, #64) ///这里为什么要加64?????

        ldr         r0, [r3], #4

        ldr         lr, [r1, #4]!

        eors        r0, r0, ip

        ldreq       r0, [r3], #4

        ldreq       ip, [r1, #4]!

        eoreqs      r0, r0, lr

        ldreq       r0, [r3], #4

        ldreq       lr, [r1, #4]!

        eoreqs      r0, r0, ip

        ldreq       r0, [r3], #4

        ldreq       ip, [r1, #4]!

        eoreqs      r0, r0, lr

        ldreq       r0, [r3], #4

        ldreq       lr, [r1, #4]!

        eoreqs      r0, r0, ip

        ldreq       r0, [r3], #4

        ldreq       ip, [r1, #4]!

        eoreqs      r0, r0, lr

        ldreq       r0, [r3], #4

        ldreq       lr, [r1, #4]!

        eoreqs      r0, r0, ip

        ldreq       r0, [r3], #4

        ldreq       ip, [r1, #4]!

        eoreqs      r0, r0, lr

        bne         2f        

        subs        r2, r2, #16

        bhs         0b



        /* do we have at least 2 words left? */

1:      adds        r2, r2, #(16 - 2 + 2)

        bmi         4f

        

        /* finish off 2 words at a time */

3:      ldr         r0, [r3], #4

        ldr         ip, [r1], #4

        eors        r0, r0, ip

        bne         2f

        subs        r2, r2, #2

        bhs         3b



        /* are we done? */

4:      adds        r2, r2, #2

        bne         8f

        /* restore registers and return */

        mov         r0, #0

        ldmfd       sp!, {r4, lr}

        bx          lr



2:      /* the last 2 words are different, restart them */

        ldrh        r0, [r3, #-4]

        ldrh        ip, [r1, #-4]

        subs        r0, r0, ip

        ldreqh      r0, [r3, #-2]

        ldreqh      ip, [r1, #-2]

        subeqs      r0, r0, ip

        /* restore registers and return */

        ldmfd       sp!, {r4, lr}

        bx          lr



        /* process the last few words */

8:      ldrh        r0, [r3], #2

        ldrh        ip, [r1], #2

        subs        r0, r0, ip

        bne         9f

        subs        r2, r2, #1

        bne         8b



9:      /* restore registers and return */

        ldmfd       sp!, {r4, lr}

        bx          lr





5:      /*************** non-congruent case ***************/



        /* align the unaligned pointer */

        bic         r1, r1, #3

        ldr         lr, [r1], #4

        sub         r2, r2, #8



6:

        PLD         (r3, #64)

        PLD         (r1, #64)

        mov         ip, lr, lsr #16

        ldr         lr, [r1], #4

        ldr         r0, [r3], #4

        orr         ip, ip, lr, lsl #16

        eors        r0, r0, ip

        moveq       ip, lr, lsr #16

        ldreq       lr, [r1], #4

        ldreq       r0, [r3], #4

        orreq       ip, ip, lr, lsl #16

        eoreqs      r0, r0, ip

        moveq       ip, lr, lsr #16

        ldreq       lr, [r1], #4

        ldreq       r0, [r3], #4

        orreq       ip, ip, lr, lsl #16

        eoreqs      r0, r0, ip

        moveq       ip, lr, lsr #16

        ldreq       lr, [r1], #4

        ldreq       r0, [r3], #4

        orreq       ip, ip, lr, lsl #16

        eoreqs      r0, r0, ip

        bne         7f

        subs        r2, r2, #8

        bhs         6b

        sub         r1, r1, #2

        /* are we done? */

        adds        r2, r2, #8

        moveq       r0, #0

        beq         9b

        /* finish off the remaining bytes */

        b           8b



7:      /* fix up the 2 pointers and fallthrough... */

        sub         r1, r1, #2

        b           2b

...全文

665 7 打赏收藏转发到动态举报

写回复

用AI写文章

7 条回复

切换为时间正序

请发表友善的回复…

发表回复

sno_guo 2013-01-06

打赏
举报

哦，谢谢，这个我在arm网站上的论坛里问过，大概功能明白了。可是还有点不明白的是pld 命令只是说明了从哪里地址来预取数据，可是没有说一次性取多少个数据，怎么知道一次预取多少个数据呢。

求佛_ce123 2013-01-06

打赏
举报

http://blog.csdn.net/ce123/article/details/8471614 可以看看这篇博客，希望对你有帮助

求佛_ce123 2013-01-06

打赏
举报

Cache块的大小需要看一下arm内核的数据手册

求佛_ce123 2013-01-06

打赏
举报

Cache未命中时的分配策略：读操作分配策略、读/写策略分配策略。 a). 读操作分配策略，当Cache未命中时，只有进行存储器读操作时，才分配Cache行。如果被替换的Cache行包含有效数据，那么在该行被新的数据填充之前，要先把原理的内容写入到主存中去。采用读操作分配策略时，存储器写操作不会更新Cache行，除非相关的Cache行恰好是前一个主存读操作刚分配的。 b). 采用读/写分配策略，无论存储器读还是写操作，在Cache未命中时，都将分配Cache行。对于存储器写操作，如果Cache未命中，将分配一个 Cache行。如果被替换的Cache行中包含有效数据，控制器会先将该行数据写入主存，再用从主存读取的数据将改行Cache覆盖，最后把内核数据写入该Cache行中。如果采用Cache直写策略，内核数据将会同时被写入到主存中。因此：一次性取多少数据取决于Cache行的大小，也就是Cache块的大小，但我有一个疑问，数据已经在寄存器中了，为什么还要预读取？

求佛_ce123 2013-01-06