diff options
Diffstat (limited to 'plugins/MirOTR/Libgcrypt/mpi/pentium4/mmx/mpih-rshift.S')
-rw-r--r-- | plugins/MirOTR/Libgcrypt/mpi/pentium4/mmx/mpih-rshift.S | 453 |
1 files changed, 453 insertions, 0 deletions
diff --git a/plugins/MirOTR/Libgcrypt/mpi/pentium4/mmx/mpih-rshift.S b/plugins/MirOTR/Libgcrypt/mpi/pentium4/mmx/mpih-rshift.S new file mode 100644 index 0000000000..e3374e3ba3 --- /dev/null +++ b/plugins/MirOTR/Libgcrypt/mpi/pentium4/mmx/mpih-rshift.S @@ -0,0 +1,453 @@ +/* Intel Pentium-4 mpn_rshift -- right shift. + * + * Copyright 2001, 2002 Free Software Foundation, Inc. + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + */ + + +#include "sysdep.h" +#include "asm-syntax.h" + + +/******************* + * mpi_limb_t + * _gcry_mpih_rshift( mpi_ptr_t wp, (sp + 4) + * mpi_ptr_t up, (sp + 8) + * mpi_size_t usize, (sp + 12) + * unsigned cnt) (sp + 16) + * + * P4 Willamette, Northwood: 1.75 cycles/limb + * P4 Prescott: 2.0 cycles/limb + */ + +.text + ALIGN (3) + .globl C_SYMBOL_NAME(_gcry_mpih_rshift) +C_SYMBOL_NAME(_gcry_mpih_rshift:) + pushl %ebx + pushl %edi + + + movl 20(%esp), %eax + movl 12(%esp), %edx + + movl 16(%esp), %ebx + movl 24(%esp), %ecx + + cmp $5, %eax + jae .Lunroll + + decl %eax + movl (%ebx), %edi + + jnz .Lsimple + + shrdl %cl, %edi, %eax + + shrl %cl, %edi + + movl %edi, (%edx) + popl %edi + + popl %ebx + + ret + + + + + + .align 8, 0x90 +.Lsimple: + + + + + + + + + + movd (%ebx), %mm5 + leal (%ebx,%eax,4), %ebx + + movd %ecx, %mm6 + leal -4(%edx,%eax,4), %edx + + psllq $32, %mm5 + negl %eax + + + + + + + +.Lsimple_top: + + + + + + + + + + movq (%ebx,%eax,4), %mm0 + incl %eax + + psrlq %mm6, %mm0 + + movd %mm0, (%edx,%eax,4) + jnz .Lsimple_top + + + movd (%ebx), %mm0 + psrlq %mm6, %mm5 + + psrlq %mm6, %mm0 + popl %edi + + movd %mm5, %eax + popl %ebx + + movd %mm0, 4(%edx) + + emms + + ret + + + + + + .align 8, 0x90 +.Lunroll: + + + + + + + + + + movd (%ebx), %mm5 + movl $4, %edi + + movd %ecx, %mm6 + testl %edi, %ebx + + psllq $32, %mm5 + jz .Lstart_src_aligned + + + + + + + + + + + + + + + + + movq (%ebx), %mm0 + + psrlq %mm6, %mm0 + addl $4, %ebx + + decl %eax + + movd %mm0, (%edx) + addl $4, %edx +.Lstart_src_aligned: + + + movq (%ebx), %mm1 + testl %edi, %edx + + psrlq %mm6, %mm5 + jz .Lstart_dst_aligned + + + + + + + + + + + + + + + + + + movq %mm1, %mm0 + addl $32, %ecx + + psrlq %mm6, %mm0 + + movd %ecx, %mm6 + + movd %mm0, (%edx) + addl $4, %edx +.Lstart_dst_aligned: + + + movq 8(%ebx), %mm3 + negl %ecx + + movq %mm3, %mm2 + addl $64, %ecx + + movd %ecx, %mm7 + psrlq %mm6, %mm1 + + leal -12(%ebx,%eax,4), %ebx + leal -20(%edx,%eax,4), %edx + + psllq %mm7, %mm3 + subl $7, %eax + + por %mm1, %mm3 + negl %eax + + jns .Lfinish + + + + + + + + + + + + + + + + .align 8, 0x90 +.Lunroll_loop: + + + + + + + + + + + + + + + + + movq (%ebx,%eax,4), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, -8(%edx,%eax,4) + por %mm2, %mm0 + + movq 8(%ebx,%eax,4), %mm3 + psrlq %mm6, %mm1 + + movq %mm0, (%edx,%eax,4) + movq %mm3, %mm2 + + psllq %mm7, %mm3 + addl $4, %eax + + por %mm1, %mm3 + js .Lunroll_loop + + +.Lfinish: + + + testb $2, %al + + jnz .Lfinish_no_two + + movq (%ebx,%eax,4), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, -8(%edx,%eax,4) + por %mm2, %mm0 + + movq %mm1, %mm2 + movq %mm0, %mm3 + + addl $2, %eax +.Lfinish_no_two: + + + + + + + + testb $1, %al + popl %edi + + movd %mm5, %eax + jnz .Lfinish_zero + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + movd 8(%ebx), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, (%edx) + por %mm2, %mm0 + + psrlq %mm6, %mm1 + andl $32, %ecx + + popl %ebx + jz .Lfinish_one_unaligned + + + movd %mm1, 16(%edx) +.Lfinish_one_unaligned: + + movq %mm0, 8(%edx) + + emms + + ret + + + + +.Lfinish_zero: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + movq %mm3, 4(%edx) + psrlq %mm6, %mm2 + + movd %mm2, 12(%edx) + andl $32, %ecx + + popl %ebx + jz .Lfinish_zero_unaligned + + movq %mm2, 12(%edx) +.Lfinish_zero_unaligned: + + emms + + ret |