diff options
Diffstat (limited to 'open-vm-tools/lib/include/vm_basic_asm_x86.h')
-rw-r--r-- | open-vm-tools/lib/include/vm_basic_asm_x86.h | 166 |
1 files changed, 68 insertions, 98 deletions
diff --git a/open-vm-tools/lib/include/vm_basic_asm_x86.h b/open-vm-tools/lib/include/vm_basic_asm_x86.h index e4f4d5ba..cf6cd881 100644 --- a/open-vm-tools/lib/include/vm_basic_asm_x86.h +++ b/open-vm-tools/lib/include/vm_basic_asm_x86.h @@ -1,5 +1,5 @@ /********************************************************* - * Copyright (C) 1998-2003 VMware, Inc. All rights reserved. + * Copyright (C) 1998-2015 VMware, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published @@ -232,7 +232,7 @@ XRSTOR_AMD_ES0(const void *load, uint64 mask) * Use this function if you are certain that: * o Either the quotient will fit in 32 bits, * o Or your code is ready to handle a #DE exception indicating overflow. - * If that is not the case, then use Div643264(). --hpreg + * If that is not the case, then use Div643264(). * * Results: * Quotient and remainder @@ -251,7 +251,6 @@ Div643232(uint64 dividend, // IN uint32 *quotient, // OUT uint32 *remainder) // OUT { - /* Checked against the Intel manual and GCC --hpreg */ __asm__( "divl %4" : "=a" (*quotient), @@ -271,7 +270,6 @@ Div643232(uint64 dividend, // IN uint32 *quotient, // OUT uint32 *remainder) // OUT { - /* Written and tested by mann, checked by dbudko and hpreg */ __asm { mov eax, DWORD PTR [dividend] mov edx, DWORD PTR [dividend+4] @@ -297,7 +295,7 @@ Div643232(uint64 dividend, // IN * Unsigned integer division: * The dividend is 64-bit wide * The divisor is 32-bit wide - * The quotient is 64-bit wide --hpreg + * The quotient is 64-bit wide * * Results: * Quotient and remainder @@ -317,7 +315,6 @@ Div643264(uint64 dividend, // IN uint32 hQuotient; uint32 lQuotient; - /* Checked against the Intel manual and GCC --hpreg */ __asm__( "divl %5" "\n\t" "movl %%eax, %0" "\n\t" @@ -342,27 +339,20 @@ Div643264(uint64 dividend, // IN * * Mul64x3264 -- * - * Unsigned integer by fixed point multiplication: + * Unsigned integer by fixed point multiplication, with rounding: + * result = floor(multiplicand * multiplier * 2**(-shift) + 0.5) + * * Unsigned 64-bit integer multiplicand. * Unsigned 32-bit fixed point multiplier, represented as - * multiplier >> shift, where shift < 64. - * Unsigned 64-bit integer product. - * - * Implementation: - * Multiply 64x32 bits to yield a full 96-bit product. - * Shift right by shift. - * Return the low-order 64 bits of the result. + * (multiplier, shift), where shift < 64. * * Result: - * Product - * - * Side effects: - * None + * Unsigned 64-bit integer product. * *----------------------------------------------------------------------------- */ -#if defined(__GNUC__) +#if defined(__GNUC__) && (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 4) && !defined(MUL64_NO_ASM) static INLINE uint64 Mul64x3264(uint64 multiplicand, uint32 multiplier, uint32 shift) @@ -371,64 +361,35 @@ Mul64x3264(uint64 multiplicand, uint32 multiplier, uint32 shift) uint32 tmp1, tmp2; // ASSERT(shift >= 0 && shift < 64); - /* - * Written and tested by mann, improved with suggestions by hpreg. - * - * The main improvement over the previous version is that the test - * of shift against 32 is moved out of the asm and into C code. - * This lets the compiler delete the test and one of the - * alternative code sequences in the case where shift is a - * constant. It also lets us use the best code sequence in each - * alternative, rather than a compromise. The downside is that in - * the non-constant case, this version takes slightly more code - * space. - * - * Note on the constraints: We don't really want multiplicand to - * start in %edx:%eax as the =A constraint dictates; in fact, we'd - * prefer any *other* two registers. But gcc doesn't have - * constraint syntax for any other register pair, and trying to - * constrain ((uint32) multiplicand) to one place and (multiplicand - * >> 32) to another generates *really* bad code -- gcc is just not - * smart enough, at least in the version we are currently using. - */ - if (shift < 32) { - __asm__("mov %%eax, %2 \n\t" // Save lo(multiplicand) in tmp2 - "mov %%edx, %%eax \n\t" // Get hi(multiplicand) - "mull %4 \n\t" // p2 = hi(multiplicand) * multiplier - "xchg %%eax, %2 \n\t" // Save lo(p2) in tmp2, get lo(multiplicand) - "mov %%edx, %1 \n\t" // Save hi(p2) in tmp1 - "mull %4 \n\t" // p1 = lo(multiplicand) * multiplier - "addl %2, %%edx \n\t" // hi(p1) += lo(p2) - "adcl $0, %1 \n\t" // hi(p2) += carry from previous step - "shrdl %%edx, %%eax \n\t" // result = hi(p2):hi(p1):lo(p1) >> shift - "shrdl %1, %%edx" - : "=A" (result), - "=&r" (tmp1), // use in shrdl requires it to be a register - "=&r" (tmp2) // could be "=&rm" but "m" is slower - : "0" (multiplicand), - "rm" (multiplier), - "c" (shift) - : "cc" - ); - } else { - __asm__("mov %%edx, %2 \n\t" // Save hi(multiplicand) in tmp2 - "mull %4 \n\t" // p1 = lo(multiplicand) * multiplier - "mov %%edx, %1 \n\t" // Save hi(p1) in tmp1 - "mov %2, %%eax \n\t" // Discard lo(p1), get hi(multiplicand) - "mull %4 \n\t" // p2 = hi(multiplicand) * multiplier - "addl %1, %%eax \n\t" // lo(p2) += hi(p1) - "adcl $0, %%edx \n\t" // hi(p2) += carry from previous step - "shrdl %%edx, %%eax \n\t" // result = p2 >> (shift & 31) - "shrl %%cl, %%edx" - : "=A" (result), - "=&r" (tmp1), // could be "=&rm" but "m" is slower - "=&r" (tmp2) // could be "=&rm" but "m" is slower - : "0" (multiplicand), - "rm" (multiplier), - "c" (shift) - : "cc" - ); - } + __asm__("mov %%eax, %2\n\t" // Save lo(multiplicand) + "mov %%edx, %%eax\n\t" // Get hi(multiplicand) + "mull %4\n\t" // p2 = hi(multiplicand) * multiplier + "xchg %%eax, %2\n\t" // Save lo(p2), get lo(multiplicand) + "mov %%edx, %1\n\t" // Save hi(p2) + "mull %4\n\t" // p1 = lo(multiplicand) * multiplier + "addl %2, %%edx\n\t" // hi(p1) += lo(p2) + "adcl $0, %1\n\t" // hi(p2) += carry from previous step + "cmpl $32, %%ecx\n\t" // shift < 32? + "jl 2f\n\t" // Go if so + "shll $1, %%eax\n\t" // Save lo(p1) bit 31 in CF in case shift=32 + "mov %%edx, %%eax\n\t" // result = hi(p2):hi(p1) >> (shift & 31) + "mov %1, %%edx\n\t" + "shrdl %%edx, %%eax\n\t" + "mov $0, %2\n\t" + "adcl $0, %2\n\t" // Get highest order bit shifted out, from CF + "shrl %%cl, %%edx\n\t" + "jmp 3f\n" + "2:\n\t" + "xor %2, %2\n\t" + "shrdl %%edx, %%eax\n\t" // result = hi(p2):hi(p1):lo(p1) >> shift + "adcl $0, %2\n\t" // Get highest order bit shifted out, from CF + "shrdl %1, %%edx\n" + "3:\n\t" + "addl %2, %%eax\n\t" // result += highest order bit shifted out + "adcl $0, %%edx" + : "=A" (result), "=&r" (tmp1), "=&r" (tmp2) + : "0" (multiplicand), "rm" (multiplier), "c" (shift) + : "cc"); return result; } @@ -440,7 +401,6 @@ Mul64x3264(uint64 multiplicand, uint32 multiplier, uint32 shift) { // ASSERT(shift >= 0 && shift < 64); - /* Written and tested by mann, checked by dbudko and hpreg */ __asm { mov eax, DWORD PTR [multiplicand+4] // Get hi(multiplicand) mul DWORD PTR [multiplier] // p2 = hi(multiplicand) * multiplier @@ -453,22 +413,30 @@ Mul64x3264(uint64 multiplicand, uint32 multiplier, uint32 shift) mov ecx, DWORD PTR [shift] // Get shift cmp ecx, 32 // shift < 32? jl SHORT l2 // Go if so + shl eax, 1 // Save lo(p1) bit 31 in CF in case shift=32 mov eax, edx // result = hi(p2):hi(p1) >> (shift & 31) mov edx, ebx shrd eax, edx, cl + mov esi, 0 + adc esi, 0 // Get highest order bit shifted out, from CF shr edx, cl jmp SHORT l3 l2: + xor esi, esi shrd eax, edx, cl // result = hi(p2):hi(p1):lo(p1) >> shift + adc esi, 0 // Get highest order bit shifted out, from CF shrd edx, ebx, cl l3: + add eax, esi // result += highest order bit shifted out + adc edx, 0 } // return with result in edx:eax } #pragma warning(default: 4035) #else -#error No compiler defined for Mul64x3264 +#define MUL64_NO_ASM 1 +#include "mul64.h" #endif /* @@ -476,27 +444,20 @@ Mul64x3264(uint64 multiplicand, uint32 multiplier, uint32 shift) * * Muls64x32s64 -- * - * Signed integer by fixed point multiplication: + * Signed integer by fixed point multiplication, with rounding: + * result = floor(multiplicand * multiplier * 2**(-shift) + 0.5) + * * Signed 64-bit integer multiplicand. * Unsigned 32-bit fixed point multiplier, represented as - * multiplier >> shift, where shift < 64. - * Signed 64-bit integer product. - * - * Implementation: - * Multiply 64x32 bits to yield a full 96-bit product. - * Shift right by the location of the binary point. - * Return the low-order 64 bits of the result. + * (multiplier, shift), where shift < 64. * * Result: - * Product - * - * Side effects: - * None + * Signed 64-bit integer product. * *----------------------------------------------------------------------------- */ -#if defined(__GNUC__) +#if defined(__GNUC__) && (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 4) && !defined(MUL64_NO_ASM) static INLINE int64 Muls64x32s64(int64 multiplicand, uint32 multiplier, uint32 shift) @@ -505,8 +466,6 @@ Muls64x32s64(int64 multiplicand, uint32 multiplier, uint32 shift) uint32 tmp1, tmp2; // ASSERT(shift >= 0 && shift < 64); - /* Written and tested by mann, checked by dbudko and hpreg */ - /* XXX hpreg suggested some improvements that we haven't converged on yet */ __asm__("mov %%eax, %2\n\t" // Save lo(multiplicand) "mov %%edx, %%eax\n\t" // Get hi(multiplicand) "test %%eax, %%eax\n\t" // Check sign of multiplicand @@ -524,22 +483,29 @@ Muls64x32s64(int64 multiplicand, uint32 multiplier, uint32 shift) "adcl $0, %1\n\t" // hi(p2) += carry from previous step "cmpl $32, %%ecx\n\t" // shift < 32? "jl 2f\n\t" // Go if so + "shll $1, %%eax\n\t" // Save lo(p1) bit 31 in CF in case shift=32 "mov %%edx, %%eax\n\t" // result = hi(p2):hi(p1) >> (shift & 31) "mov %1, %%edx\n\t" "shrdl %%edx, %%eax\n\t" + "mov $0, %2\n\t" + "adcl $0, %2\n\t" // Get highest order bit shifted out from CF "sarl %%cl, %%edx\n\t" "jmp 3f\n" "2:\n\t" + "xor %2, %2\n\t" "shrdl %%edx, %%eax\n\t" // result = hi(p2):hi(p1):lo(p1) >> shift + "adcl $0, %2\n\t" // Get highest order bit shifted out from CF "shrdl %1, %%edx\n" "3:\n\t" + "addl %2, %%eax\n\t" // result += highest order bit shifted out + "adcl $0, %%edx" : "=A" (result), "=&r" (tmp1), "=&rm" (tmp2) : "0" (multiplicand), "rm" (multiplier), "c" (shift) : "cc"); return result; } -#elif defined _MSC_VER +#elif defined(_MSC_VER) #pragma warning(disable: 4035) static INLINE int64 @@ -547,7 +513,6 @@ Muls64x32s64(int64 multiplicand, uint32 multiplier, uint32 shift) { //ASSERT(shift >= 0 && shift < 64); - /* Written and tested by mann, checked by dbudko and hpreg */ __asm { mov eax, DWORD PTR [multiplicand+4] // Get hi(multiplicand) test eax, eax // Check sign of multiplicand @@ -567,22 +532,27 @@ Muls64x32s64(int64 multiplicand, uint32 multiplier, uint32 shift) mov ecx, DWORD PTR [shift] // Get shift cmp ecx, 32 // shift < 32? jl SHORT l2 // Go if so + shl eax, 1 // Save lo(p1) bit 31 in CF in case shift=32 mov eax, edx // result = hi(p2):hi(p1) >> (shift & 31) mov edx, ebx shrd eax, edx, cl + mov esi, 0 + adc esi, 0 // Get highest order bit shifted out, from CF sar edx, cl jmp SHORT l3 l2: + xor esi, esi shrd eax, edx, cl // result = hi(p2):hi(p1):lo(p1) << shift + adc esi, 0 // Get highest order bit shifted out, from CF shrd edx, ebx, cl l3: + add eax, esi // result += highest order bit shifted out + adc edx, 0 } // return with result in edx:eax } #pragma warning(default: 4035) -#else -#error No compiler defined for Muls64x32s64 #endif |