summaryrefslogtreecommitdiff
path: root/open-vm-tools/lib/include/vm_basic_asm_x86.h
diff options
context:
space:
mode:
Diffstat (limited to 'open-vm-tools/lib/include/vm_basic_asm_x86.h')
-rw-r--r--open-vm-tools/lib/include/vm_basic_asm_x86.h166
1 files changed, 68 insertions, 98 deletions
diff --git a/open-vm-tools/lib/include/vm_basic_asm_x86.h b/open-vm-tools/lib/include/vm_basic_asm_x86.h
index e4f4d5ba..cf6cd881 100644
--- a/open-vm-tools/lib/include/vm_basic_asm_x86.h
+++ b/open-vm-tools/lib/include/vm_basic_asm_x86.h
@@ -1,5 +1,5 @@
/*********************************************************
- * Copyright (C) 1998-2003 VMware, Inc. All rights reserved.
+ * Copyright (C) 1998-2015 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
@@ -232,7 +232,7 @@ XRSTOR_AMD_ES0(const void *load, uint64 mask)
* Use this function if you are certain that:
* o Either the quotient will fit in 32 bits,
* o Or your code is ready to handle a #DE exception indicating overflow.
- * If that is not the case, then use Div643264(). --hpreg
+ * If that is not the case, then use Div643264().
*
* Results:
* Quotient and remainder
@@ -251,7 +251,6 @@ Div643232(uint64 dividend, // IN
uint32 *quotient, // OUT
uint32 *remainder) // OUT
{
- /* Checked against the Intel manual and GCC --hpreg */
__asm__(
"divl %4"
: "=a" (*quotient),
@@ -271,7 +270,6 @@ Div643232(uint64 dividend, // IN
uint32 *quotient, // OUT
uint32 *remainder) // OUT
{
- /* Written and tested by mann, checked by dbudko and hpreg */
__asm {
mov eax, DWORD PTR [dividend]
mov edx, DWORD PTR [dividend+4]
@@ -297,7 +295,7 @@ Div643232(uint64 dividend, // IN
* Unsigned integer division:
* The dividend is 64-bit wide
* The divisor is 32-bit wide
- * The quotient is 64-bit wide --hpreg
+ * The quotient is 64-bit wide
*
* Results:
* Quotient and remainder
@@ -317,7 +315,6 @@ Div643264(uint64 dividend, // IN
uint32 hQuotient;
uint32 lQuotient;
- /* Checked against the Intel manual and GCC --hpreg */
__asm__(
"divl %5" "\n\t"
"movl %%eax, %0" "\n\t"
@@ -342,27 +339,20 @@ Div643264(uint64 dividend, // IN
*
* Mul64x3264 --
*
- * Unsigned integer by fixed point multiplication:
+ * Unsigned integer by fixed point multiplication, with rounding:
+ * result = floor(multiplicand * multiplier * 2**(-shift) + 0.5)
+ *
* Unsigned 64-bit integer multiplicand.
* Unsigned 32-bit fixed point multiplier, represented as
- * multiplier >> shift, where shift < 64.
- * Unsigned 64-bit integer product.
- *
- * Implementation:
- * Multiply 64x32 bits to yield a full 96-bit product.
- * Shift right by shift.
- * Return the low-order 64 bits of the result.
+ * (multiplier, shift), where shift < 64.
*
* Result:
- * Product
- *
- * Side effects:
- * None
+ * Unsigned 64-bit integer product.
*
*-----------------------------------------------------------------------------
*/
-#if defined(__GNUC__)
+#if defined(__GNUC__) && (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 4) && !defined(MUL64_NO_ASM)
static INLINE uint64
Mul64x3264(uint64 multiplicand, uint32 multiplier, uint32 shift)
@@ -371,64 +361,35 @@ Mul64x3264(uint64 multiplicand, uint32 multiplier, uint32 shift)
uint32 tmp1, tmp2;
// ASSERT(shift >= 0 && shift < 64);
- /*
- * Written and tested by mann, improved with suggestions by hpreg.
- *
- * The main improvement over the previous version is that the test
- * of shift against 32 is moved out of the asm and into C code.
- * This lets the compiler delete the test and one of the
- * alternative code sequences in the case where shift is a
- * constant. It also lets us use the best code sequence in each
- * alternative, rather than a compromise. The downside is that in
- * the non-constant case, this version takes slightly more code
- * space.
- *
- * Note on the constraints: We don't really want multiplicand to
- * start in %edx:%eax as the =A constraint dictates; in fact, we'd
- * prefer any *other* two registers. But gcc doesn't have
- * constraint syntax for any other register pair, and trying to
- * constrain ((uint32) multiplicand) to one place and (multiplicand
- * >> 32) to another generates *really* bad code -- gcc is just not
- * smart enough, at least in the version we are currently using.
- */
- if (shift < 32) {
- __asm__("mov %%eax, %2 \n\t" // Save lo(multiplicand) in tmp2
- "mov %%edx, %%eax \n\t" // Get hi(multiplicand)
- "mull %4 \n\t" // p2 = hi(multiplicand) * multiplier
- "xchg %%eax, %2 \n\t" // Save lo(p2) in tmp2, get lo(multiplicand)
- "mov %%edx, %1 \n\t" // Save hi(p2) in tmp1
- "mull %4 \n\t" // p1 = lo(multiplicand) * multiplier
- "addl %2, %%edx \n\t" // hi(p1) += lo(p2)
- "adcl $0, %1 \n\t" // hi(p2) += carry from previous step
- "shrdl %%edx, %%eax \n\t" // result = hi(p2):hi(p1):lo(p1) >> shift
- "shrdl %1, %%edx"
- : "=A" (result),
- "=&r" (tmp1), // use in shrdl requires it to be a register
- "=&r" (tmp2) // could be "=&rm" but "m" is slower
- : "0" (multiplicand),
- "rm" (multiplier),
- "c" (shift)
- : "cc"
- );
- } else {
- __asm__("mov %%edx, %2 \n\t" // Save hi(multiplicand) in tmp2
- "mull %4 \n\t" // p1 = lo(multiplicand) * multiplier
- "mov %%edx, %1 \n\t" // Save hi(p1) in tmp1
- "mov %2, %%eax \n\t" // Discard lo(p1), get hi(multiplicand)
- "mull %4 \n\t" // p2 = hi(multiplicand) * multiplier
- "addl %1, %%eax \n\t" // lo(p2) += hi(p1)
- "adcl $0, %%edx \n\t" // hi(p2) += carry from previous step
- "shrdl %%edx, %%eax \n\t" // result = p2 >> (shift & 31)
- "shrl %%cl, %%edx"
- : "=A" (result),
- "=&r" (tmp1), // could be "=&rm" but "m" is slower
- "=&r" (tmp2) // could be "=&rm" but "m" is slower
- : "0" (multiplicand),
- "rm" (multiplier),
- "c" (shift)
- : "cc"
- );
- }
+ __asm__("mov %%eax, %2\n\t" // Save lo(multiplicand)
+ "mov %%edx, %%eax\n\t" // Get hi(multiplicand)
+ "mull %4\n\t" // p2 = hi(multiplicand) * multiplier
+ "xchg %%eax, %2\n\t" // Save lo(p2), get lo(multiplicand)
+ "mov %%edx, %1\n\t" // Save hi(p2)
+ "mull %4\n\t" // p1 = lo(multiplicand) * multiplier
+ "addl %2, %%edx\n\t" // hi(p1) += lo(p2)
+ "adcl $0, %1\n\t" // hi(p2) += carry from previous step
+ "cmpl $32, %%ecx\n\t" // shift < 32?
+ "jl 2f\n\t" // Go if so
+ "shll $1, %%eax\n\t" // Save lo(p1) bit 31 in CF in case shift=32
+ "mov %%edx, %%eax\n\t" // result = hi(p2):hi(p1) >> (shift & 31)
+ "mov %1, %%edx\n\t"
+ "shrdl %%edx, %%eax\n\t"
+ "mov $0, %2\n\t"
+ "adcl $0, %2\n\t" // Get highest order bit shifted out, from CF
+ "shrl %%cl, %%edx\n\t"
+ "jmp 3f\n"
+ "2:\n\t"
+ "xor %2, %2\n\t"
+ "shrdl %%edx, %%eax\n\t" // result = hi(p2):hi(p1):lo(p1) >> shift
+ "adcl $0, %2\n\t" // Get highest order bit shifted out, from CF
+ "shrdl %1, %%edx\n"
+ "3:\n\t"
+ "addl %2, %%eax\n\t" // result += highest order bit shifted out
+ "adcl $0, %%edx"
+ : "=A" (result), "=&r" (tmp1), "=&r" (tmp2)
+ : "0" (multiplicand), "rm" (multiplier), "c" (shift)
+ : "cc");
return result;
}
@@ -440,7 +401,6 @@ Mul64x3264(uint64 multiplicand, uint32 multiplier, uint32 shift)
{
// ASSERT(shift >= 0 && shift < 64);
- /* Written and tested by mann, checked by dbudko and hpreg */
__asm {
mov eax, DWORD PTR [multiplicand+4] // Get hi(multiplicand)
mul DWORD PTR [multiplier] // p2 = hi(multiplicand) * multiplier
@@ -453,22 +413,30 @@ Mul64x3264(uint64 multiplicand, uint32 multiplier, uint32 shift)
mov ecx, DWORD PTR [shift] // Get shift
cmp ecx, 32 // shift < 32?
jl SHORT l2 // Go if so
+ shl eax, 1 // Save lo(p1) bit 31 in CF in case shift=32
mov eax, edx // result = hi(p2):hi(p1) >> (shift & 31)
mov edx, ebx
shrd eax, edx, cl
+ mov esi, 0
+ adc esi, 0 // Get highest order bit shifted out, from CF
shr edx, cl
jmp SHORT l3
l2:
+ xor esi, esi
shrd eax, edx, cl // result = hi(p2):hi(p1):lo(p1) >> shift
+ adc esi, 0 // Get highest order bit shifted out, from CF
shrd edx, ebx, cl
l3:
+ add eax, esi // result += highest order bit shifted out
+ adc edx, 0
}
// return with result in edx:eax
}
#pragma warning(default: 4035)
#else
-#error No compiler defined for Mul64x3264
+#define MUL64_NO_ASM 1
+#include "mul64.h"
#endif
/*
@@ -476,27 +444,20 @@ Mul64x3264(uint64 multiplicand, uint32 multiplier, uint32 shift)
*
* Muls64x32s64 --
*
- * Signed integer by fixed point multiplication:
+ * Signed integer by fixed point multiplication, with rounding:
+ * result = floor(multiplicand * multiplier * 2**(-shift) + 0.5)
+ *
* Signed 64-bit integer multiplicand.
* Unsigned 32-bit fixed point multiplier, represented as
- * multiplier >> shift, where shift < 64.
- * Signed 64-bit integer product.
- *
- * Implementation:
- * Multiply 64x32 bits to yield a full 96-bit product.
- * Shift right by the location of the binary point.
- * Return the low-order 64 bits of the result.
+ * (multiplier, shift), where shift < 64.
*
* Result:
- * Product
- *
- * Side effects:
- * None
+ * Signed 64-bit integer product.
*
*-----------------------------------------------------------------------------
*/
-#if defined(__GNUC__)
+#if defined(__GNUC__) && (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 4) && !defined(MUL64_NO_ASM)
static INLINE int64
Muls64x32s64(int64 multiplicand, uint32 multiplier, uint32 shift)
@@ -505,8 +466,6 @@ Muls64x32s64(int64 multiplicand, uint32 multiplier, uint32 shift)
uint32 tmp1, tmp2;
// ASSERT(shift >= 0 && shift < 64);
- /* Written and tested by mann, checked by dbudko and hpreg */
- /* XXX hpreg suggested some improvements that we haven't converged on yet */
__asm__("mov %%eax, %2\n\t" // Save lo(multiplicand)
"mov %%edx, %%eax\n\t" // Get hi(multiplicand)
"test %%eax, %%eax\n\t" // Check sign of multiplicand
@@ -524,22 +483,29 @@ Muls64x32s64(int64 multiplicand, uint32 multiplier, uint32 shift)
"adcl $0, %1\n\t" // hi(p2) += carry from previous step
"cmpl $32, %%ecx\n\t" // shift < 32?
"jl 2f\n\t" // Go if so
+ "shll $1, %%eax\n\t" // Save lo(p1) bit 31 in CF in case shift=32
"mov %%edx, %%eax\n\t" // result = hi(p2):hi(p1) >> (shift & 31)
"mov %1, %%edx\n\t"
"shrdl %%edx, %%eax\n\t"
+ "mov $0, %2\n\t"
+ "adcl $0, %2\n\t" // Get highest order bit shifted out from CF
"sarl %%cl, %%edx\n\t"
"jmp 3f\n"
"2:\n\t"
+ "xor %2, %2\n\t"
"shrdl %%edx, %%eax\n\t" // result = hi(p2):hi(p1):lo(p1) >> shift
+ "adcl $0, %2\n\t" // Get highest order bit shifted out from CF
"shrdl %1, %%edx\n"
"3:\n\t"
+ "addl %2, %%eax\n\t" // result += highest order bit shifted out
+ "adcl $0, %%edx"
: "=A" (result), "=&r" (tmp1), "=&rm" (tmp2)
: "0" (multiplicand), "rm" (multiplier), "c" (shift)
: "cc");
return result;
}
-#elif defined _MSC_VER
+#elif defined(_MSC_VER)
#pragma warning(disable: 4035)
static INLINE int64
@@ -547,7 +513,6 @@ Muls64x32s64(int64 multiplicand, uint32 multiplier, uint32 shift)
{
//ASSERT(shift >= 0 && shift < 64);
- /* Written and tested by mann, checked by dbudko and hpreg */
__asm {
mov eax, DWORD PTR [multiplicand+4] // Get hi(multiplicand)
test eax, eax // Check sign of multiplicand
@@ -567,22 +532,27 @@ Muls64x32s64(int64 multiplicand, uint32 multiplier, uint32 shift)
mov ecx, DWORD PTR [shift] // Get shift
cmp ecx, 32 // shift < 32?
jl SHORT l2 // Go if so
+ shl eax, 1 // Save lo(p1) bit 31 in CF in case shift=32
mov eax, edx // result = hi(p2):hi(p1) >> (shift & 31)
mov edx, ebx
shrd eax, edx, cl
+ mov esi, 0
+ adc esi, 0 // Get highest order bit shifted out, from CF
sar edx, cl
jmp SHORT l3
l2:
+ xor esi, esi
shrd eax, edx, cl // result = hi(p2):hi(p1):lo(p1) << shift
+ adc esi, 0 // Get highest order bit shifted out, from CF
shrd edx, ebx, cl
l3:
+ add eax, esi // result += highest order bit shifted out
+ adc edx, 0
}
// return with result in edx:eax
}
#pragma warning(default: 4035)
-#else
-#error No compiler defined for Muls64x32s64
#endif