From 170d13ca3a2fdaaa0283399247631b76b441cca2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 4 Jan 2019 17:52:49 -0800 Subject: x86: re-introduce non-generic memcpy_{to,from}io This has been broken forever, and nobody ever really noticed because it's purely a performance issue. Long long ago, in commit 6175ddf06b61 ("x86: Clean up mem*io functions") Brian Gerst simplified the memory copies to and from iomem, since on x86, the instructions to access iomem are exactly the same as the regular instructions. That is technically true, and things worked, and nobody said anything. Besides, back then the regular memcpy was pretty simple and worked fine. Nobody noticed except for David Laight, that is. David has a testing a TLP monitor he was writing for an FPGA, and has been occasionally complaining about how memcpy_toio() writes things one byte at a time. Which is completely unacceptable from a performance standpoint, even if it happens to technically work. The reason it's writing one byte at a time is because while it's technically true that accesses to iomem are the same as accesses to regular memory on x86, the _granularity_ (and ordering) of accesses matter to iomem in ways that they don't matter to regular cached memory. In particular, when ERMS is set, we default to using "rep movsb" for larger memory copies. That is indeed perfectly fine for real memory, since the whole point is that the CPU is going to do cacheline optimizations and executes the memory copy efficiently for cached memory. With iomem? Not so much. With iomem, "rep movsb" will indeed work, but it will copy things one byte at a time. Slowly and ponderously. Now, originally, back in 2010 when commit 6175ddf06b61 was done, we didn't use ERMS, and this was much less noticeable. Our normal memcpy() was simpler in other ways too. Because in fact, it's not just about using the string instructions. Our memcpy() these days does things like "read and write overlapping values" to handle the last bytes of the copy. Again, for normal memory, overlapping accesses isn't an issue. For iomem? It can be. So this re-introduces the specialized memcpy_toio(), memcpy_fromio() and memset_io() functions. It doesn't particularly optimize them, but it tries to at least not be horrid, or do overlapping accesses. In fact, this uses the existing __inline_memcpy() function that we still had lying around that uses our very traditional "rep movsl" loop followed by movsw/movsb for the final bytes. Somebody may decide to try to improve on it, but if we've gone almost a decade with only one person really ever noticing and complaining, maybe it's not worth worrying about further, once it's not _completely_ broken? Reported-by: David Laight Signed-off-by: Linus Torvalds --- arch/x86/include/asm/io.h | 8 ++++++++ arch/x86/include/asm/string_64.h | 18 ----------------- arch/x86/lib/Makefile | 1 + arch/x86/lib/iomem.c | 42 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 18 deletions(-) create mode 100644 arch/x86/lib/iomem.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 832da8229cc7..686247db3106 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -221,6 +221,14 @@ extern void set_iounmap_nonlazy(void); #ifdef __KERNEL__ +void memcpy_fromio(void *, const volatile void __iomem *, size_t); +void memcpy_toio(volatile void __iomem *, const void *, size_t); +void memset_io(volatile void __iomem *, int, size_t); + +#define memcpy_fromio memcpy_fromio +#define memcpy_toio memcpy_toio +#define memset_io memset_io + #include /* diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 7ad41bfcc16c..4e4194e21a09 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -7,24 +7,6 @@ /* Written 2002 by Andi Kleen */ -/* Only used for special circumstances. Stolen from i386/string.h */ -static __always_inline void *__inline_memcpy(void *to, const void *from, size_t n) -{ - unsigned long d0, d1, d2; - asm volatile("rep ; movsl\n\t" - "testb $2,%b4\n\t" - "je 1f\n\t" - "movsw\n" - "1:\ttestb $1,%b4\n\t" - "je 2f\n\t" - "movsb\n" - "2:" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from) - : "memory"); - return to; -} - /* Even with __builtin_ the compiler may decide to use the out of line function. */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 25a972c61b0a..ce28829f1281 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -30,6 +30,7 @@ lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_RETPOLINE) += retpoline.o obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o +obj-y += iomem.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c new file mode 100644 index 000000000000..66894675f3c8 --- /dev/null +++ b/arch/x86/lib/iomem.c @@ -0,0 +1,42 @@ +#include +#include +#include + +/* Originally from i386/string.h */ +static __always_inline void __iomem_memcpy(void *to, const void *from, size_t n) +{ + unsigned long d0, d1, d2; + asm volatile("rep ; movsl\n\t" + "testb $2,%b4\n\t" + "je 1f\n\t" + "movsw\n" + "1:\ttestb $1,%b4\n\t" + "je 2f\n\t" + "movsb\n" + "2:" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from) + : "memory"); +} + +void memcpy_fromio(void *to, const volatile void __iomem *from, size_t n) +{ + __iomem_memcpy(to, (const void *)from, n); +} +EXPORT_SYMBOL(memcpy_fromio); + +void memcpy_toio(volatile void __iomem *to, const void *from, size_t n) +{ + __iomem_memcpy((void *)to, (const void *) from, n); +} +EXPORT_SYMBOL(memcpy_toio); + +void memset_io(volatile void __iomem *a, int b, size_t c) +{ + /* + * TODO: memset can mangle the IO patterns quite a bit. + * perhaps it would be better to use a dumb one: + */ + memset((void *)a, b, c); +} +EXPORT_SYMBOL(memset_io); -- cgit v1.2.3