summaryrefslogtreecommitdiff
path: root/kernel/dma
diff options
context:
space:
mode:
authorMichael Kelley <mhklinux@outlook.com>2024-07-08 12:41:00 -0700
committerChristoph Hellwig <hch@lst.de>2024-07-10 07:59:03 +0200
commit7296f2301a057493e97b07739213c6e864f76891 (patch)
treeed08a5d890b1c2530b9b881d2732df7f94a72389 /kernel/dma
parent54624acf8843375a6de3717ac18df3b5104c39c5 (diff)
swiotlb: reduce swiotlb pool lookups
With CONFIG_SWIOTLB_DYNAMIC enabled, each round-trip map/unmap pair in the swiotlb results in 6 calls to swiotlb_find_pool(). In multiple places, the pool is found and used in one function, and then must be found again in the next function that is called because only the tlb_addr is passed as an argument. These are the six call sites: dma_direct_map_page: 1. swiotlb_map -> swiotlb_tbl_map_single -> swiotlb_bounce dma_direct_unmap_page: 2. dma_direct_sync_single_for_cpu -> is_swiotlb_buffer 3. dma_direct_sync_single_for_cpu -> swiotlb_sync_single_for_cpu -> swiotlb_bounce 4. is_swiotlb_buffer 5. swiotlb_tbl_unmap_single -> swiotlb_del_transient 6. swiotlb_tbl_unmap_single -> swiotlb_release_slots Reduce the number of calls by finding the pool at a higher level, and passing it as an argument instead of searching again. A key change is for is_swiotlb_buffer() to return a pool pointer instead of a boolean, and then pass this pool pointer to subsequent swiotlb functions. There are 9 occurrences of is_swiotlb_buffer() used to test if a buffer is a swiotlb buffer before calling a swiotlb function. To reduce code duplication in getting the pool pointer and passing it as an argument, introduce inline wrappers for this pattern. The generated code is essentially unchanged. Since is_swiotlb_buffer() no longer returns a boolean, rename some functions to reflect the change: * swiotlb_find_pool() becomes __swiotlb_find_pool() * is_swiotlb_buffer() becomes swiotlb_find_pool() * is_xen_swiotlb_buffer() becomes xen_swiotlb_find_pool() With these changes, a round-trip map/unmap pair requires only 2 pool lookups (listed using the new names and wrappers): dma_direct_unmap_page: 1. dma_direct_sync_single_for_cpu -> swiotlb_find_pool 2. swiotlb_tbl_unmap_single -> swiotlb_find_pool These changes come from noticing the inefficiencies in a code review, not from performance measurements. With CONFIG_SWIOTLB_DYNAMIC, __swiotlb_find_pool() is not trivial, and it uses an RCU read lock, so avoiding the redundant calls helps performance in a hot path. When CONFIG_SWIOTLB_DYNAMIC is *not* set, the code size reduction is minimal and the perf benefits are likely negligible, but no harm is done. No functional change is intended. Signed-off-by: Michael Kelley <mhklinux@outlook.com> Reviewed-by: Petr Tesarik <petr@tesarici.cz> Signed-off-by: Christoph Hellwig <hch@lst.de>
Diffstat (limited to 'kernel/dma')
-rw-r--r--kernel/dma/direct.c10
-rw-r--r--kernel/dma/direct.h9
-rw-r--r--kernel/dma/swiotlb.c67
3 files changed, 42 insertions, 44 deletions
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 4d543b1e9d57..4480a3cd92e0 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -404,9 +404,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
for_each_sg(sgl, sg, nents, i) {
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
- if (unlikely(is_swiotlb_buffer(dev, paddr)))
- swiotlb_sync_single_for_device(dev, paddr, sg->length,
- dir);
+ swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_device(paddr, sg->length,
@@ -430,9 +428,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(paddr, sg->length, dir);
- if (unlikely(is_swiotlb_buffer(dev, paddr)))
- swiotlb_sync_single_for_cpu(dev, paddr, sg->length,
- dir);
+ swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
if (dir == DMA_FROM_DEVICE)
arch_dma_mark_clean(paddr, sg->length);
@@ -640,7 +636,7 @@ size_t dma_direct_max_mapping_size(struct device *dev)
bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr)
{
return !dev_is_dma_coherent(dev) ||
- is_swiotlb_buffer(dev, dma_to_phys(dev, dma_addr));
+ swiotlb_find_pool(dev, dma_to_phys(dev, dma_addr));
}
/**
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 18d346118fe8..d2c0b7e632fc 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -58,8 +58,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
{
phys_addr_t paddr = dma_to_phys(dev, addr);
- if (unlikely(is_swiotlb_buffer(dev, paddr)))
- swiotlb_sync_single_for_device(dev, paddr, size, dir);
+ swiotlb_sync_single_for_device(dev, paddr, size, dir);
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_device(paddr, size, dir);
@@ -75,8 +74,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
arch_sync_dma_for_cpu_all();
}
- if (unlikely(is_swiotlb_buffer(dev, paddr)))
- swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
+ swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
if (dir == DMA_FROM_DEVICE)
arch_dma_mark_clean(paddr, size);
@@ -121,8 +119,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
- if (unlikely(is_swiotlb_buffer(dev, phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, dir,
+ swiotlb_tbl_unmap_single(dev, phys, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
}
#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index fe1ccb53596f..b4d8167b2f8d 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -763,16 +763,18 @@ static void swiotlb_dyn_free(struct rcu_head *rcu)
}
/**
- * swiotlb_find_pool() - find the IO TLB pool for a physical address
+ * __swiotlb_find_pool() - find the IO TLB pool for a physical address
* @dev: Device which has mapped the DMA buffer.
* @paddr: Physical address within the DMA buffer.
*
* Find the IO TLB memory pool descriptor which contains the given physical
- * address, if any.
+ * address, if any. This function is for use only when the dev is known to
+ * be using swiotlb. Use swiotlb_find_pool() for the more general case
+ * when this condition is not met.
*
* Return: Memory pool which contains @paddr, or %NULL if none.
*/
-struct io_tlb_pool *swiotlb_find_pool(struct device *dev, phys_addr_t paddr)
+struct io_tlb_pool *__swiotlb_find_pool(struct device *dev, phys_addr_t paddr)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
struct io_tlb_pool *pool;
@@ -855,9 +857,8 @@ static unsigned int swiotlb_align_offset(struct device *dev,
* Bounce: copy the swiotlb buffer from or back to the original dma location
*/
static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
- enum dma_data_direction dir)
+ enum dma_data_direction dir, struct io_tlb_pool *mem)
{
- struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr);
int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = mem->slots[index].orig_addr;
size_t alloc_size = mem->slots[index].alloc_size;
@@ -1243,7 +1244,7 @@ found:
* that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy
* atomicity).
*
- * See also the comment in is_swiotlb_buffer().
+ * See also the comment in swiotlb_find_pool().
*/
smp_mb();
@@ -1435,13 +1436,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
* hardware behavior. Use of swiotlb is supposed to be transparent,
* i.e. swiotlb must not corrupt memory by clobbering unwritten bytes.
*/
- swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE, pool);
return tlb_addr;
}
-static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
+static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr,
+ struct io_tlb_pool *mem)
{
- struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr);
unsigned long flags;
unsigned int offset = swiotlb_align_offset(dev, 0, tlb_addr);
int index, nslots, aindex;
@@ -1505,11 +1506,9 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
*
* Return: %true if @tlb_addr belonged to a transient pool that was released.
*/
-static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr)
+static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr,
+ struct io_tlb_pool *pool)
{
- struct io_tlb_pool *pool;
-
- pool = swiotlb_find_pool(dev, tlb_addr);
if (!pool->transient)
return false;
@@ -1522,7 +1521,7 @@ static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr)
#else /* !CONFIG_SWIOTLB_DYNAMIC */
static inline bool swiotlb_del_transient(struct device *dev,
- phys_addr_t tlb_addr)
+ phys_addr_t tlb_addr, struct io_tlb_pool *pool)
{
return false;
}
@@ -1532,36 +1531,39 @@ static inline bool swiotlb_del_transient(struct device *dev,
/*
* tlb_addr is the physical address of the bounce buffer to unmap.
*/
-void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
- size_t mapping_size, enum dma_data_direction dir,
- unsigned long attrs)
+void __swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
+ size_t mapping_size, enum dma_data_direction dir,
+ unsigned long attrs, struct io_tlb_pool *pool)
{
/*
* First, sync the memory before unmapping the entry
*/
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+ swiotlb_bounce(dev, tlb_addr, mapping_size,
+ DMA_FROM_DEVICE, pool);
- if (swiotlb_del_transient(dev, tlb_addr))
+ if (swiotlb_del_transient(dev, tlb_addr, pool))
return;
- swiotlb_release_slots(dev, tlb_addr);
+ swiotlb_release_slots(dev, tlb_addr, pool);
}
-void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir)
+void __swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir,
+ struct io_tlb_pool *pool)
{
if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
- swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+ swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE, pool);
else
BUG_ON(dir != DMA_FROM_DEVICE);
}
-void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir)
+void __swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir,
+ struct io_tlb_pool *pool)
{
if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
- swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE);
+ swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE, pool);
else
BUG_ON(dir != DMA_TO_DEVICE);
}
@@ -1585,8 +1587,9 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
/* Ensure that the address returned is DMA'ble */
dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
- swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
- attrs | DMA_ATTR_SKIP_CPU_SYNC);
+ __swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC,
+ swiotlb_find_pool(dev, swiotlb_addr));
dev_WARN_ONCE(dev, 1,
"swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
@@ -1764,7 +1767,7 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
if (unlikely(!PAGE_ALIGNED(tlb_addr))) {
dev_WARN_ONCE(dev, 1, "Cannot allocate pages from non page-aligned swiotlb addr 0x%pa.\n",
&tlb_addr);
- swiotlb_release_slots(dev, tlb_addr);
+ swiotlb_release_slots(dev, tlb_addr, pool);
return NULL;
}
@@ -1774,11 +1777,13 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
bool swiotlb_free(struct device *dev, struct page *page, size_t size)
{
phys_addr_t tlb_addr = page_to_phys(page);
+ struct io_tlb_pool *pool;
- if (!is_swiotlb_buffer(dev, tlb_addr))
+ pool = swiotlb_find_pool(dev, tlb_addr);
+ if (!pool)
return false;
- swiotlb_release_slots(dev, tlb_addr);
+ swiotlb_release_slots(dev, tlb_addr, pool);
return true;
}