summaryrefslogtreecommitdiff
path: root/drivers/fpga/altera-cvp.c
blob: 00e73d28077cee3732b61973222422f152ebb033 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
/*
 * FPGA Manager Driver for Altera Arria/Cyclone/Stratix CvP
 *
 * Copyright (C) 2017 DENX Software Engineering
 *
 * Anatolij Gustschin <agust@denx.de>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * Manage Altera FPGA firmware using PCIe CvP.
 * Firmware must be in binary "rbf" format.
 */

#include <linux/delay.h>
#include <linux/device.h>
#include <linux/fpga/fpga-mgr.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/sizes.h>

#define CVP_BAR		0	/* BAR used for data transfer in memory mode */
#define CVP_DUMMY_WR	244	/* dummy writes to clear CvP state machine */
#define TIMEOUT_US	2000	/* CVP STATUS timeout for USERMODE polling */

/* Vendor Specific Extended Capability Registers */
#define VSE_PCIE_EXT_CAP_ID		0x200
#define VSE_PCIE_EXT_CAP_ID_VAL		0x000b	/* 16bit */

#define VSE_CVP_STATUS			0x21c	/* 32bit */
#define VSE_CVP_STATUS_CFG_RDY		BIT(18)	/* CVP_CONFIG_READY */
#define VSE_CVP_STATUS_CFG_ERR		BIT(19)	/* CVP_CONFIG_ERROR */
#define VSE_CVP_STATUS_CVP_EN		BIT(20)	/* ctrl block is enabling CVP */
#define VSE_CVP_STATUS_USERMODE		BIT(21)	/* USERMODE */
#define VSE_CVP_STATUS_CFG_DONE		BIT(23)	/* CVP_CONFIG_DONE */
#define VSE_CVP_STATUS_PLD_CLK_IN_USE	BIT(24)	/* PLD_CLK_IN_USE */

#define VSE_CVP_MODE_CTRL		0x220	/* 32bit */
#define VSE_CVP_MODE_CTRL_CVP_MODE	BIT(0)	/* CVP (1) or normal mode (0) */
#define VSE_CVP_MODE_CTRL_HIP_CLK_SEL	BIT(1) /* PMA (1) or fabric clock (0) */
#define VSE_CVP_MODE_CTRL_NUMCLKS_OFF	8	/* NUMCLKS bits offset */
#define VSE_CVP_MODE_CTRL_NUMCLKS_MASK	GENMASK(15, 8)

#define VSE_CVP_DATA			0x228	/* 32bit */
#define VSE_CVP_PROG_CTRL		0x22c	/* 32bit */
#define VSE_CVP_PROG_CTRL_CONFIG	BIT(0)
#define VSE_CVP_PROG_CTRL_START_XFER	BIT(1)

#define VSE_UNCOR_ERR_STATUS		0x234	/* 32bit */
#define VSE_UNCOR_ERR_CVP_CFG_ERR	BIT(5)	/* CVP_CONFIG_ERROR_LATCHED */

#define DRV_NAME		"altera-cvp"
#define ALTERA_CVP_MGR_NAME	"Altera CvP FPGA Manager"

/* Optional CvP config error status check for debugging */
static bool altera_cvp_chkcfg;

struct altera_cvp_conf {
	struct fpga_manager	*mgr;
	struct pci_dev		*pci_dev;
	void __iomem		*map;
	void			(*write_data)(struct altera_cvp_conf *, u32);
	char			mgr_name[64];
	u8			numclks;
};

static enum fpga_mgr_states altera_cvp_state(struct fpga_manager *mgr)
{
	struct altera_cvp_conf *conf = mgr->priv;
	u32 status;

	pci_read_config_dword(conf->pci_dev, VSE_CVP_STATUS, &status);

	if (status & VSE_CVP_STATUS_CFG_DONE)
		return FPGA_MGR_STATE_OPERATING;

	if (status & VSE_CVP_STATUS_CVP_EN)
		return FPGA_MGR_STATE_POWER_UP;

	return FPGA_MGR_STATE_UNKNOWN;
}

static void altera_cvp_write_data_iomem(struct altera_cvp_conf *conf, u32 val)
{
	writel(val, conf->map);
}

static void altera_cvp_write_data_config(struct altera_cvp_conf *conf, u32 val)
{
	pci_write_config_dword(conf->pci_dev, VSE_CVP_DATA, val);
}

/* switches between CvP clock and internal clock */
static void altera_cvp_dummy_write(struct altera_cvp_conf *conf)
{
	unsigned int i;
	u32 val;

	/* set 1 CVP clock cycle for every CVP Data Register Write */
	pci_read_config_dword(conf->pci_dev, VSE_CVP_MODE_CTRL, &val);
	val &= ~VSE_CVP_MODE_CTRL_NUMCLKS_MASK;
	val |= 1 << VSE_CVP_MODE_CTRL_NUMCLKS_OFF;
	pci_write_config_dword(conf->pci_dev, VSE_CVP_MODE_CTRL, val);

	for (i = 0; i < CVP_DUMMY_WR; i++)
		conf->write_data(conf, 0); /* dummy data, could be any value */
}

static int altera_cvp_wait_status(struct altera_cvp_conf *conf, u32 status_mask,
				  u32 status_val, int timeout_us)
{
	unsigned int retries;
	u32 val;

	retries = timeout_us / 10;
	if (timeout_us % 10)
		retries++;

	do {
		pci_read_config_dword(conf->pci_dev, VSE_CVP_STATUS, &val);
		if ((val & status_mask) == status_val)
			return 0;

		/* use small usleep value to re-check and break early */
		usleep_range(10, 11);
	} while (--retries);

	return -ETIMEDOUT;
}

static int altera_cvp_teardown(struct fpga_manager *mgr,
			       struct fpga_image_info *info)
{
	struct altera_cvp_conf *conf = mgr->priv;
	struct pci_dev *pdev = conf->pci_dev;
	int ret;
	u32 val;

	/* STEP 12 - reset START_XFER bit */
	pci_read_config_dword(pdev, VSE_CVP_PROG_CTRL, &val);
	val &= ~VSE_CVP_PROG_CTRL_START_XFER;
	pci_write_config_dword(pdev, VSE_CVP_PROG_CTRL, val);

	/* STEP 13 - reset CVP_CONFIG bit */
	val &= ~VSE_CVP_PROG_CTRL_CONFIG;
	pci_write_config_dword(pdev, VSE_CVP_PROG_CTRL, val);

	/*
	 * STEP 14
	 * - set CVP_NUMCLKS to 1 and then issue CVP_DUMMY_WR dummy
	 *   writes to the HIP
	 */
	altera_cvp_dummy_write(conf); /* from CVP clock to internal clock */

	/* STEP 15 - poll CVP_CONFIG_READY bit for 0 with 10us timeout */
	ret = altera_cvp_wait_status(conf, VSE_CVP_STATUS_CFG_RDY, 0, 10);
	if (ret)
		dev_err(&mgr->dev, "CFG_RDY == 0 timeout\n");

	return ret;
}

static int altera_cvp_write_init(struct fpga_manager *mgr,
				 struct fpga_image_info *info,
				 const char *buf, size_t count)
{
	struct altera_cvp_conf *conf = mgr->priv;
	struct pci_dev *pdev = conf->pci_dev;
	u32 iflags, val;
	int ret;

	iflags = info ? info->flags : 0;

	if (iflags & FPGA_MGR_PARTIAL_RECONFIG) {
		dev_err(&mgr->dev, "Partial reconfiguration not supported.\n");
		return -EINVAL;
	}

	/* Determine allowed clock to data ratio */
	if (iflags & FPGA_MGR_COMPRESSED_BITSTREAM)
		conf->numclks = 8; /* ratio for all compressed images */
	else if (iflags & FPGA_MGR_ENCRYPTED_BITSTREAM)
		conf->numclks = 4; /* for uncompressed and encrypted images */
	else
		conf->numclks = 1; /* for uncompressed and unencrypted images */

	/* STEP 1 - read CVP status and check CVP_EN flag */
	pci_read_config_dword(pdev, VSE_CVP_STATUS, &val);
	if (!(val & VSE_CVP_STATUS_CVP_EN)) {
		dev_err(&mgr->dev, "CVP mode off: 0x%04x\n", val);
		return -ENODEV;
	}

	if (val & VSE_CVP_STATUS_CFG_RDY) {
		dev_warn(&mgr->dev, "CvP already started, teardown first\n");
		ret = altera_cvp_teardown(mgr, info);
		if (ret)
			return ret;
	}

	/*
	 * STEP 2
	 * - set HIP_CLK_SEL and CVP_MODE (must be set in the order mentioned)
	 */
	/* switch from fabric to PMA clock */
	pci_read_config_dword(pdev, VSE_CVP_MODE_CTRL, &val);
	val |= VSE_CVP_MODE_CTRL_HIP_CLK_SEL;
	pci_write_config_dword(pdev, VSE_CVP_MODE_CTRL, val);

	/* set CVP mode */
	pci_read_config_dword(pdev, VSE_CVP_MODE_CTRL, &val);
	val |= VSE_CVP_MODE_CTRL_CVP_MODE;
	pci_write_config_dword(pdev, VSE_CVP_MODE_CTRL, val);

	/*
	 * STEP 3
	 * - set CVP_NUMCLKS to 1 and issue CVP_DUMMY_WR dummy writes to the HIP
	 */
	altera_cvp_dummy_write(conf);

	/* STEP 4 - set CVP_CONFIG bit */
	pci_read_config_dword(pdev, VSE_CVP_PROG_CTRL, &val);
	/* request control block to begin transfer using CVP */
	val |= VSE_CVP_PROG_CTRL_CONFIG;
	pci_write_config_dword(pdev, VSE_CVP_PROG_CTRL, val);

	/* STEP 5 - poll CVP_CONFIG READY for 1 with 10us timeout */
	ret = altera_cvp_wait_status(conf, VSE_CVP_STATUS_CFG_RDY,
				     VSE_CVP_STATUS_CFG_RDY, 10);
	if (ret) {
		dev_warn(&mgr->dev, "CFG_RDY == 1 timeout\n");
		return ret;
	}

	/*
	 * STEP 6
	 * - set CVP_NUMCLKS to 1 and issue CVP_DUMMY_WR dummy writes to the HIP
	 */
	altera_cvp_dummy_write(conf);

	/* STEP 7 - set START_XFER */
	pci_read_config_dword(pdev, VSE_CVP_PROG_CTRL, &val);
	val |= VSE_CVP_PROG_CTRL_START_XFER;
	pci_write_config_dword(pdev, VSE_CVP_PROG_CTRL, val);

	/* STEP 8 - start transfer (set CVP_NUMCLKS for bitstream) */
	pci_read_config_dword(pdev, VSE_CVP_MODE_CTRL, &val);
	val &= ~VSE_CVP_MODE_CTRL_NUMCLKS_MASK;
	val |= conf->numclks << VSE_CVP_MODE_CTRL_NUMCLKS_OFF;
	pci_write_config_dword(pdev, VSE_CVP_MODE_CTRL, val);

	return 0;
}

static inline int altera_cvp_chk_error(struct fpga_manager *mgr, size_t bytes)
{
	struct altera_cvp_conf *conf = mgr->priv;
	u32 val;

	/* STEP 10 (optional) - check CVP_CONFIG_ERROR flag */
	pci_read_config_dword(conf->pci_dev, VSE_CVP_STATUS, &val);
	if (val & VSE_CVP_STATUS_CFG_ERR) {
		dev_err(&mgr->dev, "CVP_CONFIG_ERROR after %zu bytes!\n",
			bytes);
		return -EPROTO;
	}
	return 0;
}

static int altera_cvp_write(struct fpga_manager *mgr, const char *buf,
			    size_t count)
{
	struct altera_cvp_conf *conf = mgr->priv;
	const u32 *data;
	size_t done, remaining;
	int status = 0;
	u32 mask;

	/* STEP 9 - write 32-bit data from RBF file to CVP data register */
	data = (u32 *)buf;
	remaining = count;
	done = 0;

	while (remaining >= 4) {
		conf->write_data(conf, *data++);
		done += 4;
		remaining -= 4;

		/*
		 * STEP 10 (optional) and STEP 11
		 * - check error flag
		 * - loop until data transfer completed
		 * Config images can be huge (more than 40 MiB), so
		 * only check after a new 4k data block has been written.
		 * This reduces the number of checks and speeds up the
		 * configuration process.
		 */
		if (altera_cvp_chkcfg && !(done % SZ_4K)) {
			status = altera_cvp_chk_error(mgr, done);
			if (status < 0)
				return status;
		}
	}

	/* write up to 3 trailing bytes, if any */
	mask = BIT(remaining * 8) - 1;
	if (mask)
		conf->write_data(conf, *data & mask);

	if (altera_cvp_chkcfg)
		status = altera_cvp_chk_error(mgr, count);

	return status;
}

static int altera_cvp_write_complete(struct fpga_manager *mgr,
				     struct fpga_image_info *info)
{
	struct altera_cvp_conf *conf = mgr->priv;
	struct pci_dev *pdev = conf->pci_dev;
	int ret;
	u32 mask;
	u32 val;

	ret = altera_cvp_teardown(mgr, info);
	if (ret)
		return ret;

	/* STEP 16 - check CVP_CONFIG_ERROR_LATCHED bit */
	pci_read_config_dword(pdev, VSE_UNCOR_ERR_STATUS, &val);
	if (val & VSE_UNCOR_ERR_CVP_CFG_ERR) {
		dev_err(&mgr->dev, "detected CVP_CONFIG_ERROR_LATCHED!\n");
		return -EPROTO;
	}

	/* STEP 17 - reset CVP_MODE and HIP_CLK_SEL bit */
	pci_read_config_dword(pdev, VSE_CVP_MODE_CTRL, &val);
	val &= ~VSE_CVP_MODE_CTRL_HIP_CLK_SEL;
	val &= ~VSE_CVP_MODE_CTRL_CVP_MODE;
	pci_write_config_dword(pdev, VSE_CVP_MODE_CTRL, val);

	/* STEP 18 - poll PLD_CLK_IN_USE and USER_MODE bits */
	mask = VSE_CVP_STATUS_PLD_CLK_IN_USE | VSE_CVP_STATUS_USERMODE;
	ret = altera_cvp_wait_status(conf, mask, mask, TIMEOUT_US);
	if (ret)
		dev_err(&mgr->dev, "PLD_CLK_IN_USE|USERMODE timeout\n");

	return ret;
}

static const struct fpga_manager_ops altera_cvp_ops = {
	.state		= altera_cvp_state,
	.write_init	= altera_cvp_write_init,
	.write		= altera_cvp_write,
	.write_complete	= altera_cvp_write_complete,
};

static ssize_t chkcfg_show(struct device_driver *dev, char *buf)
{
	return snprintf(buf, 3, "%d\n", altera_cvp_chkcfg);
}

static ssize_t chkcfg_store(struct device_driver *drv, const char *buf,
			    size_t count)
{
	int ret;

	ret = kstrtobool(buf, &altera_cvp_chkcfg);
	if (ret)
		return ret;

	return count;
}

static DRIVER_ATTR_RW(chkcfg);

static int altera_cvp_probe(struct pci_dev *pdev,
			    const struct pci_device_id *dev_id);
static void altera_cvp_remove(struct pci_dev *pdev);

#define PCI_VENDOR_ID_ALTERA	0x1172

static struct pci_device_id altera_cvp_id_tbl[] = {
	{ PCI_VDEVICE(ALTERA, PCI_ANY_ID) },
	{ }
};
MODULE_DEVICE_TABLE(pci, altera_cvp_id_tbl);

static struct pci_driver altera_cvp_driver = {
	.name   = DRV_NAME,
	.id_table = altera_cvp_id_tbl,
	.probe  = altera_cvp_probe,
	.remove = altera_cvp_remove,
};

static int altera_cvp_probe(struct pci_dev *pdev,
			    const struct pci_device_id *dev_id)
{
	struct altera_cvp_conf *conf;
	u16 cmd, val;
	int ret;

	/*
	 * First check if this is the expected FPGA device. PCI config
	 * space access works without enabling the PCI device, memory
	 * space access is enabled further down.
	 */
	pci_read_config_word(pdev, VSE_PCIE_EXT_CAP_ID, &val);
	if (val != VSE_PCIE_EXT_CAP_ID_VAL) {
		dev_err(&pdev->dev, "Wrong EXT_CAP_ID value 0x%x\n", val);
		return -ENODEV;
	}

	conf = devm_kzalloc(&pdev->dev, sizeof(*conf), GFP_KERNEL);
	if (!conf)
		return -ENOMEM;

	/*
	 * Enable memory BAR access. We cannot use pci_enable_device() here
	 * because it will make the driver unusable with FPGA devices that
	 * have additional big IOMEM resources (e.g. 4GiB BARs) on 32-bit
	 * platform. Such BARs will not have an assigned address range and
	 * pci_enable_device() will fail, complaining about not claimed BAR,
	 * even if the concerned BAR is not needed for FPGA configuration
	 * at all. Thus, enable the device via PCI config space command.
	 */
	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
	if (!(cmd & PCI_COMMAND_MEMORY)) {
		cmd |= PCI_COMMAND_MEMORY;
		pci_write_config_word(pdev, PCI_COMMAND, cmd);
	}

	ret = pci_request_region(pdev, CVP_BAR, "CVP");
	if (ret) {
		dev_err(&pdev->dev, "Requesting CVP BAR region failed\n");
		goto err_disable;
	}

	conf->pci_dev = pdev;
	conf->write_data = altera_cvp_write_data_iomem;

	conf->map = pci_iomap(pdev, CVP_BAR, 0);
	if (!conf->map) {
		dev_warn(&pdev->dev, "Mapping CVP BAR failed\n");
		conf->write_data = altera_cvp_write_data_config;
	}

	snprintf(conf->mgr_name, sizeof(conf->mgr_name), "%s @%s",
		 ALTERA_CVP_MGR_NAME, pci_name(pdev));

	ret = fpga_mgr_register(&pdev->dev, conf->mgr_name,
				&altera_cvp_ops, conf);
	if (ret)
		goto err_unmap;

	ret = driver_create_file(&altera_cvp_driver.driver,
				 &driver_attr_chkcfg);
	if (ret) {
		dev_err(&pdev->dev, "Can't create sysfs chkcfg file\n");
		fpga_mgr_unregister(&pdev->dev);
		goto err_unmap;
	}

	return 0;

err_unmap:
	pci_iounmap(pdev, conf->map);
	pci_release_region(pdev, CVP_BAR);
err_disable:
	cmd &= ~PCI_COMMAND_MEMORY;
	pci_write_config_word(pdev, PCI_COMMAND, cmd);
	return ret;
}

static void altera_cvp_remove(struct pci_dev *pdev)
{
	struct fpga_manager *mgr = pci_get_drvdata(pdev);
	struct altera_cvp_conf *conf = mgr->priv;
	u16 cmd;

	driver_remove_file(&altera_cvp_driver.driver, &driver_attr_chkcfg);
	fpga_mgr_unregister(&pdev->dev);
	pci_iounmap(pdev, conf->map);
	pci_release_region(pdev, CVP_BAR);
	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
	cmd &= ~PCI_COMMAND_MEMORY;
	pci_write_config_word(pdev, PCI_COMMAND, cmd);
}

module_pci_driver(altera_cvp_driver);

MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Anatolij Gustschin <agust@denx.de>");
MODULE_DESCRIPTION("Module to load Altera FPGA over CvP");
call with GART type x86, kexec, nvdimm: Use walk_iomem_res_desc() for iomem search resource: Add walk_iomem_res_desc() memremap: Change region_intersects() to take @flags and @desc arm/samsung: Change s3c_pm_run_res() to use System RAM type resource: Change walk_system_ram() to use System RAM type drivers: Initialize resource entry to zero xen, mm: Set IORESOURCE_SYSTEM_RAM to System RAM kexec: Set IORESOURCE_SYSTEM_RAM for System RAM arch: Set IORESOURCE_SYSTEM_RAM flag for System RAM ia64: Set System RAM type and descriptor x86/e820: Set System RAM type and descriptor resource: Add I/O resource descriptor resource: Handle resource flags properly resource: Add System RAM resource type 2016-03-09memremap: check pfn validity before passing to pfn_to_page()Ard Biesheuvel1-2/+2 In memremap's helper function try_ram_remap(), we dereference a struct page pointer that was derived from a PFN that is known to be covered by a 'System RAM' iomem region, and is thus assumed to be a 'valid' PFN, i.e., a PFN that has a struct page associated with it and is covered by the kernel direct mapping. However, the assumption that there is a 1:1 relation between the System RAM iomem region and the kernel direct mapping is not universally valid on all architectures, and on ARM and arm64, 'System RAM' may include regions for which pfn_valid() returns false. Generally speaking, both __va() and pfn_to_page() should only ever be called on PFNs/physical addresses for which pfn_valid() returns true, so add that check to try_ram_remap(). Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Dan Williams <dan.j.williams@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-09mm: fix mixed zone detection in devm_memremap_pagesDan Williams1-5/+6 The check for whether we overlap "System RAM" needs to be done at section granularity. For example a system with the following mapping: 100000000-37bffffff : System RAM 37c000000-837ffffff : Persistent Memory ...is unable to use devm_memremap_pages() as it would result in two zones colliding within a given section. Signed-off-by: Dan Williams <dan.j.williams@intel.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Reviewed-by: Toshi Kani <toshi.kani@hpe.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-09list: kill list_force_poison()Dan Williams1-2/+7 Given we have uninitialized list_heads being passed to list_add() it will always be the case that those uninitialized values randomly trigger the poison value. Especially since a list_add() operation will seed the stack with the poison value for later stack allocations to trip over. For example, see these two false positive reports: list_add attempted on force-poisoned entry WARNING: at lib/list_debug.c:34 [..] NIP [c00000000043c390] __list_add+0xb0/0x150 LR [c00000000043c38c] __list_add+0xac/0x150 Call Trace: __list_add+0xac/0x150 (unreliable) __down+0x4c/0xf8 down+0x68/0x70 xfs_buf_lock+0x4c/0x150 [xfs] list_add attempted on force-poisoned entry(0000000000000500), new->next == d0000000059ecdb0, new->prev == 0000000000000500 WARNING: at lib/list_debug.c:33 [..] NIP [c00000000042db78] __list_add+0xa8/0x140 LR [c00000000042db74] __list_add+0xa4/0x140 Call Trace: __list_add+0xa4/0x140 (unreliable) rwsem_down_read_failed+0x6c/0x1a0 down_read+0x58/0x60 xfs_log_commit_cil+0x7c/0x600 [xfs] Fixes: commit 5c2c2587b132 ("mm, dax, pmem: introduce {get|put}_dev_pagemap() for dax-gup") Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reported-by: Eryu Guan <eguan@redhat.com> Tested-by: Eryu Guan <eguan@redhat.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-04Merge tag 'v4.5-rc6' into core/resources, to resolve conflictIngo Molnar1-10/+16 Signed-off-by: Ingo Molnar <mingo@kernel.org> 2016-02-25Merge branch 'libnvdimm-fixes' of ↵Linus Torvalds1-1/+3 git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm Pull libnvdimm fixes from Dan Williams: - Two fixes for compatibility with the ACPI 6.1 specification. Without these fixes multi-interface DIMMs will fail to be probed, and address range scrub commands to find memory errors will give results that the kernel will mis-interpret. For multi-interface DIMMs Linux will accept either the original 6.0 implementation or 6.1. For address range scrub we'll only support 6.1 since ACPI formalized this DSM differently than the original example [1] implemented in v4.2. The expectation is that production systems will only ever ship the ACPI 6.1 address range scrub command definition. - The wider async address range scrub work targeting 4.6 discovered that the original synchronous implementation in 4.5 is not sizing its return buffer correctly. - Arnd caught that my recent fix to the size of the pfn_t flags missed updating the flags variable used in the pmem driver. - Toshi found that we mishandle the memremap() return value in devm_memremap(). * 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: nvdimm: use 'u64' for pfn flags devm_memremap: Fix error value when memremap failed nfit: update address range scrub commands to the acpi 6.1 format libnvdimm, tools/testing/nvdimm: fix 'ars_status' output buffer sizing nfit: fix multi-interface dimm handling, acpi6.1 compatibility 2016-02-23devm_memremap: Fix error value when memremap failedToshi Kani1-1/+3 devm_memremap() returns an ERR_PTR() value in case of error. However, it returns NULL when memremap() failed. This causes the caller, such as the pmem driver, to proceed and oops later. Change devm_memremap() to return ERR_PTR(-ENXIO) when memremap() failed. Signed-off-by: Toshi Kani <toshi.kani@hpe.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: <stable@vger.kernel.org> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> 2016-02-18devm_memremap_release(): fix memremap'd addr handlingToshi Kani1-1/+1 The pmem driver calls devm_memremap() to map a persistent memory range. When the pmem driver is unloaded, this memremap'd range is not released so the kernel will leak a vma. Fix devm_memremap_release() to handle a given memremap'd address properly. Signed-off-by: Toshi Kani <toshi.kani@hpe.com> Acked-by: Dan Williams <dan.j.williams@intel.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2016-02-11mm: fix pfn_t vs highmemDan Williams1-1/+1 The pfn_t type uses an unsigned long to store a pfn + flags value. On a 64-bit platform the upper 12 bits of an unsigned long are never used for storing the value of a pfn. However, this is not true on highmem platforms, all 32-bits of a pfn value are used to address a 44-bit physical address space. A pfn_t needs to store a 64-bit value. Link: https://bugzilla.kernel.org/show_bug.cgi?id=112211 Fixes: 01c8f1c44b83 ("mm, dax, gpu: convert vm_insert_mixed to pfn_t") Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reported-by: Stuart Foster <smf.linux@ntlworld.com> Reported-by: Julian Margetson <runaway@candw.ms> Tested-by: Julian Margetson <runaway@candw.ms> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-31phys_to_pfn_t: use phys_addr_tDan Williams1-1/+1 A dma_addr_t is potentially smaller than a phys_addr_t on some archs. Don't truncate the address when doing the pfn conversion. Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Reported-by: Matthew Wilcox <willy@linux.intel.com> [willy: fix pfn_t_to_phys as well] Signed-off-by: Dan Williams <dan.j.williams@intel.com> 2016-01-30memremap: Change region_intersects() to take @flags and @descToshi Kani1-6/+7 Change region_intersects() to identify a target with @flags and @desc, instead of @name with strcmp(). Change the callers of region_intersects(), memremap() and devm_memremap(), to set IORESOURCE_SYSTEM_RAM in @flags and IORES_DESC_NONE in @desc when searching System RAM. Also, export region_intersects() so that the ACPI EINJ error injection driver can call this function in a later patch. Signed-off-by: Toshi Kani <toshi.kani@hpe.com> Signed-off-by: Borislav Petkov <bp@suse.de> Acked-by: Dan Williams <dan.j.williams@intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jakub Sitnicki <jsitnicki@gmail.com> Cc: Jan Kara <jack@suse.cz> Cc: Jiang Liu <jiang.liu@linux.intel.com> Cc: Kees Cook <keescook@chromium.org> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Konstantin Khlebnikov <koct9i@gmail.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Luis R. Rodriguez <mcgrof@suse.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Toshi Kani <toshi.kani@hp.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: linux-arch@vger.kernel.org Cc: linux-mm <linux-mm@kvack.org> Link: http://lkml.kernel.org/r/1453841853-11383-13-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar <mingo@kernel.org> 2016-01-29devm_memremap_pages: fix vmem_altmap lifetime + alignment handlingDan Williams1-7/+11 to_vmem_altmap() needs to return valid results until arch_remove_memory() completes. It also needs to be valid for any pfn in a section regardless of whether that pfn maps to data. This escape was a result of a bug in the unit test. The signature of this bug is that free_pagetable() fails to retrieve a vmem_altmap and goes off into the weeds: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [<ffffffff811d2629>] get_pfnblock_flags_mask+0x49/0x60 [..] Call Trace: [<ffffffff811d3477>] free_hot_cold_page+0x97/0x1d0 [<ffffffff811d367a>] __free_pages+0x2a/0x40 [<ffffffff8191e669>] free_pagetable+0x8c/0xd4 [<ffffffff8191ef4e>] remove_pagetable+0x37a/0x808 [<ffffffff8191b210>] vmemmap_free+0x10/0x20 Fixes: 4b94ffdc4163 ("x86, mm: introduce vmem_altmap to augment vmemmap_populate()") Cc: Andrew Morton <akpm@linux-foundation.org> Reported-by: Jeff Moyer <jmoyer@redhat.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> 2016-01-15mm, x86: get_user_pages() for dax mappingsDan Williams1-0/+12 A dax mapping establishes a pte with _PAGE_DEVMAP set when the driver has established a devm_memremap_pages() mapping, i.e. when the pfn_t return from ->direct_access() has PFN_DEV and PFN_MAP set. Later, when encountering _PAGE_DEVMAP during a page table walk we lookup and pin a struct dev_pagemap instance to keep the result of pfn_to_page() valid until put_page(). Signed-off-by: Dan Williams <dan.j.williams@intel.com> Tested-by: Logan Gunthorpe <logang@deltatee.com> Cc: Dave Hansen <dave@sr71.net> Cc: Mel Gorman <mgorman@suse.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-15mm, dax, pmem: introduce {get|put}_dev_pagemap() for dax-gupDan Williams1-4/+49 get_dev_page() enables paths like get_user_pages() to pin a dynamically mapped pfn-range (devm_memremap_pages()) while the resulting struct page objects are in use. Unlike get_page() it may fail if the device is, or is in the process of being, disabled. While the initial lookup of the range may be an expensive list walk, the result is cached to speed up subsequent lookups which are likely to be in the same mapped range. devm_memremap_pages() now requires a reference counter to be specified at init time. For pmem this means moving request_queue allocation into pmem_alloc() so the existing queue usage counter can track "device pages". ZONE_DEVICE pages always have an elevated count and will never be on an lru reclaim list. That space in 'struct page' can be redirected for other uses, but for safety introduce a poison value that will always trip __list_add() to assert. This allows half of the struct list_head storage to be reclaimed with some assurance to back up the assumption that the page count never goes to zero and a list_add() is never attempted. Signed-off-by: Dan Williams <dan.j.williams@intel.com> Tested-by: Logan Gunthorpe <logang@deltatee.com> Cc: Dave Hansen <dave@sr71.net> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-15x86, mm: introduce vmem_altmap to augment vmemmap_populate()Dan Williams1-2/+70 In support of providing struct page for large persistent memory capacities, use struct vmem_altmap to change the default policy for allocating memory for the memmap array. The default vmemmap_populate() allocates page table storage area from the page allocator. Given persistent memory capacities relative to DRAM it may not be feasible to store the memmap in 'System Memory'. Instead vmem_altmap represents pre-allocated "device pages" to satisfy vmemmap_alloc_block_buf() requests. Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reported-by: kbuild test robot <lkp@intel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-15mm: introduce find_dev_pagemap()Dan Williams1-8/+77 There are several scenarios where we need to retrieve and update metadata associated with a given devm_memremap_pages() mapping, and the only lookup key available is a pfn in the range: 1/ We want to augment vmemmap_populate() (called via arch_add_memory()) to allocate memmap storage from pre-allocated pages reserved by the device driver. At vmemmap_alloc_block_buf() time it grabs device pages rather than page allocator pages. This is in support of devm_memremap_pages() mappings where the memmap is too large to fit in main memory (i.e. large persistent memory devices). 2/ Taking a reference against the mapping when inserting device pages into the address_space radix of a given inode. This facilitates unmap_mapping_range() and truncate_inode_pages() operations when the driver is tearing down the mapping. 3/ get_user_pages() operations on ZONE_DEVICE memory require taking a reference against the mapping so that the driver teardown path can revoke and drain usage of device pages. Signed-off-by: Dan Williams <dan.j.williams@intel.com> Tested-by: Logan Gunthorpe <logang@deltatee.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Chinner <david@fromorbit.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-15mm, dax, pmem: introduce pfn_tDan Williams1-0/+7 For the purpose of communicating the optional presence of a 'struct page' for the pfn returned from ->direct_access(), introduce a type that encapsulates a page-frame-number plus flags. These flags contain the historical "page_link" encoding for a scatterlist entry, but can also denote "device memory". Where "device memory" is a set of pfns that are not part of the kernel's linear mapping by default, but are accessed via the same memory controller as ram. The motivation for this new type is large capacity persistent memory that needs struct page entries in the 'memmap' to support 3rd party DMA (i.e. O_DIRECT I/O with a persistent memory source/target). However, we also need it in support of maintaining a list of mapped inodes which need to be unmapped at driver teardown or freeze_bdev() time. Signed-off-by: Dan Williams <dan.j.williams@intel.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Hansen <dave@sr71.net> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2015-11-10Merge tag 'libnvdimm-for-4.4' of ↵Linus Torvalds1-8/+8 git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm Pull libnvdimm updates from Dan Williams: "Outside of the new ACPI-NFIT hot-add support this pull request is more notable for what it does not contain, than what it does. There were a handful of development topics this cycle, dax get_user_pages, dax fsync, and raw block dax, that need more more iteration and will wait for 4.5. The patches to make devm and the pmem driver NUMA aware have been in -next for several weeks. The hot-add support has not, but is contained to the NFIT driver and is passing unit tests. The coredump support is straightforward and was looked over by Jeff. All of it has received a 0day build success notification across 107 configs. Summary: - Add support for the ACPI 6.0 NFIT hot add mechanism to process updates of the NFIT at runtime. - Teach the coredump implementation how to filter out DAX mappings. - Introduce NUMA hints for allocations made by the pmem driver, and as a side effect all devm allocations now hint their NUMA node by default" * tag 'libnvdimm-for-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: coredump: add DAX filtering for FDPIC ELF coredumps coredump: add DAX filtering for ELF coredumps acpi: nfit: Add support for hot-add nfit: in acpi_nfit_init, break on a 0-length table pmem, memremap: convert to numa aware allocations devm_memremap_pages: use numa_mem_id devm: make allocations numa aware by default devm_memremap: convert to return ERR_PTR devm_memunmap: use devres_release() pmem: kill memremap_pmem() x86, mm: quiet arch_add_memory() 2015-10-26memremap: fix highmem supportDan Williams1-2/+12 Currently memremap checks if the range is "System RAM" and returns the kernel linear address. This is broken for highmem platforms where a range may be "System RAM", but is not part of the kernel linear mapping. Fallback to ioremap_cache() in these cases, to let the arch code attempt to handle it. Note that ARM ioremap will WARN when attempting to remap ram, and in that case the caller needs to be fixed. For this reason, existing ioremap_cache() usages for ARM are already trained to avoid attempts to remap ram. The impact of this bug is low for now since the pmem driver is the only user of memremap(), but this is important to fix before more conversions to memremap arrive in 4.4. Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Reported-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Dan Williams <dan.j.williams@intel.com> 2015-10-09pmem, memremap: convert to numa aware allocationsDan Williams1-3/+4 Given that pmem ranges come with numa-locality hints, arrange for the resulting driver objects to be obtained from node-local memory. Reviewed-by: Tejun Heo <tj@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com> 2015-10-09devm_memremap_pages: use numa_mem_idDan Williams1-1/+1 Hint to closest numa node for the placement of newly allocated pages. As that is where the device's other allocations will originate by default when it does not specify a NUMA node. Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Reviewed-by: Tejun Heo <tj@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com> 2015-10-09devm_memremap: convert to return ERR_PTRDan Williams1-1/+1 Make devm_memremap consistent with the error return scheme of devm_memremap_pages to remove special casing in the pmem driver. Cc: Christoph Hellwig <hch@lst.de> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com> 2015-10-09devm_memunmap: use devres_release()Dan Williams1-3/+2 Remove open coded call to memunmap. Cc: Christoph Hellwig <hch@lst.de> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com> 2015-08-27add devm_memremap_pagesChristoph Hellwig1-0/+53 This behaves like devm_memremap except that it ensures we have page structures available that can back the region. Signed-off-by: Christoph Hellwig <hch@lst.de> [djbw: catch attempts to remap RAM, drop flags] Signed-off-by: Dan Williams <dan.j.williams@intel.com> 2015-08-14devres: add devm_memremapChristoph Hellwig1-0/+39 Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com> 2015-08-14arch: introduce memremap()Dan Williams1-0/+98 Existing users of ioremap_cache() are mapping memory that is known in advance to not have i/o side effects. These users are forced to cast away the __iomem annotation, or otherwise neglect to fix the sparse errors thrown when dereferencing pointers to this memory. Provide memremap() as a non __iomem annotated ioremap_*() in the case when ioremap is otherwise a pointer to cacheable memory. Empirically, ioremap_<cacheable-type>() call sites are seeking memory-like semantics (e.g. speculative reads, and prefetching permitted). memremap() is a break from the ioremap implementation pattern of adding a new memremap_<type>() for each mapping type and having silent compatibility fall backs. Instead, the implementation defines flags that are passed to the central memremap() and if a mapping type is not supported by an arch memremap returns NULL. We introduce a memremap prototype as a trivial wrapper of ioremap_cache() and ioremap_wt(). Later, once all ioremap_cache() and ioremap_wt() usage has been removed from drivers we teach archs to implement arch_memremap() with the ability to strictly enforce the mapping type. Cc: Arnd Bergmann <arnd@arndb.de> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com>