summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/cpu/topology.c
blob: 621a151ccf7d0a89699a9db2bd619b4fa92bfd47 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
// SPDX-License-Identifier: GPL-2.0-only
/*
 * CPU/APIC topology
 *
 * The APIC IDs describe the system topology in multiple domain levels.
 * The CPUID topology parser provides the information which part of the
 * APIC ID is associated to the individual levels:
 *
 * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD]
 *
 * The root space contains the package (socket) IDs.
 *
 * Not enumerated levels consume 0 bits space, but conceptually they are
 * always represented. If e.g. only CORE and THREAD levels are enumerated
 * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE.
 *
 * If SMT is not supported, then the THREAD domain is still used. It then
 * has the same physical ID as the CORE domain and is the only child of
 * the core domain.
 *
 * This allows a unified view on the system independent of the enumerated
 * domain levels without requiring any conditionals in the code.
 */
#define pr_fmt(fmt) "CPU topo: " fmt
#include <linux/cpu.h>

#include <xen/xen.h>

#include <asm/apic.h>
#include <asm/hypervisor.h>
#include <asm/io_apic.h>
#include <asm/mpspec.h>
#include <asm/smp.h>

#include "cpu.h"

/*
 * Map cpu index to physical APIC ID
 */
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);

/* Bitmap of physically present CPUs. */
DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;

/* Used for CPU number allocation and parallel CPU bringup */
u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };

/* Bitmaps to mark registered APICs at each topology domain */
static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;

/*
 * Keep track of assigned, disabled and rejected CPUs. Present assigned
 * with 1 as CPU #0 is reserved for the boot CPU.
 */
static struct {
	unsigned int		nr_assigned_cpus;
	unsigned int		nr_disabled_cpus;
	unsigned int		nr_rejected_cpus;
	u32			boot_cpu_apic_id;
	u32			real_bsp_apic_id;
} topo_info __ro_after_init = {
	.nr_assigned_cpus	= 1,
	.boot_cpu_apic_id	= BAD_APICID,
	.real_bsp_apic_id	= BAD_APICID,
};

#define domain_weight(_dom)	bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC)

bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
{
	return phys_id == (u64)cpuid_to_apicid[cpu];
}

#ifdef CONFIG_SMP
static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
{
	if (!(apicid & (__max_threads_per_core - 1)))
		cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
}
#else
static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { }
#endif

/*
 * Convert the APIC ID to a domain level ID by masking out the low bits
 * below the domain level @dom.
 */
static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom)
{
	if (dom == TOPO_SMT_DOMAIN)
		return apicid;
	return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
}

static int topo_lookup_cpuid(u32 apic_id)
{
	int i;

	/* CPU# to APICID mapping is persistent once it is established */
	for (i = 0; i < topo_info.nr_assigned_cpus; i++) {
		if (cpuid_to_apicid[i] == apic_id)
			return i;
	}
	return -ENODEV;
}

static __init int topo_get_cpunr(u32 apic_id)
{
	int cpu = topo_lookup_cpuid(apic_id);

	if (cpu >= 0)
		return cpu;

	return topo_info.nr_assigned_cpus++;
}

static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
{
#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
	early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id;
	early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id;
#endif
	set_cpu_present(cpu, true);
}

static __init bool check_for_real_bsp(u32 apic_id)
{
	bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
	u64 msr;

	/*
	 * There is no real good way to detect whether this a kdump()
	 * kernel, but except on the Voyager SMP monstrosity which is not
	 * longer supported, the real BSP APIC ID is the first one which is
	 * enumerated by firmware. That allows to detect whether the boot
	 * CPU is the real BSP. If it is not, then do not register the APIC
	 * because sending INIT to the real BSP would reset the whole
	 * system.
	 *
	 * The first APIC ID which is enumerated by firmware is detectable
	 * because the boot CPU APIC ID is registered before that without
	 * invoking this code.
	 */
	if (topo_info.real_bsp_apic_id != BAD_APICID)
		return false;

	/*
	 * Check whether the enumeration order is broken by evaluating the
	 * BSP bit in the APICBASE MSR. If the CPU does not have the
	 * APICBASE MSR then the BSP detection is not possible and the
	 * kernel must rely on the firmware enumeration order.
	 */
	if (has_apic_base) {
		rdmsrl(MSR_IA32_APICBASE, msr);
		is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
	}

	if (apic_id == topo_info.boot_cpu_apic_id) {
		/*
		 * If the boot CPU has the APIC BSP bit set then the
		 * firmware enumeration is agreeing. If the CPU does not
		 * have the APICBASE MSR then the only choice is to trust
		 * the enumeration order.
		 */
		if (is_bsp || !has_apic_base) {
			topo_info.real_bsp_apic_id = apic_id;
			return false;
		}
		/*
		 * If the boot APIC is enumerated first, but the APICBASE
		 * MSR does not have the BSP bit set, then there is no way
		 * to discover the real BSP here. Assume a crash kernel and
		 * limit the number of CPUs to 1 as an INIT to the real BSP
		 * would reset the machine.
		 */
		pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
		pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
		set_nr_cpu_ids(1);
		goto fwbug;
	}

	pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
		topo_info.boot_cpu_apic_id, apic_id);

	if (is_bsp) {
		/*
		 * The boot CPU has the APIC BSP bit set. Use it and complain
		 * about the broken firmware enumeration.
		 */
		topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
		goto fwbug;
	}

	pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");

	topo_info.real_bsp_apic_id = apic_id;
	return true;

fwbug:
	pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
	return false;
}

static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
				    unsigned long *map)
{
	unsigned int id, end, cnt = 0;

	/* Calculate the exclusive end */
	end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]);

	/* Unfortunately there is no bitmap_weight_range() */
	for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id))
		cnt++;
	return cnt;
}

static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
{
	int cpu, dom;

	if (present) {
		set_bit(apic_id, phys_cpu_present_map);

		/*
		 * Double registration is valid in case of the boot CPU
		 * APIC because that is registered before the enumeration
		 * of the APICs via firmware parsers or VM guest
		 * mechanisms.
		 */
		if (apic_id == topo_info.boot_cpu_apic_id)
			cpu = 0;
		else
			cpu = topo_get_cpunr(apic_id);

		cpuid_to_apicid[cpu] = apic_id;
		topo_set_cpuids(cpu, apic_id, acpi_id);
	} else {
		u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN);

		/*
		 * Check for present APICs in the same package when running
		 * on bare metal. Allow the bogosity in a guest.
		 */
		if (hypervisor_is_type(X86_HYPER_NATIVE) &&
		    topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) {
			pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n",
				     apic_id);
			topo_info.nr_rejected_cpus++;
			return;
		}

		topo_info.nr_disabled_cpus++;
	}

	/*
	 * Register present and possible CPUs in the domain
	 * maps. cpu_possible_map will be updated in
	 * topology_init_possible_cpus() after enumeration is done.
	 */
	for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
		set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
}

/**
 * topology_register_apic - Register an APIC in early topology maps
 * @apic_id:	The APIC ID to set up
 * @acpi_id:	The ACPI ID associated to the APIC
 * @present:	True if the corresponding CPU is present
 */
void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present)
{
	if (apic_id >= MAX_LOCAL_APIC) {
		pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1);
		topo_info.nr_rejected_cpus++;
		return;
	}

	if (check_for_real_bsp(apic_id)) {
		topo_info.nr_rejected_cpus++;
		return;
	}

	/* CPU numbers exhausted? */
	if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) {
		pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids);
		topo_info.nr_rejected_cpus++;
		return;
	}

	topo_register_apic(apic_id, acpi_id, present);
}

/**
 * topology_register_boot_apic - Register the boot CPU APIC
 * @apic_id:	The APIC ID to set up
 *
 * Separate so CPU #0 can be assigned
 */
void __init topology_register_boot_apic(u32 apic_id)
{
	WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID);

	topo_info.boot_cpu_apic_id = apic_id;
	topo_register_apic(apic_id, CPU_ACPIID_INVALID, true);
}

/**
 * topology_get_logical_id - Retrieve the logical ID at a given topology domain level
 * @apicid:		The APIC ID for which to lookup the logical ID
 * @at_level:		The topology domain level to use
 *
 * @apicid must be a full APIC ID, not the normalized variant. It's valid to have
 * all bits below the domain level specified by @at_level to be clear. So both
 * real APIC IDs and backshifted normalized APIC IDs work correctly.
 *
 * Returns:
 *  - >= 0:	The requested logical ID
 *  - -ERANGE:	@apicid is out of range
 *  - -ENODEV:	@apicid is not registered
 */
int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level)
{
	/* Remove the bits below @at_level to get the proper level ID of @apicid */
	unsigned int lvlid = topo_apicid(apicid, at_level);

	if (lvlid >= MAX_LOCAL_APIC)
		return -ERANGE;
	if (!test_bit(lvlid, apic_maps[at_level].map))
		return -ENODEV;
	/* Get the number of set bits before @lvlid. */
	return bitmap_weight(apic_maps[at_level].map, lvlid);
}
EXPORT_SYMBOL_GPL(topology_get_logical_id);

/**
 * topology_unit_count - Retrieve the count of specified units at a given topology domain level
 * @apicid:		The APIC ID which specifies the search range
 * @which_units:	The domain level specifying the units to count
 * @at_level:		The domain level at which @which_units have to be counted
 *
 * This returns the number of possible units according to the enumerated
 * information.
 *
 * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN)
 * counts the number of possible cores in the package to which @apicid
 * belongs.
 *
 * @at_level must obviously be greater than @which_level to produce useful
 * results.  If @at_level is equal to @which_units the result is
 * unsurprisingly 1. If @at_level is less than @which_units the results
 * is by definition undefined and the function returns 0.
 */
unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
				 enum x86_topology_domains at_level)
{
	/* Remove the bits below @at_level to get the proper level ID of @apicid */
	unsigned int lvlid = topo_apicid(apicid, at_level);

	if (lvlid >= MAX_LOCAL_APIC)
		return 0;
	if (!test_bit(lvlid, apic_maps[at_level].map))
		return 0;
	if (which_units > at_level)
		return 0;
	if (which_units == at_level)
		return 1;
	return topo_unit_count(lvlid, at_level, apic_maps[which_units].map);
}

#ifdef CONFIG_ACPI_HOTPLUG_CPU
/**
 * topology_hotplug_apic - Handle a physical hotplugged APIC after boot
 * @apic_id:	The APIC ID to set up
 * @acpi_id:	The ACPI ID associated to the APIC
 */
int topology_hotplug_apic(u32 apic_id, u32 acpi_id)
{
	int cpu;

	if (apic_id >= MAX_LOCAL_APIC)
		return -EINVAL;

	/* Reject if the APIC ID was not registered during enumeration. */
	if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map))
		return -ENODEV;

	cpu = topo_lookup_cpuid(apic_id);
	if (cpu < 0)
		return -ENOSPC;

	set_bit(apic_id, phys_cpu_present_map);
	topo_set_cpuids(cpu, apic_id, acpi_id);
	cpu_mark_primary_thread(cpu, apic_id);
	return cpu;
}

/**
 * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot
 * @cpu:	The CPU number for which the APIC ID is removed
 */
void topology_hotunplug_apic(unsigned int cpu)
{
	u32 apic_id = cpuid_to_apicid[cpu];

	if (apic_id == BAD_APICID)
		return;

	per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
	clear_bit(apic_id, phys_cpu_present_map);
	set_cpu_present(cpu, false);
}
#endif

#ifdef CONFIG_X86_LOCAL_APIC
static unsigned int max_possible_cpus __initdata = NR_CPUS;

/**
 * topology_apply_cmdline_limits_early - Apply topology command line limits early
 *
 * Ensure that command line limits are in effect before firmware parsing
 * takes place.
 */
void __init topology_apply_cmdline_limits_early(void)
{
	unsigned int possible = nr_cpu_ids;

	/* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' 'noapic' */
	if (!setup_max_cpus || ioapic_is_disabled || apic_is_disabled)
		possible = 1;

	/* 'possible_cpus=N' */
	possible = min_t(unsigned int, max_possible_cpus, possible);

	if (possible < nr_cpu_ids) {
		pr_info("Limiting to %u possible CPUs\n", possible);
		set_nr_cpu_ids(possible);
	}
}

static __init bool restrict_to_up(void)
{
	if (!smp_found_config || ioapic_is_disabled)
		return true;
	/*
	 * XEN PV is special as it does not advertise the local APIC
	 * properly, but provides a fake topology for it so that the
	 * infrastructure works. So don't apply the restrictions vs. APIC
	 * here.
	 */
	if (xen_pv_domain())
		return false;

	return apic_is_disabled;
}

void __init topology_init_possible_cpus(void)
{
	unsigned int assigned = topo_info.nr_assigned_cpus;
	unsigned int disabled = topo_info.nr_disabled_cpus;
	unsigned int cnta, cntb, cpu, allowed = 1;
	unsigned int total = assigned + disabled;
	u32 apicid, firstid;

	/*
	 * If there was no APIC registered, then fake one so that the
	 * topology bitmap is populated. That ensures that the code below
	 * is valid and the various query interfaces can be used
	 * unconditionally. This does not affect the actual APIC code in
	 * any way because either the local APIC address has not been
	 * registered or the local APIC was disabled on the command line.
	 */
	if (topo_info.boot_cpu_apic_id == BAD_APICID)
		topology_register_boot_apic(0);

	if (!restrict_to_up()) {
		if (WARN_ON_ONCE(assigned > nr_cpu_ids)) {
			disabled += assigned - nr_cpu_ids;
			assigned = nr_cpu_ids;
		}
		allowed = min_t(unsigned int, total, nr_cpu_ids);
	}

	if (total > allowed)
		pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed);

	assigned = min_t(unsigned int, allowed, assigned);
	disabled = allowed - assigned;

	topo_info.nr_assigned_cpus = assigned;
	topo_info.nr_disabled_cpus = disabled;

	total_cpus = allowed;
	set_nr_cpu_ids(allowed);

	cnta = domain_weight(TOPO_PKG_DOMAIN);
	cntb = domain_weight(TOPO_DIE_DOMAIN);
	__max_logical_packages = cnta;
	__max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));

	pr_info("Max. logical packages: %3u\n", cnta);
	pr_info("Max. logical dies:     %3u\n", cntb);
	pr_info("Max. dies per package: %3u\n", __max_dies_per_package);

	cnta = domain_weight(TOPO_CORE_DOMAIN);
	cntb = domain_weight(TOPO_SMT_DOMAIN);
	/*
	 * Can't use order delta here as order(cnta) can be equal
	 * order(cntb) even if cnta != cntb.
	 */
	__max_threads_per_core = DIV_ROUND_UP(cntb, cnta);
	pr_info("Max. threads per core: %3u\n", __max_threads_per_core);

	firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC);
	__num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN);
	pr_info("Num. cores per package:   %3u\n", __num_cores_per_package);
	__num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN);
	pr_info("Num. threads per package: %3u\n", __num_threads_per_package);

	pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled);
	if (topo_info.nr_rejected_cpus)
		pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus);

	init_cpu_present(cpumask_of(0));
	init_cpu_possible(cpumask_of(0));

	/* Assign CPU numbers to non-present CPUs */
	for (apicid = 0; disabled; disabled--, apicid++) {
		apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map,
					      MAX_LOCAL_APIC, apicid);
		if (apicid >= MAX_LOCAL_APIC)
			break;
		cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid;
	}

	for (cpu = 0; cpu < allowed; cpu++) {
		apicid = cpuid_to_apicid[cpu];

		set_cpu_possible(cpu, true);

		if (apicid == BAD_APICID)
			continue;

		cpu_mark_primary_thread(cpu, apicid);
		set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map));
	}
}

/*
 * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed.
 */
void __init topology_reset_possible_cpus_up(void)
{
	init_cpu_present(cpumask_of(0));
	init_cpu_possible(cpumask_of(0));

	bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC);
	if (topo_info.boot_cpu_apic_id != BAD_APICID)
		set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map);
}

static int __init setup_possible_cpus(char *str)
{
	get_option(&str, &max_possible_cpus);
	return 0;
}
early_param("possible_cpus", setup_possible_cpus);
#endif