1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
|
Before 1.1:
* Move perfcounter.h into sysprof namespace
* Check for existence and presense of __NR_perf_counter_open in
syscall.h
Before 1.2:
* Filter out kernel stacktrace entries from the interrupts.
(This can be done better in the kernel).
* Make sure errors are reported properly.
* Fork GtkLabel and change it to only request a resize when the new size is
actually larger than the old one. It could also be special-purposed for
the "Samples: <number>" format with the number left-aligned.
* When should the samples label be updated? On one hand we don't want
it to be the only thing that shows up on the profile. On the other,
when there are things going on, it should update quickly.
It is also desirable that it updates slowly if there is slow
activity going on; for example if you are moving the mouse cursor
around.
Cost of updating the samples label: s (in samples)
Base rate: f
If we update the samples label c times per second, the frequency is
c * s
If we update the samples label for every k samples.
(k - s)/f is the time it takes before updating
So if the update rate should be proportional to the base rate, then
we get
(k - s)/f = d/f
which implies k = d + s. So we should pick some constant d and only
update when that many samples have arrived.
* The counters seem to not be disabled when looking at the
profiles.
* Build system
- Create RPM package? See fedora-packaging-list for information
about how to package kernel modules. Lots of threads in
June 2005 and forward.
See also http://www.fedoraproject.org/wiki/Extras/KernelModuleProposal
Someone already did create a package - should be googlable.
* The hrtimer in the kernel currently generates an event every time
the timer fires. There are two problems with this:
- Essentially all the events are idle events and exclude_idle is
ignored.
- If you make it obey exclude_idle, it still generates activity
based on what is running currently. Unfortunately, the thing that
is running will very often be the userspace tracker because it was
handling the last sample generated. So this degenerates into an
infinite loop of sorts. Or maybe the GUI is just too slow, but
then why doesn't it happen with the real counters?
I think the solution here is to make the hrtimer fire at some
reasonable interval, like 100000 ns. When the timer fires, if the
current task is not the idle taks, it increments a counter by
cpu clock frequency * 100000 ns
If that overflows the sample period, an event is generated.
This is closer to the idea of a fake CPU cycle counter.
Also, for reasons I don't understand, it stops generating events
completely if a process is running that spins on all CPUs. So this
interface is not usable in its present state, but fortunately all
CPUs we care about have hardware cycle counters.
* With more than one CPU, we can get events out of order, so userspace
will have to deal with that. With serial numbers we could do it
correctly, but even without them we can do a pretty reasonable job
of putting them back in order. If a fork happens "soon" after a
sample, it probably happened before the sample; if an mmap happens
"soon" after a sample that would otherwise be unmapped, it probably
happened before the sample. All we need is a way to determine what
"soon" is.
Serial numbers would be useful to make "soon" an accurate measure.
There is also the issue of pid reuse, but that can probably be
ignored.
If we ignore pid reuse, we can sort the event buffer where two
events compare equal, unless both have the same pid and one is a
fork and the other is not.
A system-wide serial number could be expensive to maintain though,
so maybe time events would be just as good.
* Another issue is processes that exit during the initial scan of
/proc. Such a process will not cause sample events by itself, but it
may fork a child that will. There is no way to get maps for that
child.
A possible solution would be to optionally generate mmap event after
forks. Userspace would turn this off when it was done with the
initial gathering of processes.
Also, exec() events will delete the maps from a process, but all we
get is 'comm' events which is not quite the same thing.
* Find out why the busy cursor stays active after you hit start
* Kernel binary when available, is better than kallsyms.
* GTK+ bugs:
- Misposition window after click
- Find out why gtk_tree_view_columns_autosize() apparently doesn't
work on empty tree views.
- Write my own tree model? There is still performance issues in
FooTreeStore.
* Counters must not be destroyed during tracker setup. They have to
exist but be disabled so that we can track process creation.
* Check that we don't use too much memory (in particular with the
timeline).
* Fix names. "new process" is really "exec". (What does "comm"
actually stand for? Command?)
* Fix ugly flash when double clicking in descendants view
* Find out what's up with weird two-step flash when you hit start when
a profile is loaded.
* Make tracker creation faster. (Faster populating mainly)
* Share map reading code between vdso stuff in binfile.c and tracker.c
* Get rid of remaining gulongs (use uint64_t instead)
* Move binfile hash table to state_t.
* Get rid of process.c
* On 32 bit, NMI stackframes are not filtered out which leads to wrong
kernel traces
* Can we track kernel modules being inserted/removed?
* Does it make sense to try and locate a kernel binary, or can we
always just use kallsyms.
* open_inode() would be useful in many cases
* Is the double mapping of the ring buffer ok?
* Why do we get EINVAL when we try to track forks?
* Sometimes it gets samples for unknown processes. This may be due to
forking without execing.
* Give an informative error message if not run as root
* What are the events generated for pid 0 and pid 1? They have no
stacktrace, and an eip in the kernel.
* Delete the binparser stuff and fix the elf parser to just deal with
32 bits vs 64 bits. Or use C++ like behdad does in harfbuzz?
* We often get "No map". I suspect this is because the vdso stackframe
is strange.
* Hack to disable recursion for binaries without symbols causes the
symbols to not work the way other symbols do. A better approach is
probably to simply generate a new symbol for every appearance except
leaf nodes, which should still be considered one symbol (or maybe be
considered the same symbol if they have the same parent). In fact
"has same parent" may be the correct criterion in all cases. (done:
master now doesn't fold those recursions anymore)
* See if we can make "In file <blah>" not be treated as a recursive
function. Maybe simply treat each individual address in the file
as a function. Or try to parse the machine code. Positions that
are called are likely to be functions.
- Treat identical addresses as one function
- Treat all addresses within a library that don't have children
are treated as one function.
This will have the effect of coalescing adjacent siblings without
children. Which is what you want since you can't tell them apart
anyway. It will never be a great experience though.
* Make sure that labels look decent in case of "No Map" etc.
* Elf bugs:
- error handling for bin_parser is necessary.
* Find out why all apps have an "In file /usr/bin/<app binary>" below
_libc_main. If possible, maybe make up a name for it.
* vdso stuff:
- the "[vdso]" string should be #defined somewhere
- Does get_vdso_bytes() belong in process.c?
- Is basing on "[vdso]" always correct?
* Convert things like [heap] and [stack] to more understandable labels.
* Strategies for taking reliable stacktraces.
Three different kinds of files
- kernel
- vdso
- regular elf files
- kernel
- eh_frame annotations, in kernel or in kernel debug
- /proc/kallsyms
- userspace can look at _stext and _etext to determine
start and end of kernel text segment
- copying kernel stack to userspace
- it's always 4096 bytes these days
- heuristically determine functions based on address
- callbacks on the stack can be identified
by having an offset of 0.
- even so there is a lot of false positives.
- is eh_frame usually loaded into memory during normal
operation? It is mapped, but probably not paged in,
so we will be taking a few major page faults when we
first profile something.
Unless of course, we store the entire stack in
the stackstash. This may use way too much memory though.
- Locking, possibly useful code:
/* In principle we should use get_task_mm() but
* that will use task_lock() leading to deadlock
* if somebody already has the lock
*/
if (spin_is_locked (¤t->alloc_lock))
printk ("alreadylocked\n");
{
struct mm_struct *mm = current->mm;
if (mm)
{
printk (KERN_ALERT "stack size: %d (%d)\n",
mm->start_stack - regs->REG_STACK_PTR,
current->pid);
stacksize = mm->start_stack - regs->REG_STACK_PTR;
}
else
stacksize = 1;
}
- regular elf
- usually have eh_frame section which is mapped into memory
during normal operation
- do stackwalk in kernel based on eh_frame
- eh_frame section is usually mapped into memory, so
no file reading in kernel would be necessary.
- do stackwalk in userland based on eh_frame
- do ebp based stackwalk in kernel
- do ebp based stackwalk in userland
- do heuristic stackwalk in kernel
- do heuristic stackwalk in userland
- Send heuristic stack trace to user space, along with
location on the stack. Then, in userspace analyze the
machine code to determine the size of the stack frame at any
point. The instructions that would need to be recognized are:
subl <constant>, %esp
addl <constant>, %esp
leave
jcc
push
pop
GCC is unlikely to have different stack sizes at the entry
to a basic block.
We can often find a vmlinux in /lib/modules/<uname-r>/build.
* "Expand all" is horrendously slow because update_screenshot gets called
for every "expanded" signal. In fact even normal expanding is really
slow. It's probably hopeless to get decent performance out of GtkTreeView,
so we will have to store a list of expanded objects and keep that uptodate
as the rows expands and unexpands.
* Give more sensible 'error messages'. Eg., if you get permission denied for
a file, put "Permission denied" instead of "No map"
* crc32 checking probably doesn't belong in elfparser.c
* Missing things in binparser.[ch]
- it's inconvenient that you have to pass in both a parser _and_
a record. The record should just contain a pointer to the parser.
On the other hand, the result does depend on the parser->offset.
So it's a bit confusing that it's not passed in.
- the bin_parser_seek_record (..., 1); idiom is a little dubious
- Add error checking
Also need to add error checking.
- "native endian" is probably not useful. Maybe go back to just
having big/little endian.
Should probably rethink the whole thing. It's just not very convenient to use, even
for simple things like ELF files.
* Rename stack_stash_foreach_by_address() to stack_stash_foreach_unique(),
or maybe not ...
Which things are we actually using from stack stash now?
* Maybe report idle time? Although this would come for free with the
timelines.
* Fix (deleted) problem. But more generally, whenever we can't display a
symbol, display an error message instead, ie.,
- 'Binary file <xxxx> was deleted/replaced'
- 'No mapping for address'
- 'No symbols in binary file'
- 'Address has no corrresponding symbol in file'
- etc.
done: HEAD will not load files with the wrong inode now.
* Consider whether ProfileDescendant can be done with a StackStash We
need the "go-back-on-recursion" behavior. That could be added of
course ... the functions are otherwise very similar.
* Add spew infrastructure to make remote debugging easier.
* Make it compile and work on x86-64
- make the things we put in a stackstash real
objects so that
- we can save them
- they will know how to delete the presentation
names and themselves (through a virtual function)
- they can contain markup etc.
- The unique_dup() scheme is somewhat confusing.
- a more pragmatic approach might be to just walk the tree and
save it.
- plug all the leaks
- loading and saving probably leak right now
- make it use less memory:
- StackNodes are dominating
- fold 'toplevel' into 'size'
- allocate in blocks
- this will need to be coordinated with
profile.c which also creates stacknodes.
- maybe simply make stackstashes able to
save themselves.
- rethink loading and saving. Goals
- Can load 1.0 profiles
- Don't export too much of stackstashes to the rest of the
app
Features:
* Have a compatibility module that can load 1.0 modules
- reads in the call tree, then generate a stack stash
by repeatedly adding traces.
* Make stackstash able to save themselves using callbacks.
* If loading a file fails, then try loading it using the
1.0 loader.
- optimization: make sure we fail immediately if we
see an unknown object.
* Add versioning support to sfile.[ch]:
- sformat_new() should take doctype and version strings:
like "sysprof-profile" "version 1.2"
- there should be sfile_sniff() functionality that will
return the doctype and version of a file. Or None
if there aren't any.
* At this point, make the loader first check if the file has a version
if it doesn't, load as 1.0, otherwise as whatever the version claims
it is.
* Make provisions for forward compatibility: maybe it should be
possible to load records with more fields than specified.
* Figure out how to make sfile.[ch] use less memory.
- In general clean sfile.[ch] up a little:
- split out dfa in its own generic class
- make a generic representation of xml files with quarks for strings:
struct item {
int begin/end/text;
quark text: -> begin/end/value
int id; -> for begins that are pointed to
}
perhaps even with iterators. Should be compact and suitable for both
input and output. As a first cut, perhaps just split out the
Instruction code.
(done, somewhat).
- make the api saner; add format/content structs
Idea: all types have to be declared:
SFormat *format = sformat_new()
SType *object_list = stype_new (format);
SType *object = stype_new (format);
...
stype_define_list (object_list, "objects", object);
stype_define_record (object, "object",
name, total, self, NULL);
stype_define_pointer (...);
* See if the auto-expanding can be made more intelligent
- "Everything" should be expanded exactly one level
- all trees should be expanded at least one level
* Send entire stack to user space, then do stackwalking there. That would
allow us to do more complex algorithms, like dwarf, in userspace. Though
we'd lose the ability to do non-racy file naming. We could pass a list
of the process mappings with each stack though. Doing this would also solve
the problem of not being able to get maps of processes running as root.
Might be too expensive though. User stacks seem to be on the order
of 100K usually, which for 200 times a second means a bandwidth of
20MB/s, which is probably too much. One question is how much of it
usually changes.
Actually it seems that the _interesting_ part of the stack
(ie., from the stack pointer and up) is not that big in many cases. The
average stacksize seemed to be about 7700 bytes for gcc compiling gtk+.
Even deeply recursive apps like sysprof only generate about 55K stacks.
Other possibilities:
- Do heuristic stack walking where it lists all words on the stack
that look like they might be return addresses.
- Somehow map the application's stack pages into the client. This
is likely difficult or impossible.
- Another idea: copy all addresses that look like they could be
return addresses, along with the location on the stack. This
just might be enough for a userspace stack walker.
- Yet another: krh suggests hashing blocks of the stack, then
only sending the blocks that changed since last time.
- every time you send a stackblock, also send a cookie.
- whenever you *don't* send a stackblock, send the cookie
instead. That way you always get a complete stacktrace
conceptually.
- also, that would allow the kernel to just have a simple
hashtable containing the known blocks. Though, that could
become large. Actually there is no reason to store the
blocks; you can just send the hashcode. That way you
would only need to store a list of hashcodes that we
have generated previously.
- One problem with doing DWARF walking is that the debug code
will have to be faulted in. This can be a substantial amount
of disk access which is undesirable to have during a
profiling run. Even if we only have to fault in the
.eh_frame_hdr section, that's still 18 pages for gtk+. The
.eh_frame section for gtk+ is 72 pages.
A possibility may be to consider two stacktraces identical
if the only differing values are *outside* the text
segments. This may work since stack frames tend to be the
same size. Is there a way of determining the location of
text segments without reading the ELF files? Maybe just
check if it's inside an executable mappign.
It is then sufficient in user space to only store one
representative for each set of considered-identical stack
traces.
User space storage: Use the stackstash tree. When a new trace
is added, just skip over nodes that differ, but where none of
them points to text segments. Two possibilities then:
- when two traces are determined to differ, store them
in completely separate trees. This ensures that we
will never run the dwarf algorithm on an invalid
stack trace, but also means that we won't get shared
prefixes for stacktraces.
- when two traces are determined to differ, branch off
as currently. This will share more data, but the
dwarf algorithm could be run on invalid traces. It
may work in practice though if the compiler
generally uses fixed stack frames.
A twist on is to mark the complete stack traces as
"complete". Then after running the DWARF algorithm,
the generated stack trace can be saved with it. This
way incomplete stack traces branching off a complete
one can be completed using the DWARF information for
the shared part.
* Notes on heuristic stack walking
- We can reject addresses that point exactly to the beginning of a
function since these are likely callbacks. Note though that the
first time a function in a shared library is called, it goes
through dynamic linker resolution which will cause the stack to
contain a callback of the function. This needs to be investigated
in more detail.
- We are already rejecting addresses outside the text section
(addresses of global variables and the like).
* How to get the user stack:
/* In principle we should use get_task_mm() but
* that will use task_lock() leading to deadlock
* if somebody already has the lock
*/
if (spin_is_locked (¤t->alloc_lock))
printk ("alreadylocked\n");
{
struct mm_struct *mm = current->mm;
if (mm)
{
printk (KERN_ALERT "stack size: %d (%d)\n",
mm->start_stack - regs->REG_STACK_PTR,
current->pid);
stacksize = mm->start_stack - regs->REG_STACK_PTR;
}
else
stacksize = 1;
}
* If interrupt happens in kernel mode, send both
kernel stack and user space stack, have userspace stitch them
together. well, they could be stitched together in the kernel.
Already done: we now take a stacktrace of the user space process
when the interrupt happens in kernel mode. (Unfortunately, this
causes lockups on many kernels (though I haven't seen that for
a while)).
We don't take any stacktraces of the kernel though. Things that
need to be
investigated:
- does the kernel come with dwarf debug information?
- does the kernel come with some other debug info
- is there a place where the vmlinux binary is usually
placed? (We should avoid any "location of vmlinux" type
questions if at all possible).
We do now copy the kernel stack to userspace and do a
heuristic stack walk there. It may be better at some point to
use dump_trace() in the kernel since that will do some
filtering by itself.
Notes about kernel symbol lookup:
- /proc/kallsym is there, but it includes things like labels.
There is no way to tell them from functions
- We can search for a kernel binary with symbols. If the
kernel-debug package is installed, or if the user compiled
his own kernel, we will find one. This is a regular elf file.
It also includes labels, but we can tell by the STT_INFO field
which is which.
Note though that for some labels we do actually want to
treat them as functions. For example the "page_fault" label,
which is function by itself. We can recognize these by the
fact that their symbols have a size. However, the _start
function in normal applications does not have a size, so the
heuristic should be something like this:
- If the address is inside the range of some symbol, use
that symbol
- Otherwise, if the closest symbol is a function with
size 0, use that function.
This means the datastructure will probably have to be done a
little differently.
- See if there is a way to make it distcheck
- grep "FIXME - not10"
- grep FIXME
- translation should be hooked up
- Consider adding "at least 5% inclusive cost" filter
- consider having the ability to group a function together with its nearest
neighbours. That way we can eliminate some of the effect of
"one function taking 10% of the time"
vs.
"the same function broken into ten functions each taking 1%"
Not clear what the UI looks like though.
- Ability to generate "screenshots" suitable for mail/blog/etc
UI: "generate screenshot" menu item pops up a window with
a text area + a radio buttons "text/html". When you flick
them, the text area is automatically updated.
- beginning in CVS:
- why does the window not remember its position when
you close it with the close button, but does remember
it when you use the wm button or the menu item? It actually
seems that it only forgets the position when you click the
button with the mouse. But not if you use the keyboard ...
This is a gtk+ bug.
- Find out how gdb does backtraces; they may have a better way. Also
find out what dwarf2 is and how to use it. Look into libunwind.
It seems gdb is capable of doing backtraces of code that neither has
a framepointer nor has debug info. It appears gdb uses the contents
of the ".eh_frame" section. There is also an ".eh_frame_hdr" section.
http://www.linuxbase.org/spec/booksets/LSB-Embedded/LSB-Embedded/ehframe.html
look in dwarf2-frame.[ch] in the gdb distribution.
Also look at bozo-profiler
http://cutebugs.net/bozo-profiler/
which has an elf32 parser/debugger
- Make busy cursors more intelligent
- when you click something in the main list and we don't respond
within 50ms (or perhaps when we expect to not be able to do
so (can we know the size in advance?))
- instead of what we do now: set the busy cursor unconditionally
- Consider adding ability to show more than one function at a time. Algorithm:
Find all relevant nodes;
For each relevant node
best_so_far = relevant node
walk towards root
if node is relevant,
best_so_far = relevant
add best_so_far to interesting
for each interesting
list leaves
for each leaf
add trace to tree (leaf, interesting)
- Consider adding KDE-style nested callgraph view
- probably need a dependency on gtk+ 2.8 (cairo) for this.
- Matthias has code for something like this.
- See http://www.marzocca.net/linux/baobab.html
- Also see http://www.cs.umd.edu/hcil/treemap-history/index.shtml
- Add support for line numbers within functions
- Possibly a special "view details" mode, assuming that
the details of a function are not that interesting
together with a tree. (Could add radio buttons somewhere in
in the right pane). Or tabs.
- Open a new window for the function.
- Add view->ancestors/descendants menu items
- rethink caller list, not terribly useful at the moment. Federico suggested
listing all ancestors.
Done: implemented this idea in CVS HEAD. If we keep it that way,
should do a globale s/callers/ancestors on the code.
- not sure it's an improvement. Often it is more interesting to
find the immediate callers.
- Now it's back to just listing the immediate callers.
- Figure out how Google's pprof script works. Then add real call graph
drawing. (google's script is really simple; uses dot from graphviz).
KCacheGrind also uses dot to do graph drawing.
- hide internal stuff in ProfileDescendant
Later:
- Multithreading is possible in a number of places.
- If the stack trace ends in a memory access instruction, send the
vma information to userspace. Then have user space
produce statistics on what types of memory are accessed.
- somehow get access to VSEnterprise profiler and see how it works.
somehow get access to vtune and see how it works.
- On SMP systems interrupts happen unpredictably, including when another
one is running. Right now we are ignoring any interrupts that happen
when another one is running, but we should probably just save the data
in a per-cpu buffer.
- Find out if sysprof accurately reports time spent handling pagefaults.
There is evidence that it doesn't:
- a version of sysprof took 10 seconds to load a certain profile.
Profiling itself it appeared that most of the time was spent
in the GMarkup parser
- a newer version of sysprof with significantly more compact
Instructions structure took about 5 seconds, but the profile
looked about the same.
The difference between the two versions has to be in page faults/
memory speed, but the profiles looked similar.
Try and reproduce this in a more controlled experiment.
- See if it is possible to group the X server activity under the process that
generated it.
- .desktop file
[Is this worth it? You will often want to start it as root,
and you will need to insert the module from the comman line]
- Applications should be able to say "start profiling", "stop profiling"
so that you can limit the profiling to specific areas.
Idea:
Add a new kernel interface that applications uses to say
begin/end.
Then add a timeline where you can mark interesting regions,
for example those that applications have marked interesting.
- Find out how to hack around gtk+ bug causing multiple double clicks
to get eaten.
- Consider what it would take to take stacktraces of other languages such
as perl, python, java, ruby, or bash. Or scheme.
Possible solution is for the script binaries to have a function
called something like
__sysprof__generate_stacktrace (char **functions, int *n_functions);
that the sysprof kernel module could call (and make return to the kernel).
This function would behave essentially like a signal handler: couldn't
call malloc(), couldn't call printf(), etc.
Note though that scripting languages will generally have a stack with
both script-binary-stack, script stack, and library stacks. We wouldn't
want scripts to need to parse dwarf. Also if we do that thing with
sending the entire stack to userspace, things will be further
complicated.
Also note languages like scheme that uses heap allocated activation
records.
- Consider this usecase:
Someone is considering replacing malloc()/free() with a freelist
for a certain data structure. All use of this data structure is
confined to one function, foo(). It is now interesting to know
how much time that particular function spends on malloc() and free()
combined.
Possible UI:
- Select foo(),
- find an instance of malloc()
- shift-click on it,
- all traces with malloc are removed
- a new item "..." appears immeidately below foo()
- malloc is added below "..."
- same for free
- at this point, the desired data can be read at comulative
for "..."
Actually, with this UI, you could potentially get rid of the
caller list: Just present the call tree under an <everything> root,
and use ... to single out the stuff you are interested in.
Maybe also get rid of 'callers' by having a new "show details"
dialog or something.
The complete solution here degenerates into "expressions":
"foo" and ("malloc" or "free")
Having that would also take care of the "multiple functions"
above. Noone would understand it, though.
- figure out a way to deal with both disk and CPU. Need to make sure that
things that are UNINTERRUPTIBLE while there are RUNNING tasks are not
considered bad. Also figure out how to deal with more than one CPU/core.
Not entirely clear that the sysprof visualization is right for disk.
Maybe assign a size of n to traces with n *unique* disk access (ie.
disk accesses that are not required by any other stack trace).
Or assign values to nodes in the calltree based on how many diskaccesses
are contained in that tree. Ie., if I get rid of this branch, how many
disk accesses would that get rid of.
Or turn it around and look at individual disk accesses and see what it
would take to get rid of it. Ie., a number of traces are associated with
any given diskaccess. Just show those.
Or for a given tree with contained disk accesses, figure out what *other*
traces has the same diskaccesses.
Or visualize a set of squares with a color that is more saturated depending
on the number of unique stack traces that access it. Then look for the
lightly saturated ones.
The input to the profiler would basically be
(stack trace, badness, cookie)
For CPU: badness=10ms, cookie=<a new one always>
For Disk: badness=<calculated based on previous disk accesses>, cookie=<the accessed disk block>
For Memory: badness=<cache line size not in cache>, cookie=<the address>
Cookies are used to figure out whether an access is really the same, ie., for two identical
cookies, the size is still just one, however
Memory is different from disk because you can't reasonably assume
that stuff that has been read will stay in cache (for short profile
runs you can assume that with disk, but not for long ones).
- Perhaps show a timeline with CPU in one color and disk in one
color. Allow people to look at at subintervals of this
timeline. Is it useful to look at both CPU and disk at the same
time? Probably not. See also marker discussion above. UI should
probably allow double clicking on a marked section and all
instances of that one would be marked.
- This also allows us to show how well multicore CPUs are being used.
- Other variation on the timeline idea: Instead of a disk timeline you could have a
list of individual diskaccesses, and be able to select the ones you wanted to
get rid of.
- The existing sysprof visualization is not terribly bad, the "self" column is
more useful now.
- See what files are accessed so that you can get a getter idea of what
the system is doing.
- Optimization usecases:
- A lot of stuff is read synchronously, but it is possible to read
it asynchronously.
Visualization: A timeline with alternating CPU/disk activity.
- What function is doing all the synchronous reading, and what
files/offsets is it reading. Visualization: lots of reads across
different files out of one function
- A piece of the program is doing disk I/O. We can drop that
entire piece of code. Sysprof visualization is ok, although seeing
the files accessed is useful so that we can tell if those files are
not just going to be used in other places. (Gnumeric plugin_init()).
- A function is reading a file synchronously, but there is other
(CPU/disk) stuff that could be done at the same time. Visualization:
A piece of the timeline is diskbound with little or no CPU used.
- Want to improve code locality of library or binary. Visualization:
no GUI, just produce a list of functions that should be put first in
the file. Then run the program again until the list converges.
(Valgrind may be more useful here).
- Nautilus reads a ton of files, icons + all the files in the
homedirectory. Normal sysprof visualization is probably useful
enough.
- Profiling a login session.
- Many applications are running at the same time, doing IPC. It would
be useful if we could figure out what other things a given process
is waiting on. Eg., in poll, find out what processes have the other
ends of the fd's open.
Visualization: multiple lines on a graph. Lines join up where
one process is blocking on another. That would show processes holding
up the progress very clearly.
This was suggested by Federico.
- Need to report stat() as well. (Where do inode data end up? In the
buffer-cache?) Also open() may cause disk reads (seeks).
- To generate the timeline we need to know when a disk request is
issued and when it is completed. This way we can assign blame to all
applications that have issued a disk request at a given point in time.
The disk timeline should probably vary in intensity with the number
of outstanding disk requests.
-=-=-=-=-=-=-=-=-=-=-=-=-=-=- ALREADY DONE: -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
* Find out what is going on with kernel threads:
[(ksoftirqd/0)] 0.00 0.03
No map ([(ksoftirqd/0)]) 0.00 0.03
kernel 0.00 0.03
do_softirq 0.00 0.03
__do_softirq 0.00 0.03
* Make sure there aren't leftover stacktraces from last time when
profiling begins.
* Is the move-to-front in process_locate_map() really worth it?
* Whenever we fail to lock the atomic variable, track this, and send the
information to userspace as an indication of the overhead of the profiling.
Although there is inherent aliasing here since stack scanning happens at
regular intervals.
* Apparently, if you upgrade the kernel, then don't re-run configure,
the kernel Makefile will delete all of /lib/modules/<release>/kernel
if you run make install in the module directory. Need to find out what
is going on.
* Performance:
Switching between descendant views is a slow:
- gtk_tree_store_get_path() is O(n^2) and accounts
for 43% of the time.
- GObject signal emission overhead accounts for 18% of
the time.
Consider adding a forked version of GtkTreeStore with
performance and bug fixes.
* If we end up believing the kernel's own stacktraces, maybe
/proc/kallsyms shouldn't be parsed until the user hits profile.
* Make it compilable against a non-running kernel.
* With kernel module not installed, select Profiler->Start, then dismiss
the alert. This causes the start button to appear prelighted. Probably
just another gtk+ bug.
- Fix bugs/performance issues:
- add_trace_to_tree() might be a little slow when dealing with deeply
recursive profiles. Hypothesis: seen_nodes can grow large, and the
algorithm is O(n^2) in the length of the trace.
- Have kernel module report the file the address was found in
Should avoid a lot of potential broken/raciness with dlopen etc.
Probably better to send a list of maps with each trace. Which
shouldn't really be that expensive. We already walk the list of
maps in process_ensure_map() on every trace. And we can do hashing
if it turns out to be a problem.
Maybe replace the SysprofStackTrace with a union, so that
it can be either a list of maps, or a stacktrace. Both map lists and
stacktraces would come with a hashcode.allowing userspac. This avoids
the problem that maps could take up a lot of extra bandwidth.
struct MapInfo
{
long start;
long end;
long offset;
long inode;
}
struct Maps
{
int hash_code;
int n_maps;
MapInfo info [127];
char filenames [2048];
}
- possibly add dependency on glib 2.8 if it is released at that point.
(g_file_replace())
* Some notes about timer interrupt handling in Linux
On an SMP system APIC is used - the interesting file is arch/i386/kernel/apic.c
On UP systems, the normal IRQ0 is used
When the interrupt happens,
entry.S
calls do_IRQ, which sets up the special interrupt stack,
and calls __do_IRQ, which is in /kernel/irq/handle.c.
This calls the corresponding irqaction, which has previously
been setup by arch/i386/mach-default/setup.c to point to
timer_interrupt, which is in arch/i386/kernel/time.c.
This calls do_timer_interrupt_hooks() which is defined in
/include/asm-i386/mach-default/do_timer.h. This function
then calls profile_tick().
Note when the CPU switches from user mode to kernel mode, it
pushes SS/ESP on top of the kernel stack, but when it switches
from kernel mode to kernel mode, it does _not_ push SS/ESP.
It does in both cases push EIP though.
* Rename sysprof-text to sysprof-cli
* Find out why the samples label won't right adjust
* It crashes sometimes.
I haven't seen any crashes in a long time
* Find out why the strings
_ZL11DisplayLineP20nsDisplayListBuilderRK6nsRectS3_R19nsLineList_iteratoriRiRK16nsDisplayListSetP12nsBlockFrame
_ZL11DisplayRowsP20nsDisplayListBuilderP7nsFrameRK6nsRectRK16nsDisplayListSet _ZL11DrawBordersP10gfxContextR7gfxRectS2_PhPdS4_PjPP14nsBorderColorsijiP6nsRect _ZL11HandleEventP10nsGUIEvent
_ZL12IsContentLEQP13nsDisplayItemS0_Pv
_ZL15expose_event_cbP10_GtkWidgetP15_GdkEventExpose
do not get demangled.
* For glibc, the debug files do not contain .strtab and .symtab, but
the original files do. The algorithm in binfile.c must be modified
accordingly.
* If we profile something that is not very CPU bound, sysprof itself
seems to get a disproportionate amount of the samples. Should look
into this. Fixed by only returning from poll when there is more
than eight traces available.
* regarding crossing system call barriers: Find out about the virtual dso
that linux uses to do fast system calls:
http://lkml.org/lkml/2002/12/18/218
and what that actually makes the stack look like. (We may want to just
special case this fake dso in the symbol lookup code).
Maybe get_user_pages() is the way forward at least for some stuff.
note btw. that the kernel pages are only one or two pages, so we
could easily just dump them to userspace.
* In profile.c, change "non_recursive" to "cumulative", and
"marked_non_recursive" to a boolean "charged". This is tricky code,
so be careful. Possibly make it a two-pass operation:
- first add the new trace
- then walk from the leaf, charging nodes
That would allow us to get rid of the marked field altogether. In fact,
maybe the descendants tree could become a stackstash. We'll just have
to make stack_stash_add_trace() return the leaf.
DONE: the name is now "cumulative"
* vdso
- assume its the same across processes, just look at
sysprof's own copy.
Done: vdso is done now
- send copy of it to userspace once, or for every
sample.
* Various:
- decorate_node should be done lazily
- Find out why we sometimes get completely ridicoulous stacktraces,
where main seems to be called from within Xlib etc. This happens
even after restarting everything.
- It looks like the stackstash-reorg code confuses "main" from
unrelated processes. - currently it looks like if multiple
"main"s are present, only one gets listed in the object list.
Seems to mostly happen when multiple processes are
involved.
- Numbers in caller view are completely screwed up.
- It looks like it sometimes gets confused with similar but different
processes: Something like:
process a spends 80% in foo() called from bar()
process b spends 1% in foo() called from baz()
we get reports of baz() using > 80% of the time.
Or something.
* commandline version should check that the output file is writable
before starting the profiling.
* See if we can reproduce the problem where libraries didn't get correctly
reloaded after new versions were installed.
This is just the (deleted) problem. Turns out that the kernel
doesn't print (deleted) in all cases. Some possibilities:
- check that the inodes of the mapped file and the disk file
are the same (done in HEAD).
- check that the file was not modified after being mapped?
(Can we get the time it was mapped or opened?) If it was
modified you'd expect the inode to change, right?
* Find out if the first sort order of a GtkTreeView column can be
changed programmatically. It can't (and the GTK+ bug was wontfixed).
A workaround is possible though. (Someone, please write a
GtkTreeView replacement!)
* Missing things in binparser.[ch]
- maybe convert BIN_UINT32 => { BIN_UINT, 4 }
we already have the width in the struct.
* Rethink binparser. Maybe the default mode should be:
- there is a current offset
- you can move the cursor
- _goto()
- _align()
- you can read structs with "begin_struct (format) / end_struct()"
Or maybe just "set_format()" that would accept NULL?
- when you are reading a struct, you can skip records with _index()
- you can read fields with get_string/get_uint by passing a name.
- you can read anonymous strings and uints by passing NULL for name
This is allowed even when not reading structs. Or maybe this
should be separate functions. Advantages:
- they can skip ahead, unlike fields accessors
- you can access specific types (8,16,32,64)
- there is no "name" field
Disadvantage:
- the field accesors would need renaming.
bin_parser_get_uint_field ()
is not really that bad though.
Maybe begin_record() could return a structure you could
use to access that particular record? Would nicely solve
the problems with "goto" and "index".
bin_record_get_uint();
What should begin/end be called? They will have different
objects passed.
bin_parser_get_record (parser) -> record
bin_record_free (record);
- Maybe support for indirect strings? Ie., get_string() in elfparser
- This will require endianness to be a per-parser property. Which is
probably just fine. Although d-bus actually has
per-message endianness. Maybe there could be a settable
"endianness" property.
* Don't look in $(libdir) for separate debug files (since $libdir is
the libdir for sysprof, not a system wide libdir). Tim Rowley.
Fix is probably to hardcode /usr/lib, and also look in $libdir.
* Consider deleting cmdline hack in process.c and replace with something at
the symbol resolution level. Will require more memory though. DONE: in
head, processes are no longer coalesced based on cmdline. Need to add something
at the symbol level.
* don't loop infinitely if there are cycles in the debuglink graph.
* Add "sysprof --version"
* Fix (potential) performance issues in symbol lookup.
- when an elf file is read, it should be checked that the various
sections are of the right type. For example the debug information
for emacs is just a stub file where all the sections are NOBITS.
* Try reproducing crash when profiling xrender demo
- it looks like it crashes when it attempts to read /usr/bin/python
- apparently what's going on is that one of the symbols in python's
dynamic symbol table has a completely crazy 'st_name' offset.
DONE: we didn't actually need to read the name at all,
but still should find out why that value is so weird.
It looks like there is something strange going on with that file.
All the dynsyms have weird info/type values, yet nm and readelf
have no problems displaying it.
- Can .gnu_debuglink recurse?
yes, it can, and we should probably not crash if there are
cycles in the graph.
* Find out why we are getting bogus symbols reported for /usr/bin/Xorg
Like this:
Everything 0.00 100.00
[/usr/bin/Xorg] 0.00 94.79
GetScratchPixmapHeader 0.00 94.79
__libc_start_main 0.00 94.79
FindAllClientResources 0.00 94.79
FreeFontPath 0.00 94.79
SProcRenderCreateConicalGradient 0.00 94.56
ProcRenderTrapezoids 0.00 94.56
AllocatePicture 0.00 94.56
__glXDispatch 0.00 0.16
__glXVendorPrivate 0.00 0.08
__glXRender 0.00 0.08
SmartScheduleStartTimer 0.00 0.08
[./sysprof] 0.00 2.76
[sshd: ssp@pts/0] 0.00 2.37
[compiz] 0.00 0.08
What's going on here is that the computed load address for the X server
binary is different for the debug binary. The lowest allocated address
is 0x08047134 for the normal, and 0x8048134 for the debug. But it looks
like the addresses are still the same for the symbols.
The root of this problem may be that we should ignore the load
address of the debug binary, and just lookup the address computed.
The *text* segments have the same address though. Everything from
"gnu version" on has the same address.
So:
- find out where in memory the text segment is
- take an address and compute its offset to the text segment
- in elf parser, find address of text segment
- add offset
- lookup resulting address
So basically, "load address" should really be text address. Except of course
that load_address is not used in process.c - instead the 'load address' of the
first part of the file is computed and assumed to be equivalent to the
load address. So to lookup something you probably actually need
to know the load/offset at the elf parser level:
lookup_symbol (elf, map, offset, address)
then,
real load address of text (lta) = map - offset + text_offset
offset of called func (ocf): addr - lta
thing to lookup in table: ocf + text_addr.loadaddr in debug
addr - map - offset + text_offset
hmmm ...
* plug all the leaks
- don't leak the presentation strings/objects
- maybe add stack_stash_set_free_func() or something
* Delete elf_parser_new() and rename elf_parser_new_from_file()
* Add demangling again
* Restore filename => binfile cache.
* It is apparently possible to get another timer interrupt in the middle
of timer_notify. If this happens the circular buffer gets screwed up and
you get crashes. Note this gets much worse on SMP (in fact how did this
work at all previously?)
Possible fixes
- have a "in timer notify" variable, then simply reject nested
interrupts
- keep a "ghost head" that timers can use to allocate new traces,
then update the real head whenever one of them completes. Note
though, that the traces will get generated in the wrong order
due to the nesting. In fact, only the outermost timernotify
can update the real head, and it should update it to ghost_head.
- do correct locking? Nah, that's crazy talk
Also note: a race is a race, and on SMP we probably can't even make it
unlikely enough to not matter.
Fixed by ignoring the nested interrupts using an atomic variable.
* When you load a file from the commandline, there is a weird flash of the toolbar.
What's going on is this:
- this file is loaded into a tree model
- the tree model is set for the function list
- this causes the selection changed signal to be emitted
- the callback for that signal process updates
- somehow in that update process, the toolbar flashes.
- turns out to be a gtk+ issue: 350517
- screenshot window must be cleared when you press start.
- Formats should become first-class, stand-alone objects that offers
help with parsing and nothing else.
ParseContext* format_get_parse_context (format, err);
gboolean parse_context_begin (parse_context, name, err);
gboolean parse_context_end (parse_format, name, err);
basically, a Format encapsulates a DFA, and a ParseContext encapsulates
the current state.
- make stackstash ref counted
- Charge 'self' properly to processes that don't get any stack trace at all
(probably we get that for free with stackstash reorganisation)
- CVS head now has two radio buttons in the right pane, and
caller pane is gone. (This turned out to be a bad idea, because it
is often useful to click on ancestors to move up the tree).
* Don't build the GUI if gtk+ is not installed
* Find out why we sometimes get reports of time spent by [pid 0].
* - Run a.out generated normally with gcc.
- Run sysprof
- hit profile
- Kill a.out
- strip a.out
- run a.out
- hit start
- hit profile
At this point we should not get any symbols, but we do. There is some
sort of bad caching going on.
* support more than one reader of the samples properly
- Don't generate them if noone cares
* Correctness
- When the module is unloaded, kill all processes blocking in read
- or block unloading until all processes have exited
Unfortunately this is basically impossible to do with a /proc
file (no open() notification). So, for 1.0 this will have to be
a dont-do-that-then. For 1.2, we should do it with a sysfs and
kobject instead.
- When the module is unloaded, can we somehow *guarantee* that no
kernel thread is active? Doesn't look like it; however we can
get close by decreasing a ref count just before returning
from the module. (There may still be return instructions etc.
that will get run). This may not be an issue with the timer
based scanning we are using currently.
* Find out why we get hangs with rawhide kernels. This only happens with the
'trace "current"' code. See this mail:
http://mail.nl.linux.org/kernelnewbies/2005-08/msg00157.html
esp0 points to top of kernel stack
esp points to top of user stack
(Reported by Kjartan Maraas).
- When not profiling, sysprof shouldn't keep the file open.
- Make things faster
- Can I get it to profile itself?
- speedprof seems to report that lots of time is spent in
stack_stash_foreach() and also in generate_key()
- add an 'everything' object. It is really needed for a lot of things
- should be easy to do with stackstash reorganization.
* Handle time being set back in the RESET_DEAD_PERIOD code.
- total should probably be cached so that compute_total() doesn't
take 80% of the time to generate a profile.
- Fixing the oops in kernels < 2.6.11
- Probably just require 2.6.11 (necessary for timer interrupt
based anyway).
- Make the process waiting in poll() responsible for extracting
the backtrace. Give a copy of the entire stack rather than doing
the walk inside the kernel.
New model:
- Two arrays,
one of actual scanned stacks
one of tasks that need to be scanned
One wait queue,
wait for data
- in read() wait for stack data:
scan_tasks()
if (!stack_data)
return -EWOULDBLOCK;
in poll()
while (!stack data) {
wait_for_data();
scan_tasks();
}
return READABLE;
scan_tasks() is a function that converts waiting
tasks into data, and wakes them up.
- in timer interrupt:
if (someone waiting in poll() &&
current && current != that_someone &&
current is runnable)
{
stop current;
add current to queue;
wake wait_for_data;
}
This way, we will have a real userspace process
that can take the page faults.
- Different approach:
pollable file where a regular userspace process
can read a pid. Any pid returned is guaranteed to be
UNINTERRUPTIBLE. Userspace process is required to
start it again when it is done with it.
Also provide interface to read arbitrary memory of
that process.
ptrace() could in principle do all this, but
unfortunately it sucks to continuously
ptrace() processes.
- Yet another
Userspace process can register itself as "profiler"
and pass in a filedescriptor where all sorts of
information is sent.
- could tie lifetime of module to profiler
- could send "module going away" information
- Can we map filedescriptors to files in
a module?
* Make sure sysprof-text is not linked to gtk+
* Consider renaming profiler.[ch] to collector.[ch]
* Crash reported by Rudi Chiarito with n_addrs == 0.
* Find out what distributions it actually works on
(ask for sucess/failure-stories in 1.0 releases)
* Add note in README about Ubuntu and Debian -dbg packages and how to get
debug symbols for X there.
stackstash reorg:
- make loading and saving work again.
- make stashes loadable and savable.
- add a way to convert 1.0 files to stashes
- Get rid of remaining uses of stack_stash_foreach(), then
rename stack_stash_foreach_reversed() to
stack_stash_foreach()
- stackstash should just take traces of addresses without knowing
anything about what those addresses mean.
- stacktraces should then begin with a process
- stackstash should be extended so that the "create_descendant"
and "create_ancestor" code in profile.c can use it directly.
At that point, get rid of the profile tree, and rename
profile.c to analyze.c.
- the profile tree will then just be a stackstash where the
addresses are presentation strings instead.
- Doing a profile will then amount to converting the raw stash
to one where the addresses have been looked up and converted to
presentation strings.
-=-=
- profile should take traces of pointers to presentation
objects without knowing anything about these presentation
objects.
- For each stack node, compute a presentation object
(probably need to export opaque stacknode objects
with set/get_user_data)
- Send each stack trace to the profile module, along with
presentation objects. Maybe just a map from stack nodes
to presentation objects.
- Make the Profile class use the stash directly instead of
building its own copy.
- store a stash in the profile class
- make sure descendants and callers can be
built from it.
- get rid of other stuff in the profile
struct
* Before 1.0:
- Update version numbers in source
- Make tarball
- Check that tarball works
- cvs commit
- cvs tag sysprof-1-0
- Update website
- Announce on Freshmeat
- Announce on gnome-announce
- Announce on kernel list.
- Announce on Gnomefiles
- Announce on news.gnome.org
- Send to slashdot/developers
- Announce on devtools list (?)
- Announce on Advogato
link to archive
* The handling of the global variable in signal-handler.[ch] needs to be
atomic - right now it isn't. The issue is what happens if a handled signal
arrives while we are manipulating the list?
* (User space stack must probably be done in a thread - kernel
stack must probably be taken in the interrupt itself?
- Why this difference? The page tables should still be loaded. Is it
because pages_present() doesn't work? No, turning it off doesn't help.
- It looks like this works. Get:
struct pt_regs *user_regs =
(void *)current->thread.esp0 - sizeof (struct pt_regs);
then use pages_present as usual to trace with user_regs; There could be
rare lockups though.
* Non-GUI version that can save in a format the GUI can understand.
Could be used for profiling startup etc. Would preferably be able to
dump the data to a network socket. Should be able to react to eg.
SIGUSR1 by dumping the data.
Work done by Lorenzo:
http://www.colitti.com/lorenzo/software/gnome-startup/sysprof-text.diff
http://www.colitti.com/lorenzo/software/gnome-startup/sysprof.log
http://colitti.com/lorenzo/software/gnome-startup/
* consider caching [filename => bin_file]
* Check the kernel we are building against, if it is SMP or
less than 2.6.11, print a warning and suggest upgrading.
* Timer interrupt based
* Interface
- Consider expanding a few more levels of a new descendants tree
- Algorithm should be expand in proportion to the
"total" percentage. Basically consider 'total' the
likelyhood that the user is going to look at it.
- Maybe just; keep expanding the biggest total until
there is no more room or we run out of things to expand.
* Web page containing
- Screen shots
- Explanation of what it is
- Download
- Bug reporting
- Contact info
- Ask for sucess/failure reports
- hook up menu items view/start etc (or possibly get rid of them or
move them)
- Should do as suggested in the automake manual in the
chapter "when automake is not enough"
- add an "insert-module" target
- need to run depmod on install
- If the current profile has a name, display it in the title bar
- auto*?
- Find out if that PREFIX business in Makefile was really such
a great idea.
- Sould just install the kernel module if it running as root, pop up
a dialog if not. Note we must be able to start without module now,
since it is useful to just load profiles from disk.
- Is there a portable way of asking for the root password?
- Install a small suid program that only inserts the module?
(instant security hole ..)
- Need to make "make install" work (how do you know where to install
kernel modules?)
- in /lib/modules/`uname -r`/kernel/drivers/
- need to run depmod as root after that
- Then modprobe run as root should correctly find it.
- grep FIXME
- give profiles on the command line
- Hopefully the oops at the end of this file is gone now that
we use mmput/get_task_mm. For older kernels those symbols
are not exported though, so we will probably have to either
use the old way (directly accessing the mm's) or just not
support those kernels.
- Need an icon
- hook up about box
- Add busy cursors,
- when you hit "Profile"
- when you click something in the main list and we don't respond
within 50ms (or perhaps when we expect to not be able to do
so (can we know the size in advance?))
- kernel module should put process to sleep before sampling. Should get us
more accurate data
- Make sure samples label shows correct nunber after Open
- Move "samples" label to the toolbar, then get rid of statusbar.
- crashes when you ctrl-click the selected item in the top left pane
<ian__> ssp: looks like it doesn't handle the none-selected case
- loading and saving
- consider making ProfileObject more of an object.
- make an "everything" object
maybe not necessary -- there is a libc_ctors_something()
- make presentation strings nicer
four different kinds of symbols:
a) I know exactly what this is
b) I know in what library this is
c) I know only the process that did this
d) I know the name, but there is another similarly named one
(a) is easy, (b) should be <in ...> (c) should just become "???"
(d) not sure
- processes with a cmdline of "" should get a [pid = %d] instead.
- make an "n samples" label
Process stuff:
- make threads be reported together
(simply report pids with similar command lines together)
(note: it seems separating by pid is way too slow (uses too much memory),
so it has to be like this)
- stack stash should allow different pids to refer to the same root
(ie. there is no need to create a new tree for each pid)
The *leaves* should contain the pid, not the root. You could even imagine
a set of processes, each referring to a set of leaves.
- when we see a new pid, immediately capture its mappings
Road map:
- new object Process
- hashable by pointer
- contains list of maps
- process_from_pid (pid_t pid, gboolean join_threads)
- new processes are gets their maps immediately
- resulting pointer must be unref()ed, but it is possible it
just points to an existing process
- processes with identical cmdlines are taken together
- method lookup_symbol()
- method get_name()
- ref/unref
- StackStash stores map from process to leaves
- Profile is called with processes
It is possible that we simply need a better concept of Process:
If two pids have the same command line, consider them the same, period.
This should save considerable amounts of memory.
The assumptions:
"No pids are reused during a profiling run"
"Two processes with the same command line have the same mappings"
are somewhat dubious, but probably necessary.
(More complex kernel:
have the module report
- new pid arrived (along with mappings)
- mapping changed for pid
- stacktrace)
- make symbols in executable work
- the hashtables used in profile.c should not accept NULL as the key
- make callers work
- autoexpand descendant tree
- make double clicks work
- fix leaks
- Find out what happened here:
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: Unable to handle kernel NULL pointer dereference at virtual address 000001b8
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: printing eip:
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: c017342c
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: *pde = 00000000
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: Oops: 0000 [#1]
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: Modules linked in: sysprof_module(U) i2c_algo_bit md5 ipv6 parport_pc lp parport autofs4 sunrpc video button battery ac ohci1394 ieee1394 uhci_hcd ehci_hcd hw_random tpm_atmel tpm i2c_i801 i2c_core snd_intel8x0 snd_ac97_codec snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd soundcore snd_page_alloc e1000 floppy dm_snapshot dm_zero dm_mirror ext3 jbd dm_mod ata_piix libata sd_mod scsi_mod
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: CPU: 0
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: EIP: 0060:[<c017342c>] Not tainted VLI
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: EFLAGS: 00010287 (2.6.11-1.1225_FC4)
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: EIP is at grab_swap_token+0x35/0x21f
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: eax: 0bd48023 ebx: d831d028 ecx: 00000282 edx: 00000000
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: esi: c1b72934 edi: c1045820 ebp: c1b703f0 esp: c18dbdd8
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: ds: 007b es: 007b ss: 0068
Apr 11 15:42:08 great-sage-equal-to-heaven kernel: Process events/0 (pid: 3, threadinfo=c18db000 task=f7e62000)
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: Stack: 000011a8 00000000 000011a8 c1b703f0 c0151731 c016f58f 000011a8 c1b72934
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: 000011a8 c0166415 c1b72934 c1b72934 c0163768 ee7ccc38 f459fbf8 bf92e7b8
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: f6c6a934 c0103b92 bfdaba18 c1b703f0 00000001 c1b81bfc c1b72934 bfdaba18
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: Call Trace:
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: [<c0151731>] find_get_page+0x9/0x24
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: [<c016f58f>] read_swap_cache_async+0x32/0x83Apr 11 15:42:09 great-sage-equal-to-heaven kernel: [<c0166415>] do_swap_page+0x262/0x600
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: [<c0163768>] pte_alloc_map+0xc6/0x1e6
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: [<c0103b92>] common_interrupt+0x1a/0x20
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: [<c01673f0>] handle_mm_fault+0x1da/0x31d
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: [<c016488e>] __follow_page+0xa2/0x10d
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: [<c0164a6f>] get_user_pages+0x145/0x6ee
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: [<c0161f66>] kmap_high+0x52/0x44e
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: [<c0103b92>] common_interrupt+0x1a/0x20
Apr 11 15:42:09 great-sage-equal-to-heaven kernel: [<f8cbb19d>] x_access_process_vm+0x111/0x1a5 [sysprof_module]
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: [<f8cbb24a>] read_user_space+0x19/0x1d [sysprof_module]
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: [<f8cbb293>] read_frame+0x35/0x51 [sysprof_module]
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: [<f8cbb33a>] generate_stack_trace+0x8b/0xb4
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: [<f8cbb3a2>] do_generate+0x3f/0xa0 [sysprof_module]
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: [<c0138d7a>] worker_thread+0x1b0/0x450
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: [<c0379ccd>] schedule+0x30d/0x780
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: [<c011bdb6>] __wake_up_common+0x39/0x59
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: [<f8cbb363>] do_generate+0x0/0xa0 [sysprof_module]
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: [<c011bd71>] default_wake_function+0x0/0xc
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: [<c0138bca>] worker_thread+0x0/0x450
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: [<c013f3cb>] kthread+0x87/0x8b
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: [<c013f344>] kthread+0x0/0x8b
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: [<c0101275>] kernel_thread_helper+0x5/0xb
Apr 11 15:42:10 great-sage-equal-to-heaven kernel: Code: e0 8b 00 8b 50 74 8b 1d c4 55 3d c0 39
da 0f 84 9b 01 00 00 a1 60 fc 3c c0 39 05 30 ec 48 c0 78 05 83 c4 20 5b c3 a1 60 fc 3c c0 <3b> 82 b8 01 00 00 78 ee 81 3d ac 55 3d c0 3c 4b 24 1d 0f 85 78
|