-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathkvm-for-4.18.12.patch
12164 lines (11294 loc) · 442 KB
/
kvm-for-4.18.12.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
From b8e5a31ef46b671ffe77d7b7121b777373818411 Mon Sep 17 00:00:00 2001
From: Dongli Zhang <[email protected]>
Date: Mon, 15 Mar 2021 09:19:57 -0700
Subject: [PATCH 1/1] kvm for 4.18.12
Signed-off-by: Dongli Zhang <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 490 ++++
arch/x86/include/asm/smp.h | 3 +
arch/x86/kernel/apic/apic.c | 7 +
arch/x86/kernel/kvm.c | 4 +
arch/x86/kernel/kvmclock.c | 77 +
arch/x86/kernel/smpboot.c | 18 +
arch/x86/kernel/tsc_sync.c | 8 +
arch/x86/kvm/i8254.c | 10 +
arch/x86/kvm/kvm_cache_regs.h | 5 +
arch/x86/kvm/lapic.c | 38 +
arch/x86/kvm/mmu.c | 3932 ++++++++++++++++++++++++++++++-
arch/x86/kvm/mmu.h | 60 +
arch/x86/kvm/mtrr.c | 23 +
arch/x86/kvm/page_track.c | 150 ++
arch/x86/kvm/paging_tmpl.h | 3 +
arch/x86/kvm/pmu_intel.c | 39 +
arch/x86/kvm/vmx.c | 562 +++++
arch/x86/kvm/x86.c | 186 ++
arch/x86/kvm/x86.h | 5 +
include/linux/cpumask.h | 36 +
include/linux/kvm_host.h | 80 +
kernel/cpu.c | 77 +
kernel/smp.c | 11 +
mm/memory.c | 4 +
mm/mmu_notifier.c | 11 +
mm/slab.c | 4 +
mm/vmscan.c | 41 +
virt/kvm/eventfd.c | 4 +
virt/kvm/irqchip.c | 4 +
virt/kvm/kvm_main.c | 411 +++-
virt/kvm/vfio.c | 27 +
31 files changed, 6326 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0722b774..eddaf82a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -102,12 +102,45 @@
/* KVM Hugepage definitions for x86 */
#define KVM_NR_PAGE_SIZES 3
+/*
+ * x = 1: KVM_HPAGE_GFN_SHIFT(1) = 0 none
+ * x = 2: KVM_HPAGE_GFN_SHIFT(2) = 9 2M
+ * x = 3: KVM_HPAGE_GFN_SHIFT(3) = 18 1G
+ */
#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
+/*
+ * KVM_HPAGE_SHIFT(1) = 12 + 0 = 12
+ * KVM_HPAGE_SHIFT(2) = 12 + 9 = 21
+ * KVM_HPAGE_SHIFT(3) = 12 + 18 = 30
+ */
#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
+/*
+ * KVM_HPAGE_SIZE(1) = 1 << 12 = 4K
+ * KVM_HPAGE_SIZE(2) = 1 << 21 = 2M
+ * KVM_HPAGE_SIZE(3) = 1 << 30 = 1G
+ */
#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
+/*
+ * KVM_HPAGE_MASK(1) : mask了4K
+ * KVM_HPAGE_MASK(2) : mask了2M
+ * KVM_HPAGE_MASK(3) : mask了1G
+ */
#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
+/*
+ * KVM_PAGES_PER_HPAGE(1) : 4K有多少4K的page
+ * KVM_PAGES_PER_HPAGE(2) : 2M有多少4K的page
+ * KVM_PAGES_PER_HPAGE(3) : 1G有多少4K的page
+ */
#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
+/*
+ * base_gfn是基于4k开始的gfn
+ * gfn是基于4k结束的gfn
+ * 计算从开始到结束需要用到几个hugepage (或者普通page)
+ * level是1的时候hugepage大小是4K
+ * level是2的时候hugepage大小是2M
+ * level是3的时候hugepage大小是1G
+ */
static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
{
/* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
@@ -115,6 +148,10 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
}
+/*
+ * 只在以下使用KVM_PERMILLE_MMU_PAGES:
+ * - arch/x86/kvm/mmu.c|8396| <<kvm_mmu_calculate_mmu_pages>> nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
+ */
#define KVM_PERMILLE_MMU_PAGES 20
#define KVM_MIN_ALLOC_MMU_PAGES 64
#define KVM_MMU_HASH_SHIFT 12
@@ -196,9 +233,21 @@ enum {
#define PFERR_GUEST_FINAL_BIT 32
#define PFERR_GUEST_PAGE_BIT 33
+/*
+ * 在以下设置PFERR_PRESENT_MASK:
+ * - arch/x86/kvm/mmu.h|223| <<permission_fault>> u32 errcode = PFERR_PRESENT_MASK;
+ * - arch/x86/kvm/paging_tmpl.h|386| <<FNAME(walk_addr_generic)>> errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+ * - arch/x86/kvm/vmx.c|7816| <<handle_ept_violation>> ? PFERR_PRESENT_MASK : 0;
+ */
#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
+/*
+ * 在以下设置PFERR_RSVD_MASK:
+ * - arch/x86/kvm/paging_tmpl.h|386| <<FNAME(walk_addr_generic)>> errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+ * - arch/x86/kvm/paging_tmpl.h|757| <<FNAME(page_fault)>> error_code &= ~PFERR_RSVD_MASK;
+ * - arch/x86/kvm/vmx.c|7878| <<handle_ept_misconfig>> return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
+ */
#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
@@ -248,9 +297,56 @@ struct kvm_mmu_memory_cache {
union kvm_mmu_page_role {
unsigned word;
struct {
+ /*
+ * The level in the shadow paging hierarchy that this shadow page belongs to.
+ * 1=4k sptes, 2=2M sptes, 3=1G sptes, etc.
+ *
+ */
+ /*
+ * 在以下使用:
+ * - arch/x86/kvm/mmu.c|2003| <<kvm_mmu_page_get_gfn>> return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
+ * - arch/x86/kvm/mmu.c|2124| <<account_shadowed>> if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ * - arch/x86/kvm/mmu.c|2147| <<unaccount_shadowed>> if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ * - arch/x86/kvm/mmu.c|2555| <<gfn_to_rmap>> return __gfn_to_rmap(gfn, sp->role.level, slot);
+ * - arch/x86/kvm/mmu.c|2794| <<__drop_large_spte>> WARN_ON(page_header(__pa(sptep))->role.level ==
+ * - arch/x86/kvm/mmu.c|3479| <<rmap_recycle>> kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
+ * - arch/x86/kvm/mmu.c|3992| <<kvm_sync_pages>> WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ * - arch/x86/kvm/mmu.c|4024| <<mmu_pages_next>> int level = sp->role.level;
+ * - arch/x86/kvm/mmu.c|4053| <<mmu_pages_first>> level = sp->role.level;
+ * - arch/x86/kvm/mmu.c|4462| <<mmu_page_zap_pte>> if (is_last_spte(pte, sp->role.level)) {
+ * - arch/x86/kvm/mmu.c|4558| <<mmu_zap_unsync_children>> if (parent->role.level == PT_PAGE_TABLE_LEVEL)
+ * - arch/x86/kvm/mmu.c|4789| <<mmu_need_write_protect>> WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+ * - arch/x86/kvm/mmu.c|5013| <<direct_pte_prefetch_many>> mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
+ * - arch/x86/kvm/mmu.c|5056| <<direct_pte_prefetch>> if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ * - arch/x86/kvm/mmu.c|5371| <<fast_page_fault>> if (!is_last_spte(spte, sp->role.level))
+ * - arch/x86/kvm/mmu.c|5418| <<fast_page_fault>> if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ * - arch/x86/kvm/mmu.c|7045| <<mmu_pte_write_new_pte>> if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+ * - arch/x86/kvm/mmu.c|7113| <<detect_write_flooding>> if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+ * - arch/x86/kvm/mmu.c|7155| <<get_written_sptes>> level = sp->role.level;
+ * - arch/x86/kvm/mmu_audit.c|153| <<inspect_spte_has_rmap>> rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
+ * - arch/x86/kvm/mmu_audit.c|182| <<check_mappings_rmap>> if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ * - arch/x86/kvm/paging_tmpl.h|569| <<FNAME(pte_prefetch)>> if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ * - arch/x86/kvm/paging_tmpl.h|851| <<FNAME(get_level1_sp_gpa)>> WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+ *
+ * 只在一处修改:
+ * - arch/x86/kvm/mmu.c|4160| <<kvm_mmu_get_page>> role.level = level;
+ */
unsigned level:4;
unsigned cr4_pae:1;
unsigned quadrant:2;
+ /*
+ * If set, leaf sptes reachable from this page are for a linear range.
+ * Examples include real mode translation, large guest pages backed by small
+ * host pages, and gpa->hpa translations when NPT or EPT is active.
+ * The linear range starts at (gfn << PAGE_SHIFT) and its size is determined
+ * by role.level (2MB for first level, 1GB for second level, 0.5TB for third
+ * level, 256TB for fourth level)
+ * If clear, this page corresponds to a guest page table denoted by the gfn
+ * field.
+ *
+ * 在以下修改:
+ * - arch/x86/kvm/mmu.c|2588| <<kvm_mmu_get_page>> role.direct = direct;
+ */
unsigned direct:1;
unsigned access:3;
unsigned invalid:1;
@@ -258,6 +354,11 @@ union kvm_mmu_page_role {
unsigned cr0_wp:1;
unsigned smep_andnot_wp:1;
unsigned smap_andnot_wp:1;
+ /*
+ * Is 1 if the MMU instance cannot use A/D bits. EPT did not have A/D
+ * bits before Haswell; shadow EPT page tables also cannot use A/D bits
+ * if the L1 hypervisor does not enable them.
+ */
unsigned ad_disabled:1;
unsigned guest_mode:1;
unsigned :6;
@@ -273,10 +374,20 @@ union kvm_mmu_page_role {
};
struct kvm_rmap_head {
+ /*
+ * 注释:
+ *
+ * If the bit zero of rmap_head->val is clear, then it points to the only spte
+ * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
+ * pte_list_desc containing more mappings.
+ */
unsigned long val;
};
struct kvm_mmu_page {
+ /*
+ * 在kvm_mmu_alloc_page()被加入vcpu->kvm->arch.active_mmu_pages
+ */
struct list_head link;
struct hlist_node hash_link;
@@ -287,17 +398,88 @@ struct kvm_mmu_page {
gfn_t gfn;
union kvm_mmu_page_role role;
+ /* 在kvm_mmu_alloc_page()分配的真正的页表的4K page */
u64 *spt;
/* hold the gfn of each spte inside spt */
+ /* 似乎tdp不用, shadow才用 */
gfn_t *gfns;
+ /*
+ * 在以下修改kvm_mmu_page->unsync:
+ * - arch/x86/kvm/mmu.c|3036| <<kvm_unsync_page>> sp->unsync = 1;
+ * - arch/x86/kvm/mmu.c|2303| <<kvm_unlink_unsync_page>> sp->unsync = 0;
+ *
+ * 用于最后一级页表页,表示该页的页表项(pte)是否与guest同步(guest是否已更新tlb)
+ *
+ * 查看mmu_need_write_protect()中的注释, 似乎只有shadow page table会设置
+ */
bool unsync;
+ /*
+ * 在以下设置或者修改:
+ * - arch/x86/kvm/mmu.c|3656| <<mmu_free_root_page>> --sp->root_count;
+ * - arch/x86/kvm/mmu.c|3721| <<mmu_alloc_direct_roots>> ++sp->root_count;
+ * - arch/x86/kvm/mmu.c|3737| <<mmu_alloc_direct_roots>> ++sp->root_count;
+ * - arch/x86/kvm/mmu.c|3781| <<mmu_alloc_shadow_roots>> ++sp->root_count;
+ * - arch/x86/kvm/mmu.c|3818| <<mmu_alloc_shadow_roots>> ++sp->root_count;
+ *
+ * 应该是和cpu的数量相关
+ */
int root_count; /* Currently serving as active root */
+ /*
+ * 在以下被增加:
+ * - arch/x86/kvm/mmu.c|2118| <<mark_unsync>> if (sp->unsync_children++)
+ *
+ * 在以下被减少:
+ * - arch/x86/kvm/mmu.c|2168| <<clear_unsync_child_bit>> --sp->unsync_children;
+ */
unsigned int unsync_children;
+ /*
+ * 在以下使用或修改kvm_mmu_page->parent_ptes:
+ * - arch/x86/kvm/mmu.c|2045| <<mmu_page_add_parent_pte>> pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
+ * - arch/x86/kvm/mmu.c|2051| <<mmu_page_remove_parent_pte>> pte_list_remove(parent_pte, &sp->parent_ptes);
+ * - arch/x86/kvm/mmu.c|2104| <<kvm_mmu_mark_parents_unsync>> for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
+ * - arch/x86/kvm/mmu.c|2743| <<kvm_mmu_unlink_parents>> while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
+ *
+ * 反向映射(rmap),维护指向自己的上级页表项
+ */
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
/* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */
+ /*
+ * Zapping all pages (page generation count)
+ *
+ * For the large memory guests, walking and zapping all pages is really slow
+ * (because there are a lot of pages), and also blocks memory accesses of
+ * all VCPUs because it needs to hold the MMU lock.
+ *
+ * To make it be more scalable, kvm maintains a global generation number
+ * which is stored in kvm->arch.mmu_valid_gen. Every shadow page stores
+ * the current global generation-number into sp->mmu_valid_gen when it
+ * is created. Pages with a mismatching generation number are "obsolete".
+ *
+ * When KVM need zap all shadow pages sptes, it just simply increases the global
+ * generation-number then reload root shadow pages on all vcpus. As the VCPUs
+ * create new shadow page tables, the old pages are not used because of the
+ * mismatching generation number.
+ *
+ * KVM then walks through all pages and zaps obsolete pages. While the zap
+ * operation needs to take the MMU lock, the lock can be released periodically
+ * so that the VCPUs can make progress.
+ */
+ /*
+ * 在以下修kvm_mmu_page->mmu_valid_gen:
+ * - arch/x86/kvm/mmu.c|4438| <<kvm_mmu_get_page>> sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
+ *
+ * 在以下修改kvm_arch->mmu_valid_gen:
+ * - arch/x86/kvm/mmu.c|8217| <<kvm_mmu_invalidate_zap_all_pages>> kvm->arch.mmu_valid_gen++;
+ */
unsigned long mmu_valid_gen;
+ /*
+ * 在以下被使用kvm_mmu_page->unsync_child_bitmap:
+ * - arch/x86/kvm/mmu.c|2116| <<mark_unsync>> if (__test_and_set_bit(index, sp->unsync_child_bitmap))
+ * - arch/x86/kvm/mmu.c|2170| <<clear_unsync_child_bit>> __clear_bit(idx, sp->unsync_child_bitmap);
+ * - arch/x86/kvm/mmu.c|2178| <<__mmu_unsync_walk>> for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
+ */
DECLARE_BITMAP(unsync_child_bitmap, 512);
#ifdef CONFIG_X86_32
@@ -332,27 +514,132 @@ struct rsvd_bits_validate {
* current mmu mode.
*/
struct kvm_mmu {
+ /*
+ * 设置set_cr3的地方:
+ * - arch/x86/kvm/mmu.c|4563| <<init_kvm_tdp_mmu>> context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
+ * - arch/x86/kvm/mmu.c|4660| <<init_kvm_softmmu>> context->set_cr3 = kvm_x86_ops->set_cr3;
+ * - arch/x86/kvm/svm.c|2923| <<nested_svm_init_mmu_context>> vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
+ * - arch/x86/kvm/vmx.c|11331| <<nested_ept_init_mmu_context>> vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
+ */
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
+ /*
+ * 设置get_cr3的地方:
+ * - arch/x86/kvm/mmu.c|4564| <<init_kvm_tdp_mmu>> context->get_cr3 = get_cr3;
+ * - arch/x86/kvm/mmu.c|4661| <<init_kvm_softmmu>> context->get_cr3 = get_cr3;
+ * - arch/x86/kvm/mmu.c|4670| <<init_kvm_nested_mmu>> g_context->get_cr3 = get_cr3;
+ * - arch/x86/kvm/svm.c|2924| <<nested_svm_init_mmu_context>> vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
+ * - arch/x86/kvm/vmx.c|11332| <<nested_ept_init_mmu_context>> vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
+ */
unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
+ /*
+ * 设置get_pdptr的地方:
+ * - arch/x86/kvm/mmu.c|4565| <<init_kvm_tdp_mmu>> context->get_pdptr = kvm_pdptr_read;
+ * - arch/x86/kvm/mmu.c|4662| <<init_kvm_softmmu>> context->get_pdptr = kvm_pdptr_read;
+ * - arch/x86/kvm/mmu.c|4671| <<init_kvm_nested_mmu>> g_context->get_pdptr = kvm_pdptr_read;
+ * - arch/x86/kvm/svm.c|2925| <<nested_svm_init_mmu_context>> vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
+ */
u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
+ /*
+ * 设置page_fault的地方:
+ * - arch/x86/kvm/mmu.c|4013| <<nonpaging_init_context>> context->page_fault = nonpaging_page_fault;
+ * - arch/x86/kvm/mmu.c|4498| <<paging64_init_context_common>> context->page_fault = paging64_page_fault;
+ * - arch/x86/kvm/mmu.c|4528| <<paging32_init_context>> context->page_fault = paging32_page_fault;
+ * - arch/x86/kvm/mmu.c|4556| <<init_kvm_tdp_mmu>> context->page_fault = tdp_page_fault;
+ * - arch/x86/kvm/mmu.c|4637| <<kvm_init_shadow_ept_mmu>> context->page_fault = ept_page_fault;
+ */
int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
bool prefault);
void (*inject_page_fault)(struct kvm_vcpu *vcpu,
struct x86_exception *fault);
+ /*
+ * 设置gva_to_gpa的地方:
+ * - arch/x86/kvm/mmu.c|4014| <<nonpaging_init_context>> context->gva_to_gpa = nonpaging_gva_to_gpa;
+ * - arch/x86/kvm/mmu.c|4499| <<paging64_init_context_common>> context->gva_to_gpa = paging64_gva_to_gpa;
+ * - arch/x86/kvm/mmu.c|4529| <<paging32_init_context>> context->gva_to_gpa = paging32_gva_to_gpa;
+ * - arch/x86/kvm/mmu.c|4570| <<init_kvm_tdp_mmu>> context->gva_to_gpa = nonpaging_gva_to_gpa;
+ * - arch/x86/kvm/mmu.c|4577| <<init_kvm_tdp_mmu>> context->gva_to_gpa = paging64_gva_to_gpa;
+ * - arch/x86/kvm/mmu.c|4582| <<init_kvm_tdp_mmu>> context->gva_to_gpa = paging64_gva_to_gpa;
+ * - arch/x86/kvm/mmu.c|4587| <<init_kvm_tdp_mmu>> context->gva_to_gpa = paging32_gva_to_gpa;
+ * - arch/x86/kvm/mmu.c|4638| <<kvm_init_shadow_ept_mmu>> context->gva_to_gpa = ept_gva_to_gpa;
+ * - arch/x86/kvm/mmu.c|4685| <<init_kvm_nested_mmu>> g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+ * - arch/x86/kvm/mmu.c|4691| <<init_kvm_nested_mmu>> g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ * - arch/x86/kvm/mmu.c|4696| <<init_kvm_nested_mmu>> g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ * - arch/x86/kvm/mmu.c|4701| <<init_kvm_nested_mmu>> g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
+ */
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
struct x86_exception *exception);
gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
struct x86_exception *exception);
+ /*
+ * 设置sync_page的地方:
+ * - arch/x86/kvm/mmu.c|4298| <<nonpaging_init_context>> context->sync_page = nonpaging_sync_page;
+ * - arch/x86/kvm/mmu.c|4783| <<paging64_init_context_common>> context->sync_page = paging64_sync_page;
+ * - arch/x86/kvm/mmu.c|4813| <<paging32_init_context>> context->sync_page = paging32_sync_page;
+ * - arch/x86/kvm/mmu.c|4842| <<init_kvm_tdp_mmu>> context->sync_page = nonpaging_sync_page;
+ * - arch/x86/kvm/mmu.c|4924| <<kvm_init_shadow_ept_mmu>> context->sync_page = ept_sync_page;
+ */
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte);
+ /*
+ * 设置kvm_mmu->root_hpa的地方:
+ * - arch/x86/kvm/mmu.c|6172| <<mmu_free_root_page>> *root_hpa = INVALID_PAGE;
+ * - arch/x86/kvm/mmu.c|6204| <<kvm_mmu_free_roots>> mmu->root_hpa = INVALID_PAGE;
+ * - arch/x86/kvm/mmu.c|6250| <<mmu_alloc_direct_roots>> vcpu->arch.mmu.root_hpa = __pa(sp->spt);
+ * - arch/x86/kvm/mmu.c|6268| <<mmu_alloc_direct_roots>> vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ * - arch/x86/kvm/mmu.c|6310| <<mmu_alloc_shadow_roots>> vcpu->arch.mmu.root_hpa = root;
+ * - arch/x86/kvm/mmu.c|6350| <<mmu_alloc_shadow_roots>> vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ * - arch/x86/kvm/mmu.c|6374| <<mmu_alloc_shadow_roots>> vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
+ * - arch/x86/kvm/mmu.c|7003| <<nonpaging_init_context>> context->root_hpa = INVALID_PAGE;
+ * - arch/x86/kvm/mmu.c|7496| <<paging64_init_context_common>> context->root_hpa = INVALID_PAGE;
+ * - arch/x86/kvm/mmu.c|7534| <<paging32_init_context>> context->root_hpa = INVALID_PAGE;
+ * - arch/x86/kvm/mmu.c|7567| <<init_kvm_tdp_mmu>> context->root_hpa = INVALID_PAGE;
+ * - arch/x86/kvm/mmu.c|7658| <<kvm_init_shadow_ept_mmu>> context->root_hpa = INVALID_PAGE;
+ * - arch/x86/kvm/mmu.c|8284| <<kvm_mmu_create>> vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ */
hpa_t root_hpa;
union kvm_mmu_page_role base_role;
+ /*
+ * 设置的地方:
+ * - arch/x86/kvm/mmu.c|4383| <<nonpaging_init_context>> context->root_level = 0;
+ * - arch/x86/kvm/mmu.c|4855| <<paging64_init_context_common>> context->root_level = level;
+ * - arch/x86/kvm/mmu.c|4886| <<paging32_init_context>> context->root_level = PT32_ROOT_LEVEL;
+ * - arch/x86/kvm/mmu.c|4938| <<init_kvm_tdp_mmu>> context->root_level = 0;
+ * - arch/x86/kvm/mmu.c|4941| <<init_kvm_tdp_mmu>> context->root_level = is_la57_mode(vcpu) ?
+ * - arch/x86/kvm/mmu.c|4947| <<init_kvm_tdp_mmu>> context->root_level = PT32E_ROOT_LEVEL;
+ * - arch/x86/kvm/mmu.c|4952| <<init_kvm_tdp_mmu>> context->root_level = PT32_ROOT_LEVEL;
+ * - arch/x86/kvm/mmu.c|5013| <<kvm_init_shadow_ept_mmu>> context->root_level = PT64_ROOT_4LEVEL;
+ * - arch/x86/kvm/mmu.c|5055| <<init_kvm_nested_mmu>> g_context->root_level = 0;
+ * - arch/x86/kvm/mmu.c|5059| <<init_kvm_nested_mmu>> g_context->root_level = is_la57_mode(vcpu) ?
+ * - arch/x86/kvm/mmu.c|5065| <<init_kvm_nested_mmu>> g_context->root_level = PT32E_ROOT_LEVEL;
+ * - arch/x86/kvm/mmu.c|5070| <<init_kvm_nested_mmu>> g_context->root_level = PT32_ROOT_LEVEL;
+ *
+ * 在tdp下如果开了long mode, 则不是4就是5
+ */
u8 root_level;
+ /*
+ * 设置的地方:
+ * - arch/x86/kvm/mmu.c|4379| <<nonpaging_init_context>> context->shadow_root_level = PT32E_ROOT_LEVEL;
+ * - arch/x86/kvm/mmu.c|4863| <<paging64_init_context_common>> context->shadow_root_level = level;
+ * - arch/x86/kvm/mmu.c|4893| <<paging32_init_context>> context->shadow_root_level = PT32E_ROOT_LEVEL;
+ * - arch/x86/kvm/mmu.c|4922| <<init_kvm_tdp_mmu>> context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
+ * - arch/x86/kvm/mmu.c|4999| <<kvm_init_shadow_ept_mmu>> context->shadow_root_level = PT64_ROOT_4LEVEL;
+ * - arch/x86/kvm/svm.c|2927| <<nested_svm_init_mmu_context>> vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu);
+ *
+ * 对于tdp不是4就是5
+ */
u8 shadow_root_level;
u8 ept_ad;
+ /*
+ * 设置direct_map的地方:
+ * - arch/x86/kvm/mmu.c|4003| <<nonpaging_init_context>> context->direct_map = true;
+ * - arch/x86/kvm/mmu.c|4487| <<paging64_init_context_common>> context->direct_map = false;
+ * - arch/x86/kvm/mmu.c|4517| <<paging32_init_context>> context->direct_map = false;
+ * - arch/x86/kvm/mmu.c|4544| <<init_kvm_tdp_mmu>> context->direct_map = true;
+ * - arch/x86/kvm/mmu.c|4626| <<kvm_init_shadow_ept_mmu>> context->direct_map = false;
+ */
bool direct_map;
/*
@@ -370,6 +657,25 @@ struct kvm_mmu {
*/
u32 pkru_mask;
+ /*
+ * 在以下使用kvm_mmu->pae_root:
+ * - arch/x86/kvm/mmu.c|4855| <<shadow_walk_init>> = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+ * - arch/x86/kvm/mmu.c|6297| <<kvm_mmu_free_roots>> if (mmu->pae_root[i] != 0)
+ * - arch/x86/kvm/mmu.c|6298| <<kvm_mmu_free_roots>> mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i],
+ * - arch/x86/kvm/mmu.c|6375| <<mmu_alloc_direct_roots>> hpa_t root = vcpu->arch.mmu.pae_root[i];
+ * - arch/x86/kvm/mmu.c|6388| <<mmu_alloc_direct_roots>> vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ * - arch/x86/kvm/mmu.c|6390| <<mmu_alloc_direct_roots>> vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ * - arch/x86/kvm/mmu.c|6446| <<mmu_alloc_shadow_roots>> hpa_t root = vcpu->arch.mmu.pae_root[i];
+ * - arch/x86/kvm/mmu.c|6452| <<mmu_alloc_shadow_roots>> vcpu->arch.mmu.pae_root[i] = 0;
+ * - arch/x86/kvm/mmu.c|6470| <<mmu_alloc_shadow_roots>> vcpu->arch.mmu.pae_root[i] = root | pm_mask;
+ * - arch/x86/kvm/mmu.c|6472| <<mmu_alloc_shadow_roots>> vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ * - arch/x86/kvm/mmu.c|6491| <<mmu_alloc_shadow_roots>> lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
+ * - arch/x86/kvm/mmu.c|6557| <<mmu_sync_roots>> hpa_t root = vcpu->arch.mmu.pae_root[i];
+ * - arch/x86/kvm/mmu.c|8419| <<free_mmu_pages>> free_page((unsigned long )vcpu->arch.mmu.pae_root);
+ * - arch/x86/kvm/mmu.c|8441| <<alloc_mmu_pages>> vcpu->arch.mmu.pae_root = page_address(page);
+ * - arch/x86/kvm/mmu.c|8443| <<alloc_mmu_pages>> vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+ * - arch/x86/kvm/mmu_audit.c|74| <<mmu_spte_walk>> hpa_t root = vcpu->arch.mmu.pae_root[i];
+ */
u64 *pae_root;
u64 *lm_root;
@@ -542,6 +848,18 @@ struct kvm_vcpu_arch {
struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
struct kvm_mmu_memory_cache mmu_page_cache;
+ /*
+ * 在以下使用kvm_vcpu_arch:
+ * - arch/x86/kvm/mmu.c|1921| <<mmu_topup_memory_caches>> r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ * - arch/x86/kvm/mmu.c|1922| <<mmu_topup_memory_caches>> mmu_page_header_cache, 4);
+ * - arch/x86/kvm/mmu.c|1936| <<mmu_free_memory_caches>> mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ * - arch/x86/kvm/mmu.c|1937| <<mmu_free_memory_caches>> mmu_page_header_cache);
+ * - arch/x86/kvm/mmu.c|3686| <<kvm_mmu_free_page>> kmem_cache_free(mmu_page_header_cache, sp);
+ * - arch/x86/kvm/mmu.c|3766| <<kvm_mmu_alloc_page>> sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+ * - arch/x86/kvm/mmu.c|8313| <<mmu_destroy_caches>> kmem_cache_destroy(mmu_page_header_cache);
+ * - arch/x86/kvm/mmu.c|8331| <<kvm_mmu_module_init>> mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+ * - arch/x86/kvm/mmu.c|8334| <<kvm_mmu_module_init>> if (!mmu_page_header_cache)
+ */
struct kvm_mmu_memory_cache mmu_page_header_cache;
/*
@@ -650,9 +968,32 @@ struct kvm_vcpu_arch {
u64 *mce_banks;
/* Cache MMIO info */
+ /*
+ * 在以下修改或者使用:
+ * - arch/x86/kvm/x86.h|188| <<vcpu_cache_mmio_info>> vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK;
+ * - arch/x86/kvm/x86.h|207| <<vcpu_clear_mmio_info>> if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+ * - arch/x86/kvm/x86.h|210| <<vcpu_clear_mmio_info>> vcpu->arch.mmio_gva = 0;
+ * - arch/x86/kvm/x86.h|215| <<vcpu_match_mmio_gva>> if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva &&
+ * - arch/x86/kvm/x86.h|216| <<vcpu_match_mmio_gva>> vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+ */
u64 mmio_gva;
+ /*
+ * 在vcpu_cache_mmio_info()中修改
+ */
unsigned access;
+ /*
+ * 在以下修改或者使用:
+ * - arch/x86/kvm/x86.h|190| <<vcpu_cache_mmio_info>> vcpu->arch.mmio_gfn = gfn;
+ * - arch/x86/kvm/x86.h|224| <<vcpu_match_mmio_gpa>> if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn &&
+ * - arch/x86/kvm/x86.h|225| <<vcpu_match_mmio_gpa>> vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+ * - arch/x86/kvm/x86.c|4982| <<vcpu_mmio_gva_to_gpa>> *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
+ */
gfn_t mmio_gfn;
+ /*
+ * 在以下修改或者使用:
+ * - arch/x86/kvm/x86.h|191| <<vcpu_cache_mmio_info>> vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation;
+ * - arch/x86/kvm/x86.h|196| <<vcpu_match_mmio_gen>> return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation;
+ */
u64 mmio_gen;
struct kvm_pmu pmu;
@@ -720,12 +1061,34 @@ struct kvm_vcpu_arch {
};
struct kvm_lpage_info {
+ /*
+ * kvm_lpage_info->disallowe_lpage在以下被修改:
+ * - arch/x86/kvm/mmu.c|2080| <<update_gfn_disallow_lpage_count>> linfo->disallow_lpage += count;
+ * - arch/x86/kvm/x86.c|9087| <<kvm_arch_create_memslot>> linfo[0].disallow_lpage = 1;
+ * - arch/x86/kvm/x86.c|9089| <<kvm_arch_create_memslot>> linfo[lpages - 1].disallow_lpage = 1;
+ * - arch/x86/kvm/x86.c|9101| <<kvm_arch_create_memslot>> linfo[j].disallow_lpage = 1;
+ */
int disallow_lpage;
};
struct kvm_arch_memory_slot {
+ /*
+ * About rmap_head encoding:
+ *
+ * If the bit zero of rmap_head->val is clear, then it points to the only spte
+ * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
+ * pte_list_desc containing more mappings.
+ *
+ * rmap和lpage_info的第二维都是在kvm_arch_create_memslot()分配
+ * 数量是对应level的page的数量
+ * level越大(大页)数量越少
+ */
struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+ /*
+ * 在kvm_page_track_create_memslot()初始化第二维
+ * 应该是每个slot中page的数量吧
+ */
unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
};
@@ -778,16 +1141,104 @@ enum kvm_irqchip_mode {
};
struct kvm_arch {
+ /*
+ * 被以下修改:
+ * - arch/x86/kvm/mmu.c|1973| <<kvm_mod_used_mmu_pages>> kvm->arch.n_used_mmu_pages += nr;
+ *
+ * 被以下使用:
+ * - arch/x86/kvm/mmu.c|2727| <<kvm_mmu_change_mmu_pages>> if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
+ * - arch/x86/kvm/mmu.c|2729| <<kvm_mmu_change_mmu_pages>> while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
+ * - arch/x86/kvm/mmu.c|2734| <<kvm_mmu_change_mmu_pages>> goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
+ * - arch/x86/kvm/mmu.c|5729| <<mmu_shrink_scan>> if (!kvm->arch.n_used_mmu_pages &&
+ * - arch/x86/kvm/mmu.h|73| <<kvm_mmu_available_pages>> if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
+ * - arch/x86/kvm/mmu.h|75| <<kvm_mmu_available_pages>> kvm->arch.n_used_mmu_pages;
+ */
unsigned int n_used_mmu_pages;
+ /*
+ * 被以下修改:
+ * - arch/x86/kvm/x86.c|4035| <<kvm_vm_ioctl_set_nr_mmu_pages>> kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
+ *
+ * 在以下使用:
+ * - arch/x86/kvm/x86.c|9196| <<kvm_arch_commit_memory_region>> if (!kvm->arch.n_requested_mmu_pages)
+ */
unsigned int n_requested_mmu_pages;
+ /*
+ * 在以下使用n_max_mmu_pages:
+ * - arch/x86/kvm/mmu.c|4913| <<kvm_mmu_change_mmu_pages>> kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+ * - arch/x86/kvm/mmu.h|81| <<kvm_mmu_available_pages>> if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
+ * - arch/x86/kvm/mmu.h|82| <<kvm_mmu_available_pages>> return kvm->arch.n_max_mmu_pages -
+ * - arch/x86/kvm/x86.c|4046| <<kvm_vm_ioctl_get_nr_mmu_pages>> return kvm->arch.n_max_mmu_pages;
+ */
unsigned int n_max_mmu_pages;
+ /*
+ * 在以下使用kvm_arch->indirect_shadow_pages:
+ * - arch/x86/kvm/mmu.c|2218| <<account_shadowed>> kvm->arch.indirect_shadow_pages++;
+ * - arch/x86/kvm/mmu.c|2243| <<unaccount_shadowed>> kvm->arch.indirect_shadow_pages--;
+ * - arch/x86/kvm/mmu.c|7681| <<kvm_mmu_pte_write>> if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+ * - arch/x86/kvm/x86.c|5880| <<reexecute_instruction>> unsigned int indirect_shadow_pages;
+ * - arch/x86/kvm/x86.c|5883| <<reexecute_instruction>> indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
+ * - arch/x86/kvm/x86.c|5886| <<reexecute_instruction>> if (indirect_shadow_pages)
+ */
unsigned int indirect_shadow_pages;
+ /*
+ * Zapping all pages (page generation count)
+ *
+ * For the large memory guests, walking and zapping all pages is really slow
+ * (because there are a lot of pages), and also blocks memory accesses of
+ * all VCPUs because it needs to hold the MMU lock.
+ *
+ * To make it be more scalable, kvm maintains a global generation number
+ * which is stored in kvm->arch.mmu_valid_gen. Every shadow page stores
+ * the current global generation-number into sp->mmu_valid_gen when it
+ * is created. Pages with a mismatching generation number are "obsolete".
+ *
+ * When KVM need zap all shadow pages sptes, it just simply increases the global
+ * generation-number then reload root shadow pages on all vcpus. As the VCPUs
+ * create new shadow page tables, the old pages are not used because of the
+ * mismatching generation number.
+ *
+ * KVM then walks through all pages and zaps obsolete pages. While the zap
+ * operation needs to take the MMU lock, the lock can be released periodically
+ * so that the VCPUs can make progress.
+ */
+ /*
+ * 在以下修kvm_mmu_page->mmu_valid_gen:
+ * - arch/x86/kvm/mmu.c|4438| <<kvm_mmu_get_page>> sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
+ *
+ * 在以下修改kvm_arch->mmu_valid_gen:
+ * - arch/x86/kvm/mmu.c|8217| <<kvm_mmu_invalidate_zap_all_pages>> kvm->arch.mmu_valid_gen++;
+ */
unsigned long mmu_valid_gen;
+ /*
+ * 在以下使用kvm_mmu_page->mmu_page_hash[KVM_NUM_MMU_PAGES]:
+ * - arch/x86/kvm/mmu.c|4446| <<for_each_valid_sp>> &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+ * - arch/x86/kvm/mmu.c|4864| <<kvm_mmu_get_page>> &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
+ *
+ * 通过hash可以快速获得一个gfn对应的也表页面
+ */
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
* Hash table of struct kvm_mmu_page.
*/
+ /*
+ * 在以下使用kvm_mmu_page->active_mmu_pages:
+ * - arch/x86/kvm/mmu.c|2030| <<kvm_mmu_alloc_page>> list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+ * - arch/x86/kvm/mmu.c|2665| <<kvm_mmu_prepare_zap_page>> list_move(&sp->link, &kvm->arch.active_mmu_pages);
+ * - arch/x86/kvm/mmu.c|2709| <<prepare_zap_oldest_mmu_page>> if (list_empty(&kvm->arch.active_mmu_pages))
+ * - arch/x86/kvm/mmu.c|2712| <<prepare_zap_oldest_mmu_page>> sp = list_last_entry(&kvm->arch.active_mmu_pages,
+ * - arch/x86/kvm/mmu.c|5606| <<kvm_zap_obsolete_pages>> &kvm->arch.active_mmu_pages, link) {
+ * - arch/x86/kvm/mmu_audit.c|92| <<walk_all_active_sps>> list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+ * - arch/x86/kvm/x86.c|8822| <<kvm_arch_init_vm>> INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ */
struct list_head active_mmu_pages;
+ /*
+ * 在以下使用kvm_arch->zapped_obsolete_pages:
+ * - arch/x86/kvm/mmu.c|6107| <<kvm_zap_obsolete_pages>> &kvm->arch.zapped_obsolete_pages);
+ * - arch/x86/kvm/mmu.c|6118| <<kvm_zap_obsolete_pages>> kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
+ * - arch/x86/kvm/mmu.c|6159| <<kvm_has_zapped_obsolete_pages>> return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+ * - arch/x86/kvm/mmu.c|6214| <<mmu_shrink_scan>> &kvm->arch.zapped_obsolete_pages);
+ * - arch/x86/kvm/x86.c|8826| <<kvm_arch_init_vm>> INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
+ */
struct list_head zapped_obsolete_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head;
@@ -860,6 +1311,11 @@ struct kvm_arch {
};
struct kvm_vm_stat {
+ /*
+ * 在以下使用kvm_vm_stat->mmu_shadow_zapped:
+ * - arch/x86/kvm/x86.c|199| <<global>> { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
+ * - arch/x86/kvm/mmu.c|5138| <<kvm_mmu_prepare_zap_page>> ++kvm->stat.mmu_shadow_zapped;
+ */
ulong mmu_shadow_zapped;
ulong mmu_pte_write;
ulong mmu_pte_updated;
@@ -867,8 +1323,21 @@ struct kvm_vm_stat {
ulong mmu_flooded;
ulong mmu_recycled;
ulong mmu_cache_miss;
+ /*
+ * 在以下使用kvm_vm_stat->mmu_unsync:
+ * - arch/x86/kvm/x86.c|206| <<global>> { "mmu_unsync", VM_STAT(mmu_unsync) },
+ * - arch/x86/kvm/mmu.c|4380| <<kvm_unlink_unsync_page>> --kvm->stat.mmu_unsync;
+ * - arch/x86/kvm/mmu.c|5371| <<kvm_unsync_page>> ++vcpu->kvm->stat.mmu_unsync;
+ */
ulong mmu_unsync;
ulong remote_tlb_flush;
+ /*
+ * 在以下使用kvm_vm_stat->lpages:
+ * - arch/x86/kvm/x86.c|208| <<global>> { "largepages", VM_STAT(lpages) },
+ * - arch/x86/kvm/mmu.c|2996| <<__drop_large_spte>> --kvm->stat.lpages;
+ * - arch/x86/kvm/mmu.c|4839| <<mmu_page_zap_pte>> --kvm->stat.lpages;
+ * - arch/x86/kvm/mmu.c|5370| <<mmu_set_spte>> ++vcpu->kvm->stat.lpages;
+ */
ulong lpages;
ulong max_mmu_page_hash_collisions;
};
@@ -922,6 +1391,10 @@ struct kvm_lapic_irq {
bool msi_redir_hint;
};
+/*
+ * intel : vmx_x86_ops
+ * amd : svm_x86_ops
+ */
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
@@ -1114,6 +1587,10 @@ extern struct kvm_x86_ops *kvm_x86_ops;
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
+ /*
+ * intel: vmx_vm_alloc()
+ * amd : svm_vm_alloc()
+ */
return kvm_x86_ops->vm_alloc();
}
@@ -1318,8 +1795,16 @@ static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
return gpa;
}
+/*
+ * shadow_page可能是一个指向某个pte的地址 (shadow_page保存的这个pte的地址而不是内容)
+ * 获得包含这个地址的页表页对应的kvm_mmu_page
+ */
static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
{
+ /*
+ * shadow_page可能是一个指向某个pte的地址 (shadow_page保存的这个pte的地址而不是内容)
+ * shadow_page向右移动PAGE_SHIFT是这个pte所在的页表页的基地址的pfn
+ */
struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
return (struct kvm_mmu_page *)page_private(page);
@@ -1376,6 +1861,11 @@ enum {
#define HF_VINTR_MASK (1 << 2)
#define HF_NMI_MASK (1 << 3)
#define HF_IRET_MASK (1 << 4)
+/*
+ * 在以下修改:
+ * - arch/x86/kvm/kvm_cache_regs.h|90| <<enter_guest_mode>> vcpu->arch.hflags |= HF_GUEST_MASK;
+ * - arch/x86/kvm/kvm_cache_regs.h|95| <<leave_guest_mode>> vcpu->arch.hflags &= ~HF_GUEST_MASK;
+ */
#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
#define HF_SMM_MASK (1 << 6)
#define HF_SMM_INSIDE_NMI_MASK (1 << 7)
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 547c4fe5..0184db6c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -93,6 +93,9 @@ static inline void smp_cpus_done(unsigned int max_cpus)
static inline int __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
+ /*
+ * native_cpu_up()
+ */
return smp_ops.cpu_up(cpu, tidle);
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 3b3a2d0a..c0f7ab83 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -998,6 +998,10 @@ void setup_secondary_APIC_clock(void)
/*
* The guts of the apic timer interrupt
*/
+/*
+ * called by only:
+ * - arch/x86/kernel/apic/apic.c|1054| <<smp_apic_timer_interrupt>> local_apic_timer_interrupt();
+ */
static void local_apic_timer_interrupt(void)
{
struct clock_event_device *evt = this_cpu_ptr(&lapic_events);
@@ -1037,6 +1041,9 @@ static void local_apic_timer_interrupt(void)
* [ if a single-CPU system runs an SMP kernel then we call the local
* interrupt as well. Thus we cannot inline the local irq ... ]
*/
+/*
+ * arch/x86/entry/entry_64.S: 用作LOCAL_TIMER_VECTOR (0xec)的处理函数
+ */
__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5b2300b8..269c8e22 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -299,6 +299,10 @@ static void __init paravirt_ops_setup(void)
#endif
}
+/*
+ * called by:
+ * - arch/x86/kernel/kvm.c|364| <<kvm_guest_cpu_init>> kvm_register_steal_time();
+ */
static void kvm_register_steal_time(void)
{
int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 3b8e7c13..a3e88288 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -32,6 +32,76 @@
#include <asm/reboot.h>
#include <asm/kvmclock.h>
+/*
+ * FROM SUSE!!!
+ * When using kvm-clock, it is not recommended to use NTP in the VM Guest, as
+ * well. Using NTP on the VM Host Server, however, is still recommended.
+ */
+
+/*
+ * Clocksource is a device that can give a timestamp whenever you need it. In
+ * other words, Clocksource is any ticking counter that allows you to get its
+ * value.
+ *
+ * Clockevent device is an alarm clock—you ask the device to signal a time in
+ * the future (e.g., "wake me up in 1ms") and when the alarm is triggered, you
+ * get the signal.
+ *
+ * sched_clock() function is similar to clocksource, but this particular one
+ * should be "cheap" to read (meaning that one can get its value fast), as
+ * sched_clock() is used for task-scheduling purposes and scheduling happens
+ * often. We're ready to sacrifice accuracy and other characteristics for
+ * speed.
+ *
+ * > CLOCK_REALTIME clock gives the time passed since January 1, 1970. This
+ * clock is affected by NTP adjustments and can jump forward and backward when
+ * a system administrator adjusts system time.
+ *
+ * > CLOCK_MONOTONIC clock gives the time since a fixed starting point-usually
+ * since you booted the system. This clock is affected by NTP, but it can't
+ * jump backward.
+ *
+ * > CLOCK_MONOTONIC_RAW clock gives the same time as CLOCK_MONOTONIC, but this
+ * clock is not affected by NTP adjustments.
+ *
+ * > CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE are faster but
+ * less-accurate variants of CLOCK_REALTIME and CLOCK_MONOTONIC.
+ *
+ *
+ *
+ * Hardware extensions for virtualizing TSC
+ *
+ * Since the early days of hardware-assisted virtualization, Intel was
+ * supplying an option to do TSC offsetting for virtual guests in hardware,
+ * which would mean that a guest's rdtsc reading will return a host's TSC value
+ * + offset. Unfortunately, this wasn't enough to support migration between
+ * different hosts because TSC frequency may differ, so pvclock and TSC page
+ * protocol were introduced. In late 2015, Intel introduced the TSC scaling
+ * feature (which was already present in AMD processors for several years) and,
+ * in theory, this is a game changer making pvclock and TSC page protocols
+ * redundant. However, an immediate switch to using plain TSC as a clocksource
+ * for virtualized guests seems impractical; one must be sure that all
+ * potential migration recipient hosts support the feature, but it is not yet
+ * widely available. Extensive testing also must be performed to make sure
+ * there are no drawbacks to switching from paravirtualized protocols.
+ */
+
+/*
+ * To get the current TSC reading, guests must do the following math:
+ *
+ * PerCPUTime = ((RDTSC() - tsc_timestamp) >> tsc_shift) * tsc_to_system_mul + system_time
+ *
+ *
+ *
+ * kvmclock or KVM pvclock lets guests read the host's wall clock time. It's
+ * really very simple: the guest sets aside a page of its RAM and asks the host
+ * to write time into that page (using an MSR). The host writes a structure
+ * containing the current time to this page - in theory the host updates this
+ * page constantly, but in reality that would be wasteful and the structure is
+ * only updated just before reentering the guest after some VM event.
+ * host更新clock的时间的函数在kvm_guest_time_update(), 只被vcpu_enter_guest()调用
+ */
+
static int kvmclock __ro_after_init = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
@@ -45,6 +115,9 @@ static int parse_no_kvmclock(char *arg)
early_param("no-kvmclock", parse_no_kvmclock);
/* The hypervisor will put information about time periodically here */
+/*
+ * 相当与xen的vcpuinfo里的时间的部分 和host共享
+ */
static struct pvclock_vsyscall_time_info *hv_clock;
static struct pvclock_wall_clock *wall_clock;
@@ -273,6 +346,10 @@ static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size)
memblock_free(addr, size);
}
+/*
+ * called only by:
+ * - arch/x86/kernel/setup.c|1207| <<setup_arch>> kvmclock_init();
+ */
void __init kvmclock_init(void)
{
struct pvclock_vcpu_time_info *vcpu_time;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f02ecaf9..a962a95c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -960,6 +960,11 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
* Returns zero if CPU booted OK, else error code from
* ->wakeup_secondary_cpu.
*/
+/*
+ * called by:
+ * - arch/ia64/kernel/smpboot.c|751| <<__cpu_up>> ret = do_boot_cpu(sapicid, cpu, tidle);
+ * - arch/x86/kernel/smpboot.c|1117| <<native_cpu_up>> err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
+ */
static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
int *cpu0_nmi_registered)
{
@@ -1069,6 +1074,19 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
return boot_error;
}
+/*
+ * [0] native_cpu_up
+ * [0] bringup_cpu
+ * [0] cpuhp_invoke_callback
+ * [0] _cpu_up
+ * [0] do_cpu_up
+ * [0] smp_init
+ * [0] kernel_init_freeable
+ * [0] kernel_init
+ * [0] ret_from_fork
+ *
+ * struct smp_ops smp_ops.cpu_up = native_cpu_up()
+ */
int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int apicid = apic->cpu_present_to_apicid(cpu);
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index ec534f97..117e3dd7 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -307,6 +307,10 @@ static inline unsigned int loop_timeout(int cpu)
* Source CPU calls into this - it waits for the freshly booted
* target CPU to arrive and then starts the measurement:
*/
+/*
+ * called by:
+ * - arch/x86/kernel/smpboot.c|1126| <<native_cpu_up>> check_tsc_sync_source(cpu);
+ */
void check_tsc_sync_source(int cpu)
{
int cpus = 2;
@@ -397,6 +401,10 @@ void check_tsc_sync_source(int cpu)
/*
* Freshly booted CPUs call into this:
*/
+/*
+ * called by:
+ * - arch/x86/kernel/smpboot.c|245| <<start_secondary>> check_tsc_sync_target();
+ */
void check_tsc_sync_target(void)
{
struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index af192895..f5d8eb10 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -264,6 +264,12 @@ static void pit_do_work(struct kthread_work *work)
kvm_apic_nmi_wd_deliver(vcpu);
}
+/*
+ * used by:
+ * - arch/x86/kvm/i8254.c|680| <<kvm_create_pit>> pit_state->timer.function = pit_timer_fn;
+ *
+ * 传统pit的时钟函数, 都用lapic了
+ */
static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{
struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
@@ -645,6 +651,10 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
.write = speaker_ioport_write,
};
+/*
+ * called only by:
+ * - arch/x86/kvm/x86.c|4390| <<kvm_arch_vm_ioctl>> kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
+ */
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
{
struct kvm_pit *pit;
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 9619dcc2..40810a18 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -102,6 +102,11 @@ static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
{
+ /*
+ * 在以下修改:
+ * - arch/x86/kvm/kvm_cache_regs.h|90| <<enter_guest_mode>> vcpu->arch.hflags |= HF_GUEST_MASK;
+ * - arch/x86/kvm/kvm_cache_regs.h|95| <<leave_guest_mode>> vcpu->arch.hflags &= ~HF_GUEST_MASK;
+ */
return vcpu->arch.hflags & HF_GUEST_MASK;
}