Archive for December, 2015

How to use ftrace and gcov to understanding kernel scheduling?

The following described the steps for using FTRACE analysis:

First we need to mount debugfs for analysis:
sudo mkdir /debug
sudo mount -t debugfs nodev /debug
echo 0 >/debug/tracing/tracing_on
echo '*rt_mutex*' '*sched*' > /debug/tracing/set_ftrace_filter

You can experiment with yourself on this. Try putting different kernel API (found from /proc/kallsyms) and encapsulate it within the “*”.

And read it back with “cat /debug/tracing/set_ftrace_filter”.

echo function_graph >/debug/tracing/current_tracer
echo 1 >/debug/tracing/tracing_on

ls -alR /btrfs
cp /bin/l* /btrfs
sleep 3

Here is where you can issues command that makes a lot of calls to task switching logic, eg, repeated “sleep 1;ls /proc/”.

Notice that the procfs “/proc” is used – because procfs is not reading from cache, and actual traversal of the kernel is needed to generate all the displayed data, and thus will trigger a lot of kernel code path execution.

echo 0 >/debug/tracing/tracing_on ===> to stop tracing.
cat /debug/tracing/trace ==> to view the tracing output.

whether you have “stop tracing” or not does not matter.   If you have not, the tracing output will be different each time you “cat trace”.

 3)   0.371 us    |  _cond_resched();
 3)   0.278 us    |  resched_curr();
 3)   0.132 us    |  _cond_resched();
 3)               |  schedule_timeout() {
 3)               |    schedule() {
 3)   0.175 us    |      rcu_sched_qs();
 ------------------------------------------
 3)  btrfs-t-873   =>    <idle>-0
 ------------------------------------------
 3) + 22.614 us   |    }
 3) + 24.023 us   |  } /* schedule_preempt_disabled */
 3)   3.136 us    |  tick_nohz_stop_sched_tick();
 3)   0.150 us    |  sched_idle_set_state();
 0)   0.284 us    |  sched_idle_set_state();
 0)               |  tick_nohz_restart_sched_tick() {
 0)   0.267 us    |    sched_avg_update();
 0)   6.895 us    |  }
 0)   0.282 us    |  sched_ttwu_pending();
 0)               |  schedule_preempt_disabled() {
 0)               |    schedule() {
 0)   0.222 us    |      rcu_sched_qs();

The number in front represent the CPU core number, and thus the indentation does not look very logical, as it is quite messed up.   Moreover, only a very small subset of API have been instrumented.

Let us instrument all the API which is available to us for instrumentation:

echo '*' > /sys/kernel/debug/tracing/set_ftrace_filter

And now filtering out to leave behind only those instructions executing on CPU core 0 my machine (i7 8-core) – also called the “boot cpu” as it is the first core that got started at system bootup time:

0)               |  call_cpuidle() {
 0)               |    cpuidle_enter() {
 0)               |      cpuidle_enter_state() {
 0)   0.058 us    |        sched_idle_set_state();
 0)   0.093 us    |        ktime_get();
 0)               |        intel_idle() {
 0)   0.455 us    |          leave_mm();
 0) # 5151.052 us |        }
 0)   0.223 us    |        ktime_get();
 0)   0.266 us    |        sched_idle_set_state();
 0)               |        smp_reschedule_interrupt() {
 0)               |          scheduler_ipi() {
 0)               |            irq_enter() {
 0)               |              rcu_irq_enter() {
 0)   0.226 us    |                rcu_sysidle_exit();
 0)   2.197 us    |              }
 0)               |              tick_irq_enter() {
 0)   0.058 us    |                tick_check_oneshot_broadcast_this_cpu();
 0)   0.195 us    |                ktime_get();
 0)               |                tick_nohz_stop_idle() {
 0)               |                  update_ts_time_stats() {
 0)   0.238 us    |                    nr_iowait_cpu();
 0)   2.658 us    |                  }
 0)   0.174 us    |                  touch_softlockup_watchdog();
 0)   7.120 us    |                }
 0)   0.114 us    |                tick_do_update_jiffies64();
 0)   0.186 us    |                touch_softlockup_watchdog();
 0) + 17.748 us   |              }
 0)               |              _local_bh_enable() {
 0)   0.172 us    |                __local_bh_enable();
 0)   2.071 us    |              }
 0) + 27.292 us   |            }
 0)   0.157 us    |            sched_ttwu_pending();
 0)               |            irq_exit() {
 0)               |              __do_softirq() {
 0)               |                run_rebalance_domains() {
 0)   0.122 us    |                  idle_cpu();
 0)   0.137 us    |                  _raw_spin_lock_irq();
 0)   0.181 us    |                  update_rq_clock();
 0)               |                  rebalance_domains() {
 0)               |                    update_blocked_averages() {
 0)   0.127 us    |                      _raw_spin_lock_irqsave();
 0)   0.319 us    |                      update_rq_clock();
 0)   0.166 us    |                      __compute_runnable_contrib();
 0)   0.137 us    |                      __compute_runnable_contrib();
 0)   0.107 us    |                      __compute_runnable_contrib();
 0)   0.041 us    |                      __compute_runnable_contrib();
 0)   0.086 us    |                      __compute_runnable_contrib();

I have not understand fully the internals of scheduling yet, and so will not making much comments here.

But safe to say in any scheduling logic, you need:

a.  CPU switching logic:   being able to dequeue one task from one CPU and enqueue it on another CPU.

b.  load balancing logic:   all CPU have its own task queue running independently of the other CPU, and all execution of one CPU is highly indedepent on another CPU as well.   So sometimes you will need to examine the queue length to rebalance the queues.
c.  IPI (Inter-processor interrupt): this is the hardware signal between CPU to interrupt other CPU get its attention as one CPU has something to talk to another over the interprocessor bus.   It may be a one-to-one cross-switching bus or a broadcast mechanism.
d.   spinlocks:   Every CPU have its own independent timer, interrupt tables and and so on, but when reading from memory, it is a common shared memory (SMP architecture).  And so every read/write operation on common memory have to be locked with a spinlock structure to avoid collision.
e.   Process accounting:   each execution of the process have to update some resources usage information for that process.
And so below are the snapshots of events happening on CPU 0:
24726  0)               |          schedule() {
 24727  0)               |            rcu_note_context_switch() {
 24728  0)   0.064 us    |              rcu_sched_qs();
 24729  0)   2.628 us    |            }
 24730  0)   0.219 us    |            _raw_spin_lock_irq();
 24731  0)               |            deactivate_task() {
 24732  0)   0.235 us    |              update_rq_clock();
 24733  0)               |              dequeue_task_fair() {
 24734  0)               |                dequeue_entity() {
 24735  0)               |                  update_curr() {
 24736  0)   0.203 us    |                    update_min_vruntime();
 24737  0)   0.307 us    |                    cpuacct_charge();
 24738  0)   6.199 us    |                  }
 24739  0)   0.187 us    |                  __compute_runnable_contrib();
 24740  0)   0.157 us    |                  clear_buddies();
 24741  0)   0.284 us    |                  account_entity_dequeue();
 24742  0)   0.165 us    |                  update_min_vruntime();
 24743  0)               |                  update_cfs_shares() {
 24744  0)               |                    update_curr() {
 24745  0)   0.122 us    |                      __calc_delta();
 24746  0)   0.332 us    |                      update_min_vruntime();
 24747  0)   5.724 us    |                    }
 24748  0)   0.336 us    |                    account_entity_dequeue();
 24749  0)   0.110 us    |                    account_entity_enqueue();
 24750  0) + 13.877 us   |                  }
 24751  0) + 37.606 us   |                }
 24752  0)               |                dequeue_entity() {
 24753  0)   0.202 us    |                  update_curr();
 24754  0)   0.089 us    |                  clear_buddies();
 24755  0)   0.092 us    |                  account_entity_dequeue();
 24756  0)   0.054 us    |                  update_min_vruntime();
 24757  0)   0.289 us    |                  update_cfs_shares();
 24758  0) + 14.732 us   |                }
 24759  0)   0.168 us    |                hrtick_update();
 24760  0) + 59.122 us   |              }
 24761  0) + 65.030 us   |            }
 24762  0)               |            pick_next_task_fair() {
 24763  0)   0.067 us    |              __msecs_to_jiffies();

And below shows an Xorg triggered to go into idling mode:

24775  0)   Xorg-2305    =>    <idle>-0
 24776  0)   0.119 us    |      __switch_to_xtra();
 24777  0)   0.394 us    |      finish_task_switch();
 24778  0) ! 220.623 us  |    } /* schedule */
 24779  0) ! 223.059 us  |  } /* schedule_preempt_disabled */
 24780  0)               |  tick_nohz_idle_enter() {
 24781  0)   0.072 us    |    set_cpu_sd_state_idle();
 24782  0)               |    __tick_nohz_idle_enter() {
 24783  0)   0.150 us    |      ktime_get();
 24784  0)               |      tick_nohz_stop_sched_tick() {
 24785  0)   0.151 us    |        rcu_needs_cpu();
 24786  0)               |        get_next_timer_interrupt() {
 24787  0)   0.160 us    |          _raw_spin_lock();
 24788  0)               |          hrtimer_get_next_event() {
 24789  0)   0.305 us    |            _raw_spin_lock_irqsave();
 24790  0)   0.259 us    |            _raw_spin_unlock_irqrestore();
 24791  0)   6.779 us    |          }
 24792  0) + 11.664 us   |        }
 24793  0) + 17.438 us   |      }
 24794  0) + 22.068 us   |    }
 24795  0) + 27.167 us   |  }

And here is the logic for picking the next task during idling mode:

4765   0)               |            pick_next_task_idle() {
 24766  0)               |              put_prev_task_fair() {
 24767  0)               |                put_prev_entity() {
 24768  0)   0.208 us    |                  check_cfs_rq_runtime();
 24769  0)   3.249 us    |                }
 24770  0)               |                put_prev_entity() {
 24771  0)   0.106 us    |                  check_cfs_rq_runtime();
 24772  0)   3.038 us    |                }

And here is the timer statistics updating, which ultimately even propagated to the KVM kernel modules which maintained timer information for its guest through the notifier call chain mechanism (which is not hardcoded into the kernel source code, and so reading the kernel source code directly you will not see the callout directly):

 1081  0)               |                update_wall_time() {
 1082  0)   0.184 us    |                  _raw_spin_lock_irqsave();
 1083  0)   0.139 us    |                  ntp_tick_length();
 1084  0)   0.142 us    |                  ntp_tick_length();
 1085  0)   0.149 us    |                  ntp_tick_length();
 1086  0)               |                  timekeeping_update() {
 1087  0)   0.142 us    |                    ntp_get_next_leap();
 1088  0)   0.232 us    |                    update_vsyscall();
 1089  0)               |                    raw_notifier_call_chain() {
 1090  0)               |                      notifier_call_chain() {
 1091  0)   0.385 us    |                        pvclock_gtod_notify [kvm]();
 1092  0)   1.865 us    |                      }
 1093  0)   3.014 us    |                    }
 1094  0)   0.155 us    |                    update_fast_timekeeper();
 1095  0)   0.152 us    |                    update_fast_timekeeper();
 1096  0)   8.926 us    |                  }
 1097  0)   0.220 us    |                  _raw_spin_unlock_irqrestore();
 1098  0) + 16.559 us   |                }
 1099  0) + 21.748 us   |              }
 1100  0)   0.133 us    |              touch_softlockup_watchdog();
And here is the APIC timer interrupt triggered processing –
 1113  0)               |          local_apic_timer_interrupt() {
 1114  0)               |            hrtimer_interrupt() {
 1115  0)   0.139 us    |              _raw_spin_lock();
 1116  0)   0.257 us    |              ktime_get_update_offsets_now();
 1117  0)               |              __hrtimer_run_queues() {
 1118  0)   0.860 us    |                __remove_hrtimer();
 1119  0)               |                tick_sched_timer() {
 1120  0)   0.194 us    |                  ktime_get();
 1121  0)               |                  tick_sched_do_timer() {
 1122  0)   0.162 us    |                    tick_do_update_jiffies64();
 1123  0)   1.462 us    |                  }
 1124  0)               |                  tick_sched_handle.isra.14() {
 1125  0)   0.136 us    |                    touch_softlockup_watchdog();
 1126  0)               |                    update_process_times() {
 1127  0)   0.328 us    |                      account_process_tick();
 1128  0)   0.150 us    |                      hrtimer_run_queues();
 1129  0)   0.251 us    |                      raise_softirq();
 1130  0)               |                      rcu_check_callbacks() {
 1131  0)   0.166 us    |                        rcu_sched_qs();
 1132  0)   0.199 us    |                        cpu_needs_another_gp();
 1133  0)               |                        invoke_rcu_core() {
 1134  0)   0.199 us    |                          raise_softirq();
 1135  0)   1.406 us    |                        }
 1136  0)   5.365 us    |                      }
 1137  0)               |                      scheduler_tick() {
 1138  0)   0.191 us    |                        _raw_spin_lock();
 1139  0)   0.336 us    |                        update_rq_clock();
 1140  0)   0.152 us    |                        task_tick_idle();
 1141  0)               |                        update_cpu_load_active() {
 1142  0)               |                          __update_cpu_load() {
 1143  0)   0.140 us    |                            sched_avg_update();
 1144  0)   1.587 us    |                          }
 1145  0)   2.719 us    |                        }
 1146  0)   0.178 us    |                        calc_global_load_tick();
 1147  0)               |                        trigger_load_balance() {
 1148  0)   0.189 us    |                          raise_softirq();
 1149  0)   1.386 us    |                        }
 1150  0) + 11.249 us   |                      }
 1151  0)   0.353 us    |                      run_posix_cpu_timers();
 1152  0) + 25.259 us   |                    }
 1153  0)   0.151 us    |                    profile_tick();
 1154  0) + 28.624 us   |                  }
 1155  0) + 33.276 us   |                }
 1156  0)   0.142 us    |                _raw_spin_lock();
 1157  0) + 37.941 us   |              }
 1158  0)   0.173 us    |              __hrtimer_get_next_event();
 1159  0)               |              tick_program_event() {
 1160  0)               |                clockevents_program_event() {
Now using gcov analysis:
cd /home/tthtlc/linux_latest
 gcov -o /debug/gcov/home/tthtlc/linux_latest/kernel/sched/ core.c

And if you view the “*.gcov” files generated in the present directory:

 -:   41:#ifdef __ARCH_HAS_VTIME_TASK_SWITCH
 -:   42:extern void vtime_task_switch(struct task_struct *prev);
 -:   43:#else
 -:   44:extern void vtime_common_task_switch(struct task_struct *prev);
 1459453:   45:static inline void vtime_task_switch(struct task_struct *prev)
 -:   46:{
 1461467:   47:    if (vtime_accounting_enabled())
   #####:   48:        vtime_common_task_switch(prev);
 1461467:   49:}
       -:   50:#endif /* __ARCH_HAS_VTIME_TASK_SWITCH */
 -:   51:
 -:   52:extern void vtime_account_system(struct task_struct *tsk);
 -:   53:extern void vtime_account_idle(struct task_struct *tsk);
 -:   54:extern void vtime_account_user(struct task_struct *tsk);
 -:   55:
 -:   56:#ifdef __ARCH_HAS_VTIME_ACCOUNT
 -:   57:extern void vtime_account_irq_enter(struct
A lot of these are “####” meaning it is not executed, if not the number represent the times it has been executed like line 47-48 above.

And if you sort it you can get the highest frequently called lines inside core.c:

grep -v "\-:" core.c.gcov |grep -v "####"|sort -n
 
 1681581:  565:}
 1701534:  545:void wake_up_q(struct wake_q_head *head)
 1701534:  547:    struct wake_q_node *node = head->first;
 1788519: 1680:        for_each_domain(this_cpu, sd) {
 1871785: 3569:    struct rq *rq = cpu_rq(cpu);
 1871785: 3571:    if (rq->curr != rq->idle)
 2012652: 1972:    while (p->on_cpu)
 2181250: 1032:        for_each_class(class) {
 2232245: 1046:    if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
 2303005: 3192:    if (!tsk->state || tsk_is_pi_blocked(tsk))
 2396808:  104:    if (rq->clock_skip_update & RQCF_ACT_SKIP)
 2396808:   98:void update_rq_clock(struct rq *rq)
 2910568: 1033:            if (class == rq->curr->sched_class)
 2911445: 1035:            if (class == p->sched_class) {
 2912432: 3052:    for_each_class(class) {
 2912432: 3053:        p = class->pick_next_task(rq, prev);
 2915415: 3054:        if (p) {
 3840916:  549:    while (node != WAKE_Q_TAIL) {
23406406: 4615:int __sched _cond_resched(void)
23406406: 4617:    if (should_resched(0)) {

This will not tell you much about the logic of scheduling, but perhaps only the bottleneck inside core.c.

Advertisements
%d bloggers like this: