CPU在内核中运行时并不是处处不可抢占的,内核中存在一些空隙,在这时进行抢占是安全的,内核抢占补丁的基本原理就是将SMP可并行的代码段看成是能够进行内核抢占的区域。

Linux 2.4内核正好细化了多CPU下的内核线程同步机构,对不可并行的指令块用spinlock和rwlock作了细致的表示,该补丁的实现可谓水到渠成。具体的方法就是在进程的任务结构上增加一个preempt_count变量作为内核抢占锁,他随着spinlock和rwlock一起加锁和解锁。当preempt_count为0时表示能够进行内核调度。内核调度器的入口为preempt_schedule(),他将当前进程标记为TASK_PREEMPTED状态再调用schedule(),在TASK_PREEMPTED状态,schedule()不会将进程从运行队列中删除。

下面是内核抢占补丁的主要代码示意:

arch/i386/kernel/entry.S:

            preempt_count = 4 # 将task_struct中的flags用作preempt_count,flags被移到了别

            的位置

            ret_from_exception: # 从异常返回

            #ifdef CONFIG_SMP

            GET_CURRENT(離)

            movl processor(離),陎

            shll $CONFIG_X86_L1_CACHE_SHIFT,陎

            movl SYMBOL_NAME(irq_stat)(,陎),靫 # softirq_active

            testl SYMBOL_NAME(irq_stat) 4(,陎),靫 # softirq_mask

            #else

            movl SYMBOL_NAME(irq_stat),靫 # softirq_active

            testl SYMBOL_NAME(irq_stat) 4,靫 # softirq_mask

            #endif

            jne handle_softirq

            #ifdef CONFIG_PREEMPT

            cli

            incl preempt_count(離) # 异常的入口没有禁止内核调度的指令,和ret_from_intr

            匹配一下

            #endif

            ENTRY(ret_from_intr) # 硬件中断的返回

            GET_CURRENT(離)

            #ifdef CONFIG_PREEMPT

            cli

            decl preempt_count(離) # 恢复内核抢占标志

            #endif

            movl EFLAGS(%esp),陎 # mix EFLAGS and CS

            movb CS(%esp),%al

            testl $(VM_MASK | 3),陎 # return to VM86 mode or non-supervisor?

            jne ret_with_reschedule

            #ifdef CONFIG_PREEMPT

            cmpl $0,preempt_count(離)

            jnz restore_all # 假如preempt_count非零则表示禁止内核抢占

            cmpl $0,need_resched(離)

            jz restore_all #

            movl SYMBOL_NAME(irq_stat) irq_stat_local_bh_count CPU_INDX,靫

            addl SYMBOL_NAME(irq_stat) irq_stat_local_irq_count CPU_INDX,靫

            jnz restore_all

            incl preempt_count(離)

            sti

            call SYMBOL_NAME(preempt_schedule)

            jmp ret_from_intr # 新进程返回,返回ret_from_intr恢复抢占标志后再返回

            #else

            jmp restore_all

            #endif

            ALIGN

            handle_softirq:

            #ifdef CONFIG_PREEMPT

            cli

            GET_CURRENT(離)

            incl preempt_count(離)

            sti

            #endif

            call SYMBOL_NAME(do_softirq)

            jmp ret_from_intr

            ALIGN

            reschedule:

            call SYMBOL_NAME(schedule) # test

            jmp ret_from_sys_call

            include/asm/hw_irq.h:

            ...

            #ifdef CONFIG_PREEMPT

            #define BUMP_CONTEX_SWITCH_LOCK             GET_CURRENT             "incl 4(離)\n\t"

            #else

            #define BUMP_CONTEX_SWITCH_LOCK

            #endif

            #define SAVE_ALL \ 硬件中断保护入口现场

            "cld\n\t"             "pushl %es\n\t"             "pushl %ds\n\t"             "pushl 陎\n\t"             "pushl 雙\n\t"             "pushl 韎\n\t"             "pushl %esi\n\t"             "pushl 韝\n\t"             "pushl 靫\n\t"             "pushl 離\n\t"             "movl $" STR(__KERNEL_DS) ",韝\n\t"             "movl 韝,%ds\n\t"             "movl 韝,%es\n\t"             BUMP_CONTEX_SWITCH_LOCK # 硬件中断的入口禁止内核抢占

            include/linux/spinlock.h:

            #ifdef CONFIG_PREEMPT

            #define switch_lock_count() current->preempt_count

            #define in_ctx_sw_off() (switch_lock_count().counter) 判断当前进程的抢占计数

            是否非零

            #define atomic_ptr_in_ctx_sw_off() (&switch_lock_count())

            #define ctx_sw_off() \ 禁止内核抢占

            do {             atomic_inc(atomic_ptr_in_ctx_sw_off()); \ 当前进程的内核抢占计数增1

            } while (0)

            #define ctx_sw_on_no_preempt() \ 允许内核抢占

            do {             atomic_dec(atomic_ptr_in_ctx_sw_off()); \ 当前进程的内核抢占计数减1

            } while (0)

            #define ctx_sw_on() \ 允许并完成内核抢占

            do {             if (atomic_dec_and_test(atomic_ptr_in_ctx_sw_off()) &&             current->need_resched)             preempt_schedule();             } while (0)

            #define spin_lock(lock)             do {             ctx_sw_off(); \ 进入自旋锁时禁止抢占

            _raw_spin_lock(lock);             } while(0)

            #define spin_trylock(lock) ({ctx_sw_off(); _raw_spin_trylock(lock) ? \锁定并

            测试原来是否上锁

            1 : ({ctx_sw_on(); 0;});})

            #define spin_unlock(lock)             do {             _raw_spin_unlock(lock);             ctx_sw_on(); \ 离开自旋锁时允许并完成内核抢占

            } while (0)

            #define read_lock(lock) ({ctx_sw_off(); _raw_read_lock(lock);})

            #define read_unlock(lock) ({_raw_read_unlock(lock); ctx_sw_on();})

            #define write_lock(lock) ({ctx_sw_off(); _raw_write_lock(lock);})

            #define write_unlock(lock) ({_raw_write_unlock(lock); ctx_sw_on();})

            #define write_trylock(lock) ({ctx_sw_off(); _raw_write_trylock(lock) ?             1 : ({ctx_sw_on(); 0;});})

            ...

            include/asm/softirq.h:

            #define cpu_bh_disable(cpu) do { ctx_sw_off(); local_bh_count(cpu)  ; barrie

            r(); } while (0)

            #define cpu_bh_enable(cpu) do { barrier(); local_bh_count(cpu)--;ctx_sw_on()

            ; } while (0)

            kernel/schedule.c:

            #ifdef CONFIG_PREEMPT

            asmlinkage void preempt_schedule(void)

            {

            while (current->need_resched) {

            ctx_sw_off();

            current->state |= TASK_PREEMPTED;

            schedule();

            current->state &= ~TASK_PREEMPTED;

            ctx_sw_on_no_preempt();

            }

            }

            #endif

            asmlinkage void schedule(void)

            {

            struct schedule_data * sched_data;

            struct task_struct *prev, *next, *p;

            struct list_head *tmp;

            int this_cpu, c;

            #ifdef CONFIG_PREEMPT

            ctx_sw_off();

            #endif

            if (!current->active_mm) BUG();

            need_resched_back:

            prev = current;

            this_cpu = prev->processor;

            if (in_interrupt())

            goto scheduling_in_interrupt;

            release_kernel_lock(prev, this_cpu);

            /* Do "administrative" work here while we don't hold any locks */

            if (softirq_active(this_cpu) & softirq_mask(this_cpu))

            goto handle_softirq;

            handle_softirq_back:

            /*

            * 'sched_data' is protected by the fact that we can run

            * only one process per CPU.

            */

            sched_data = & aligned_data[this_cpu].schedule_data;

            spin_lock_irq(&runqueue_lock);

            /* move an exhausted RR process to be last.. */

            if (prev->policy == SCHED_RR)

            goto move_rr_last;

            move_rr_back:

            switch (prev->state) {

            case TASK_INTERRUPTIBLE:

            if (signal_pending(prev)) {

            prev->state = TASK_RUNNING;

            break;

            }

            default:

            #ifdef CONFIG_PREEMPT

            if (prev->state & TASK_PREEMPTED)

            break; 假如是内核抢占调度,则保留运行队列

            #endif

            del_from_runqueue(prev);

            #ifdef CONFIG_PREEMPT

            case TASK_PREEMPTED:

            #endif

            case TASK_RUNNING:

            }

            prev->need_resched = 0;

            /*

            * this is the scheduler proper:

            */

            repeat_schedule:

            /*

            * Default process to select..

            */

            next = idle_task(this_cpu);

            c = -1000;

            if (task_on_runqueue(prev))

            goto still_running;

            still_running_back:

            list_for_each(tmp, &runqueue_head) {

            p = list_entry(tmp, struct task_struct, run_list);

            if (can_schedule(p, this_cpu)) {

            int weight = goodness(p, this_cpu, prev->active_mm);

            if (weight > c)

            c = weight, next = p;

            }

            }

            /* Do we need to re-calculate counters? */

            if (!c)

            goto recalculate;

            /*

            * from this point on nothing can prevent us from

            * switching to the next task, save this fact in

            * sched_data.

            */

            sched_data->curr = next;

            #ifdef CONFIG_SMP

            next->has_cpu = 1;

            next->processor = this_cpu;

            #endif

            spin_unlock_irq(&runqueue_lock);

            if (prev == next)

            goto same_process;

            #ifdef CONFIG_SMP

            /*

            * maintain the per-process 'last schedule' value.

            * (this has to be recalculated even if we reschedule to

            * the same process) Currently this is only used on SMP,

            * and it's approximate, so we do not have to maintain

            * it while holding the runqueue spinlock.

            */

            sched_data->last_schedule = get_cycles();

            /*

            * We drop the scheduler lock early (it's a global spinlock),

            * thus we have to lock the previous process from getting

            * rescheduled during switch_to().

            */

            #endif /* CONFIG_SMP */

            kstat.context_swtch  ;

            /*

            * there are 3 processes which are affected by a context switch:

            *

            * prev == .... ==> (last => next)

            *

            * It's the 'much more previous' 'prev' that is on next's stack,

            * but prev is set to (the just run) 'last' process by switch_to().

            * This might sound slightly confusing but makes tons of sense.

            */

            prepare_to_switch();

            {

            struct mm_struct *mm = next->mm;

            struct mm_struct *oldmm = prev->active_mm;

            if (!mm) {

            if (next->active_mm) BUG();

            next->active_mm = oldmm;

            atomic_inc(&oldmm->mm_count);

            enter_lazy_tlb(oldmm, next, this_cpu);

            } else {

            if (next->active_mm != mm) BUG();

            switch_mm(oldmm, mm, next, this_cpu);

            }

            if (!prev->mm) {

            prev->active_mm = NULL;

            mmdrop(oldmm);

            }

            }

            /*

            * This just switches the register state and the

            * stack.

            */

            switch_to(prev, next, prev);

            __schedule_tail(prev);

            same_process:

            reacquire_kernel_lock(current);

            if (current->need_resched)

            goto need_resched_back;

            #ifdef CONFIG_PREEMPT

            ctx_sw_on_no_preempt();

            #endif

            return;

            recalculate:

            {

            struct task_struct *p;

            spin_unlock_irq(&runqueue_lock);

            read_lock(&tasklist_lock);

            for_each_task(p)

            p->counter = (p->counter >> 1)   NICE_TO_TICKS(p->nice);

            read_unlock(&tasklist_lock);

            spin_lock_irq(&runqueue_lock);

            }

            goto repeat_schedule;

            still_running:

            c = goodness(prev, this_cpu, prev->active_mm);

            next = prev;

            goto still_running_back;

            handle_softirq:

            do_softirq();

            goto handle_softirq_back;

            move_rr_last:

            if (!prev->counter) {

            prev->counter = NICE_TO_TICKS(prev->nice);

            move_last_runqueue(prev);

            }

            goto move_rr_back;

            scheduling_in_interrupt:

            printk("Scheduling in interrupt\n");

            BUG();

            return;

            }

            void schedule_tail(struct task_struct *prev)

            {

            __schedule_tail(prev);

            #ifdef CONFIG_PREEMPT

            ctx_sw_on();

            #endif

            }

文章整理:西部数码--专业提供域名注册虚拟主机服务
http://www.west263.com
以上信息与文章正文是不可分割的一部分,如果您要转载本文章,请保留以上信息,谢谢!