linux内核源码阅读之facebook硬盘加速利器flashcache
1963int __init
1964flashcache_init(void)
1965{
1966 int r;
1967
1968 r = flashcache_jobs_init();
1969 if (r)
1970 return r;
1971 atomic_set(&nr_cache_jobs, 0);
1972 atomic_set(&nr_pending_jobs, 0);
1973#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
1974 INIT_WORK(&_kcached_wq, do_work, NULL);
1975#else
1976 INIT_WORK(&_kcached_wq, do_work);
1977#endif
1978 for (r = 0 ; r < 33 ; r++)
1979 size_hist[r] = 0;
1980 r = dm_register_target(&flashcache_target);
1981 if (r < 0) {
1982 DMERR("cache: register failed %d", r);
1983 }
1984#ifdef CONFIG_PROC_FS
1985#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
1986 flashcache_table_header =
1987 register_sysctl_table(flashcache_root_table, 1);
1988#else
1989 flashcache_table_header =
1990 register_sysctl_table(flashcache_root_table);
1991#endif
1992 {
1993 struct proc_dir_entry *entry;
1994
1995 entry = create_proc_entry("flashcache_stats", 0, NULL);
1996 if (entry)
1997 entry->proc_fops = &flashcache_stats_operations;
1998 entry = create_proc_entry("flashcache_errors", 0, NULL);
1999 if (entry)
2000 entry->proc_fops = &flashcache_errors_operations;
2001 entry = create_proc_entry("flashcache_iosize_hist", 0, NULL);
2002 if (entry)
2003 entry->proc_fops = &flashcache_iosize_hist_operations;
2004 entry = create_proc_entry("flashcache_pidlists", 0, NULL);
2005 if (entry)
2006 entry->proc_fops = &flashcache_pidlists_operations;
2007 entry = create_proc_entry("flashcache_version", 0, NULL);
2008 if (entry)
2009 entry->proc_fops = &flashcache_version_operations;
2010 }
2011#endif
2012 flashcache_control = (struct flashcache_control_s *)
2013 kmalloc(sizeof(struct flashcache_control_s *), GFP_KERNEL);
2014 flashcache_control->synch_flags = 0;
2015 register_reboot_notifier(&flashcache_notifier);
2016 return r;
2017}
441static int
442flashcache_jobs_init(void)
443{
444#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
445 _job_cache = kmem_cache_create("kcached-jobs",
446 sizeof(struct kcached_job),
447 __alignof__(struct kcached_job),
448 0, NULL, NULL);
449#else
450 _job_cache = kmem_cache_create("kcached-jobs",
451 sizeof(struct kcached_job),
452 __alignof__(struct kcached_job),
453 0, NULL);
454#endif
455 if (!_job_cache)
456 return -ENOMEM;
457
458 _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
459 mempool_free_slab, _job_cache);
460 if (!_job_pool) {
461 kmem_cache_destroy(_job_cache);
462 return -ENOMEM;
463 }
464#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
465 _pending_job_cache = kmem_cache_create("pending-jobs",
466 sizeof(struct pending_job),
467 __alignof__(struct pending_job),
468 0, NULL, NULL);
469#else
470 _pending_job_cache = kmem_cache_create("pending-jobs",
471 sizeof(struct pending_job),
472 __alignof__(struct pending_job),
473 0, NULL);
474#endif
475 if (!_pending_job_cache) {
476 mempool_destroy(_job_pool);
477 kmem_cache_destroy(_job_cache);
478 return -ENOMEM;
479 }
480
481 _pending_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
482 mempool_free_slab, _pending_job_cache);
483 if (!_pending_job_pool) {
484 kmem_cache_destroy(_pending_job_cache);
485 mempool_destroy(_job_pool);
486 kmem_cache_destroy(_job_cache);
487 return -ENOMEM;
488 }
489
490 return 0;
491}
245/* kcached/pending job states */
246#define READCACHE 1
247#define WRITECACHE 2
248#define READDISK 3
249#define WRITEDISK 4
250#define READFILL 5 /* Read Cache Miss Fill */
251#define INVALIDATE 6
252#define WRITEDISK_SYNC 7
unsigned int num_dests, struct dm_io_region *dests,
unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
901static void
902flashcache_kcopyd_callback(int read_err, unsigned int write_err, void *context)
903{
904 struct kcached_job *job = (struct kcached_job *)context;
905 struct cache_c *dmc = job->dmc;
906 int index = job->index;
907 unsigned long flags;
908
909 VERIFY(!in_interrupt());
910 DPRINTK("kcopyd_callback: Index %d", index);
911 VERIFY(job->bio == NULL);
912 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
913 VERIFY(dmc->cache[index].cache_state & (DISKWRITEINPROG | VALID | DIRTY));
914 if (unlikely(sysctl_flashcache_error_inject & KCOPYD_CALLBACK_ERROR)) {
915 read_err = -EIO;
916 sysctl_flashcache_error_inject &= ~KCOPYD_CALLBACK_ERROR;
917 }
918 if (likely(read_err == 0 && write_err == 0)) {
919 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
920 flashcache_md_write(job);
921 } else {
922 /* Disk write failed. We can not purge this block from flash */
923 DMERR("flashcache: Disk writeback failed ! read error %d write error %d block %lu",
924 -read_err, -write_err, job->disk.sector);
925 VERIFY(dmc->cache_sets[index / dmc->assoc].clean_inprog > 0);
926 VERIFY(dmc->clean_inprog > 0);
927 dmc->cache_sets[index / dmc->assoc].clean_inprog--;
928 dmc->clean_inprog--;
929 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
930 /* Set the error in the job and let do_pending() handle the error */
931 if (read_err) {
932 dmc->ssd_read_errors++;
933 job->error = read_err;
934 } else {
935 dmc->disk_write_errors++;
936 job->error = write_err;
937 }
938 flashcache_do_pending(job);
939 flashcache_clean_set(dmc, index / dmc->assoc); /* Kick off more cleanings */
940 dmc->cleanings++;
941 }
942}
860
861/*
862 * Kick off a cache metadata update (called from workqueue).
863 * Cache metadata update IOs to a given metadata sector are serialized using the
864 * nr_in_prog bit in the md sector bufhead.
865 * If a metadata IO is already in progress, we queue up incoming metadata updates
866 * on the pending_jobs list of the md sector bufhead. When kicking off an IO, we
867 * cluster all these pending updates and do all of them as 1 flash write (that
868 * logic is in md_write_kickoff), where it switches out the entire pending_jobs
869 * list and does all of those updates.
870 */
871void
872flashcache_md_write(struct kcached_job *job)
873{
874 struct cache_c *dmc = job->dmc;
875 struct cache_md_sector_head *md_sector_head;
876 unsigned long flags;
877
878 VERIFY(!in_interrupt());
879 VERIFY(job->action == WRITEDISK || job->action == WRITECACHE ||
880 job->action == WRITEDISK_SYNC);
881 md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];
882 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
883 /* If a write is in progress for this metadata sector, queue this update up */
884 if (md_sector_head->nr_in_prog != 0) {
885 struct kcached_job **nodepp;
886
887 /* A MD update is already in progress, queue this one up for later */
888 nodepp = &md_sector_head->pending_jobs;
889 while (*nodepp != NULL)
890 nodepp = &((*nodepp)->next);
891 job->next = NULL;
892 *nodepp = job;
893 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
894 } else {
895 md_sector_head->nr_in_prog = 1;
896 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
897 flashcache_md_write_kickoff(job);
898 }
899}
861/*
862 * Kick off a cache metadata update (called from workqueue).
863 * Cache metadata update IOs to a given metadata sector are serialized using the
864 * nr_in_prog bit in the md sector bufhead.
865 * If a metadata IO is already in progress, we queue up incoming metadata updates
866 * on the pending_jobs list of the md sector bufhead. When kicking off an IO, we
867 * cluster all these pending updates and do all of them as 1 flash write (that
868 * logic is in md_write_kickoff), where it switches out the entire pending_jobs
869 * list and does all of those updates.
870 */
660static void
661flashcache_md_write_kickoff(struct kcached_job *job)
662{
663 struct cache_c *dmc = job->dmc;
664 struct flash_cacheblock *md_sector;
665 int md_sector_ix;
666#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
667 struct io_region where;
668#else
669 struct dm_io_region where;
670#endif
671 int i;
672 struct cache_md_sector_head *md_sector_head;
673 struct kcached_job *orig_job = job;
674 unsigned long flags;
675
676 if (flashcache_alloc_md_sector(job)) {
677 DMERR("flashcache: %d: Cache metadata write failed, cannot alloc page ! block %lu",
678 job->action, job->disk.sector);
679 flashcache_md_write_callback(-EIO, job);
680 return;
681 }
682 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
683 /*
684 * Transfer whatever is on the pending queue to the md_io_inprog queue.
685 */
686 md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];
687 md_sector_head->md_io_inprog = md_sector_head->pending_jobs;
688 md_sector_head->pending_jobs = NULL;
689 md_sector = job->md_sector;
690 md_sector_ix = INDEX_TO_MD_SECTOR(job->index) * MD_BLOCKS_PER_SECTOR;
691 /* First copy out the entire sector */
692 for (i = 0 ;
693 i < MD_BLOCKS_PER_SECTOR && md_sector_ix < dmc->size ;
694 i++, md_sector_ix++) {
695 md_sector[i].dbn = dmc->cache[md_sector_ix].dbn;
696#ifdef FLASHCACHE_DO_CHECKSUMS
697 md_sector[i].checksum = dmc->cache[md_sector_ix].checksum;
698#endif
699 md_sector[i].cache_state =
700 dmc->cache[md_sector_ix].cache_state & (VALID | INVALID | DIRTY);
701 }
702 /* Then set/clear the DIRTY bit for the "current" index */
703 if (job->action == WRITECACHE) {
704 /* DIRTY the cache block */
705 md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state =
706 (VALID | DIRTY);
707 } else { /* job->action == WRITEDISK* */
708 /* un-DIRTY the cache block */
709 md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = VALID;
710 }
711
712 for (job = md_sector_head->md_io_inprog ;
713 job != NULL ;
714 job = job->next) {
715 if (job->action == WRITECACHE) {
716 /* DIRTY the cache block */
717 md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state =
718 (VALID | DIRTY);
719 } else { /* job->action == WRITEDISK* */
720 /* un-DIRTY the cache block */
721 md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = VALID;
722 }
723 }
724 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
725 where.bdev = dmc->cache_dev->bdev;
726 where.count = 1;
727 where.sector = 1 + INDEX_TO_MD_SECTOR(orig_job->index);
728 dmc->ssd_writes++;
729 dm_io_async_bvec(1, &where, WRITE,
730 &orig_job->md_io_bvec,
731 flashcache_md_write_callback, orig_job);
732 flashcache_unplug_device(dmc->cache_dev->bdev);
733}
static int dm_io_async_bvec(unsigned int num_regions,
struct dm_io_region *where, int rw,
struct bio_vec *bvec, io_notify_fn fn,
void *context)
621void
622flashcache_md_write_callback(unsigned long error, void *context)
623{
624 struct kcached_job *job = (struct kcached_job *)context;
625
626 job->error = error;
627 push_md_complete(job);
628 schedule_work(&_kcached_wq);
629}
284static void
285process_jobs(struct list_head *jobs,
286 void (*fn) (struct kcached_job *))
287{
288 struct kcached_job *job;
289
290 while ((job = pop(jobs)))
291 (void)fn(job);
292}
735void
736flashcache_md_write_done(struct kcached_job *job)
737{
738 struct cache_c *dmc = job->dmc;
739 struct cache_md_sector_head *md_sector_head;
740 int index;
741 unsigned long flags;
742 struct kcached_job *job_list;
743 int error = job->error;
744 struct kcached_job *next;
745 struct cacheblock *cacheblk;
746
747 VERIFY(!in_interrupt());
748 VERIFY(job->action == WRITEDISK || job->action == WRITECACHE ||
749 job->action == WRITEDISK_SYNC);
750 flashcache_free_md_sector(job);
751 job->md_sector = NULL;
752 md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];
753 job_list = job;
754 job->next = md_sector_head->md_io_inprog;
755 md_sector_head->md_io_inprog = NULL;
756 for (job = job_list ; job != NULL ; job = next) {
757 next = job->next;
758 job->error = error;
759 index = job->index;
760 cacheblk = &dmc->cache[index];
761 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
762 if (job->action == WRITECACHE) {
763 if (unlikely(sysctl_flashcache_error_inject & WRITECACHE_MD_ERROR)) {
764 job->error = -EIO;
765 sysctl_flashcache_error_inject &= ~WRITECACHE_MD_ERROR;
766 }
767 if (likely(job->error == 0)) {
768 if ((cacheblk->cache_state & DIRTY) == 0) {
769 dmc->cache_sets[index / dmc->assoc].nr_dirty++;
770 dmc->nr_dirty++;
771 }
772 dmc->md_write_dirty++;
773 cacheblk->cache_state |= DIRTY;
774 } else
775 dmc->ssd_write_errors++;
776 flashcache_bio_endio(job->bio, job->error);
777 if (job->error || cacheblk->head) {
778 if (job->error) {
779 DMERR("flashcache: WRITE: Cache metadata write failed ! error %d block %lu",
780 -job->error, cacheblk->dbn);
781 }
782 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
783 flashcache_do_pending(job);
784 } else {
785 cacheblk->cache_state &= ~BLOCK_IO_INPROG;
786 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
787 flashcache_free_cache_job(job);
788 if (atomic_dec_and_test(&dmc->nr_jobs))
789 wake_up(&dmc->destroyq);
790 }
791 } else {
792 int action = job->action;
793
794 if (unlikely(sysctl_flashcache_error_inject & WRITEDISK_MD_ERROR)) {
795 job->error = -EIO;
796 sysctl_flashcache_error_inject &= ~WRITEDISK_MD_ERROR;
797 }
798 /*
799 * If we have an error on a WRITEDISK*, no choice but to preserve the
800 * dirty block in cache. Fail any IOs for this block that occurred while
801 * the block was being cleaned.
802 */
803 if (likely(job->error == 0)) {
804 dmc->md_write_clean++;
805 cacheblk->cache_state &= ~DIRTY;
806 VERIFY(dmc->cache_sets[index / dmc->assoc].nr_dirty > 0);
807 VERIFY(dmc->nr_dirty > 0);
808 dmc->cache_sets[index / dmc->assoc].nr_dirty--;
809 dmc->nr_dirty--;
810 } else
811 dmc->ssd_write_errors++;
812 VERIFY(dmc->cache_sets[index / dmc->assoc].clean_inprog > 0);
813 VERIFY(dmc->clean_inprog > 0);
814 dmc->cache_sets[index / dmc->assoc].clean_inprog--;
815 dmc->clean_inprog--;
816 if (job->error || cacheblk->head) {
817 if (job->error) {
818 DMERR("flashcache: CLEAN: Cache metadata write failed ! error %d block %lu",
819 -job->error, cacheblk->dbn);
820 }
821 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
822 flashcache_do_pending(job);
823 /* Kick off more cleanings */
824 if (action == WRITEDISK)
825 flashcache_clean_set(dmc, index / dmc->assoc);
826 else
827 flashcache_sync_blocks(dmc);
828 } else {
829 cacheblk->cache_state &= ~BLOCK_IO_INPROG;
830 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
831 flashcache_free_cache_job(job);
832 if (atomic_dec_and_test(&dmc->nr_jobs))
833 wake_up(&dmc->destroyq);
834 /* Kick off more cleanings */
835 if (action == WRITEDISK)
836 flashcache_clean_set(dmc, index / dmc->assoc);
837 else
838 flashcache_sync_blocks(dmc);
839 }
840 dmc->cleanings++;
841 if (action == WRITEDISK_SYNC)
842 flashcache_update_sync_progress(dmc);
843 }
844 }
845 spin_lock_irqsave(&dmc->cache_spin_lock, flags);
846 if (md_sector_head->pending_jobs != NULL) {
847 /* peel off the first job from the pending queue and kick that off */
848 job = md_sector_head->pending_jobs;
849 md_sector_head->pending_jobs = job->next;
850 job->next = NULL;
851 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
852 VERIFY(job->action == WRITEDISK || job->action == WRITECACHE ||
853 job->action == WRITEDISK_SYNC);
854 flashcache_md_write_kickoff(job);
855 } else {
856 md_sector_head->nr_in_prog = 0;
857 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
858 }
859}
860
/*
* We have one of these for *every* cache metadata sector, to keep track
* of metadata ios in progress for blocks covered in this sector. Only
* one metadata IO per sector can be in progress at any given point in
* time
*/
struct cache_md_sector_head {
u_int32_t nr_in_prog;
struct kcached_job *pending_jobs, *md_io_inprog;
};
linux内核源码阅读之facebook硬盘加速利器flashcache的更多相关文章
- linux内核源码阅读之facebook硬盘加速flashcache之八
前面我们的分析中重点关注正常的数据流程,这一小节关注如果有异常,那么流程是怎么走完的呢? 1)创建新任务时kcached_job申请不到 2)读写命中时cache块为忙 3)系统关机时处理,系统开机时 ...
- linux内核源码阅读之facebook硬盘加速flashcache之三
上一节讲到在刷缓存的时候会调用new_kcahed_job创建kcached_job,由此我们也可以看到cache数据块与磁盘数据的对应关系.上一篇:http://blog.csdn.net/lium ...
- linux内核源码阅读之facebook硬盘加速flashcache之四
这一小节介绍一下flashcache读写入口和读写的基础实现. 首先,不管是模块还是程序,必须先找到入口,用户态代码会经常去先看main函数,内核看module_init,同样看IO流时候也要找到入口 ...
- linux内核源码阅读之facebook硬盘加速flashcache之二
flashcache数据结构都在flashcache.h文件中,但在看数据结构之前,需要先过一遍flashcache是什么,要完成哪些功能?如果是自己设计这样一个系统的话,大概要怎么设计. 前面讲过, ...
- linux内核源码阅读之facebook硬盘加速flashcache之六
其实到目前为止,如果对读流程已经能轻松地看懂了,那么写流程不需要太多脑细胞.我觉得再写下去没有太大的必要了,后面想想为了保持flashcache完整性,还是写出来吧.接着到写流程: 1530stati ...
- linux内核源码阅读之facebook硬盘加速flashcache之五
正常流程到flashcache_map的1623行或1625行,按顺序先看读流程: 1221static void 1222flashcache_read(struct cache_c *dmc, s ...
- ubuntu下linux内核源码阅读工具和调试方法总结
http://blog.chinaunix.net/uid-20940095-id-66148.html 一 linux内核源码阅读工具 windows下当然首选source insight, 但是l ...
- Linux内核源码阅读记录一之分析存储在不同段中的函数调用过程
在写驱动的过程中,对于入口函数与出口函数我们会用一句话来修饰他们:module_init与module_exit,那会什么经过修饰后,内核就能狗调用我们编写的入口函数与出口函数呢?下面就来分析内核调用 ...
- Linux内核源码分析
Linux源码下载: https://www.kernel.org/ https://git.kernel.org/ Linux内核源码阅读以及工具(转): https://blog.csdn.net ...
随机推荐
- leetcode Climbing Stairs python
class Solution(object): def climbStairs(self, n): """ :type n: int :rtype: int " ...
- VC++学习之进程和线程的区别
VC++学习之进程和线程的区别 一.进程 进程是表示资源分配的基本单位,又是调度运行的基本单位.例如,用户运行自己的程序,系统就创建一个进程,并为它分配资源,包括各种表格.内存空间.磁盘 ...
- 数据库分页【Limt与Limt..OFFSET 】
数据起始 SELECT * from xiaoyao_blogs_essay limit 20 , 15;解释:20是起始位置,15是页容量.因为id是从15开始的 SELECT * from xi ...
- linux杂记(十四)CAJ文档阅读方法
关于Linux下看CAJ文档的方法 前言:由于大四狗要写各种各样的综述,看各种论文,关于知网为何没有PDF下载,关于为何知网没有CAJ阅读器for linux的种种蛋疼问题,都不要问我. 说回正题,网 ...
- ThinkPHP第二十一天(JQuery元素获取,parents属性,toggle方法,确认弹出对话框使用)
1.JQuery用法 A:$(function(){code...});表示当页面载入完毕后执行 B:获取元素方法:根据class名称 $('.classname'),根据ID名称 $('#IDnam ...
- MYSQL group_concat() 函数
看来看一下表中的数据 select * from t; 下一步来看一下group_concat函数的用法 select ID,group_concat(Name) from t group by ID ...
- MySQl5.6最新安装
http://www.cnblogs.com/xiongpq/p/3384681.html http://dev.mysql.com/doc/refman/5.5/en/source-configur ...
- Android学习路径图
一个PHPer转战Android学习过程: 直接跨过java的学习,原因有我之前看过毕向东和张孝祥的Java基础课程,虽然中间好几次看睡着,但java的环境是能跑起来的.我建议大家如果没有Java基础 ...
- qwtplot3D安装——终结解决方案(YOUYOU版)
转自CSDN: 首先不得不说,要感谢北京邮电大学的阿科.感谢他慷慨的分享和极具科学态度的记录,将自己搜集到的众多资料收集整理发布,拯救众多苦逼寻找方案的程序员于苦海之中.因为最近接手新的项目,涉及到使 ...
- tlplayer for ios V1.0
此程序UI修改于虎跃在线课堂.所以极其相似. 可以播放网络视频与本地视频,不知道怎么拷贝本地视频到Ipad或iphone上看的朋友,请自己到网上看教程. 支持mms,file,rtsp,rtmp,ht ...