PCIe 配置空间读写内核实现
1 PCI及PCI-E配置空间介绍
PCI-E是用来互联如计算和通信平台应用中外围设备的第三代高性能I/O总线。PCI-E采用了与PCI相同的使用模型和读写(load-store)通信模型,支持各种常见的事务,如存储器读/写、IO读/写和配置读/写事务。其存储器、IO和配置地址空间与PCI的地址空间相同。PCI Express与PCI系统是软件向后兼容的。
PCI-E的配置空间大小为4096字节,如图1所示。其中前256字节是与PCI兼容的配置寄存器,该区域可以用以下两种机制访问:
· PCI配置访问机制。
· PCI Express增强型配置机制。
图1 PCI-E配置空间
Memory-mappedI/O (MMIO)与port I/O
MMIO和port I/O(也称为port-mapped I/O或PMIO)是两种CPU与外设之间进行I/O操作的方式。
PortI/O是通过特殊的CPU指令来进行I/O操作,在x86架构上,可以通过指令in和out在特定的端口上进行I/O读写。I/O设备拥有与内存不同的地址空间,实现的方式是通过在CPU上额外的I/O pin或者将整个总线赋予端口。
MMIO即内存映射I/O,它是PCI规范一部分,I/O设备被放置在内存空间而不是I/O空。从处理器角度看,内存映射I/O后系统设备访问起来和内存一样。这样访问AGP/PCI-E显卡上的帧缓存,BIOS,PCI设备就可以使用读写内存一样的汇编指令完成,简化了程序设计的难度和接口的复杂性。
对软件人员来说,MMIO比Port I/O更方便使用。
2 PCI-E配置空间读写在内核的实现
用户空间的两个命令lspci和setpci来查看/修改PCI及PCI-E配置空间。用户命令执行的结果,是由内核来确定。那么我们关心一个问题:内核是如何真正去读取和修改配置空间的?
2.1 内核API接口
Linux内核提供了以下PCI/PCI-E配置空间访问接口,在驱动编写过程中,我们可以直接使用下面函数。这些
· pci_{read,write}_config_byte()
· pci_{read,write}_config_word()
· pci_{read,write}_config_dword()
函数的定义在文件include/linux/pci.h中。
00513:static inline int pci_read_config_byte(structpci_dev *dev,int where,
00513: u8*val)
00514: {
00515: return pci_bus_read_config_byte(dev->bus, dev->devfn, where,val);
00516: }
00517:static inline int pci_read_config_word(structpci_dev *dev,int where,
00517: u16*val)
00518: {
00519: return pci_bus_read_config_word(dev->bus, dev->devfn, where,val);
00520: }
00521: staticinline int pci_read_config_dword(structpci_dev *dev,
00521: int where,u32*val)
00522: {
00523: return pci_bus_read_config_dword(dev->bus, dev->devfn, where,val);
00524: }
00525:static inline int pci_write_config_byte(structpci_dev *dev,int where,
00525: u8val)
00526: {
00527: return pci_bus_write_config_byte(dev->bus, dev->devfn, where,val);
00528: }
00529:static inline int pci_write_config_word(structpci_dev *dev,
00529: int where,u16 val)
00530: {
00531: return pci_bus_write_config_word(dev->bus, dev->devfn, where,val);
00532: }
00533: staticinline int pci_write_config_dword(structpci_dev *dev,
00533: int where,u32 val)
00534: {
00535: return pci_bus_write_config_dword(dev->bus, dev->devfn, where,val);
00536: }
2.2 内核API实现
在PCI/PCI-E 配置空间读写API接口中,我们看到是对pci_bus_{read,write}_config_{byte, word, dword}的的封装。这些函数由drivers/pci/access.c中以宏的方式定义。
00024: #definePCI_OP_READ(size,type,len)\
00025: intpci_bus_read_config_##size\
00026: (struct pci_bus *bus, unsignedint devfn, int pos , type*value) \
00027: { \
00028: int res ; \
00029: unsigned long flags; \
00030: u32 data = 0; \
00031: if (PCI_##size##_BAD)return PCIBIOS_BAD_REGISTER_NUMBER; \
00032: spin_lock_irqsave(&pci_lock,flags); \
00033: res =bus ->ops- >read(bus, devfn,pos ,len,&data ); \
00034: *value = (type)data; \
00035: spin_unlock_irqrestore(&pci_lock,flags); \
00036: return res ; \
00037: }
00038:
00039: #definePCI_OP_WRITE(size,type,len)\
00040: intpci_bus_write_config_##size\
00041: (struct pci_bus *bus, unsignedint devfn, int pos , typevalue) \
00042: { \
00043: int res ; \
00044: unsigned long flags; \
00045: if (PCI_##size##_BAD)return PCIBIOS_BAD_REGISTER_NUMBER; \
00046: spin_lock_irqsave(&pci_lock,flags); \
00047: res =bus ->ops- >write(bus, devfn,pos ,len,value); \
00048: spin_unlock_irqrestore(&pci_lock,flags); \
00049: return res ; \
00050: }
00059: EXPORT_SYMBOL(pci_bus_read_config_byte);
00060: EXPORT_SYMBOL(pci_bus_read_config_word);
00061: EXPORT_SYMBOL(pci_bus_read_config_dword);
00062: EXPORT_SYMBOL(pci_bus_write_config_byte);
00063: EXPORT_SYMBOL(pci_bus_write_config_word);
00064: EXPORT_SYMBOL(pci_bus_write_config_dword);
pci_bus_{read,write}_config_{byte, word, dword}()等函数,调用的是bus->ops->write、bus->ops->read方法。显然,现在的bus总线是PCI/PCI-E,我们就关注内核定义PCI/PCI-E总线的读写操作方法。
注:Linux内核没有专门将PCI-E列为一种总线,而是将PCI-E合并到PCI总线中。
2.3 PCI总线读写方法
PCI总线读写方法为pci_root_ops,对应的读写函数分别为pci_read()、pci_write()。实现在文件arch/i386/pci/common.c中。
00036: staticintpci_read(structpci_bus *bus,unsigned intdevfn,int where,int size,u32
00036: *value)
00037: {
00038: return raw_pci_ops- >read(pci_domain_nr(bus),bus->number,
00039: devfn,where,size,value);
00040: }
00041:
00042: staticintpci_write(structpci_bus *bus,unsigned intdevfn,int where,int size,
00042: u32 value)
00043: {
00044: return raw_pci_ops- >write(pci_domain_nr(bus),bus->number,
00045: devfn,where,size,value);
00046: }
00047:
00048: structpci_ops pci_root_ops= {
00049: .read = pci_read,
00050: .write = pci_write,
00051: };
pci_read()、pci_write()依赖于raw_pci_ops全局变量。
2.3.1 raw_pci_ops全局变量的设置
内核在启动时,会执行pci_access_init()函数,在文件arch/i386/pci/init.c中。该函数中,确定了raw_pci_ops值。
00005: /* arch_initcall has too randomordering, so call theinitializers
00006: in the right sequence from here. */
00007: static__init int pci_access_init(void)
00008: {
00009: #ifdefCONFIG_PCI_MMCONFIG
00010: pci_mmcfg_init();
00011: #endif
00012: dmi_check_pciprobe();
00013:
00014: if (raw_pci_ops)
00015: return 0;
00016:
00017: #ifdefCONFIG_PCI_BIOS
00018: pci_pcbios_init();
00019: #endif
00020: / *
00021: * don't check for raw_pci_ops here because we want pcbios as last
00022: * fallback, yet it'sneeded to run first to set pcibios_last_bus
00023: * in case legacy PCI probingis used. otherwise detecting peer busses
00024: * fails.
00025: */
00026: #ifdefCONFIG_PCI_DIRECT
00027: pci_direct_init();
00028: #endif
00029: return0;
00030: }? end pci_access_init ?
00031: arch_initcall(pci_access_init);
对于访问PCI空间,通过Port I/O方式则可以实现完全访问。但要访问全部的PCI-E配置空间,则需要MMIO方式。MMIO方式访问,则需要Linux内核支持。在编译内核时,选中以下选项即可。
Bus options (PCI etc.) --->
--- PCI support
[*] Support mmconfig PCI config spaceaccess
即需要选中“Supportmmconfig PCI config space access”。 若没有选中该项,则用户通过lspci或setpci命令,访问不到PCI-E的扩展配置空间(256~4096字节)。
为了访问PCI-E扩展配置空间,pci_access_init()函数会调用pci_mmcfg_init()。于是将raw_pci_ops的值设为pci_mmcfg,代码都在文件arch/i386/pci/mmconfig.c中。
00152: void__init pci_mmcfg_init(void)
00153: {
... ...
00173: raw_pci_ops = &pci_mmcfg;
... ...
00176: }? end pci_mmcfg_init ?
00147: staticstruct pci_raw_opspci_mmcfg ={
00148: .read = pci_mmcfg_read,
00149: .write = pci_mmcfg_write,
00150: };
00151:
若内核中没有选中“Support mmconfig PCIconfig space access”,则raw_pci_ops方法为:pci_direct_conf1或pci_direct_conf2。通常情况下,使用pci_direct_conf1。代码在文件/arc/i386/pci/direct.c中。
00257: void__init pci_direct_init(void)
00258: {
00259: struct resource *region,*region2;
......
00267: if (pci_check_type1()){
00268: printk(KERN_INFO"PCI: Using configuration type 1\n");
00269: raw_pci_ops = &pci_direct_conf1;
00270: return;
00271: }
... ...
00284: if (pci_check_type2()){
00285: printk(KERN_INFO"PCI: Using configuration type 2\n");
00286: raw_pci_ops = &pci_direct_conf2;
00287: return;
00288: }
00293: }? end pci_direct_init ?
00079: structpci_raw_ops pci_direct_conf1= {
00080: .read = pci_conf1_read,
00081: .write = pci_conf1_write,
00082: };
00171: #undefPCI_CONF2_ADDRESS
00172:
00173: staticstruct pci_raw_opspci_direct_conf2 ={
00174: .read = pci_conf2_read,
00175: .write = pci_conf2_write,
00176: };
2.3.2 Port I/O方式访问配置空间
Port I/O方式也称为直接方式访问。
PCI规范规定,直接操作port读取PCI配置信息时,通过CONFIG_ADDRESS和CONFIG_DATA;两个寄存器进行。CONFIG_ADDRESS的值为0xCF8,CONFIG_DATA的值为0xCFC,两个寄存器都为32bit。两个寄存器就是对应x86架构中的端口号。图2为CONFIG_ADDRESS寄存器格式。
图2 CONFIG_ADDRESS寄存器格式
bit31是使能对PCI Bus CONFIG_DATA的访问;
bit 30~24为保留,为只读,访问时返回值为0;
bit 23~16是Bus号;
bit 15~10是设备号;
bit 10~8是功能号;
bit 7~2是配置空间中的寄存器,单位为DWORD。
bit 1~0为只读,读取时放回为0。
这样直接访问PCI配置空间时,分为两步:
第一步是向CONFIG_ADDRESS寄存器(端口0xCF8)写入要读/写的位置;
第二步是从CONFIG_DATA寄存器(端口0xCFC)读/写所需要数据。
Linux内核对PCI配置空间直接访问的实现函数分别为pci_conf1_read()/pci_conf1_write()和pci_conf2_read()/pci_conf2_write(),分别对应读写Type 0和Type 1的配置空间。对于我们的PCI-E外设来说,是Type 0型配置空间。这里我们只关注Type 0。
函数pci_conf1_read()和pci_conf1_write()函数在文件arch/i386/pci/direct.c中。
00017: int pci_conf1_read(unsignedint seg,unsigned intbus,
00018: unsigned int devfn,int reg,int len,u32 *value)
00019: {
00020: unsigned long flags;
00021:
00022: if ((bus> 255)|| (devfn> 255)|| (reg> 255)){
00023: *value= - 1;
00024: return -EINVAL;
00025: }
00026:
00027: spin_lock_irqsave(&pci_config_lock, flags);
00028:
00029: outl(PCI_CONF1_ADDRESS(bus,devfn,reg),0xCF8);
00030:
00031: switch (len){
00032: case 1:
00033: *value= inb(0xCFC+ (reg& 3));
00034: break;
00035: case 2:
00036: *value= inw(0xCFC+ (reg& 2));
00037: break;
00038: case 4:
00039: *value= inl(0xCFC);
00040: break;
00041: }
00042:
00043: spin_unlock_irqrestore(&pci_config_lock, flags);
00044:
00045: return0;
00046: }? end pci_conf1_read ?
00047:
00048: int pci_conf1_write(unsignedint seg,unsigned intbus,
00049: unsigned int devfn,int reg,int len,u32 value)
00050: {
00051: unsigned long flags;
00052:
00053: if ((bus> 255)|| (devfn> 255)|| (reg> 255))
00054: return -EINVAL;
00056: spin_lock_irqsave(&pci_config_lock, flags);
00057:
00058: outl(PCI_CONF1_ADDRESS(bus,devfn,reg),0xCF8);
00059:
00060: switch (len){
00061: case 1:
00062: outb((u8)value,0xCFC + (reg & 3));
00063: break;
00064: case 2:
00065: outw((u16)value,0xCFC + (reg & 2));
00066: break;
00067: case 4:
00068: outl((u32)value,0xCFC);
00069: break;
00070: }
00071:
00072: spin_unlock_irqrestore(&pci_config_lock, flags);
00073:
00074: return0;
00075: }? end pci_conf1_write ?
00076:
2.3.3 MMIO方式访问配置空间
Port I/O方式只能访问PCI配置空间,而不能访问PCI-E扩展配置空间(257~4096字节),此时只能通过MMIO方式。Linux内核中的MMIO实现读/写分别对应函数pci_mmcfg_write()和pci_mmcfg_read()。函数在文件arch/i386/pci/mmconfig.c中。
00071: staticintpci_mmcfg_read(unsignedint seg,unsigned intbus,
00072: unsigned int devfn,int reg,int len,u32 *value)
00073: {
00074: unsigned long flags;
00075: u32 base;
00076:
00077: if ((bus> 255)|| (devfn> 255)|| (reg> 4095)){
00078: err: *value =- 1;
00079: return -EINVAL;
00080: }
00081:
00082: if (reg< 256)
00083: return pci_conf1_read(seg,bus,devfn,reg,len,value);
00084:
00085: base = get_base_addr(seg,bus,devfn);
00086: if (!base)
00087: goto ↑err;
00088:
00089: spin_lock_irqsave(&pci_config_lock, flags);
00090:
00091: pci_exp_set_dev_base(base,bus,devfn);
00092:
00093: switch (len){
00094: case 1:
00095: *value= mmio_config_readb(mmcfg_virt_addr+ reg);
00096: break;
00097: case 2:
00098: *value= mmio_config_readw(mmcfg_virt_addr+ reg);
00099: break;
00100: case 4:
00101: *value= mmio_config_readl(mmcfg_virt_addr+ reg);
00102: break;
00103: }
00104:
00105: spin_unlock_irqrestore(&pci_config_lock, flags);
00106:
00107: return0;
00108: }? end pci_mmcfg_read ?
00110: staticint pci_mmcfg_write(unsignedint seg,unsigned intbus,
00111: unsigned int devfn,int reg,int len,u32 value)
00112: {
00113: unsigned long flags;
00114: u32 base;
00115:
00116: if ((bus> 255)|| (devfn> 255)|| (reg> 4095))
00117: return -EINVAL;
00118:
00119: if (reg< 256)
00120: return pci_conf1_write(seg,bus,devfn,reg,len,value);
00121:
00122: base = get_base_addr(seg,bus,devfn);
00123: if (!base)
00124: return -EINVAL;
00125:
00126: spin_lock_irqsave(&pci_config_lock, flags);
00127:
00128: pci_exp_set_dev_base(base,bus,devfn);
00129:
00130: switch (len){
00131: case 1:
00132: mmio_config_writeb(mmcfg_virt_addr+ reg,value);
00133: break;
00134: case 2:
00135: mmio_config_writew(mmcfg_virt_addr+ reg,value);
00136: break;
00137: case 4:
00138: mmio_config_writel(mmcfg_virt_addr+ reg,value);
00139: break;
00140: }
00141:
00142: spin_unlock_irqrestore(&pci_config_lock, flags);
00143:
00144: return0;
00145: }? end pci_mmcfg_write ?
若访问的配置空间在前面256字节范围内,则直接调用直接访问方式(Port I/O)。若访问PCI-E扩展配置空间,则首先通过get_base_addr()函数获取设备对应的内存空间物理地址,然后通过pci_exp_set_dev_base()函数将物理地址映射到一个线性地址,最后通过mmio_config_{read, write}{b, w, l}执行真正的读写。
1. get_base_addr()
00028/ *
00029: *Functions for accessing PCI configuration space with MMCONFIGaccesses
00030: */
00031: staticu32get_base_addr(unsignedint seg,int bus,unsigned devfn)
00032: {
00033: int cfg_num = - 1;
00034: struct acpi_table_mcfg_config*cfg;
00035:
00036: while(1){
00037: ++cfg_num;
00038: if (cfg_num>= pci_mmcfg_config_num) {
00039: break;
00040: }
00041: cfg = &pci_mmcfg_config[cfg_num];
00042: if (cfg->pci_segment_group_number ! = seg)
00043: continue;
00044: if ((cfg->start_bus_number <= bus)&&
00045: (cfg->end_bus_number >= bus))
00046: return cfg->base_address;
00047: }
00048:
00049: / * Handle more broken MCFG tableson Asus etc.
00050: They only contain a single entryfor bus 0- 0. Assume
00051: this applies to all busses. */
00052: cfg = &pci_mmcfg_config[0];
00053: if (pci_mmcfg_config_num== 1 &&
00054: cfg- >pci_segment_group_number ==0 &&
00055: (cfg->start_bus_number | cfg->end_bus_number) ==0)
00056: return cfg->base_address;
00057:
全局变量pci_mmcfg_config是所有PCI/PCI-E设备的MMIO映射表,MMIO映射表是内核根据BIOS POST结构初始化PCI总线时设置好,内核读取分配的值即可。
2. pci_exp_set_dev_base()
通过get_base_addr()获取到的地址是物理地址,为了能读取,还需通过桉树pci_exp_set_dev_base(base, bus, devfn);理地址转换为逻辑地址。
00062:static inline void pci_exp_set_dev_base(unsignedint base,int bus,int
00062: devfn)
00063: {
00064: u32 dev_base = base|(bus<< 20) |(devfn<< 12);
00065: if (dev_base!= mmcfg_last_accessed_device) {
00066: mmcfg_last_accessed_device =dev_base;
00067: set_fixmap_nocache(FIX_PCIE_MCFG,dev_base);
00068: }
00069: }
文件include/asm-i386/fixmap.h。
00100: /*
00101: *Some hardwarewants to get fixmapped withoutcaching.
00102: */
00103: #defineset_fixmap_nocache(idx, phys)\
00104: __set_fixmap(idx,phys,PAGE_KERNEL_NOCACHE)
00105:
文件arch/i386/mm/pgtable.c。
00140:void __set_fixmap(enumfixed_addresses idx,unsigned longphys,pgprot_t
00140: flags)
00141: {
00142: unsigned long address= __fix_to_virt(idx);
00143:
00144: if (idx>= __end_of_fixed_addresses){
00145: BUG();
00146: return;
00147: }
00148: set_pte_pfn(address,phys >>PAGE_SHIFT, flags);
00149: }
00150:
00023: #definemmcfg_virt_addr ((void__iomem *)fix_to_virt(FIX_PCIE_MCFG))
2.4 用户接口在内核的实现
前面我们已经提到可以通过lspci和setpci命令来读写PCI/PCI-E配置。而这些命令的实现是基于内核提供的/sysfs接口或/proc接口。
内核为PCI/PCI-E总线提供的/sysfs读写方法如下,文件drivers/pci/pci-sysfs.c。
00510: staticstruct bin_attributepci_config_attr ={
00511: .attr = {
00512: .name = "config",
00513: .mode = S_IRUGO| S_IWUSR,
00514: .owner = THIS_MODULE,
00515: },
00516: .size =256,
00517: .read = pci_read_config,
00518: .write = pci_write_config,
00519: };
00520:
00521: staticstruct bin_attributepcie_config_attr ={
00522: .attr = {
00523: .name = "config",
00524: .mode = S_IRUGO| S_IWUSR,
00525: .owner = THIS_MODULE,
00526: },
00527: .size =4096,
00528: .read = pci_read_config,
00529: .write = pci_write_config,
00530: };
00531:
pci_read_config()和pci_write_config()函数进而调用pci_user_write_config_{dword, word, byte}。我们来看一下setpci命令执行时(图3),内核栈信息和lspci栈信息(图4)。
由栈信息我们可以看出,函数最终调用pci_conf1_write()函数。也就是/sysfs提供的读写接口,也最终是使用Port I/O和MMIO方式。
图3 pci_conf1_write()函数调用栈
图4 pci_mmcfg_read()函数调用栈