lxd容器之GPU发现和加载
lxd gpu设备发现:
// /dev/nvidia[0-9]+
type nvidiaGpuCards struct {
path string
major int
minor int
id string
} // {/dev/nvidiactl, /dev/nvidia-uvm, ...}
type nvidiaGpuDevices struct {
path string
major int
minor int
} // /dev/dri/card0. If we detect that vendor == nvidia, then nvidia will contain
// the corresponding nvidia car, e.g. {/dev/dri/card1 --> /dev/nvidia1}.
type gpuDevice struct {
vendorid string
productid string
id string // card id e.g. 0
// If related devices have the same PCI address as the GPU we should
// mount them all. Meaning if we detect /dev/dri/card0,
// /dev/dri/controlD64, and /dev/dri/renderD128 with the same PCI
// address, then they should all be made available in the container.
pci string
nvidia nvidiaGpuCards path string
major int
minor int
} func (g *gpuDevice) isNvidiaGpu() bool {
return strings.EqualFold(g.vendorid, "10de")
} type cardIds struct {
id string
pci string
} func deviceLoadGpu() ([]gpuDevice, []nvidiaGpuDevices, error) {
const DRI_PATH = "/sys/bus/pci/devices"
var gpus []gpuDevice
var nvidiaDevices []nvidiaGpuDevices
var cards []cardIds ents, err := ioutil.ReadDir(DRI_PATH)
if err != nil {
if os.IsNotExist(err) {
return nil, nil, nil
}
return nil, nil, err
} isNvidia := false
for _, ent := range ents {
// The pci address == the name of the directory. So let's use
// this cheap way of retrieving it.
pciAddr := ent.Name() // Make sure that we are dealing with a GPU by looking whether
// the "drm" subfolder exists.
drm := filepath.Join(DRI_PATH, pciAddr, "drm")
drmEnts, err := ioutil.ReadDir(drm)
if err != nil {
if os.IsNotExist(err) {
continue
}
} // Retrieve vendor ID.
vendorIdPath := filepath.Join(DRI_PATH, pciAddr, "vendor")
vendorId, err := ioutil.ReadFile(vendorIdPath)
if err != nil {
if os.IsNotExist(err) {
continue
}
} // Retrieve device ID.
productIdPath := filepath.Join(DRI_PATH, pciAddr, "device")
productId, err := ioutil.ReadFile(productIdPath)
if err != nil {
if os.IsNotExist(err) {
continue
}
} // Store all associated subdevices, e.g. controlD64, renderD128.
// The name of the directory == the last part of the
// /dev/dri/controlD64 path. So ent.Name() will give us
// controlD64.
for _, drmEnt := range drmEnts {
vendorTmp := strings.TrimSpace(string(vendorId))
productTmp := strings.TrimSpace(string(productId))
vendorTmp = strings.TrimPrefix(vendorTmp, "0x")
productTmp = strings.TrimPrefix(productTmp, "0x")
tmpGpu := gpuDevice{
pci: pciAddr,
vendorid: vendorTmp,
productid: productTmp,
path: filepath.Join("/dev/dri", drmEnt.Name()),
} majMinPath := filepath.Join(drm, drmEnt.Name(), "dev")
majMinByte, err := ioutil.ReadFile(majMinPath)
if err != nil {
if os.IsNotExist(err) {
continue
}
}
majMin := strings.TrimSpace(string(majMinByte))
majMinSlice := strings.Split(string(majMin), ":")
if len(majMinSlice) != 2 {
continue
}
majorInt, err := strconv.Atoi(majMinSlice[0])
if err != nil {
continue
}
minorInt, err := strconv.Atoi(majMinSlice[1])
if err != nil {
continue
} tmpGpu.major = majorInt
tmpGpu.minor = minorInt isCard, err := regexp.MatchString("^card[0-9]+", drmEnt.Name())
if err != nil {
continue
} if isCard {
// If it is a card it's minor number will be its id.
tmpGpu.id = strconv.Itoa(minorInt)
tmp := cardIds{
id: tmpGpu.id,
pci: tmpGpu.pci,
}
cards = append(cards, tmp)
}
// Find matching /dev/nvidia* entry for /dev/dri/card*
if tmpGpu.isNvidiaGpu() && isCard {
if !isNvidia {
isNvidia = true
}
nvidiaPath := "/dev/nvidia" + strconv.Itoa(tmpGpu.minor)
stat := syscall.Stat_t{}
err := syscall.Stat(nvidiaPath, &stat)
if err != nil {
continue
}
tmpGpu.nvidia.path = nvidiaPath
tmpGpu.nvidia.major = int(stat.Rdev / 256)
tmpGpu.nvidia.minor = int(stat.Rdev % 256)
tmpGpu.nvidia.id = strconv.Itoa(tmpGpu.nvidia.minor)
}
gpus = append(gpus, tmpGpu)
}
} // We detected a Nvidia card, so let's collect all other nvidia devices
// that are not /dev/nvidia[0-9]+.
if isNvidia {
nvidiaEnts, err := ioutil.ReadDir("/dev")
if err != nil {
if os.IsNotExist(err) {
return nil, nil, err
}
}
validNvidia, err := regexp.Compile(`^nvidia[^0-9]+`)
if err != nil {
return nil, nil, err
}
for _, nvidiaEnt := range nvidiaEnts {
if !validNvidia.MatchString(nvidiaEnt.Name()) {
continue
}
nvidiaPath := filepath.Join("/dev", nvidiaEnt.Name())
stat := syscall.Stat_t{}
err = syscall.Stat(nvidiaPath, &stat)
if err != nil {
continue
}
tmpNividiaGpu := nvidiaGpuDevices{
path: nvidiaPath,
major: int(stat.Rdev / 256),
minor: int(stat.Rdev % 256),
}
nvidiaDevices = append(nvidiaDevices, tmpNividiaGpu)
} } // Since we'll give users to ability to specify and id we need to group
// devices on the same PCI that belong to the same card by id.
for _, card := range cards {
for i := 0; i < len(gpus); i++ {
if gpus[i].pci == card.pci {
gpus[i].id = card.id
}
}
} return gpus, nvidiaDevices, nil
}
lxd gpu设备加载:由下可见
最终是否加载取决于Rest接口创建的request body中的config.devices.type是否是gpu以及指定的属性是否和发现上来的一致。而客户端又是如何知道vendorid,pci等信息?实际一般是需要建立GPU资源池,GPU元数据由上层管理,通过一定调度规则指定。而GPU资源的发现实际可通过类似上面的函数进行发现或者通过lspci命令发现
else if m["type"] == "gpu" { if gpus == nil {
gpus, nvidiaDevices, err = deviceLoadGpu()
if err != nil {
return "", err
}
} sawNvidia := false
for _, gpu := range gpus {
//最终是否加载取决于Rest接口创建的request body中的config.devices.type是否是gpu以及指定的属性是否和发现上来的一致
if (m["vendorid"] != "" && gpu.vendorid != m["vendorid"]) ||
(m["pci"] != "" && gpu.pci != m["pci"]) ||
(m["productid"] != "" && gpu.productid != m["productid"]) ||
(m["id"] != "" && gpu.id != m["id"]) {
continue
} err := c.setupUnixDevice(k, m, gpu.major, gpu.minor, gpu.path, true)
if err != nil {
return "", err
} if gpu.nvidia.path == "" {
continue
} err = c.setupUnixDevice(k, m, gpu.nvidia.major, gpu.nvidia.minor, gpu.nvidia.path, true)
if err != nil {
return "", err
} sawNvidia = true
} if sawNvidia {
for _, gpu := range nvidiaDevices {
err := c.setupUnixDevice(k, m, gpu.major, gpu.minor, gpu.path, true)
if err != nil {
return "", err
}
}
}
}
// setupUnixDevice() creates the unix device and sets up the necessary low-level
// liblxc configuration items.
func (c *containerLXC) setupUnixDevice(devType string, dev types.Device, major int, minor int, path string, createMustSucceed bool) error {
if c.IsPrivileged() && !runningInUserns && cgDevicesController {
//设置设备访问白名单
err := lxcSetConfigItem(c.c, "lxc.cgroup.devices.allow", fmt.Sprintf("c %d:%d rwm", major, minor))
if err != nil {
return err
}
} temp := types.Device{}
if err := shared.DeepCopy(&dev, &temp); err != nil {
return err
} temp["major"] = fmt.Sprintf("%d", major)
temp["minor"] = fmt.Sprintf("%d", minor)
temp["path"] = path paths, err := c.createUnixDevice(temp)
if err != nil {
shared.LogDebug("failed to create device", log.Ctx{"err": err, "device": devType})
if createMustSucceed {
return err
}
return nil
}
devPath := paths[0]
tgtPath := paths[1]
//设置挂载对象
err = lxcSetConfigItem(c.c, "lxc.mount.entry", fmt.Sprintf("%s %s none bind,create=file", devPath, tgtPath))
if err != nil {
return err
}
return nil
}
// Unix devices handling
func (c *containerLXC) createUnixDevice(m types.Device) ([]string, error) {
var err error
var major, minor int // Our device paths
srcPath := m["path"]
tgtPath := strings.TrimPrefix(srcPath, "/")
devName := fmt.Sprintf("unix.%s", strings.Replace(tgtPath, "/", "-", -1))
devPath := filepath.Join(c.DevicesPath(), devName)//var/lib/lxd/devices/容器名称/xxxx // Extra checks for nesting
if runningInUserns {
for key, value := range m {
if shared.StringInSlice(key, []string{"major", "minor", "mode", "uid", "gid"}) && value != "" {
return nil, fmt.Errorf("The \"%s\" property may not be set when adding a device to a nested container", key)
}
}
} // Get the major/minor of the device we want to create
if m["major"] == "" && m["minor"] == "" {
// If no major and minor are set, use those from the device on the host
_, major, minor, err = deviceGetAttributes(srcPath)
if err != nil {
return nil, fmt.Errorf("Failed to get device attributes for %s: %s", m["path"], err)
}
} else if m["major"] == "" || m["minor"] == "" {
return nil, fmt.Errorf("Both major and minor must be supplied for device: %s", m["path"])
} else {
major, err = strconv.Atoi(m["major"])
if err != nil {
return nil, fmt.Errorf("Bad major %s in device %s", m["major"], m["path"])
} minor, err = strconv.Atoi(m["minor"])
if err != nil {
return nil, fmt.Errorf("Bad minor %s in device %s", m["minor"], m["path"])
}
} // Get the device mode
mode := os.FileMode(0660)
if m["mode"] != "" {
tmp, err := deviceModeOct(m["mode"])
if err != nil {
return nil, fmt.Errorf("Bad mode %s in device %s", m["mode"], m["path"])
}
mode = os.FileMode(tmp)
} if m["type"] == "unix-block" {
mode |= syscall.S_IFBLK
} else {
mode |= syscall.S_IFCHR
} // Get the device owner
uid := 0
gid := 0 if m["uid"] != "" {
uid, err = strconv.Atoi(m["uid"])
if err != nil {
return nil, fmt.Errorf("Invalid uid %s in device %s", m["uid"], m["path"])
}
} if m["gid"] != "" {
gid, err = strconv.Atoi(m["gid"])
if err != nil {
return nil, fmt.Errorf("Invalid gid %s in device %s", m["gid"], m["path"])
}
} // Create the devices directory if missing
if !shared.PathExists(c.DevicesPath()) {
os.Mkdir(c.DevicesPath(), 0711)
if err != nil {
return nil, fmt.Errorf("Failed to create devices path: %s", err)
}
} // Clean any existing entry
if shared.PathExists(devPath) {
if runningInUserns {
syscall.Unmount(devPath, syscall.MNT_DETACH)
} err = os.Remove(devPath)
if err != nil {
return nil, fmt.Errorf("Failed to remove existing entry: %s", err)
}
} // Create the new entry
if !runningInUserns {
if err := syscall.Mknod(devPath, uint32(mode), minor|(major<<8)); err != nil {
return nil, fmt.Errorf("Failed to create device %s for %s: %s", devPath, m["path"], err)
} if err := os.Chown(devPath, uid, gid); err != nil {
return nil, fmt.Errorf("Failed to chown device %s: %s", devPath, err)
} // Needed as mknod respects the umask
if err := os.Chmod(devPath, mode); err != nil {
return nil, fmt.Errorf("Failed to chmod device %s: %s", devPath, err)
} if c.idmapset != nil {
if err := c.idmapset.ShiftFile(devPath); err != nil {
// uidshift failing is weird, but not a big problem. Log and proceed
shared.LogDebugf("Failed to uidshift device %s: %s\n", m["path"], err)
}
}
} else {
f, err := os.Create(devPath)
if err != nil {
return nil, err
}
f.Close() err = deviceMountDisk(srcPath, devPath, false, false)
if err != nil {
return nil, err
}
} return []string{devPath, tgtPath}, nil
}
func deviceMountDisk(srcPath string, dstPath string, readonly bool, recursive bool) error {
var err error // Prepare the mount flags
flags := 0
if readonly {
flags |= syscall.MS_RDONLY
} // Detect the filesystem
fstype := "none"
if deviceIsBlockdev(srcPath) {
fstype, err = shared.BlockFsDetect(srcPath)
if err != nil {
return err
}
} else {
flags |= syscall.MS_BIND
if recursive {
flags |= syscall.MS_REC
}
} // Mount the filesystem
if err = syscall.Mount(srcPath, dstPath, fstype, uintptr(flags), ""); err != nil {
return fmt.Errorf("Unable to mount %s at %s: %s", srcPath, dstPath, err)
} flags = syscall.MS_REC | syscall.MS_SLAVE
if err = syscall.Mount("", dstPath, "", uintptr(flags), ""); err != nil {
return fmt.Errorf("unable to make mount %s private: %s", dstPath, err)
} return nil
}
lxd容器之GPU发现和加载的更多相关文章
- 使用Pytorch在多GPU下保存和加载训练模型参数遇到的问题
最近使用Pytorch在学习一个深度学习项目,在模型保存和加载过程中遇到了问题,最终通过在网卡查找资料得已解决,故以此记之,以备忘却. 首先,是在使用多GPU进行模型训练的过程中,在保存模型参数时,应 ...
- Java并发编程:并发容器之CopyOnWriteArrayList(转载)
Java并发编程:并发容器之CopyOnWriteArrayList(转载) 原文链接: http://ifeve.com/java-copy-on-write/ Copy-On-Write简称COW ...
- Java并发编程:并发容器之CopyOnWriteArrayList
转载: Java并发编程:并发容器之CopyOnWriteArrayList Copy-On-Write简称COW,是一种用于程序设计中的优化策略.其基本思路是,从一开始大家都在共享同一个内容,当某个 ...
- linux 编译,链接和加载
1. 序 最近在折腾各种.so,碰到了一些问题,一开始对于很多错误也没有头绪,茫然不知所措.索性化了一天多时间将<<程序员的自我修养—链接.装载与库>>中部分内容略读了一遍 ...
- 源码跟读,Spring是如何解析和加载xml中配置的beans
Spring版本基于: 跟踪代码源码基于: https://github.com/deng-cc/KeepLearning commit id:c009ce47bd19e1faf9e07f12086c ...
- ionic 在windows环境下更换logo和加载图片的问题
做用自己的电脑做ionic项目时,更换logo和加载图片时,无论使用哪种命令,发现都上传不了,并且报错 最后发现,需要将 icon和splash两个文件改为.ai格式才能上传成功. 这是最终生成后的文 ...
- 【转】Java并发编程:并发容器之CopyOnWriteArrayList
Copy-On-Write简称COW,是一种用于程序设计中的优化策略.其基本思路是,从一开始大家都在共享同一个内容,当某个人想要修改这个内容的时候,才会真正把内容Copy出去形成一个新的内容然后再改, ...
- Kettle实现数据抽取、转换、装入和加载数据-数据转移ETL工具
原文地址:http://www.xue51.com/soft/5341.html Kettle是来自国外的一款开源的ETL工具,纯java编写,可以在Window.Linux.Unix上运行,绿色无需 ...
- spring揭密学习笔记(3)-spring ioc容器:Spring的IoC容器之BeanFactory
1. Spring的IoC容器和IoC Service Provider的关系 Spring的IoC容器和IoC Service Provider所提供的服务之间存在一定的交集,二者的关系如图4-1所 ...
随机推荐
- iOS 编程小知识 之 本地化
1. 使用本地化多语言 有时候,在网上下载的Demo,有本地化的处理,默认的本地化都是英文,这时候,可以考虑这么处理: info.plist->Infomation Property List ...
- ReactiveCocoa学习笔记--用法
1.监测UI变量的变化 return 后把值传递下去. 1.1.输出 [self.usernameTextField.rac_textSignal subscribeNext:^(id x){ NSL ...
- 移动App Crash的测试用例设计
一些通用的触发移动App Crash的测试场景,如下: 1. 验证在有不同的屏幕分辨率, 操作系统 和运营商的多个设备上的App行为. 2. 用新发布的操作系统版本验证App的行为. 3. 验证在如隧 ...
- iOS 错误 之 Potential leak of an object stored into 'cs'
存储到 “cs”的对象存在潜在的泄露
- Angular - - $interpolate 和 $parse
$interpolate 将一个字符串编译成一个插值函数.HTML编译服务使用这个服务完成数据绑定. 使用:$interpolate(text,[mustHaveExpression],[truste ...
- Win10下CISCO VPN Client无法安装解决方案
Cisco vpn client 在Windows升级到Windows 10 之后无法正常安装使用,在这种情况下:1.先安装Dell SonicWALL Global VPN Client(GVCSe ...
- List与Linkedlist、Arrylist、Vector、Map应用
1.List与LinkedList List是数组链表 LinkedList是指针链表 选择List还是LinkedList要看你的使用特点. 数组链表访问快,复 ...
- 细说WPF数据绑定
简单的事例: <Slider Name="mySlider" Height="28" HorizontalAlignment="Left&q ...
- 常用的.net开源项目
Json.NET http://json.codeplex.com/ Json.Net 是一个读写Json效率比较高的.Net框架.Json.Net 使得在.Net环境下使用Json更加简单.通过Li ...
- Spring context:component-scan代替context:annotation-config
Spring context:component-scan代替context:annotation-config XML: <?xml version="1.0" encod ...