From 95fbe354dd8af31eee2133b479b3a72c52535640 Mon Sep 17 00:00:00 2001 From: SamYuan1990 Date: Fri, 22 Aug 2025 20:11:16 +0800 Subject: [PATCH] example with mkdocs Signed-off-by: SamYuan1990 --- data/adopters.yaml | 7 + docs/README.md | 4 - docs/{ => en}/FAQ/FAQ.md | 0 .../2024-12-18-support-blog-post/index.md | 0 .../en/blog}/2024-12-31-post/index.md | 0 .../en/changelog}/source/authors.json | 0 .../en/changelog}/source/v2.5.0.md | 0 .../en/changelog}/source/v2.5.1.md | 0 docs/{ => en}/contributor/adopters.md | 0 docs/{ => en}/contributor/cherry-picks.md | 0 docs/{ => en}/contributor/contribute-docs.md | 0 docs/{ => en}/contributor/contributers.md | 0 docs/{ => en}/contributor/contributing.md | 0 docs/{ => en}/contributor/github-workflow.md | 0 docs/{ => en}/contributor/goverance.md | 0 docs/{ => en}/contributor/ladder.md | 0 docs/{ => en}/contributor/lifted.md | 0 docs/{ => en}/core-concepts/architecture.md | 0 docs/{ => en}/core-concepts/introduction.md | 0 docs/{ => en}/developers/Dynamic-mig.md | 0 docs/{ => en}/developers/HAMi-core-design.md | 0 docs/{ => en}/developers/build.md | 0 .../{ => en}/developers/kunlunxin-topology.md | 0 docs/{ => en}/developers/mindmap.md | 0 docs/{ => en}/developers/protocol.md | 0 docs/{ => en}/developers/scheduling.md | 0 docs/{ => en}/get-started/deploy-with-helm.md | 0 .../{ => en}/installation/aws-installation.md | 0 .../installation/how-to-use-volcano-vgpu.md | 0 .../installation/offline-installation.md | 0 .../installation/online-installation.md | 0 docs/{ => en}/installation/prequisities.md | 0 docs/{ => en}/installation/uninstall.md | 0 docs/{ => en}/installation/upgrade.md | 0 .../installation/webui-installation.md | 0 .../key-features/device-resource-isolation.md | 0 docs/{ => en}/key-features/device-sharing.md | 0 .../resources/HAMI-VGPU-mind-map-English.png | Bin .../Karmada-logo-horizontal-color.png | Bin .../administrator/prometheus/grafana.png | Bin .../adoptions-ci123-architecture.png | Bin .../resources/adoptions-ci123-aries.png | Bin .../adoptions-ci123-automation-cluster-en.png | Bin .../adoptions-ci123-automation-cluster-zh.png | Bin ...options-ci123-capability-visualization.png | Bin .../adoptions-ci123-cluster-inspection.png | Bin .../adoptions-ci123-gpu-resources.png | Bin .../adoptions-ci123-msp-multicluster-1.png | Bin .../adoptions-ci123-msp-multicluster-2.png | Bin ...doptions-ci123-multicluster-capability.png | Bin .../resources/adoptions-ci123-override.png | Bin .../adoptions-ci123-sequence-status.png | Bin .../adoptions-ci123-unified-view-1.png | Bin .../adoptions-ci123-unified-view-2.png | Bin .../resources/adoptions-ci123-velero.png | Bin .../adoptions-vipkid-architecture-en.png | Bin .../adoptions-vipkid-architecture-zh.png | Bin docs/{ => en}/resources/architect.jpg | Bin docs/{ => en}/resources/architecture.drawio | 0 docs/{ => en}/resources/architecture.png | Bin .../resources/argocd-new-app-cluster.png | Bin .../resources/argocd-new-app-name.png | Bin .../resources/argocd-new-app-repo.png | Bin docs/{ => en}/resources/argocd-new-app.png | Bin .../resources/argocd-register-karmada.png | Bin .../resources/argocd-status-aggregated.png | Bin .../resources/argocd-status-overview.png | Bin .../argocd-status-resourcebinding.png | Bin docs/{ => en}/resources/argocd-sync-apps.png | Bin .../binding-controller-process.drawio | 0 .../resources/binding-controller-process.png | Bin .../ci123/adoptions-ci123-architecture.png | Bin .../ci123/adoptions-ci123-aries.png | Bin .../adoptions-ci123-automation-cluster-en.png | Bin .../adoptions-ci123-automation-cluster-zh.png | Bin ...options-ci123-capability-visualization.png | Bin .../adoptions-ci123-cluster-inspection.png | Bin .../ci123/adoptions-ci123-gpu-resources.png | Bin .../adoptions-ci123-msp-multicluster-1.png | Bin .../adoptions-ci123-msp-multicluster-2.png | Bin ...doptions-ci123-multicluster-capability.png | Bin .../ci123/adoptions-ci123-override.png | Bin .../ci123/adoptions-ci123-sequence-status.png | Bin .../ci123/adoptions-ci123-unified-view-1.png | Bin .../ci123/adoptions-ci123-unified-view-2.png | Bin .../ci123/adoptions-ci123-velero.png | Bin .../vipkid/adoptions-vipkid-architecture.png | Bin .../cluster-controller-process.drawio | 0 .../resources/cluster-controller-process.png | Bin docs/{ => en}/resources/cncf-logo.png | Bin .../resources/contributor/click-next.png | Bin .../resources/contributor/debug-docs.png | Bin .../resources/contributor/git_workflow.png | Bin docs/{ => en}/resources/demo-3in1.svg | 0 .../resources/developers/grafana_metrics.png | Bin .../resources/device_registration.png | Bin docs/{ => en}/resources/example.png | Bin .../execution-controller-process.drawio | 0 .../execution-controller-process.png | Bin .../general/Karmada-logo-horizontal-color.png | Bin .../resources/general/architecture.drawio | 0 .../resources/general/architecture.png | Bin .../general/binding-controller-process.drawio | 0 .../general/binding-controller-process.png | Bin .../general/cluster-controller-process.drawio | 0 .../general/cluster-controller-process.png | Bin docs/{ => en}/resources/general/cncf-logo.png | Bin docs/{ => en}/resources/general/demo-3in1.svg | 0 .../execution-controller-process.drawio | 0 .../general/execution-controller-process.png | Bin .../general/karmada-resource-relation.drawio | 0 .../general/karmada-resource-relation.png | Bin .../general/object-association-map.drawio | 0 .../general/object-association-map.png | Bin .../general/policy-controller-process.drawio | 0 .../general/policy-controller-process.png | Bin .../resources/general/sample-nginx.svg | 0 docs/{ => en}/resources/git_workflow.png | Bin .../resources/gpu-scheduler-policy-demo.png | Bin docs/{ => en}/resources/grafana.png | Bin docs/{ => en}/resources/hami-arch.png | Bin .../{ => en}/resources/hami-core-position.png | Bin .../resources/hami-dynamic-mig-procedure.png | Bin .../resources/hami-dynamic-mig-structure.png | Bin docs/{ => en}/resources/hard_limit.jpg | Bin .../generate_cert/csr_config/admin.conf | 0 .../etcd/apiserver-etcd-client.conf | 0 .../csr_config/etcd/healthcheck-client.conf | 0 .../generate_cert/csr_config/etcd/peer.conf | 0 .../generate_cert/csr_config/etcd/server.conf | 0 .../csr_config/front-proxy-client.conf | 0 .../generate_cert/csr_config/karmada.conf | 0 .../csr_config/kube-apiserver.conf | 0 .../csr_config/kube-controller-manager.conf | 0 .../generate_cert/generate_ca.sh | 0 .../generate_cert/generate_etcd.sh | 0 .../generate_cert/generate_leaf.sh | 0 .../install-binary/generate_cert/util.sh | 0 .../other_scripts/check_status.sh | 0 .../other_scripts/create_kubeconfig_file.sh | 0 .../istio-on-karmada-different-network.png | Bin docs/{ => en}/resources/istio-on-karmada.png | Bin .../karmada-resource-relation.drawio | 0 .../resources/karmada-resource-relation.png | Bin .../key-features/cluster-failover.png | Bin .../key-features/overall-relationship.png | Bin .../key-features/overall-rescheduling.png | Bin .../key-features/overall-scheduling.png | Bin .../key-features/service-governance.png | Bin .../resources/key-features/unified-access.png | Bin .../key-features/unified-operation.png | Bin .../key-features/unified-resourcequota.png | Bin .../resources/key-features/unified-search.png | Bin docs/{ => en}/resources/kunlunxin_filter.png | Bin docs/{ => en}/resources/kunlunxin_topo.jpg | Bin docs/{ => en}/resources/metax_binpack.jpg | Bin docs/{ => en}/resources/metax_spread.jpg | Bin docs/{ => en}/resources/metax_topo.jpg | Bin .../resources/node-shceduler-policy-demo.png | Bin .../resources/object-association-map.drawio | 0 .../resources/object-association-map.png | Bin .../policy-controller-process.drawio | 0 .../resources/policy-controller-process.png | Bin docs/{ => en}/resources/sample-nginx.svg | 0 docs/{ => en}/resources/sample_nvidia-smi.png | Bin .../resources/scheduler-policy-story.png | Bin docs/{ => en}/resources/task_dispatch.png | Bin .../cicd/argocd/argocd-new-app-cluster.png | Bin .../cicd/argocd/argocd-new-app-name.png | Bin .../cicd/argocd/argocd-new-app-repo.png | Bin .../userguide/cicd/argocd/argocd-new-app.png | Bin .../cicd/argocd/argocd-register-karmada.png | Bin .../cicd/argocd/argocd-status-aggregated.png | Bin .../cicd/argocd/argocd-status-overview.png | Bin .../argocd/argocd-status-resourcebinding.png | Bin .../cicd/argocd/argocd-sync-apps.png | Bin .../userguide/failover/failover-overview.png | Bin .../istio-on-karmada-different-network.png | Bin .../service/istio/istio-on-karmada.png | Bin .../troubleshooting/troubleshooting.md | 0 .../enable-awsneuron-managing.md | 0 .../examples/allocate-neuron-core.md | 0 .../examples/allocate-neuron-device.md | 0 .../Ascend-device/device-template.md | 0 .../Ascend-device/enable-ascend-sharing.md | 0 .../Ascend-device/examples/allocate-310p.md | 0 .../Ascend-device/examples/allocate-910b.md | 0 .../examples/allocate-exclusive.md | 0 .../enable-cambricon-mlu-sharing.md | 0 .../examples/allocate-core-and-memory.md | 0 .../examples/allocate-exclusive.md | 0 .../specify-device-core-usage.md | 0 .../specify-device-memory-usage.md | 0 .../specify-device-type-to-use.md | 0 docs/{ => en}/userguide/Device-supported.md | 0 .../enable-enflame-gcu-sharing.md | 0 .../Hygon-device/enable-hygon-dcu-sharing.md | 0 .../examples/allocate-core-and-memory.md | 0 .../examples/allocate-exclusive.md | 0 .../examples/specify-certain-cards.md | 0 .../Hygon-device/specify-device-core-usage.md | 0 .../specify-device-memory-usage.md | 0 .../specify-device-uuid-to-use.md | 0 .../enable-illuvatar-gpu-sharing.md | 0 ...ate-device-core-and-memory-to-container.md | 0 .../examples/allocate-exclusive.md | 0 .../enable-kunlunxin-schedule.md | 0 .../Metax-GPU/enable-metax-gpu-schedule.md | 0 .../Metax-GPU/examples/allocate-binpack.md | 0 .../Metax-GPU/examples/allocate-spread.md | 0 .../Metax-GPU/examples/default-use.md | 0 .../Metax-GPU/specify-binpack-task.md | 0 .../Metax-GPU/specify-spread-task.md | 0 .../Metax-sGPU/enable-metax-gpu-sharing.md | 0 .../Metax-sGPU/examples/allocate-exclusive.md | 0 .../examples/allocate-qos-policy.md | 0 .../Metax-sGPU/examples/default-use.md | 0 .../enable-mthreads-gpu-sharing.md | 0 .../examples/allocate-core-and-memory.md | 0 .../examples/allocate-exclusive.md | 0 .../specify-device-core-usage.md | 0 .../specify-device-memory-usage.md | 0 .../NVIDIA-device/dynamic-mig-support.md | 0 .../examples/allocate-device-core.md | 0 .../examples/allocate-device-memory.md | 0 .../examples/allocate-device-memory2.md | 0 .../examples/dynamic-mig-example.md | 0 .../examples/specify-card-type-to-use.md | 0 .../examples/specify-certain-card.md | 0 .../examples/use-exclusive-card.md | 0 .../specify-device-core-usage.md | 0 .../specify-device-memory-usage.md | 0 .../specify-device-type-to-use.md | 0 .../specify-device-uuid-to-use.md | 0 docs/{ => en}/userguide/configure.md | 0 .../userguide/monitoring/device-allocation.md | 0 .../userguide/monitoring/globalview.md | 0 .../monitoring/real-time-device-usage.md | 0 .../userguide/monitoring/real-time-usage.md | 0 .../NVIDIA-GPU/examples/default_use.md | 0 .../NVIDIA-GPU/examples/use_exclusive_gpu.md | 0 .../NVIDIA-GPU/how-to-use-volcano-vgpu.md | 0 .../volcano-vgpu/NVIDIA-GPU/monitor.md | 0 docs/index.md | 37 + docs/logo.svg | 7030 +++++++++++++++++ docs/zh/FAQ/FAQ.md | 194 + .../2024-12-18-support-blog-post/index.md | 54 + docs/zh/blog/2024-12-31-post/index.md | 1799 +++++ docs/zh/contributor/adopters.md | 32 + docs/zh/contributor/cherry-picks.md | 86 + docs/zh/contributor/contribute-docs.md | 174 + docs/zh/contributor/contributers.md | 24 + docs/zh/contributor/contributing.md | 102 + docs/zh/contributor/github-workflow.md | 258 + docs/zh/contributor/goverance.md | 46 + docs/zh/contributor/ladder.md | 182 + docs/zh/contributor/lifted.md | 121 + docs/zh/core-concepts/architecture.md | 23 + docs/zh/core-concepts/introduction.md | 47 + docs/zh/developers/Dynamic-mig.md | 164 + docs/zh/developers/HAMi-core-design.md | 30 + docs/zh/developers/build.md | 95 + docs/zh/developers/kunlunxin-topology.md | 52 + docs/zh/developers/mindmap.md | 8 + docs/zh/developers/protocol.md | 37 + docs/zh/developers/scheduling.md | 169 + docs/zh/get-started/deploy-with-helm.md | 188 + docs/zh/index.md | 37 + docs/zh/installation/aws-installation.md | 54 + .../installation/how-to-use-volcano-vgpu.md | 126 + docs/zh/installation/offline-installation.md | 58 + docs/zh/installation/online-installation.md | 42 + docs/zh/installation/prequisities.md | 88 + docs/zh/installation/uninstall.md | 12 + docs/zh/installation/upgrade.md | 14 + docs/zh/installation/webui-installation.md | 107 + .../key-features/device-resource-isolation.md | 18 + docs/zh/key-features/device-sharing.md | 12 + docs/zh/releases.md | 67 + docs/zh/roadmap.md | 45 + docs/zh/troubleshooting/troubleshooting.md | 12 + .../enable-awsneuron-managing.md | 132 + .../examples/allocate-neuron-core.md | 26 + .../examples/allocate-neuron-device.md | 26 + .../Ascend-device/device-template.md | 66 + .../Ascend-device/enable-ascend-sharing.md | 109 + .../Ascend-device/examples/allocate-310p.md | 28 + .../Ascend-device/examples/allocate-910b.md | 24 + .../examples/allocate-exclusive.md | 21 + .../enable-cambricon-mlu-sharing.md | 80 + .../examples/allocate-core-and-memory.md | 37 + .../examples/allocate-exclusive.md | 35 + .../specify-device-core-usage.md | 18 + .../specify-device-memory-usage.md | 17 + .../specify-device-type-to-use.md | 16 + docs/zh/userguide/Device-supported.md | 16 + .../enable-enflame-gcu-sharing.md | 122 + .../Hygon-device/enable-hygon-dcu-sharing.md | 76 + .../examples/allocate-core-and-memory.md | 28 + .../examples/allocate-exclusive.md | 26 + .../examples/specify-certain-cards.md | 25 + .../Hygon-device/specify-device-core-usage.md | 16 + .../specify-device-memory-usage.md | 15 + .../specify-device-uuid-to-use.md | 18 + .../enable-illuvatar-gpu-sharing.md | 153 + ...ate-device-core-and-memory-to-container.md | 35 + .../examples/allocate-exclusive.md | 35 + .../Metax-GPU/enable-metax-gpu-schedule.md | 65 + .../Metax-GPU/examples/allocate-binpack.md | 24 + .../Metax-GPU/examples/allocate-spread.md | 24 + .../Metax-GPU/examples/default-use.md | 22 + .../Metax-GPU/specify-binpack-task.md | 12 + .../Metax-GPU/specify-spread-task.md | 12 + .../Metax-sGPU/enable-metax-gpu-sharing.md | 47 + .../Metax-sGPU/examples/allocate-exclusive.md | 22 + .../examples/allocate-qos-policy.md | 32 + .../Metax-sGPU/examples/default-use.md | 26 + .../enable-mthreads-gpu-sharing.md | 69 + .../examples/allocate-core-and-memory.md | 28 + .../examples/allocate-exclusive.md | 26 + .../specify-device-core-usage.md | 15 + .../specify-device-memory-usage.md | 15 + .../NVIDIA-device/dynamic-mig-support.md | 181 + .../examples/allocate-device-core.md | 26 + .../examples/allocate-device-memory.md | 26 + .../examples/allocate-device-memory2.md | 26 + .../examples/dynamic-mig-example.md | 25 + .../examples/specify-card-type-to-use.md | 28 + .../examples/specify-certain-card.md | 25 + .../examples/use-exclusive-card.md | 23 + .../specify-device-core-usage.md | 17 + .../specify-device-memory-usage.md | 26 + .../specify-device-type-to-use.md | 23 + .../specify-device-uuid-to-use.md | 18 + docs/zh/userguide/configure.md | 72 + .../userguide/monitoring/device-allocation.md | 25 + docs/zh/userguide/monitoring/globalview.md | 0 .../monitoring/real-time-device-usage.md | 23 + .../userguide/monitoring/real-time-usage.md | 6 + docs/zh/userguide/support-devices.md | 0 .../NVIDIA-GPU/examples/default_use.md | 27 + .../NVIDIA-GPU/examples/use_exclusive_gpu.md | 26 + .../NVIDIA-GPU/how-to-use-volcano-vgpu.md | 142 + .../volcano-vgpu/NVIDIA-GPU/monitor.md | 23 + mkdocs.yml | 58 + overrides/partials/footer.html | 14 + pyproject.toml | 80 + templates/adopters.md | 21 + 348 files changed, 14251 insertions(+), 4 deletions(-) create mode 100644 data/adopters.yaml delete mode 100644 docs/README.md rename docs/{ => en}/FAQ/FAQ.md (100%) rename {blog => docs/en/blog}/2024-12-18-support-blog-post/index.md (100%) rename {blog => docs/en/blog}/2024-12-31-post/index.md (100%) rename {changelog => docs/en/changelog}/source/authors.json (100%) rename {changelog => docs/en/changelog}/source/v2.5.0.md (100%) rename {changelog => docs/en/changelog}/source/v2.5.1.md (100%) rename docs/{ => en}/contributor/adopters.md (100%) rename docs/{ => en}/contributor/cherry-picks.md (100%) rename docs/{ => en}/contributor/contribute-docs.md (100%) rename docs/{ => en}/contributor/contributers.md (100%) rename docs/{ => en}/contributor/contributing.md (100%) rename docs/{ => en}/contributor/github-workflow.md (100%) rename docs/{ => en}/contributor/goverance.md (100%) rename docs/{ => en}/contributor/ladder.md (100%) rename docs/{ => en}/contributor/lifted.md (100%) rename docs/{ => en}/core-concepts/architecture.md (100%) rename docs/{ => en}/core-concepts/introduction.md (100%) rename docs/{ => en}/developers/Dynamic-mig.md (100%) rename docs/{ => en}/developers/HAMi-core-design.md (100%) rename docs/{ => en}/developers/build.md (100%) rename docs/{ => en}/developers/kunlunxin-topology.md (100%) rename docs/{ => en}/developers/mindmap.md (100%) rename docs/{ => en}/developers/protocol.md (100%) rename docs/{ => en}/developers/scheduling.md (100%) rename docs/{ => en}/get-started/deploy-with-helm.md (100%) rename docs/{ => en}/installation/aws-installation.md (100%) rename docs/{ => en}/installation/how-to-use-volcano-vgpu.md (100%) rename docs/{ => en}/installation/offline-installation.md (100%) rename docs/{ => en}/installation/online-installation.md (100%) rename docs/{ => en}/installation/prequisities.md (100%) rename docs/{ => en}/installation/uninstall.md (100%) rename docs/{ => en}/installation/upgrade.md (100%) rename docs/{ => en}/installation/webui-installation.md (100%) rename docs/{ => en}/key-features/device-resource-isolation.md (100%) rename docs/{ => en}/key-features/device-sharing.md (100%) rename docs/{ => en}/resources/HAMI-VGPU-mind-map-English.png (100%) rename docs/{ => en}/resources/Karmada-logo-horizontal-color.png (100%) rename docs/{ => en}/resources/administrator/prometheus/grafana.png (100%) rename docs/{ => en}/resources/adoptions-ci123-architecture.png (100%) rename docs/{ => en}/resources/adoptions-ci123-aries.png (100%) rename docs/{ => en}/resources/adoptions-ci123-automation-cluster-en.png (100%) rename docs/{ => en}/resources/adoptions-ci123-automation-cluster-zh.png (100%) rename docs/{ => en}/resources/adoptions-ci123-capability-visualization.png (100%) rename docs/{ => en}/resources/adoptions-ci123-cluster-inspection.png (100%) rename docs/{ => en}/resources/adoptions-ci123-gpu-resources.png (100%) rename docs/{ => en}/resources/adoptions-ci123-msp-multicluster-1.png (100%) rename docs/{ => en}/resources/adoptions-ci123-msp-multicluster-2.png (100%) rename docs/{ => en}/resources/adoptions-ci123-multicluster-capability.png (100%) rename docs/{ => en}/resources/adoptions-ci123-override.png (100%) rename docs/{ => en}/resources/adoptions-ci123-sequence-status.png (100%) rename docs/{ => en}/resources/adoptions-ci123-unified-view-1.png (100%) rename docs/{ => en}/resources/adoptions-ci123-unified-view-2.png (100%) rename docs/{ => en}/resources/adoptions-ci123-velero.png (100%) rename docs/{ => en}/resources/adoptions-vipkid-architecture-en.png (100%) rename docs/{ => en}/resources/adoptions-vipkid-architecture-zh.png (100%) rename docs/{ => en}/resources/architect.jpg (100%) rename docs/{ => en}/resources/architecture.drawio (100%) rename docs/{ => en}/resources/architecture.png (100%) rename docs/{ => en}/resources/argocd-new-app-cluster.png (100%) rename docs/{ => en}/resources/argocd-new-app-name.png (100%) rename docs/{ => en}/resources/argocd-new-app-repo.png (100%) rename docs/{ => en}/resources/argocd-new-app.png (100%) rename docs/{ => en}/resources/argocd-register-karmada.png (100%) rename docs/{ => en}/resources/argocd-status-aggregated.png (100%) rename docs/{ => en}/resources/argocd-status-overview.png (100%) rename docs/{ => en}/resources/argocd-status-resourcebinding.png (100%) rename docs/{ => en}/resources/argocd-sync-apps.png (100%) rename docs/{ => en}/resources/binding-controller-process.drawio (100%) rename docs/{ => en}/resources/binding-controller-process.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-architecture.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-aries.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-automation-cluster-en.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-automation-cluster-zh.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-capability-visualization.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-cluster-inspection.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-gpu-resources.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-msp-multicluster-1.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-msp-multicluster-2.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-multicluster-capability.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-override.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-sequence-status.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-unified-view-1.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-unified-view-2.png (100%) rename docs/{ => en}/resources/casestudies/ci123/adoptions-ci123-velero.png (100%) rename docs/{ => en}/resources/casestudies/vipkid/adoptions-vipkid-architecture.png (100%) rename docs/{ => en}/resources/cluster-controller-process.drawio (100%) rename docs/{ => en}/resources/cluster-controller-process.png (100%) rename docs/{ => en}/resources/cncf-logo.png (100%) rename docs/{ => en}/resources/contributor/click-next.png (100%) rename docs/{ => en}/resources/contributor/debug-docs.png (100%) rename docs/{ => en}/resources/contributor/git_workflow.png (100%) rename docs/{ => en}/resources/demo-3in1.svg (100%) rename docs/{ => en}/resources/developers/grafana_metrics.png (100%) rename docs/{ => en}/resources/device_registration.png (100%) rename docs/{ => en}/resources/example.png (100%) rename docs/{ => en}/resources/execution-controller-process.drawio (100%) rename docs/{ => en}/resources/execution-controller-process.png (100%) rename docs/{ => en}/resources/general/Karmada-logo-horizontal-color.png (100%) rename docs/{ => en}/resources/general/architecture.drawio (100%) rename docs/{ => en}/resources/general/architecture.png (100%) rename docs/{ => en}/resources/general/binding-controller-process.drawio (100%) rename docs/{ => en}/resources/general/binding-controller-process.png (100%) rename docs/{ => en}/resources/general/cluster-controller-process.drawio (100%) rename docs/{ => en}/resources/general/cluster-controller-process.png (100%) rename docs/{ => en}/resources/general/cncf-logo.png (100%) rename docs/{ => en}/resources/general/demo-3in1.svg (100%) rename docs/{ => en}/resources/general/execution-controller-process.drawio (100%) rename docs/{ => en}/resources/general/execution-controller-process.png (100%) rename docs/{ => en}/resources/general/karmada-resource-relation.drawio (100%) rename docs/{ => en}/resources/general/karmada-resource-relation.png (100%) rename docs/{ => en}/resources/general/object-association-map.drawio (100%) rename docs/{ => en}/resources/general/object-association-map.png (100%) rename docs/{ => en}/resources/general/policy-controller-process.drawio (100%) rename docs/{ => en}/resources/general/policy-controller-process.png (100%) rename docs/{ => en}/resources/general/sample-nginx.svg (100%) rename docs/{ => en}/resources/git_workflow.png (100%) rename docs/{ => en}/resources/gpu-scheduler-policy-demo.png (100%) rename docs/{ => en}/resources/grafana.png (100%) rename docs/{ => en}/resources/hami-arch.png (100%) rename docs/{ => en}/resources/hami-core-position.png (100%) rename docs/{ => en}/resources/hami-dynamic-mig-procedure.png (100%) rename docs/{ => en}/resources/hami-dynamic-mig-structure.png (100%) rename docs/{ => en}/resources/hard_limit.jpg (100%) rename docs/{ => en}/resources/installation/install-binary/generate_cert/csr_config/admin.conf (100%) rename docs/{ => en}/resources/installation/install-binary/generate_cert/csr_config/etcd/apiserver-etcd-client.conf (100%) rename docs/{ => en}/resources/installation/install-binary/generate_cert/csr_config/etcd/healthcheck-client.conf (100%) rename docs/{ => en}/resources/installation/install-binary/generate_cert/csr_config/etcd/peer.conf (100%) rename docs/{ => en}/resources/installation/install-binary/generate_cert/csr_config/etcd/server.conf (100%) rename docs/{ => en}/resources/installation/install-binary/generate_cert/csr_config/front-proxy-client.conf (100%) rename docs/{ => en}/resources/installation/install-binary/generate_cert/csr_config/karmada.conf (100%) rename docs/{ => en}/resources/installation/install-binary/generate_cert/csr_config/kube-apiserver.conf (100%) rename docs/{ => en}/resources/installation/install-binary/generate_cert/csr_config/kube-controller-manager.conf (100%) rename docs/{ => en}/resources/installation/install-binary/generate_cert/generate_ca.sh (100%) rename docs/{ => en}/resources/installation/install-binary/generate_cert/generate_etcd.sh (100%) rename docs/{ => en}/resources/installation/install-binary/generate_cert/generate_leaf.sh (100%) rename docs/{ => en}/resources/installation/install-binary/generate_cert/util.sh (100%) rename docs/{ => en}/resources/installation/install-binary/other_scripts/check_status.sh (100%) rename docs/{ => en}/resources/installation/install-binary/other_scripts/create_kubeconfig_file.sh (100%) rename docs/{ => en}/resources/istio-on-karmada-different-network.png (100%) rename docs/{ => en}/resources/istio-on-karmada.png (100%) rename docs/{ => en}/resources/karmada-resource-relation.drawio (100%) rename docs/{ => en}/resources/karmada-resource-relation.png (100%) rename docs/{ => en}/resources/key-features/cluster-failover.png (100%) rename docs/{ => en}/resources/key-features/overall-relationship.png (100%) rename docs/{ => en}/resources/key-features/overall-rescheduling.png (100%) rename docs/{ => en}/resources/key-features/overall-scheduling.png (100%) rename docs/{ => en}/resources/key-features/service-governance.png (100%) rename docs/{ => en}/resources/key-features/unified-access.png (100%) rename docs/{ => en}/resources/key-features/unified-operation.png (100%) rename docs/{ => en}/resources/key-features/unified-resourcequota.png (100%) rename docs/{ => en}/resources/key-features/unified-search.png (100%) rename docs/{ => en}/resources/kunlunxin_filter.png (100%) rename docs/{ => en}/resources/kunlunxin_topo.jpg (100%) rename docs/{ => en}/resources/metax_binpack.jpg (100%) rename docs/{ => en}/resources/metax_spread.jpg (100%) rename docs/{ => en}/resources/metax_topo.jpg (100%) rename docs/{ => en}/resources/node-shceduler-policy-demo.png (100%) rename docs/{ => en}/resources/object-association-map.drawio (100%) rename docs/{ => en}/resources/object-association-map.png (100%) rename docs/{ => en}/resources/policy-controller-process.drawio (100%) rename docs/{ => en}/resources/policy-controller-process.png (100%) rename docs/{ => en}/resources/sample-nginx.svg (100%) rename docs/{ => en}/resources/sample_nvidia-smi.png (100%) rename docs/{ => en}/resources/scheduler-policy-story.png (100%) rename docs/{ => en}/resources/task_dispatch.png (100%) rename docs/{ => en}/resources/userguide/cicd/argocd/argocd-new-app-cluster.png (100%) rename docs/{ => en}/resources/userguide/cicd/argocd/argocd-new-app-name.png (100%) rename docs/{ => en}/resources/userguide/cicd/argocd/argocd-new-app-repo.png (100%) rename docs/{ => en}/resources/userguide/cicd/argocd/argocd-new-app.png (100%) rename docs/{ => en}/resources/userguide/cicd/argocd/argocd-register-karmada.png (100%) rename docs/{ => en}/resources/userguide/cicd/argocd/argocd-status-aggregated.png (100%) rename docs/{ => en}/resources/userguide/cicd/argocd/argocd-status-overview.png (100%) rename docs/{ => en}/resources/userguide/cicd/argocd/argocd-status-resourcebinding.png (100%) rename docs/{ => en}/resources/userguide/cicd/argocd/argocd-sync-apps.png (100%) rename docs/{ => en}/resources/userguide/failover/failover-overview.png (100%) rename docs/{ => en}/resources/userguide/service/istio/istio-on-karmada-different-network.png (100%) rename docs/{ => en}/resources/userguide/service/istio/istio-on-karmada.png (100%) rename docs/{ => en}/troubleshooting/troubleshooting.md (100%) rename docs/{ => en}/userguide/AWSNeuron-device/enable-awsneuron-managing.md (100%) rename docs/{ => en}/userguide/AWSNeuron-device/examples/allocate-neuron-core.md (100%) rename docs/{ => en}/userguide/AWSNeuron-device/examples/allocate-neuron-device.md (100%) rename docs/{ => en}/userguide/Ascend-device/device-template.md (100%) rename docs/{ => en}/userguide/Ascend-device/enable-ascend-sharing.md (100%) rename docs/{ => en}/userguide/Ascend-device/examples/allocate-310p.md (100%) rename docs/{ => en}/userguide/Ascend-device/examples/allocate-910b.md (100%) rename docs/{ => en}/userguide/Ascend-device/examples/allocate-exclusive.md (100%) rename docs/{ => en}/userguide/Cambricon-device/enable-cambricon-mlu-sharing.md (100%) rename docs/{ => en}/userguide/Cambricon-device/examples/allocate-core-and-memory.md (100%) rename docs/{ => en}/userguide/Cambricon-device/examples/allocate-exclusive.md (100%) rename docs/{ => en}/userguide/Cambricon-device/specify-device-core-usage.md (100%) rename docs/{ => en}/userguide/Cambricon-device/specify-device-memory-usage.md (100%) rename docs/{ => en}/userguide/Cambricon-device/specify-device-type-to-use.md (100%) rename docs/{ => en}/userguide/Device-supported.md (100%) rename docs/{ => en}/userguide/Enflame-device/enable-enflame-gcu-sharing.md (100%) rename docs/{ => en}/userguide/Hygon-device/enable-hygon-dcu-sharing.md (100%) rename docs/{ => en}/userguide/Hygon-device/examples/allocate-core-and-memory.md (100%) rename docs/{ => en}/userguide/Hygon-device/examples/allocate-exclusive.md (100%) rename docs/{ => en}/userguide/Hygon-device/examples/specify-certain-cards.md (100%) rename docs/{ => en}/userguide/Hygon-device/specify-device-core-usage.md (100%) rename docs/{ => en}/userguide/Hygon-device/specify-device-memory-usage.md (100%) rename docs/{ => en}/userguide/Hygon-device/specify-device-uuid-to-use.md (100%) rename docs/{ => en}/userguide/Iluvatar-device/enable-illuvatar-gpu-sharing.md (100%) rename docs/{ => en}/userguide/Iluvatar-device/examples/allocate-device-core-and-memory-to-container.md (100%) rename docs/{ => en}/userguide/Iluvatar-device/examples/allocate-exclusive.md (100%) rename docs/{ => en}/userguide/Kunlunxin-device/enable-kunlunxin-schedule.md (100%) rename docs/{ => en}/userguide/Metax-device/Metax-GPU/enable-metax-gpu-schedule.md (100%) rename docs/{ => en}/userguide/Metax-device/Metax-GPU/examples/allocate-binpack.md (100%) rename docs/{ => en}/userguide/Metax-device/Metax-GPU/examples/allocate-spread.md (100%) rename docs/{ => en}/userguide/Metax-device/Metax-GPU/examples/default-use.md (100%) rename docs/{ => en}/userguide/Metax-device/Metax-GPU/specify-binpack-task.md (100%) rename docs/{ => en}/userguide/Metax-device/Metax-GPU/specify-spread-task.md (100%) rename docs/{ => en}/userguide/Metax-device/Metax-sGPU/enable-metax-gpu-sharing.md (100%) rename docs/{ => en}/userguide/Metax-device/Metax-sGPU/examples/allocate-exclusive.md (100%) rename docs/{ => en}/userguide/Metax-device/Metax-sGPU/examples/allocate-qos-policy.md (100%) rename docs/{ => en}/userguide/Metax-device/Metax-sGPU/examples/default-use.md (100%) rename docs/{ => en}/userguide/Mthreads-device/enable-mthreads-gpu-sharing.md (100%) rename docs/{ => en}/userguide/Mthreads-device/examples/allocate-core-and-memory.md (100%) rename docs/{ => en}/userguide/Mthreads-device/examples/allocate-exclusive.md (100%) rename docs/{ => en}/userguide/Mthreads-device/specify-device-core-usage.md (100%) rename docs/{ => en}/userguide/Mthreads-device/specify-device-memory-usage.md (100%) rename docs/{ => en}/userguide/NVIDIA-device/dynamic-mig-support.md (100%) rename docs/{ => en}/userguide/NVIDIA-device/examples/allocate-device-core.md (100%) rename docs/{ => en}/userguide/NVIDIA-device/examples/allocate-device-memory.md (100%) rename docs/{ => en}/userguide/NVIDIA-device/examples/allocate-device-memory2.md (100%) rename docs/{ => en}/userguide/NVIDIA-device/examples/dynamic-mig-example.md (100%) rename docs/{ => en}/userguide/NVIDIA-device/examples/specify-card-type-to-use.md (100%) rename docs/{ => en}/userguide/NVIDIA-device/examples/specify-certain-card.md (100%) rename docs/{ => en}/userguide/NVIDIA-device/examples/use-exclusive-card.md (100%) rename docs/{ => en}/userguide/NVIDIA-device/specify-device-core-usage.md (100%) rename docs/{ => en}/userguide/NVIDIA-device/specify-device-memory-usage.md (100%) rename docs/{ => en}/userguide/NVIDIA-device/specify-device-type-to-use.md (100%) rename docs/{ => en}/userguide/NVIDIA-device/specify-device-uuid-to-use.md (100%) rename docs/{ => en}/userguide/configure.md (100%) rename docs/{ => en}/userguide/monitoring/device-allocation.md (100%) rename docs/{ => en}/userguide/monitoring/globalview.md (100%) rename docs/{ => en}/userguide/monitoring/real-time-device-usage.md (100%) rename docs/{ => en}/userguide/monitoring/real-time-usage.md (100%) rename docs/{ => en}/userguide/volcano-vgpu/NVIDIA-GPU/examples/default_use.md (100%) rename docs/{ => en}/userguide/volcano-vgpu/NVIDIA-GPU/examples/use_exclusive_gpu.md (100%) rename docs/{ => en}/userguide/volcano-vgpu/NVIDIA-GPU/how-to-use-volcano-vgpu.md (100%) rename docs/{ => en}/userguide/volcano-vgpu/NVIDIA-GPU/monitor.md (100%) create mode 100644 docs/index.md create mode 100644 docs/logo.svg create mode 100644 docs/zh/FAQ/FAQ.md create mode 100644 docs/zh/blog/2024-12-18-support-blog-post/index.md create mode 100644 docs/zh/blog/2024-12-31-post/index.md create mode 100644 docs/zh/contributor/adopters.md create mode 100644 docs/zh/contributor/cherry-picks.md create mode 100644 docs/zh/contributor/contribute-docs.md create mode 100644 docs/zh/contributor/contributers.md create mode 100644 docs/zh/contributor/contributing.md create mode 100644 docs/zh/contributor/github-workflow.md create mode 100644 docs/zh/contributor/goverance.md create mode 100644 docs/zh/contributor/ladder.md create mode 100644 docs/zh/contributor/lifted.md create mode 100644 docs/zh/core-concepts/architecture.md create mode 100644 docs/zh/core-concepts/introduction.md create mode 100644 docs/zh/developers/Dynamic-mig.md create mode 100644 docs/zh/developers/HAMi-core-design.md create mode 100644 docs/zh/developers/build.md create mode 100644 docs/zh/developers/kunlunxin-topology.md create mode 100644 docs/zh/developers/mindmap.md create mode 100644 docs/zh/developers/protocol.md create mode 100644 docs/zh/developers/scheduling.md create mode 100644 docs/zh/get-started/deploy-with-helm.md create mode 100644 docs/zh/index.md create mode 100644 docs/zh/installation/aws-installation.md create mode 100644 docs/zh/installation/how-to-use-volcano-vgpu.md create mode 100644 docs/zh/installation/offline-installation.md create mode 100644 docs/zh/installation/online-installation.md create mode 100644 docs/zh/installation/prequisities.md create mode 100644 docs/zh/installation/uninstall.md create mode 100644 docs/zh/installation/upgrade.md create mode 100644 docs/zh/installation/webui-installation.md create mode 100644 docs/zh/key-features/device-resource-isolation.md create mode 100644 docs/zh/key-features/device-sharing.md create mode 100644 docs/zh/releases.md create mode 100644 docs/zh/roadmap.md create mode 100644 docs/zh/troubleshooting/troubleshooting.md create mode 100644 docs/zh/userguide/AWSNeuron-device/enable-awsneuron-managing.md create mode 100644 docs/zh/userguide/AWSNeuron-device/examples/allocate-neuron-core.md create mode 100644 docs/zh/userguide/AWSNeuron-device/examples/allocate-neuron-device.md create mode 100644 docs/zh/userguide/Ascend-device/device-template.md create mode 100644 docs/zh/userguide/Ascend-device/enable-ascend-sharing.md create mode 100644 docs/zh/userguide/Ascend-device/examples/allocate-310p.md create mode 100644 docs/zh/userguide/Ascend-device/examples/allocate-910b.md create mode 100644 docs/zh/userguide/Ascend-device/examples/allocate-exclusive.md create mode 100644 docs/zh/userguide/Cambricon-device/enable-cambricon-mlu-sharing.md create mode 100644 docs/zh/userguide/Cambricon-device/examples/allocate-core-and-memory.md create mode 100644 docs/zh/userguide/Cambricon-device/examples/allocate-exclusive.md create mode 100644 docs/zh/userguide/Cambricon-device/specify-device-core-usage.md create mode 100644 docs/zh/userguide/Cambricon-device/specify-device-memory-usage.md create mode 100644 docs/zh/userguide/Cambricon-device/specify-device-type-to-use.md create mode 100644 docs/zh/userguide/Device-supported.md create mode 100644 docs/zh/userguide/Enflame-device/enable-enflame-gcu-sharing.md create mode 100644 docs/zh/userguide/Hygon-device/enable-hygon-dcu-sharing.md create mode 100644 docs/zh/userguide/Hygon-device/examples/allocate-core-and-memory.md create mode 100644 docs/zh/userguide/Hygon-device/examples/allocate-exclusive.md create mode 100644 docs/zh/userguide/Hygon-device/examples/specify-certain-cards.md create mode 100644 docs/zh/userguide/Hygon-device/specify-device-core-usage.md create mode 100644 docs/zh/userguide/Hygon-device/specify-device-memory-usage.md create mode 100644 docs/zh/userguide/Hygon-device/specify-device-uuid-to-use.md create mode 100644 docs/zh/userguide/Iluvatar-device/enable-illuvatar-gpu-sharing.md create mode 100644 docs/zh/userguide/Iluvatar-device/examples/allocate-device-core-and-memory-to-container.md create mode 100644 docs/zh/userguide/Iluvatar-device/examples/allocate-exclusive.md create mode 100644 docs/zh/userguide/Metax-device/Metax-GPU/enable-metax-gpu-schedule.md create mode 100644 docs/zh/userguide/Metax-device/Metax-GPU/examples/allocate-binpack.md create mode 100644 docs/zh/userguide/Metax-device/Metax-GPU/examples/allocate-spread.md create mode 100644 docs/zh/userguide/Metax-device/Metax-GPU/examples/default-use.md create mode 100644 docs/zh/userguide/Metax-device/Metax-GPU/specify-binpack-task.md create mode 100644 docs/zh/userguide/Metax-device/Metax-GPU/specify-spread-task.md create mode 100644 docs/zh/userguide/Metax-device/Metax-sGPU/enable-metax-gpu-sharing.md create mode 100644 docs/zh/userguide/Metax-device/Metax-sGPU/examples/allocate-exclusive.md create mode 100644 docs/zh/userguide/Metax-device/Metax-sGPU/examples/allocate-qos-policy.md create mode 100644 docs/zh/userguide/Metax-device/Metax-sGPU/examples/default-use.md create mode 100644 docs/zh/userguide/Mthreads-device/enable-mthreads-gpu-sharing.md create mode 100644 docs/zh/userguide/Mthreads-device/examples/allocate-core-and-memory.md create mode 100644 docs/zh/userguide/Mthreads-device/examples/allocate-exclusive.md create mode 100644 docs/zh/userguide/Mthreads-device/specify-device-core-usage.md create mode 100644 docs/zh/userguide/Mthreads-device/specify-device-memory-usage.md create mode 100644 docs/zh/userguide/NVIDIA-device/dynamic-mig-support.md create mode 100644 docs/zh/userguide/NVIDIA-device/examples/allocate-device-core.md create mode 100644 docs/zh/userguide/NVIDIA-device/examples/allocate-device-memory.md create mode 100644 docs/zh/userguide/NVIDIA-device/examples/allocate-device-memory2.md create mode 100644 docs/zh/userguide/NVIDIA-device/examples/dynamic-mig-example.md create mode 100644 docs/zh/userguide/NVIDIA-device/examples/specify-card-type-to-use.md create mode 100644 docs/zh/userguide/NVIDIA-device/examples/specify-certain-card.md create mode 100644 docs/zh/userguide/NVIDIA-device/examples/use-exclusive-card.md create mode 100644 docs/zh/userguide/NVIDIA-device/specify-device-core-usage.md create mode 100644 docs/zh/userguide/NVIDIA-device/specify-device-memory-usage.md create mode 100644 docs/zh/userguide/NVIDIA-device/specify-device-type-to-use.md create mode 100644 docs/zh/userguide/NVIDIA-device/specify-device-uuid-to-use.md create mode 100644 docs/zh/userguide/configure.md create mode 100644 docs/zh/userguide/monitoring/device-allocation.md create mode 100644 docs/zh/userguide/monitoring/globalview.md create mode 100644 docs/zh/userguide/monitoring/real-time-device-usage.md create mode 100644 docs/zh/userguide/monitoring/real-time-usage.md create mode 100644 docs/zh/userguide/support-devices.md create mode 100644 docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/examples/default_use.md create mode 100644 docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/examples/use_exclusive_gpu.md create mode 100644 docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/how-to-use-volcano-vgpu.md create mode 100644 docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/monitor.md create mode 100644 mkdocs.yml create mode 100644 overrides/partials/footer.html create mode 100644 pyproject.toml create mode 100644 templates/adopters.md diff --git a/data/adopters.yaml b/data/adopters.yaml new file mode 100644 index 0000000..61a4d28 --- /dev/null +++ b/data/adopters.yaml @@ -0,0 +1,7 @@ +adopters: + project: Kepler + description: > + Check out everything about Kepler on our [website](https://sustainable-computing.io/). + companies: + - name: Kepler + url: https://sustainable-computing.io/ diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 604d9f9..0000000 --- a/docs/README.md +++ /dev/null @@ -1,4 +0,0 @@ ---- -title: readme -slug: /readme ---- \ No newline at end of file diff --git a/docs/FAQ/FAQ.md b/docs/en/FAQ/FAQ.md similarity index 100% rename from docs/FAQ/FAQ.md rename to docs/en/FAQ/FAQ.md diff --git a/blog/2024-12-18-support-blog-post/index.md b/docs/en/blog/2024-12-18-support-blog-post/index.md similarity index 100% rename from blog/2024-12-18-support-blog-post/index.md rename to docs/en/blog/2024-12-18-support-blog-post/index.md diff --git a/blog/2024-12-31-post/index.md b/docs/en/blog/2024-12-31-post/index.md similarity index 100% rename from blog/2024-12-31-post/index.md rename to docs/en/blog/2024-12-31-post/index.md diff --git a/changelog/source/authors.json b/docs/en/changelog/source/authors.json similarity index 100% rename from changelog/source/authors.json rename to docs/en/changelog/source/authors.json diff --git a/changelog/source/v2.5.0.md b/docs/en/changelog/source/v2.5.0.md similarity index 100% rename from changelog/source/v2.5.0.md rename to docs/en/changelog/source/v2.5.0.md diff --git a/changelog/source/v2.5.1.md b/docs/en/changelog/source/v2.5.1.md similarity index 100% rename from changelog/source/v2.5.1.md rename to docs/en/changelog/source/v2.5.1.md diff --git a/docs/contributor/adopters.md b/docs/en/contributor/adopters.md similarity index 100% rename from docs/contributor/adopters.md rename to docs/en/contributor/adopters.md diff --git a/docs/contributor/cherry-picks.md b/docs/en/contributor/cherry-picks.md similarity index 100% rename from docs/contributor/cherry-picks.md rename to docs/en/contributor/cherry-picks.md diff --git a/docs/contributor/contribute-docs.md b/docs/en/contributor/contribute-docs.md similarity index 100% rename from docs/contributor/contribute-docs.md rename to docs/en/contributor/contribute-docs.md diff --git a/docs/contributor/contributers.md b/docs/en/contributor/contributers.md similarity index 100% rename from docs/contributor/contributers.md rename to docs/en/contributor/contributers.md diff --git a/docs/contributor/contributing.md b/docs/en/contributor/contributing.md similarity index 100% rename from docs/contributor/contributing.md rename to docs/en/contributor/contributing.md diff --git a/docs/contributor/github-workflow.md b/docs/en/contributor/github-workflow.md similarity index 100% rename from docs/contributor/github-workflow.md rename to docs/en/contributor/github-workflow.md diff --git a/docs/contributor/goverance.md b/docs/en/contributor/goverance.md similarity index 100% rename from docs/contributor/goverance.md rename to docs/en/contributor/goverance.md diff --git a/docs/contributor/ladder.md b/docs/en/contributor/ladder.md similarity index 100% rename from docs/contributor/ladder.md rename to docs/en/contributor/ladder.md diff --git a/docs/contributor/lifted.md b/docs/en/contributor/lifted.md similarity index 100% rename from docs/contributor/lifted.md rename to docs/en/contributor/lifted.md diff --git a/docs/core-concepts/architecture.md b/docs/en/core-concepts/architecture.md similarity index 100% rename from docs/core-concepts/architecture.md rename to docs/en/core-concepts/architecture.md diff --git a/docs/core-concepts/introduction.md b/docs/en/core-concepts/introduction.md similarity index 100% rename from docs/core-concepts/introduction.md rename to docs/en/core-concepts/introduction.md diff --git a/docs/developers/Dynamic-mig.md b/docs/en/developers/Dynamic-mig.md similarity index 100% rename from docs/developers/Dynamic-mig.md rename to docs/en/developers/Dynamic-mig.md diff --git a/docs/developers/HAMi-core-design.md b/docs/en/developers/HAMi-core-design.md similarity index 100% rename from docs/developers/HAMi-core-design.md rename to docs/en/developers/HAMi-core-design.md diff --git a/docs/developers/build.md b/docs/en/developers/build.md similarity index 100% rename from docs/developers/build.md rename to docs/en/developers/build.md diff --git a/docs/developers/kunlunxin-topology.md b/docs/en/developers/kunlunxin-topology.md similarity index 100% rename from docs/developers/kunlunxin-topology.md rename to docs/en/developers/kunlunxin-topology.md diff --git a/docs/developers/mindmap.md b/docs/en/developers/mindmap.md similarity index 100% rename from docs/developers/mindmap.md rename to docs/en/developers/mindmap.md diff --git a/docs/developers/protocol.md b/docs/en/developers/protocol.md similarity index 100% rename from docs/developers/protocol.md rename to docs/en/developers/protocol.md diff --git a/docs/developers/scheduling.md b/docs/en/developers/scheduling.md similarity index 100% rename from docs/developers/scheduling.md rename to docs/en/developers/scheduling.md diff --git a/docs/get-started/deploy-with-helm.md b/docs/en/get-started/deploy-with-helm.md similarity index 100% rename from docs/get-started/deploy-with-helm.md rename to docs/en/get-started/deploy-with-helm.md diff --git a/docs/installation/aws-installation.md b/docs/en/installation/aws-installation.md similarity index 100% rename from docs/installation/aws-installation.md rename to docs/en/installation/aws-installation.md diff --git a/docs/installation/how-to-use-volcano-vgpu.md b/docs/en/installation/how-to-use-volcano-vgpu.md similarity index 100% rename from docs/installation/how-to-use-volcano-vgpu.md rename to docs/en/installation/how-to-use-volcano-vgpu.md diff --git a/docs/installation/offline-installation.md b/docs/en/installation/offline-installation.md similarity index 100% rename from docs/installation/offline-installation.md rename to docs/en/installation/offline-installation.md diff --git a/docs/installation/online-installation.md b/docs/en/installation/online-installation.md similarity index 100% rename from docs/installation/online-installation.md rename to docs/en/installation/online-installation.md diff --git a/docs/installation/prequisities.md b/docs/en/installation/prequisities.md similarity index 100% rename from docs/installation/prequisities.md rename to docs/en/installation/prequisities.md diff --git a/docs/installation/uninstall.md b/docs/en/installation/uninstall.md similarity index 100% rename from docs/installation/uninstall.md rename to docs/en/installation/uninstall.md diff --git a/docs/installation/upgrade.md b/docs/en/installation/upgrade.md similarity index 100% rename from docs/installation/upgrade.md rename to docs/en/installation/upgrade.md diff --git a/docs/installation/webui-installation.md b/docs/en/installation/webui-installation.md similarity index 100% rename from docs/installation/webui-installation.md rename to docs/en/installation/webui-installation.md diff --git a/docs/key-features/device-resource-isolation.md b/docs/en/key-features/device-resource-isolation.md similarity index 100% rename from docs/key-features/device-resource-isolation.md rename to docs/en/key-features/device-resource-isolation.md diff --git a/docs/key-features/device-sharing.md b/docs/en/key-features/device-sharing.md similarity index 100% rename from docs/key-features/device-sharing.md rename to docs/en/key-features/device-sharing.md diff --git a/docs/resources/HAMI-VGPU-mind-map-English.png b/docs/en/resources/HAMI-VGPU-mind-map-English.png similarity index 100% rename from docs/resources/HAMI-VGPU-mind-map-English.png rename to docs/en/resources/HAMI-VGPU-mind-map-English.png diff --git a/docs/resources/Karmada-logo-horizontal-color.png b/docs/en/resources/Karmada-logo-horizontal-color.png similarity index 100% rename from docs/resources/Karmada-logo-horizontal-color.png rename to docs/en/resources/Karmada-logo-horizontal-color.png diff --git a/docs/resources/administrator/prometheus/grafana.png b/docs/en/resources/administrator/prometheus/grafana.png similarity index 100% rename from docs/resources/administrator/prometheus/grafana.png rename to docs/en/resources/administrator/prometheus/grafana.png diff --git a/docs/resources/adoptions-ci123-architecture.png b/docs/en/resources/adoptions-ci123-architecture.png similarity index 100% rename from docs/resources/adoptions-ci123-architecture.png rename to docs/en/resources/adoptions-ci123-architecture.png diff --git a/docs/resources/adoptions-ci123-aries.png b/docs/en/resources/adoptions-ci123-aries.png similarity index 100% rename from docs/resources/adoptions-ci123-aries.png rename to docs/en/resources/adoptions-ci123-aries.png diff --git a/docs/resources/adoptions-ci123-automation-cluster-en.png b/docs/en/resources/adoptions-ci123-automation-cluster-en.png similarity index 100% rename from docs/resources/adoptions-ci123-automation-cluster-en.png rename to docs/en/resources/adoptions-ci123-automation-cluster-en.png diff --git a/docs/resources/adoptions-ci123-automation-cluster-zh.png b/docs/en/resources/adoptions-ci123-automation-cluster-zh.png similarity index 100% rename from docs/resources/adoptions-ci123-automation-cluster-zh.png rename to docs/en/resources/adoptions-ci123-automation-cluster-zh.png diff --git a/docs/resources/adoptions-ci123-capability-visualization.png b/docs/en/resources/adoptions-ci123-capability-visualization.png similarity index 100% rename from docs/resources/adoptions-ci123-capability-visualization.png rename to docs/en/resources/adoptions-ci123-capability-visualization.png diff --git a/docs/resources/adoptions-ci123-cluster-inspection.png b/docs/en/resources/adoptions-ci123-cluster-inspection.png similarity index 100% rename from docs/resources/adoptions-ci123-cluster-inspection.png rename to docs/en/resources/adoptions-ci123-cluster-inspection.png diff --git a/docs/resources/adoptions-ci123-gpu-resources.png b/docs/en/resources/adoptions-ci123-gpu-resources.png similarity index 100% rename from docs/resources/adoptions-ci123-gpu-resources.png rename to docs/en/resources/adoptions-ci123-gpu-resources.png diff --git a/docs/resources/adoptions-ci123-msp-multicluster-1.png b/docs/en/resources/adoptions-ci123-msp-multicluster-1.png similarity index 100% rename from docs/resources/adoptions-ci123-msp-multicluster-1.png rename to docs/en/resources/adoptions-ci123-msp-multicluster-1.png diff --git a/docs/resources/adoptions-ci123-msp-multicluster-2.png b/docs/en/resources/adoptions-ci123-msp-multicluster-2.png similarity index 100% rename from docs/resources/adoptions-ci123-msp-multicluster-2.png rename to docs/en/resources/adoptions-ci123-msp-multicluster-2.png diff --git a/docs/resources/adoptions-ci123-multicluster-capability.png b/docs/en/resources/adoptions-ci123-multicluster-capability.png similarity index 100% rename from docs/resources/adoptions-ci123-multicluster-capability.png rename to docs/en/resources/adoptions-ci123-multicluster-capability.png diff --git a/docs/resources/adoptions-ci123-override.png b/docs/en/resources/adoptions-ci123-override.png similarity index 100% rename from docs/resources/adoptions-ci123-override.png rename to docs/en/resources/adoptions-ci123-override.png diff --git a/docs/resources/adoptions-ci123-sequence-status.png b/docs/en/resources/adoptions-ci123-sequence-status.png similarity index 100% rename from docs/resources/adoptions-ci123-sequence-status.png rename to docs/en/resources/adoptions-ci123-sequence-status.png diff --git a/docs/resources/adoptions-ci123-unified-view-1.png b/docs/en/resources/adoptions-ci123-unified-view-1.png similarity index 100% rename from docs/resources/adoptions-ci123-unified-view-1.png rename to docs/en/resources/adoptions-ci123-unified-view-1.png diff --git a/docs/resources/adoptions-ci123-unified-view-2.png b/docs/en/resources/adoptions-ci123-unified-view-2.png similarity index 100% rename from docs/resources/adoptions-ci123-unified-view-2.png rename to docs/en/resources/adoptions-ci123-unified-view-2.png diff --git a/docs/resources/adoptions-ci123-velero.png b/docs/en/resources/adoptions-ci123-velero.png similarity index 100% rename from docs/resources/adoptions-ci123-velero.png rename to docs/en/resources/adoptions-ci123-velero.png diff --git a/docs/resources/adoptions-vipkid-architecture-en.png b/docs/en/resources/adoptions-vipkid-architecture-en.png similarity index 100% rename from docs/resources/adoptions-vipkid-architecture-en.png rename to docs/en/resources/adoptions-vipkid-architecture-en.png diff --git a/docs/resources/adoptions-vipkid-architecture-zh.png b/docs/en/resources/adoptions-vipkid-architecture-zh.png similarity index 100% rename from docs/resources/adoptions-vipkid-architecture-zh.png rename to docs/en/resources/adoptions-vipkid-architecture-zh.png diff --git a/docs/resources/architect.jpg b/docs/en/resources/architect.jpg similarity index 100% rename from docs/resources/architect.jpg rename to docs/en/resources/architect.jpg diff --git a/docs/resources/architecture.drawio b/docs/en/resources/architecture.drawio similarity index 100% rename from docs/resources/architecture.drawio rename to docs/en/resources/architecture.drawio diff --git a/docs/resources/architecture.png b/docs/en/resources/architecture.png similarity index 100% rename from docs/resources/architecture.png rename to docs/en/resources/architecture.png diff --git a/docs/resources/argocd-new-app-cluster.png b/docs/en/resources/argocd-new-app-cluster.png similarity index 100% rename from docs/resources/argocd-new-app-cluster.png rename to docs/en/resources/argocd-new-app-cluster.png diff --git a/docs/resources/argocd-new-app-name.png b/docs/en/resources/argocd-new-app-name.png similarity index 100% rename from docs/resources/argocd-new-app-name.png rename to docs/en/resources/argocd-new-app-name.png diff --git a/docs/resources/argocd-new-app-repo.png b/docs/en/resources/argocd-new-app-repo.png similarity index 100% rename from docs/resources/argocd-new-app-repo.png rename to docs/en/resources/argocd-new-app-repo.png diff --git a/docs/resources/argocd-new-app.png b/docs/en/resources/argocd-new-app.png similarity index 100% rename from docs/resources/argocd-new-app.png rename to docs/en/resources/argocd-new-app.png diff --git a/docs/resources/argocd-register-karmada.png b/docs/en/resources/argocd-register-karmada.png similarity index 100% rename from docs/resources/argocd-register-karmada.png rename to docs/en/resources/argocd-register-karmada.png diff --git a/docs/resources/argocd-status-aggregated.png b/docs/en/resources/argocd-status-aggregated.png similarity index 100% rename from docs/resources/argocd-status-aggregated.png rename to docs/en/resources/argocd-status-aggregated.png diff --git a/docs/resources/argocd-status-overview.png b/docs/en/resources/argocd-status-overview.png similarity index 100% rename from docs/resources/argocd-status-overview.png rename to docs/en/resources/argocd-status-overview.png diff --git a/docs/resources/argocd-status-resourcebinding.png b/docs/en/resources/argocd-status-resourcebinding.png similarity index 100% rename from docs/resources/argocd-status-resourcebinding.png rename to docs/en/resources/argocd-status-resourcebinding.png diff --git a/docs/resources/argocd-sync-apps.png b/docs/en/resources/argocd-sync-apps.png similarity index 100% rename from docs/resources/argocd-sync-apps.png rename to docs/en/resources/argocd-sync-apps.png diff --git a/docs/resources/binding-controller-process.drawio b/docs/en/resources/binding-controller-process.drawio similarity index 100% rename from docs/resources/binding-controller-process.drawio rename to docs/en/resources/binding-controller-process.drawio diff --git a/docs/resources/binding-controller-process.png b/docs/en/resources/binding-controller-process.png similarity index 100% rename from docs/resources/binding-controller-process.png rename to docs/en/resources/binding-controller-process.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-architecture.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-architecture.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-architecture.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-architecture.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-aries.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-aries.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-aries.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-aries.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-automation-cluster-en.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-automation-cluster-en.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-automation-cluster-en.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-automation-cluster-en.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-automation-cluster-zh.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-automation-cluster-zh.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-automation-cluster-zh.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-automation-cluster-zh.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-capability-visualization.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-capability-visualization.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-capability-visualization.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-capability-visualization.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-cluster-inspection.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-cluster-inspection.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-cluster-inspection.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-cluster-inspection.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-gpu-resources.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-gpu-resources.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-gpu-resources.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-gpu-resources.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-msp-multicluster-1.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-msp-multicluster-1.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-msp-multicluster-1.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-msp-multicluster-1.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-msp-multicluster-2.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-msp-multicluster-2.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-msp-multicluster-2.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-msp-multicluster-2.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-multicluster-capability.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-multicluster-capability.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-multicluster-capability.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-multicluster-capability.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-override.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-override.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-override.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-override.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-sequence-status.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-sequence-status.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-sequence-status.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-sequence-status.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-unified-view-1.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-unified-view-1.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-unified-view-1.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-unified-view-1.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-unified-view-2.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-unified-view-2.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-unified-view-2.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-unified-view-2.png diff --git a/docs/resources/casestudies/ci123/adoptions-ci123-velero.png b/docs/en/resources/casestudies/ci123/adoptions-ci123-velero.png similarity index 100% rename from docs/resources/casestudies/ci123/adoptions-ci123-velero.png rename to docs/en/resources/casestudies/ci123/adoptions-ci123-velero.png diff --git a/docs/resources/casestudies/vipkid/adoptions-vipkid-architecture.png b/docs/en/resources/casestudies/vipkid/adoptions-vipkid-architecture.png similarity index 100% rename from docs/resources/casestudies/vipkid/adoptions-vipkid-architecture.png rename to docs/en/resources/casestudies/vipkid/adoptions-vipkid-architecture.png diff --git a/docs/resources/cluster-controller-process.drawio b/docs/en/resources/cluster-controller-process.drawio similarity index 100% rename from docs/resources/cluster-controller-process.drawio rename to docs/en/resources/cluster-controller-process.drawio diff --git a/docs/resources/cluster-controller-process.png b/docs/en/resources/cluster-controller-process.png similarity index 100% rename from docs/resources/cluster-controller-process.png rename to docs/en/resources/cluster-controller-process.png diff --git a/docs/resources/cncf-logo.png b/docs/en/resources/cncf-logo.png similarity index 100% rename from docs/resources/cncf-logo.png rename to docs/en/resources/cncf-logo.png diff --git a/docs/resources/contributor/click-next.png b/docs/en/resources/contributor/click-next.png similarity index 100% rename from docs/resources/contributor/click-next.png rename to docs/en/resources/contributor/click-next.png diff --git a/docs/resources/contributor/debug-docs.png b/docs/en/resources/contributor/debug-docs.png similarity index 100% rename from docs/resources/contributor/debug-docs.png rename to docs/en/resources/contributor/debug-docs.png diff --git a/docs/resources/contributor/git_workflow.png b/docs/en/resources/contributor/git_workflow.png similarity index 100% rename from docs/resources/contributor/git_workflow.png rename to docs/en/resources/contributor/git_workflow.png diff --git a/docs/resources/demo-3in1.svg b/docs/en/resources/demo-3in1.svg similarity index 100% rename from docs/resources/demo-3in1.svg rename to docs/en/resources/demo-3in1.svg diff --git a/docs/resources/developers/grafana_metrics.png b/docs/en/resources/developers/grafana_metrics.png similarity index 100% rename from docs/resources/developers/grafana_metrics.png rename to docs/en/resources/developers/grafana_metrics.png diff --git a/docs/resources/device_registration.png b/docs/en/resources/device_registration.png similarity index 100% rename from docs/resources/device_registration.png rename to docs/en/resources/device_registration.png diff --git a/docs/resources/example.png b/docs/en/resources/example.png similarity index 100% rename from docs/resources/example.png rename to docs/en/resources/example.png diff --git a/docs/resources/execution-controller-process.drawio b/docs/en/resources/execution-controller-process.drawio similarity index 100% rename from docs/resources/execution-controller-process.drawio rename to docs/en/resources/execution-controller-process.drawio diff --git a/docs/resources/execution-controller-process.png b/docs/en/resources/execution-controller-process.png similarity index 100% rename from docs/resources/execution-controller-process.png rename to docs/en/resources/execution-controller-process.png diff --git a/docs/resources/general/Karmada-logo-horizontal-color.png b/docs/en/resources/general/Karmada-logo-horizontal-color.png similarity index 100% rename from docs/resources/general/Karmada-logo-horizontal-color.png rename to docs/en/resources/general/Karmada-logo-horizontal-color.png diff --git a/docs/resources/general/architecture.drawio b/docs/en/resources/general/architecture.drawio similarity index 100% rename from docs/resources/general/architecture.drawio rename to docs/en/resources/general/architecture.drawio diff --git a/docs/resources/general/architecture.png b/docs/en/resources/general/architecture.png similarity index 100% rename from docs/resources/general/architecture.png rename to docs/en/resources/general/architecture.png diff --git a/docs/resources/general/binding-controller-process.drawio b/docs/en/resources/general/binding-controller-process.drawio similarity index 100% rename from docs/resources/general/binding-controller-process.drawio rename to docs/en/resources/general/binding-controller-process.drawio diff --git a/docs/resources/general/binding-controller-process.png b/docs/en/resources/general/binding-controller-process.png similarity index 100% rename from docs/resources/general/binding-controller-process.png rename to docs/en/resources/general/binding-controller-process.png diff --git a/docs/resources/general/cluster-controller-process.drawio b/docs/en/resources/general/cluster-controller-process.drawio similarity index 100% rename from docs/resources/general/cluster-controller-process.drawio rename to docs/en/resources/general/cluster-controller-process.drawio diff --git a/docs/resources/general/cluster-controller-process.png b/docs/en/resources/general/cluster-controller-process.png similarity index 100% rename from docs/resources/general/cluster-controller-process.png rename to docs/en/resources/general/cluster-controller-process.png diff --git a/docs/resources/general/cncf-logo.png b/docs/en/resources/general/cncf-logo.png similarity index 100% rename from docs/resources/general/cncf-logo.png rename to docs/en/resources/general/cncf-logo.png diff --git a/docs/resources/general/demo-3in1.svg b/docs/en/resources/general/demo-3in1.svg similarity index 100% rename from docs/resources/general/demo-3in1.svg rename to docs/en/resources/general/demo-3in1.svg diff --git a/docs/resources/general/execution-controller-process.drawio b/docs/en/resources/general/execution-controller-process.drawio similarity index 100% rename from docs/resources/general/execution-controller-process.drawio rename to docs/en/resources/general/execution-controller-process.drawio diff --git a/docs/resources/general/execution-controller-process.png b/docs/en/resources/general/execution-controller-process.png similarity index 100% rename from docs/resources/general/execution-controller-process.png rename to docs/en/resources/general/execution-controller-process.png diff --git a/docs/resources/general/karmada-resource-relation.drawio b/docs/en/resources/general/karmada-resource-relation.drawio similarity index 100% rename from docs/resources/general/karmada-resource-relation.drawio rename to docs/en/resources/general/karmada-resource-relation.drawio diff --git a/docs/resources/general/karmada-resource-relation.png b/docs/en/resources/general/karmada-resource-relation.png similarity index 100% rename from docs/resources/general/karmada-resource-relation.png rename to docs/en/resources/general/karmada-resource-relation.png diff --git a/docs/resources/general/object-association-map.drawio b/docs/en/resources/general/object-association-map.drawio similarity index 100% rename from docs/resources/general/object-association-map.drawio rename to docs/en/resources/general/object-association-map.drawio diff --git a/docs/resources/general/object-association-map.png b/docs/en/resources/general/object-association-map.png similarity index 100% rename from docs/resources/general/object-association-map.png rename to docs/en/resources/general/object-association-map.png diff --git a/docs/resources/general/policy-controller-process.drawio b/docs/en/resources/general/policy-controller-process.drawio similarity index 100% rename from docs/resources/general/policy-controller-process.drawio rename to docs/en/resources/general/policy-controller-process.drawio diff --git a/docs/resources/general/policy-controller-process.png b/docs/en/resources/general/policy-controller-process.png similarity index 100% rename from docs/resources/general/policy-controller-process.png rename to docs/en/resources/general/policy-controller-process.png diff --git a/docs/resources/general/sample-nginx.svg b/docs/en/resources/general/sample-nginx.svg similarity index 100% rename from docs/resources/general/sample-nginx.svg rename to docs/en/resources/general/sample-nginx.svg diff --git a/docs/resources/git_workflow.png b/docs/en/resources/git_workflow.png similarity index 100% rename from docs/resources/git_workflow.png rename to docs/en/resources/git_workflow.png diff --git a/docs/resources/gpu-scheduler-policy-demo.png b/docs/en/resources/gpu-scheduler-policy-demo.png similarity index 100% rename from docs/resources/gpu-scheduler-policy-demo.png rename to docs/en/resources/gpu-scheduler-policy-demo.png diff --git a/docs/resources/grafana.png b/docs/en/resources/grafana.png similarity index 100% rename from docs/resources/grafana.png rename to docs/en/resources/grafana.png diff --git a/docs/resources/hami-arch.png b/docs/en/resources/hami-arch.png similarity index 100% rename from docs/resources/hami-arch.png rename to docs/en/resources/hami-arch.png diff --git a/docs/resources/hami-core-position.png b/docs/en/resources/hami-core-position.png similarity index 100% rename from docs/resources/hami-core-position.png rename to docs/en/resources/hami-core-position.png diff --git a/docs/resources/hami-dynamic-mig-procedure.png b/docs/en/resources/hami-dynamic-mig-procedure.png similarity index 100% rename from docs/resources/hami-dynamic-mig-procedure.png rename to docs/en/resources/hami-dynamic-mig-procedure.png diff --git a/docs/resources/hami-dynamic-mig-structure.png b/docs/en/resources/hami-dynamic-mig-structure.png similarity index 100% rename from docs/resources/hami-dynamic-mig-structure.png rename to docs/en/resources/hami-dynamic-mig-structure.png diff --git a/docs/resources/hard_limit.jpg b/docs/en/resources/hard_limit.jpg similarity index 100% rename from docs/resources/hard_limit.jpg rename to docs/en/resources/hard_limit.jpg diff --git a/docs/resources/installation/install-binary/generate_cert/csr_config/admin.conf b/docs/en/resources/installation/install-binary/generate_cert/csr_config/admin.conf similarity index 100% rename from docs/resources/installation/install-binary/generate_cert/csr_config/admin.conf rename to docs/en/resources/installation/install-binary/generate_cert/csr_config/admin.conf diff --git a/docs/resources/installation/install-binary/generate_cert/csr_config/etcd/apiserver-etcd-client.conf b/docs/en/resources/installation/install-binary/generate_cert/csr_config/etcd/apiserver-etcd-client.conf similarity index 100% rename from docs/resources/installation/install-binary/generate_cert/csr_config/etcd/apiserver-etcd-client.conf rename to docs/en/resources/installation/install-binary/generate_cert/csr_config/etcd/apiserver-etcd-client.conf diff --git a/docs/resources/installation/install-binary/generate_cert/csr_config/etcd/healthcheck-client.conf b/docs/en/resources/installation/install-binary/generate_cert/csr_config/etcd/healthcheck-client.conf similarity index 100% rename from docs/resources/installation/install-binary/generate_cert/csr_config/etcd/healthcheck-client.conf rename to docs/en/resources/installation/install-binary/generate_cert/csr_config/etcd/healthcheck-client.conf diff --git a/docs/resources/installation/install-binary/generate_cert/csr_config/etcd/peer.conf b/docs/en/resources/installation/install-binary/generate_cert/csr_config/etcd/peer.conf similarity index 100% rename from docs/resources/installation/install-binary/generate_cert/csr_config/etcd/peer.conf rename to docs/en/resources/installation/install-binary/generate_cert/csr_config/etcd/peer.conf diff --git a/docs/resources/installation/install-binary/generate_cert/csr_config/etcd/server.conf b/docs/en/resources/installation/install-binary/generate_cert/csr_config/etcd/server.conf similarity index 100% rename from docs/resources/installation/install-binary/generate_cert/csr_config/etcd/server.conf rename to docs/en/resources/installation/install-binary/generate_cert/csr_config/etcd/server.conf diff --git a/docs/resources/installation/install-binary/generate_cert/csr_config/front-proxy-client.conf b/docs/en/resources/installation/install-binary/generate_cert/csr_config/front-proxy-client.conf similarity index 100% rename from docs/resources/installation/install-binary/generate_cert/csr_config/front-proxy-client.conf rename to docs/en/resources/installation/install-binary/generate_cert/csr_config/front-proxy-client.conf diff --git a/docs/resources/installation/install-binary/generate_cert/csr_config/karmada.conf b/docs/en/resources/installation/install-binary/generate_cert/csr_config/karmada.conf similarity index 100% rename from docs/resources/installation/install-binary/generate_cert/csr_config/karmada.conf rename to docs/en/resources/installation/install-binary/generate_cert/csr_config/karmada.conf diff --git a/docs/resources/installation/install-binary/generate_cert/csr_config/kube-apiserver.conf b/docs/en/resources/installation/install-binary/generate_cert/csr_config/kube-apiserver.conf similarity index 100% rename from docs/resources/installation/install-binary/generate_cert/csr_config/kube-apiserver.conf rename to docs/en/resources/installation/install-binary/generate_cert/csr_config/kube-apiserver.conf diff --git a/docs/resources/installation/install-binary/generate_cert/csr_config/kube-controller-manager.conf b/docs/en/resources/installation/install-binary/generate_cert/csr_config/kube-controller-manager.conf similarity index 100% rename from docs/resources/installation/install-binary/generate_cert/csr_config/kube-controller-manager.conf rename to docs/en/resources/installation/install-binary/generate_cert/csr_config/kube-controller-manager.conf diff --git a/docs/resources/installation/install-binary/generate_cert/generate_ca.sh b/docs/en/resources/installation/install-binary/generate_cert/generate_ca.sh similarity index 100% rename from docs/resources/installation/install-binary/generate_cert/generate_ca.sh rename to docs/en/resources/installation/install-binary/generate_cert/generate_ca.sh diff --git a/docs/resources/installation/install-binary/generate_cert/generate_etcd.sh b/docs/en/resources/installation/install-binary/generate_cert/generate_etcd.sh similarity index 100% rename from docs/resources/installation/install-binary/generate_cert/generate_etcd.sh rename to docs/en/resources/installation/install-binary/generate_cert/generate_etcd.sh diff --git a/docs/resources/installation/install-binary/generate_cert/generate_leaf.sh b/docs/en/resources/installation/install-binary/generate_cert/generate_leaf.sh similarity index 100% rename from docs/resources/installation/install-binary/generate_cert/generate_leaf.sh rename to docs/en/resources/installation/install-binary/generate_cert/generate_leaf.sh diff --git a/docs/resources/installation/install-binary/generate_cert/util.sh b/docs/en/resources/installation/install-binary/generate_cert/util.sh similarity index 100% rename from docs/resources/installation/install-binary/generate_cert/util.sh rename to docs/en/resources/installation/install-binary/generate_cert/util.sh diff --git a/docs/resources/installation/install-binary/other_scripts/check_status.sh b/docs/en/resources/installation/install-binary/other_scripts/check_status.sh similarity index 100% rename from docs/resources/installation/install-binary/other_scripts/check_status.sh rename to docs/en/resources/installation/install-binary/other_scripts/check_status.sh diff --git a/docs/resources/installation/install-binary/other_scripts/create_kubeconfig_file.sh b/docs/en/resources/installation/install-binary/other_scripts/create_kubeconfig_file.sh similarity index 100% rename from docs/resources/installation/install-binary/other_scripts/create_kubeconfig_file.sh rename to docs/en/resources/installation/install-binary/other_scripts/create_kubeconfig_file.sh diff --git a/docs/resources/istio-on-karmada-different-network.png b/docs/en/resources/istio-on-karmada-different-network.png similarity index 100% rename from docs/resources/istio-on-karmada-different-network.png rename to docs/en/resources/istio-on-karmada-different-network.png diff --git a/docs/resources/istio-on-karmada.png b/docs/en/resources/istio-on-karmada.png similarity index 100% rename from docs/resources/istio-on-karmada.png rename to docs/en/resources/istio-on-karmada.png diff --git a/docs/resources/karmada-resource-relation.drawio b/docs/en/resources/karmada-resource-relation.drawio similarity index 100% rename from docs/resources/karmada-resource-relation.drawio rename to docs/en/resources/karmada-resource-relation.drawio diff --git a/docs/resources/karmada-resource-relation.png b/docs/en/resources/karmada-resource-relation.png similarity index 100% rename from docs/resources/karmada-resource-relation.png rename to docs/en/resources/karmada-resource-relation.png diff --git a/docs/resources/key-features/cluster-failover.png b/docs/en/resources/key-features/cluster-failover.png similarity index 100% rename from docs/resources/key-features/cluster-failover.png rename to docs/en/resources/key-features/cluster-failover.png diff --git a/docs/resources/key-features/overall-relationship.png b/docs/en/resources/key-features/overall-relationship.png similarity index 100% rename from docs/resources/key-features/overall-relationship.png rename to docs/en/resources/key-features/overall-relationship.png diff --git a/docs/resources/key-features/overall-rescheduling.png b/docs/en/resources/key-features/overall-rescheduling.png similarity index 100% rename from docs/resources/key-features/overall-rescheduling.png rename to docs/en/resources/key-features/overall-rescheduling.png diff --git a/docs/resources/key-features/overall-scheduling.png b/docs/en/resources/key-features/overall-scheduling.png similarity index 100% rename from docs/resources/key-features/overall-scheduling.png rename to docs/en/resources/key-features/overall-scheduling.png diff --git a/docs/resources/key-features/service-governance.png b/docs/en/resources/key-features/service-governance.png similarity index 100% rename from docs/resources/key-features/service-governance.png rename to docs/en/resources/key-features/service-governance.png diff --git a/docs/resources/key-features/unified-access.png b/docs/en/resources/key-features/unified-access.png similarity index 100% rename from docs/resources/key-features/unified-access.png rename to docs/en/resources/key-features/unified-access.png diff --git a/docs/resources/key-features/unified-operation.png b/docs/en/resources/key-features/unified-operation.png similarity index 100% rename from docs/resources/key-features/unified-operation.png rename to docs/en/resources/key-features/unified-operation.png diff --git a/docs/resources/key-features/unified-resourcequota.png b/docs/en/resources/key-features/unified-resourcequota.png similarity index 100% rename from docs/resources/key-features/unified-resourcequota.png rename to docs/en/resources/key-features/unified-resourcequota.png diff --git a/docs/resources/key-features/unified-search.png b/docs/en/resources/key-features/unified-search.png similarity index 100% rename from docs/resources/key-features/unified-search.png rename to docs/en/resources/key-features/unified-search.png diff --git a/docs/resources/kunlunxin_filter.png b/docs/en/resources/kunlunxin_filter.png similarity index 100% rename from docs/resources/kunlunxin_filter.png rename to docs/en/resources/kunlunxin_filter.png diff --git a/docs/resources/kunlunxin_topo.jpg b/docs/en/resources/kunlunxin_topo.jpg similarity index 100% rename from docs/resources/kunlunxin_topo.jpg rename to docs/en/resources/kunlunxin_topo.jpg diff --git a/docs/resources/metax_binpack.jpg b/docs/en/resources/metax_binpack.jpg similarity index 100% rename from docs/resources/metax_binpack.jpg rename to docs/en/resources/metax_binpack.jpg diff --git a/docs/resources/metax_spread.jpg b/docs/en/resources/metax_spread.jpg similarity index 100% rename from docs/resources/metax_spread.jpg rename to docs/en/resources/metax_spread.jpg diff --git a/docs/resources/metax_topo.jpg b/docs/en/resources/metax_topo.jpg similarity index 100% rename from docs/resources/metax_topo.jpg rename to docs/en/resources/metax_topo.jpg diff --git a/docs/resources/node-shceduler-policy-demo.png b/docs/en/resources/node-shceduler-policy-demo.png similarity index 100% rename from docs/resources/node-shceduler-policy-demo.png rename to docs/en/resources/node-shceduler-policy-demo.png diff --git a/docs/resources/object-association-map.drawio b/docs/en/resources/object-association-map.drawio similarity index 100% rename from docs/resources/object-association-map.drawio rename to docs/en/resources/object-association-map.drawio diff --git a/docs/resources/object-association-map.png b/docs/en/resources/object-association-map.png similarity index 100% rename from docs/resources/object-association-map.png rename to docs/en/resources/object-association-map.png diff --git a/docs/resources/policy-controller-process.drawio b/docs/en/resources/policy-controller-process.drawio similarity index 100% rename from docs/resources/policy-controller-process.drawio rename to docs/en/resources/policy-controller-process.drawio diff --git a/docs/resources/policy-controller-process.png b/docs/en/resources/policy-controller-process.png similarity index 100% rename from docs/resources/policy-controller-process.png rename to docs/en/resources/policy-controller-process.png diff --git a/docs/resources/sample-nginx.svg b/docs/en/resources/sample-nginx.svg similarity index 100% rename from docs/resources/sample-nginx.svg rename to docs/en/resources/sample-nginx.svg diff --git a/docs/resources/sample_nvidia-smi.png b/docs/en/resources/sample_nvidia-smi.png similarity index 100% rename from docs/resources/sample_nvidia-smi.png rename to docs/en/resources/sample_nvidia-smi.png diff --git a/docs/resources/scheduler-policy-story.png b/docs/en/resources/scheduler-policy-story.png similarity index 100% rename from docs/resources/scheduler-policy-story.png rename to docs/en/resources/scheduler-policy-story.png diff --git a/docs/resources/task_dispatch.png b/docs/en/resources/task_dispatch.png similarity index 100% rename from docs/resources/task_dispatch.png rename to docs/en/resources/task_dispatch.png diff --git a/docs/resources/userguide/cicd/argocd/argocd-new-app-cluster.png b/docs/en/resources/userguide/cicd/argocd/argocd-new-app-cluster.png similarity index 100% rename from docs/resources/userguide/cicd/argocd/argocd-new-app-cluster.png rename to docs/en/resources/userguide/cicd/argocd/argocd-new-app-cluster.png diff --git a/docs/resources/userguide/cicd/argocd/argocd-new-app-name.png b/docs/en/resources/userguide/cicd/argocd/argocd-new-app-name.png similarity index 100% rename from docs/resources/userguide/cicd/argocd/argocd-new-app-name.png rename to docs/en/resources/userguide/cicd/argocd/argocd-new-app-name.png diff --git a/docs/resources/userguide/cicd/argocd/argocd-new-app-repo.png b/docs/en/resources/userguide/cicd/argocd/argocd-new-app-repo.png similarity index 100% rename from docs/resources/userguide/cicd/argocd/argocd-new-app-repo.png rename to docs/en/resources/userguide/cicd/argocd/argocd-new-app-repo.png diff --git a/docs/resources/userguide/cicd/argocd/argocd-new-app.png b/docs/en/resources/userguide/cicd/argocd/argocd-new-app.png similarity index 100% rename from docs/resources/userguide/cicd/argocd/argocd-new-app.png rename to docs/en/resources/userguide/cicd/argocd/argocd-new-app.png diff --git a/docs/resources/userguide/cicd/argocd/argocd-register-karmada.png b/docs/en/resources/userguide/cicd/argocd/argocd-register-karmada.png similarity index 100% rename from docs/resources/userguide/cicd/argocd/argocd-register-karmada.png rename to docs/en/resources/userguide/cicd/argocd/argocd-register-karmada.png diff --git a/docs/resources/userguide/cicd/argocd/argocd-status-aggregated.png b/docs/en/resources/userguide/cicd/argocd/argocd-status-aggregated.png similarity index 100% rename from docs/resources/userguide/cicd/argocd/argocd-status-aggregated.png rename to docs/en/resources/userguide/cicd/argocd/argocd-status-aggregated.png diff --git a/docs/resources/userguide/cicd/argocd/argocd-status-overview.png b/docs/en/resources/userguide/cicd/argocd/argocd-status-overview.png similarity index 100% rename from docs/resources/userguide/cicd/argocd/argocd-status-overview.png rename to docs/en/resources/userguide/cicd/argocd/argocd-status-overview.png diff --git a/docs/resources/userguide/cicd/argocd/argocd-status-resourcebinding.png b/docs/en/resources/userguide/cicd/argocd/argocd-status-resourcebinding.png similarity index 100% rename from docs/resources/userguide/cicd/argocd/argocd-status-resourcebinding.png rename to docs/en/resources/userguide/cicd/argocd/argocd-status-resourcebinding.png diff --git a/docs/resources/userguide/cicd/argocd/argocd-sync-apps.png b/docs/en/resources/userguide/cicd/argocd/argocd-sync-apps.png similarity index 100% rename from docs/resources/userguide/cicd/argocd/argocd-sync-apps.png rename to docs/en/resources/userguide/cicd/argocd/argocd-sync-apps.png diff --git a/docs/resources/userguide/failover/failover-overview.png b/docs/en/resources/userguide/failover/failover-overview.png similarity index 100% rename from docs/resources/userguide/failover/failover-overview.png rename to docs/en/resources/userguide/failover/failover-overview.png diff --git a/docs/resources/userguide/service/istio/istio-on-karmada-different-network.png b/docs/en/resources/userguide/service/istio/istio-on-karmada-different-network.png similarity index 100% rename from docs/resources/userguide/service/istio/istio-on-karmada-different-network.png rename to docs/en/resources/userguide/service/istio/istio-on-karmada-different-network.png diff --git a/docs/resources/userguide/service/istio/istio-on-karmada.png b/docs/en/resources/userguide/service/istio/istio-on-karmada.png similarity index 100% rename from docs/resources/userguide/service/istio/istio-on-karmada.png rename to docs/en/resources/userguide/service/istio/istio-on-karmada.png diff --git a/docs/troubleshooting/troubleshooting.md b/docs/en/troubleshooting/troubleshooting.md similarity index 100% rename from docs/troubleshooting/troubleshooting.md rename to docs/en/troubleshooting/troubleshooting.md diff --git a/docs/userguide/AWSNeuron-device/enable-awsneuron-managing.md b/docs/en/userguide/AWSNeuron-device/enable-awsneuron-managing.md similarity index 100% rename from docs/userguide/AWSNeuron-device/enable-awsneuron-managing.md rename to docs/en/userguide/AWSNeuron-device/enable-awsneuron-managing.md diff --git a/docs/userguide/AWSNeuron-device/examples/allocate-neuron-core.md b/docs/en/userguide/AWSNeuron-device/examples/allocate-neuron-core.md similarity index 100% rename from docs/userguide/AWSNeuron-device/examples/allocate-neuron-core.md rename to docs/en/userguide/AWSNeuron-device/examples/allocate-neuron-core.md diff --git a/docs/userguide/AWSNeuron-device/examples/allocate-neuron-device.md b/docs/en/userguide/AWSNeuron-device/examples/allocate-neuron-device.md similarity index 100% rename from docs/userguide/AWSNeuron-device/examples/allocate-neuron-device.md rename to docs/en/userguide/AWSNeuron-device/examples/allocate-neuron-device.md diff --git a/docs/userguide/Ascend-device/device-template.md b/docs/en/userguide/Ascend-device/device-template.md similarity index 100% rename from docs/userguide/Ascend-device/device-template.md rename to docs/en/userguide/Ascend-device/device-template.md diff --git a/docs/userguide/Ascend-device/enable-ascend-sharing.md b/docs/en/userguide/Ascend-device/enable-ascend-sharing.md similarity index 100% rename from docs/userguide/Ascend-device/enable-ascend-sharing.md rename to docs/en/userguide/Ascend-device/enable-ascend-sharing.md diff --git a/docs/userguide/Ascend-device/examples/allocate-310p.md b/docs/en/userguide/Ascend-device/examples/allocate-310p.md similarity index 100% rename from docs/userguide/Ascend-device/examples/allocate-310p.md rename to docs/en/userguide/Ascend-device/examples/allocate-310p.md diff --git a/docs/userguide/Ascend-device/examples/allocate-910b.md b/docs/en/userguide/Ascend-device/examples/allocate-910b.md similarity index 100% rename from docs/userguide/Ascend-device/examples/allocate-910b.md rename to docs/en/userguide/Ascend-device/examples/allocate-910b.md diff --git a/docs/userguide/Ascend-device/examples/allocate-exclusive.md b/docs/en/userguide/Ascend-device/examples/allocate-exclusive.md similarity index 100% rename from docs/userguide/Ascend-device/examples/allocate-exclusive.md rename to docs/en/userguide/Ascend-device/examples/allocate-exclusive.md diff --git a/docs/userguide/Cambricon-device/enable-cambricon-mlu-sharing.md b/docs/en/userguide/Cambricon-device/enable-cambricon-mlu-sharing.md similarity index 100% rename from docs/userguide/Cambricon-device/enable-cambricon-mlu-sharing.md rename to docs/en/userguide/Cambricon-device/enable-cambricon-mlu-sharing.md diff --git a/docs/userguide/Cambricon-device/examples/allocate-core-and-memory.md b/docs/en/userguide/Cambricon-device/examples/allocate-core-and-memory.md similarity index 100% rename from docs/userguide/Cambricon-device/examples/allocate-core-and-memory.md rename to docs/en/userguide/Cambricon-device/examples/allocate-core-and-memory.md diff --git a/docs/userguide/Cambricon-device/examples/allocate-exclusive.md b/docs/en/userguide/Cambricon-device/examples/allocate-exclusive.md similarity index 100% rename from docs/userguide/Cambricon-device/examples/allocate-exclusive.md rename to docs/en/userguide/Cambricon-device/examples/allocate-exclusive.md diff --git a/docs/userguide/Cambricon-device/specify-device-core-usage.md b/docs/en/userguide/Cambricon-device/specify-device-core-usage.md similarity index 100% rename from docs/userguide/Cambricon-device/specify-device-core-usage.md rename to docs/en/userguide/Cambricon-device/specify-device-core-usage.md diff --git a/docs/userguide/Cambricon-device/specify-device-memory-usage.md b/docs/en/userguide/Cambricon-device/specify-device-memory-usage.md similarity index 100% rename from docs/userguide/Cambricon-device/specify-device-memory-usage.md rename to docs/en/userguide/Cambricon-device/specify-device-memory-usage.md diff --git a/docs/userguide/Cambricon-device/specify-device-type-to-use.md b/docs/en/userguide/Cambricon-device/specify-device-type-to-use.md similarity index 100% rename from docs/userguide/Cambricon-device/specify-device-type-to-use.md rename to docs/en/userguide/Cambricon-device/specify-device-type-to-use.md diff --git a/docs/userguide/Device-supported.md b/docs/en/userguide/Device-supported.md similarity index 100% rename from docs/userguide/Device-supported.md rename to docs/en/userguide/Device-supported.md diff --git a/docs/userguide/Enflame-device/enable-enflame-gcu-sharing.md b/docs/en/userguide/Enflame-device/enable-enflame-gcu-sharing.md similarity index 100% rename from docs/userguide/Enflame-device/enable-enflame-gcu-sharing.md rename to docs/en/userguide/Enflame-device/enable-enflame-gcu-sharing.md diff --git a/docs/userguide/Hygon-device/enable-hygon-dcu-sharing.md b/docs/en/userguide/Hygon-device/enable-hygon-dcu-sharing.md similarity index 100% rename from docs/userguide/Hygon-device/enable-hygon-dcu-sharing.md rename to docs/en/userguide/Hygon-device/enable-hygon-dcu-sharing.md diff --git a/docs/userguide/Hygon-device/examples/allocate-core-and-memory.md b/docs/en/userguide/Hygon-device/examples/allocate-core-and-memory.md similarity index 100% rename from docs/userguide/Hygon-device/examples/allocate-core-and-memory.md rename to docs/en/userguide/Hygon-device/examples/allocate-core-and-memory.md diff --git a/docs/userguide/Hygon-device/examples/allocate-exclusive.md b/docs/en/userguide/Hygon-device/examples/allocate-exclusive.md similarity index 100% rename from docs/userguide/Hygon-device/examples/allocate-exclusive.md rename to docs/en/userguide/Hygon-device/examples/allocate-exclusive.md diff --git a/docs/userguide/Hygon-device/examples/specify-certain-cards.md b/docs/en/userguide/Hygon-device/examples/specify-certain-cards.md similarity index 100% rename from docs/userguide/Hygon-device/examples/specify-certain-cards.md rename to docs/en/userguide/Hygon-device/examples/specify-certain-cards.md diff --git a/docs/userguide/Hygon-device/specify-device-core-usage.md b/docs/en/userguide/Hygon-device/specify-device-core-usage.md similarity index 100% rename from docs/userguide/Hygon-device/specify-device-core-usage.md rename to docs/en/userguide/Hygon-device/specify-device-core-usage.md diff --git a/docs/userguide/Hygon-device/specify-device-memory-usage.md b/docs/en/userguide/Hygon-device/specify-device-memory-usage.md similarity index 100% rename from docs/userguide/Hygon-device/specify-device-memory-usage.md rename to docs/en/userguide/Hygon-device/specify-device-memory-usage.md diff --git a/docs/userguide/Hygon-device/specify-device-uuid-to-use.md b/docs/en/userguide/Hygon-device/specify-device-uuid-to-use.md similarity index 100% rename from docs/userguide/Hygon-device/specify-device-uuid-to-use.md rename to docs/en/userguide/Hygon-device/specify-device-uuid-to-use.md diff --git a/docs/userguide/Iluvatar-device/enable-illuvatar-gpu-sharing.md b/docs/en/userguide/Iluvatar-device/enable-illuvatar-gpu-sharing.md similarity index 100% rename from docs/userguide/Iluvatar-device/enable-illuvatar-gpu-sharing.md rename to docs/en/userguide/Iluvatar-device/enable-illuvatar-gpu-sharing.md diff --git a/docs/userguide/Iluvatar-device/examples/allocate-device-core-and-memory-to-container.md b/docs/en/userguide/Iluvatar-device/examples/allocate-device-core-and-memory-to-container.md similarity index 100% rename from docs/userguide/Iluvatar-device/examples/allocate-device-core-and-memory-to-container.md rename to docs/en/userguide/Iluvatar-device/examples/allocate-device-core-and-memory-to-container.md diff --git a/docs/userguide/Iluvatar-device/examples/allocate-exclusive.md b/docs/en/userguide/Iluvatar-device/examples/allocate-exclusive.md similarity index 100% rename from docs/userguide/Iluvatar-device/examples/allocate-exclusive.md rename to docs/en/userguide/Iluvatar-device/examples/allocate-exclusive.md diff --git a/docs/userguide/Kunlunxin-device/enable-kunlunxin-schedule.md b/docs/en/userguide/Kunlunxin-device/enable-kunlunxin-schedule.md similarity index 100% rename from docs/userguide/Kunlunxin-device/enable-kunlunxin-schedule.md rename to docs/en/userguide/Kunlunxin-device/enable-kunlunxin-schedule.md diff --git a/docs/userguide/Metax-device/Metax-GPU/enable-metax-gpu-schedule.md b/docs/en/userguide/Metax-device/Metax-GPU/enable-metax-gpu-schedule.md similarity index 100% rename from docs/userguide/Metax-device/Metax-GPU/enable-metax-gpu-schedule.md rename to docs/en/userguide/Metax-device/Metax-GPU/enable-metax-gpu-schedule.md diff --git a/docs/userguide/Metax-device/Metax-GPU/examples/allocate-binpack.md b/docs/en/userguide/Metax-device/Metax-GPU/examples/allocate-binpack.md similarity index 100% rename from docs/userguide/Metax-device/Metax-GPU/examples/allocate-binpack.md rename to docs/en/userguide/Metax-device/Metax-GPU/examples/allocate-binpack.md diff --git a/docs/userguide/Metax-device/Metax-GPU/examples/allocate-spread.md b/docs/en/userguide/Metax-device/Metax-GPU/examples/allocate-spread.md similarity index 100% rename from docs/userguide/Metax-device/Metax-GPU/examples/allocate-spread.md rename to docs/en/userguide/Metax-device/Metax-GPU/examples/allocate-spread.md diff --git a/docs/userguide/Metax-device/Metax-GPU/examples/default-use.md b/docs/en/userguide/Metax-device/Metax-GPU/examples/default-use.md similarity index 100% rename from docs/userguide/Metax-device/Metax-GPU/examples/default-use.md rename to docs/en/userguide/Metax-device/Metax-GPU/examples/default-use.md diff --git a/docs/userguide/Metax-device/Metax-GPU/specify-binpack-task.md b/docs/en/userguide/Metax-device/Metax-GPU/specify-binpack-task.md similarity index 100% rename from docs/userguide/Metax-device/Metax-GPU/specify-binpack-task.md rename to docs/en/userguide/Metax-device/Metax-GPU/specify-binpack-task.md diff --git a/docs/userguide/Metax-device/Metax-GPU/specify-spread-task.md b/docs/en/userguide/Metax-device/Metax-GPU/specify-spread-task.md similarity index 100% rename from docs/userguide/Metax-device/Metax-GPU/specify-spread-task.md rename to docs/en/userguide/Metax-device/Metax-GPU/specify-spread-task.md diff --git a/docs/userguide/Metax-device/Metax-sGPU/enable-metax-gpu-sharing.md b/docs/en/userguide/Metax-device/Metax-sGPU/enable-metax-gpu-sharing.md similarity index 100% rename from docs/userguide/Metax-device/Metax-sGPU/enable-metax-gpu-sharing.md rename to docs/en/userguide/Metax-device/Metax-sGPU/enable-metax-gpu-sharing.md diff --git a/docs/userguide/Metax-device/Metax-sGPU/examples/allocate-exclusive.md b/docs/en/userguide/Metax-device/Metax-sGPU/examples/allocate-exclusive.md similarity index 100% rename from docs/userguide/Metax-device/Metax-sGPU/examples/allocate-exclusive.md rename to docs/en/userguide/Metax-device/Metax-sGPU/examples/allocate-exclusive.md diff --git a/docs/userguide/Metax-device/Metax-sGPU/examples/allocate-qos-policy.md b/docs/en/userguide/Metax-device/Metax-sGPU/examples/allocate-qos-policy.md similarity index 100% rename from docs/userguide/Metax-device/Metax-sGPU/examples/allocate-qos-policy.md rename to docs/en/userguide/Metax-device/Metax-sGPU/examples/allocate-qos-policy.md diff --git a/docs/userguide/Metax-device/Metax-sGPU/examples/default-use.md b/docs/en/userguide/Metax-device/Metax-sGPU/examples/default-use.md similarity index 100% rename from docs/userguide/Metax-device/Metax-sGPU/examples/default-use.md rename to docs/en/userguide/Metax-device/Metax-sGPU/examples/default-use.md diff --git a/docs/userguide/Mthreads-device/enable-mthreads-gpu-sharing.md b/docs/en/userguide/Mthreads-device/enable-mthreads-gpu-sharing.md similarity index 100% rename from docs/userguide/Mthreads-device/enable-mthreads-gpu-sharing.md rename to docs/en/userguide/Mthreads-device/enable-mthreads-gpu-sharing.md diff --git a/docs/userguide/Mthreads-device/examples/allocate-core-and-memory.md b/docs/en/userguide/Mthreads-device/examples/allocate-core-and-memory.md similarity index 100% rename from docs/userguide/Mthreads-device/examples/allocate-core-and-memory.md rename to docs/en/userguide/Mthreads-device/examples/allocate-core-and-memory.md diff --git a/docs/userguide/Mthreads-device/examples/allocate-exclusive.md b/docs/en/userguide/Mthreads-device/examples/allocate-exclusive.md similarity index 100% rename from docs/userguide/Mthreads-device/examples/allocate-exclusive.md rename to docs/en/userguide/Mthreads-device/examples/allocate-exclusive.md diff --git a/docs/userguide/Mthreads-device/specify-device-core-usage.md b/docs/en/userguide/Mthreads-device/specify-device-core-usage.md similarity index 100% rename from docs/userguide/Mthreads-device/specify-device-core-usage.md rename to docs/en/userguide/Mthreads-device/specify-device-core-usage.md diff --git a/docs/userguide/Mthreads-device/specify-device-memory-usage.md b/docs/en/userguide/Mthreads-device/specify-device-memory-usage.md similarity index 100% rename from docs/userguide/Mthreads-device/specify-device-memory-usage.md rename to docs/en/userguide/Mthreads-device/specify-device-memory-usage.md diff --git a/docs/userguide/NVIDIA-device/dynamic-mig-support.md b/docs/en/userguide/NVIDIA-device/dynamic-mig-support.md similarity index 100% rename from docs/userguide/NVIDIA-device/dynamic-mig-support.md rename to docs/en/userguide/NVIDIA-device/dynamic-mig-support.md diff --git a/docs/userguide/NVIDIA-device/examples/allocate-device-core.md b/docs/en/userguide/NVIDIA-device/examples/allocate-device-core.md similarity index 100% rename from docs/userguide/NVIDIA-device/examples/allocate-device-core.md rename to docs/en/userguide/NVIDIA-device/examples/allocate-device-core.md diff --git a/docs/userguide/NVIDIA-device/examples/allocate-device-memory.md b/docs/en/userguide/NVIDIA-device/examples/allocate-device-memory.md similarity index 100% rename from docs/userguide/NVIDIA-device/examples/allocate-device-memory.md rename to docs/en/userguide/NVIDIA-device/examples/allocate-device-memory.md diff --git a/docs/userguide/NVIDIA-device/examples/allocate-device-memory2.md b/docs/en/userguide/NVIDIA-device/examples/allocate-device-memory2.md similarity index 100% rename from docs/userguide/NVIDIA-device/examples/allocate-device-memory2.md rename to docs/en/userguide/NVIDIA-device/examples/allocate-device-memory2.md diff --git a/docs/userguide/NVIDIA-device/examples/dynamic-mig-example.md b/docs/en/userguide/NVIDIA-device/examples/dynamic-mig-example.md similarity index 100% rename from docs/userguide/NVIDIA-device/examples/dynamic-mig-example.md rename to docs/en/userguide/NVIDIA-device/examples/dynamic-mig-example.md diff --git a/docs/userguide/NVIDIA-device/examples/specify-card-type-to-use.md b/docs/en/userguide/NVIDIA-device/examples/specify-card-type-to-use.md similarity index 100% rename from docs/userguide/NVIDIA-device/examples/specify-card-type-to-use.md rename to docs/en/userguide/NVIDIA-device/examples/specify-card-type-to-use.md diff --git a/docs/userguide/NVIDIA-device/examples/specify-certain-card.md b/docs/en/userguide/NVIDIA-device/examples/specify-certain-card.md similarity index 100% rename from docs/userguide/NVIDIA-device/examples/specify-certain-card.md rename to docs/en/userguide/NVIDIA-device/examples/specify-certain-card.md diff --git a/docs/userguide/NVIDIA-device/examples/use-exclusive-card.md b/docs/en/userguide/NVIDIA-device/examples/use-exclusive-card.md similarity index 100% rename from docs/userguide/NVIDIA-device/examples/use-exclusive-card.md rename to docs/en/userguide/NVIDIA-device/examples/use-exclusive-card.md diff --git a/docs/userguide/NVIDIA-device/specify-device-core-usage.md b/docs/en/userguide/NVIDIA-device/specify-device-core-usage.md similarity index 100% rename from docs/userguide/NVIDIA-device/specify-device-core-usage.md rename to docs/en/userguide/NVIDIA-device/specify-device-core-usage.md diff --git a/docs/userguide/NVIDIA-device/specify-device-memory-usage.md b/docs/en/userguide/NVIDIA-device/specify-device-memory-usage.md similarity index 100% rename from docs/userguide/NVIDIA-device/specify-device-memory-usage.md rename to docs/en/userguide/NVIDIA-device/specify-device-memory-usage.md diff --git a/docs/userguide/NVIDIA-device/specify-device-type-to-use.md b/docs/en/userguide/NVIDIA-device/specify-device-type-to-use.md similarity index 100% rename from docs/userguide/NVIDIA-device/specify-device-type-to-use.md rename to docs/en/userguide/NVIDIA-device/specify-device-type-to-use.md diff --git a/docs/userguide/NVIDIA-device/specify-device-uuid-to-use.md b/docs/en/userguide/NVIDIA-device/specify-device-uuid-to-use.md similarity index 100% rename from docs/userguide/NVIDIA-device/specify-device-uuid-to-use.md rename to docs/en/userguide/NVIDIA-device/specify-device-uuid-to-use.md diff --git a/docs/userguide/configure.md b/docs/en/userguide/configure.md similarity index 100% rename from docs/userguide/configure.md rename to docs/en/userguide/configure.md diff --git a/docs/userguide/monitoring/device-allocation.md b/docs/en/userguide/monitoring/device-allocation.md similarity index 100% rename from docs/userguide/monitoring/device-allocation.md rename to docs/en/userguide/monitoring/device-allocation.md diff --git a/docs/userguide/monitoring/globalview.md b/docs/en/userguide/monitoring/globalview.md similarity index 100% rename from docs/userguide/monitoring/globalview.md rename to docs/en/userguide/monitoring/globalview.md diff --git a/docs/userguide/monitoring/real-time-device-usage.md b/docs/en/userguide/monitoring/real-time-device-usage.md similarity index 100% rename from docs/userguide/monitoring/real-time-device-usage.md rename to docs/en/userguide/monitoring/real-time-device-usage.md diff --git a/docs/userguide/monitoring/real-time-usage.md b/docs/en/userguide/monitoring/real-time-usage.md similarity index 100% rename from docs/userguide/monitoring/real-time-usage.md rename to docs/en/userguide/monitoring/real-time-usage.md diff --git a/docs/userguide/volcano-vgpu/NVIDIA-GPU/examples/default_use.md b/docs/en/userguide/volcano-vgpu/NVIDIA-GPU/examples/default_use.md similarity index 100% rename from docs/userguide/volcano-vgpu/NVIDIA-GPU/examples/default_use.md rename to docs/en/userguide/volcano-vgpu/NVIDIA-GPU/examples/default_use.md diff --git a/docs/userguide/volcano-vgpu/NVIDIA-GPU/examples/use_exclusive_gpu.md b/docs/en/userguide/volcano-vgpu/NVIDIA-GPU/examples/use_exclusive_gpu.md similarity index 100% rename from docs/userguide/volcano-vgpu/NVIDIA-GPU/examples/use_exclusive_gpu.md rename to docs/en/userguide/volcano-vgpu/NVIDIA-GPU/examples/use_exclusive_gpu.md diff --git a/docs/userguide/volcano-vgpu/NVIDIA-GPU/how-to-use-volcano-vgpu.md b/docs/en/userguide/volcano-vgpu/NVIDIA-GPU/how-to-use-volcano-vgpu.md similarity index 100% rename from docs/userguide/volcano-vgpu/NVIDIA-GPU/how-to-use-volcano-vgpu.md rename to docs/en/userguide/volcano-vgpu/NVIDIA-GPU/how-to-use-volcano-vgpu.md diff --git a/docs/userguide/volcano-vgpu/NVIDIA-GPU/monitor.md b/docs/en/userguide/volcano-vgpu/NVIDIA-GPU/monitor.md similarity index 100% rename from docs/userguide/volcano-vgpu/NVIDIA-GPU/monitor.md rename to docs/en/userguide/volcano-vgpu/NVIDIA-GPU/monitor.md diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..dda7077 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,37 @@ +# HAMi +Open, Device Virtualization, VGPU, Heterogeneous AI Computing + +### What is HAMi + +HAMi (Heterogeneous AI Computing Virtualization Middleware) formerly known as k8s-vGPU-scheduler, is an 'all-in-one' chart designed to manage Heterogeneous AI Computing Devices in a k8s cluster. It can provide the ability to share Heterogeneous AI devices and provide resource isolation among tasks. + +HAMi is committed to improving the utilization rate of heterogeneous computing devices in Kubernetes clusters and providing a unified multiplexing interface for different types of heterogeneous devices. + +### Why HAMi + +#### Kubernetes Native API Compatible + +Zero change upgrade: compatible with default behaviour from Kubernetes. + +#### Open and Neutral + +Jointly initiated by Internet, finance, manufacturing, cloud providers, etc. Target for open governance with CNCF + +#### Avoid Vendor Lock-in + +Integration with mainstream cloud providers; Not tied to proprietary vendor orchestration + +#### Resource Isolation + +Provides hard isolation of resources within containers, task in containers can't use resources that exceed their quota + +#### Supports a variety of heterogeneous computing devices + +Provides device-sharing on GPU, MLU, NPU from a variety of manufacturers + +#### Unified Management + +Unified monitoring system, Configurable scheduling policies(binpack,spread,etc...) + +### How It Works +tbd... \ No newline at end of file diff --git a/docs/logo.svg b/docs/logo.svg new file mode 100644 index 0000000..83b8b26 --- /dev/null +++ b/docs/logo.svg @@ -0,0 +1,7030 @@ + + + + diff --git a/docs/zh/FAQ/FAQ.md b/docs/zh/FAQ/FAQ.md new file mode 100644 index 0000000..6beaf87 --- /dev/null +++ b/docs/zh/FAQ/FAQ.md @@ -0,0 +1,194 @@ +--- +title: 常见问题 +--- + + +## 支持的设备厂商及具体型号 + +| **GPU 厂商** | **GPU 型号** | **粒度** | **多 GPU 支持** | +| --- | --- | --- | --- | +| NVIDIA | 几乎所有主流消费级和数据中心 GPU | 核心 1%,内存 1M | 支持。多 GPU 仍可通过虚拟化进行拆分和共享。 | +| 昇腾 | 910A、910B2、910B3、310P | 最小粒度取决于卡类型模板。参考[官方模板](https://www.hiascend.com/document/detail/zh/mindx-dl/50rc1/AVI/cpaug/cpaug_0005.html)。 | 支持,但当 `npu > 1` 时不支持拆分,整卡独占。 | +| 海光 | Z100、Z100L、K100-AI | 核心 1%,内存 1M | 支持,但当 `dcu > 1` 时不支持拆分,整卡独占。 | +| 寒武纪 | 370、590 | 核心 1%,内存 256M | 支持,但当 `mlu > 1` 时不支持拆分,整卡独占。 | +| 天数智芯 | 全部 | 核心 1%,内存 256M | 支持,但当 `gpu > 1` 时不支持拆分,整卡独占。 | +| 摩尔线程 | MTT S4000 | 核心为 1 个核心组,内存 512M | 支持,但当 `gpu > 1` 时不支持拆分,整卡独占。 | +| 魅特思 | MXC500 | 不支持拆分,只能整卡分配。 | 支持,但所有分配均为整卡。 | + +## 什么是 vGPU?为什么看到 10 个 vGPU 却无法在同一张卡上分配两个 vGPU? + +**简要说明** + +vGPU 通过逻辑划分方式提升 GPU 利用率,使多个任务共享同一块物理 GPU。设置 `deviceSplitCount: 10` 表示该 GPU 最多可同时服务 10 个任务,但并不允许一个任务使用该 GPU 上的多个 vGPU。 + +--- + +### vGPU 的概念 + +vGPU 是通过虚拟化在物理 GPU 上创建的逻辑实例,使多个任务可共享同一个物理 GPU。例如配置为 `deviceSplitCount: 10`,表示该物理 GPU 最多可被分配给 10 个任务。这种分配并不会增加物理资源,仅改变逻辑可见性。 + +**为什么无法在同一张卡上分配两个 vGPU?** + +1. **vGPU 的含义** + vGPU 是物理 GPU 的不同任务视图,并非物理资源的划分。当任务请求 `nvidia.com/gpu: 2`,它被理解为需要两张物理 GPU,而非同一张卡上的两个 vGPU。 + +2. **资源分配机制** + vGPU 的设计初衷是让多个任务共享一张 GPU,而不是让单个任务绑定多个 vGPU。`deviceSplitCount: 10` 表示最多有 10 个任务可以并发共享此 GPU,并不支持一个任务使用多个 vGPU。 + +3. **容器与节点视图一致性** + 容器中的 GPU UUID 与节点上的物理 GPU UUID 是一致的,即反映的是同一块 GPU。虽然可见多个 vGPU,但这些是逻辑视图而非独立资源。 + +4. **设计目的** + vGPU 的设计是为了 **让一张 GPU 可供多个任务共享**,而不是 **让一个任务使用多个 vGPU**。vGPU 超售的目标是提升资源利用率,而非扩展单个任务的计算能力。 + +## HAMi 的 `nvidia.com/priority` 字段仅支持两级,如何在资源紧张时实现多级用户自定义优先级的排队调度? + +**简要说明** + +HAMi 的两级优先级用于同一张卡内任务的运行时抢占。若需支持多级用户自定义的任务调度优先级,可将 HAMi 与 **Volcano** 集成,利用其队列调度功能实现多级任务分配与抢占。 + +--- + +HAMi 原生的 `nvidia.com/priority` 字段(0 为高优先级,1 为低/默认)是为 **单卡内运行时抢占场景** 设计的。例如一个低优先级训练任务正在运行,若此时有高优先级的推理任务到来,高优先级任务会暂停低优任务,占用资源,完成后低优任务再恢复。此机制仅适用于单设备上的资源抢占,并非用于调度系统中多个任务队列的优先级排序。 + +若需在资源不足、多个任务排队等待的场景中,按照用户提交的多级优先级进行调度,HAMi 本身不具备此能力。 + +但你仍然可以通过与调度器 **Volcano** 集成来实现: + +1. **Volcano 实现多级调度优先级**: + - Volcano 支持定义多个具有不同优先级的队列; + - 可根据队列优先级决定任务的资源分配顺序,并可对任务间进行抢占,支持 HAMi 管理的 vGPU 资源。 + +2. **HAMi 管理 GPU 共享与运行时优先级**: + - HAMi 可通过其 [volcano-vgpu-device-plugin](https://github.com/Project-HAMi/volcano-vgpu-device-plugin) 与 Volcano 集成; + - Volcano 负责任务队列排序,HAMi 则负责实际运行时的 GPU 共享与抢占逻辑。 + +**总结**:HAMi 的优先级机制用于卡内任务的运行时抢占;若要实现多级任务调度优先级,应结合 **Volcano 与 HAMi** 使用。 + +## 与其他开源工具的集成情况 + +**已支持**: + +- **Volcano**:通过 [`volcano-vgpu-device-plugin`](https://github.com/Project-HAMi/volcano-vgpu-device-plugin) 与 Volcano 集成,实现 GPU 资源调度与管理。 +- **Koordinator**:支持与 Koordinator 集成,实现端到端的 GPU 共享。通过在节点部署 HAMi-core 并在 Pod 中配置 label 和资源请求,Koordinator 能够利用 HAMi 的 GPU 隔离能力。 + + 配置说明参见:[Device Scheduling - GPU Share With HAMi](https://koordinator.sh/docs/user-manuals/device-scheduling-gpu-share-with-hami/) + +**暂不支持**: + +- **KubeVirt 与 Kata Containers**:由于它们依赖虚拟化进行资源隔离,而 HAMi 的 GPU 插件依赖直接挂载 GPU,无法兼容。若要支持需重构设备分配逻辑,但会增加性能开销,HAMi 当前优先支持高性能直挂场景。 + +## 为什么我的 Pod 输出中有 `[HAMI-core Warn(...)]` 日志?可以关闭吗? + +这是正常日志,可忽略。如需关闭,可在容器中设置环境变量 `LIBCUDA_LOG_LEVEL=0`。 + +## HAMi 支持多节点、多 GPU 分布式训练吗?支持跨节点和跨 GPU 吗? + +**简要说明** + +HAMi 支持多节点多 GPU 分布式训练,单个 Pod 可使用同节点多个 GPU,跨节点则通过多个 Pod 配合分布式框架实现。 + +--- + +### 多节点多 GPU 分布式训练 + +在 Kubernetes 中,HAMi 支持通过在不同节点运行多个 Pod,结合分布式框架(如 PyTorch、TensorFlow、Horovod),实现多节点多 GPU 协同训练。每个 Pod 使用本地 GPU,通过 NCCL、RDMA 等高性能网络通信。 + +### 跨节点与跨 GPU 场景 + +1. **跨节点**:多个 Pod 分布在不同节点上,节点间通过网络同步梯度和参数; +2. **跨 GPU**:单个 Pod 可使用所在节点内的多个 GPU。 + +**注意**:一个 Pod 无法跨节点。需采用多 Pod 分布式训练,由分布式框架协调。 + +## HAMi 插件、Volcano 插件、NVIDIA 官方插件三者的关系与兼容性 + +**简要说明** + +同一节点只能启用一个 GPU 插件,避免资源冲突。 + +--- + +### 插件关系说明 + +三种插件都用于 GPU 资源管理,但适用场景及资源汇报方式不同: + +- **HAMi 插件** + - 使用扩展资源名 `nvidia.com/gpu`; + - 支持 HAMi 的 GPU 管理能力(如 vGPU 拆分、自定义调度); + - 适用于复杂资源管理场景。 + +- **Volcano 插件** + - 使用扩展资源名 `volcano.sh/vgpu-number`; + - 为 Volcano 提供 vGPU 虚拟化资源; + - 适合分布式任务、细粒度调度场景。 + +- **NVIDIA 官方插件** + - 使用扩展资源名 `nvidia.com/gpu`; + - 提供基本 GPU 分配功能; + - 适合直接使用物理 GPU 的稳定场景。 + +### 是否可共存 + +- **HAMi 与 NVIDIA 插件**:不建议共存,会产生资源冲突; +- **HAMi 与 Volcano 插件**:理论上可共存,但推荐只启用一个; +- **NVIDIA 与 Volcano 插件**:理论上可共存,但不建议混合使用。 + +## 为什么 Node Capacity 中只有 `nvidia.com/gpu` 而没有 `nvidia.com/gpucores` 或 `nvidia.com/gpumem`? + +**简要说明** + +Kubernetes 的 Device Plugin 每次只能上报一种资源类型。HAMi 将核心数和内存信息以 Node 注解方式记录供调度器使用。 + +--- + +### Device Plugin 的设计限制 + +- Device Plugin 接口(如 Registration、ListAndWatch)仅允许每个插件实例上报一个资源; +- 这简化了资源管理,但限制了同时上报多个指标(如核心和内存)。 + +### HAMi 的实现 + +- HAMi 将 GPU 详细信息(如算力、内存、型号)存储为 **节点注解**,供调度器解析; +- 示例: + ```yaml + hami.io/node-nvidia-register: GPU-fc28df76-54d2-c387-e52e-5f0a9495968c,10,49140,100,NVIDIA-NVIDIA L40S,0,true:GPU-b97db201-0442-8531-56d4-367e0c7d6edd,10,49140,100,... + +### 后续问题说明 + +**为什么使用 `volcano-vgpu-device-plugin` 时 Node Capacity 中会出现 `volcano.sh/vgpu-number` 和 `volcano.sh/vgpu-memory`?** + +- `volcano-vgpu-device-plugin` 是通过 Kubernetes API **直接补丁方式**将 `volcano.sh/vgpu-number` 和 `volcano.sh/vgpu-memory` 写入 Node 的 `capacity` 和 `allocatable` 字段中,而不是通过标准的 Device Plugin 接口进行注册。 +- **注意**:通过这种方式注册的资源并不受 kubelet 的标准机制管理,**kubelet 无法自动更新或释放这些资源**。 + +--- + +## 为什么某些国产厂商不需要单独安装运行时? + +某些国产厂商(例如:**海光**、**寒武纪**)的 Device Plugin 插件已内置了设备发现与挂载的能力,因此不再需要额外的运行时组件。 +相比之下,**NVIDIA** 和 **昇腾** 等厂商的插件则依赖运行时来完成以下功能: + +- 环境变量和软件依赖配置; +- 设备节点挂载; +- 高级功能(如拓扑感知、NUMA、性能隔离等)支持。 + +--- + +**简要总结** + +当官方插件无法满足高级功能(如缺少必要信息)或引入配置复杂性时,**HAMi 会选择自研 Device Plugin 插件**,以确保调度器获取完整资源信息。 + +--- + +HAMi 的调度器需要从节点获取足够的 GPU 信息来完成资源调度和设备分配。主要通过以下三种方式: + +1. **Patch 节点注解(Annotations)**; +2. **通过标准 Device Plugin 接口上报资源给 kubelet**; +3. **直接修改节点的 `status.capacity` 与 `status.allocatable` 字段**。 + +--- + +**为什么 HAMi 要自研插件?举例如下:** + +- **昇腾插件问题**:官方插件需为每种卡类型部署不同插件,HAMi 将其抽象为统一模板,简化集成; +- **NVIDIA 插件问题**:无法支持如 GPU 核心/内存比例限制、GPU 资源超售、NUMA 感知等高级功能,HAMi 需定制插件实现这些调度优化功能。 \ No newline at end of file diff --git a/docs/zh/blog/2024-12-18-support-blog-post/index.md b/docs/zh/blog/2024-12-18-support-blog-post/index.md new file mode 100644 index 0000000..453278d --- /dev/null +++ b/docs/zh/blog/2024-12-18-support-blog-post/index.md @@ -0,0 +1,54 @@ +--- +title: 介绍 HAMi +--- + +## 什么是 HAMi? + +HAMi(异构 AI 计算虚拟化中间件),之前称为 k8s-vGPU-scheduler,是一种创新解决方案, +旨在管理 Kubernetes 集群内的异构 AI 计算设备。这个一站式中间件能够实现各种 AI 设备的共享, +同时确保不同任务之间的资源隔离。通过提高异构计算设备的利用率, +HAMi 提供了一个统一的复用接口,以满足不同设备类型的需求。 + + + +## 为什么选择 HAMi? + +### Kubernetes 本机 API 兼容性 + +HAMi 的突出特点之一是其与 Kubernetes 原生 API 的兼容性。这意味着用户可以在 +不修改现有配置的情况下升级到 HAMi,从而实现无缝过渡,同时保持 Kubernetes 的默认行为。 + +### 开放和中立 + +HAMi 是一个涉及来自各个领域利益相关者的协作倡议,包括互联网服务、金融、制造业和云服务提供商。 +目标是建立云原生计算基金会(CNCF)下的开放治理,确保 HAMi 对所有用户保持中立和可访问。 + +### 避免供应商锁定 + +使用 HAMi,用户可以与主流云服务提供商集成,而无需绑定到专有供应商的编排。 +这种灵活性允许组织选择他们偏好的云解决方案,同时利用 HAMi 的功能。 + +### 资源隔离 + +HAMi 在容器内提供强大的资源隔离。每个在容器中运行的任务都被限制在其分配的资源范围内, +防止任何任务超出其配额。这种严格的隔离增强了计算环境中的安全性和稳定性。 + +### 支持多种异构计算设备 + +HAMi 在支持各种异构计算设备方面表现出色。无论是来自不同制造商的 GPU、MLU 还是 NPU, +HAMi 都促进了设备共享,并在不同的硬件平台上最大化资源效率。 + +### 统一管理 + +为了简化运营,HAMi 提供了一套统一的监控系统,以及如箱装和扩散的可配置调度策略。 +这种全面的管理方法简化了对资源的监管,并提升了整体系统性能。 + +## 结语 + +总之,HAMi 代表了在 Kubernetes 环境中管理异构 AI 计算资源的重大进步。它与现有系统的兼容性、 +对开放治理的承诺以及强大的资源管理能力,使其成为寻求优化其 AI 计算基础设施的组织不可或缺的工具。 + +加入我们,一起踏上使用 HAMi 实现更高效和灵活的 AI 计算的旅程吧! + +引用: +[1] https://project-hami.io diff --git a/docs/zh/blog/2024-12-31-post/index.md b/docs/zh/blog/2024-12-31-post/index.md new file mode 100644 index 0000000..a358607 --- /dev/null +++ b/docs/zh/blog/2024-12-31-post/index.md @@ -0,0 +1,1799 @@ +--- +layout: post +title: HAMI 项目 GPU Pod 调度流程源码走读 +catalog: true +tag: [Kubernetes, GPU, AI] +author: elrond.wang +--- + +- [调度流程](#调度流程) +- [Pod 调度流程](#pod-调度流程) + - [常见问题排查](#常见问题排查) + - [Pod UnexpectedAdmissionError](#pod-unexpectedadmissionerror) + - [调度问题](#调度问题) + - [MutatingWebhook](#mutatingwebhook) + - [Webhook 配置](#webhook-配置) + - [Webhook Server 实现](#webhook-server-实现) + - [拓展 k8s scheduler](#拓展-k8s-scheduler) + - [KubeSchedulerConfiguration](#kubeschedulerconfiguration) + - [拓展调度器 HTTP Server 启动](#拓展调度器-http-server-启动) + - [filter 实现](#filter-实现) + - [获取节点资源信息](#获取节点资源信息) + - [Node 缓存](#node-缓存) + - [device](#device) + - [根据节点资源信息打分](#根据节点资源信息打分) + - [计算出节点的分数](#计算出节点的分数) + - [计算每个容器对应的设备的分数](#计算每个容器对应的设备的分数) + - [binding 实现](#binding-实现) + - [Node 将设备情况写入 node annotation](#node-将设备情况写入-node-annotation) + - [启动 device-plugin 服务](#启动-device-plugin-服务) + - [启动 plugin](#启动-plugin) + - [nvidia 插件的实现](#nvidia-插件的实现) +- [参考](#参考) + +使用 HAMi 的过程中经常会出现 Pod 被创建出来 Pending 的问题,犹以如下两个问题为著: + +- Pod UnexpectedAdmissionError +- Pod Pending + +介于此,展开这部分代码的粗略走读,旨在说明调度过程中各组件的交互,以及资源的计算方式,其他细节会有所遗漏。 + +## 调度流程 + +看代码之前可以先看下官方文档说明,大体上比较明确: + +![flowchart](https://github.com/Project-HAMi/HAMi/blob/master/docs/develop/imgs/flowchart.jpeg?raw=true) + +细节上可以分为三个阶段: + +- 准备阶段: 图上可以看出有一些依赖条件,例如要有 Mutating Webhook、device-plugin 等等。 + 所以这个阶段主要分析下依赖条件的准备,只有在服务首次启动时需要。 + + ![Pod 创建前的准备工作](https://github.com/elrondwong/elrond.wang/raw/master/img/posts/Hami-GPU-Pod-Scheduler/%E5%87%86%E5%A4%87%E5%B7%A5%E4%BD%9C.png) + +- Pod 调度阶段: 准备过程完成之后 Pod 进入处理流程,完成调度 +- Pod 启动阶段: Pod 如何与 Node 上的 GPU 进行交互等 + +本文会着重分析准备阶段,主要内容为调度分析。 + +## Pod 调度流程 + +- 用户发送创建 Pod 请求到 kube-apiserver +- 触发 Adminssion Webhook,更新 Pod 中 schedulerName +- kube-apiserver 根据 schedulerName 将请求发送给调度器处理 +- 调度器处理 + - 收集 Node device 信息 -- 通过 node annotation 收集,数据来自 daemonSet `hami-device-plugin` 定时写入 + - 根据设备信息以及 Pod 的 limit 信息进行打分,选出最高分的 node + - 将 Pod 和 node 进行绑定完成绑定,进行 Pod 创建 + +### 常见问题排查 + +#### Pod UnexpectedAdmissionError + +Pod 创建状态显示 `UnexpectedAdmissionError` + +了解流程之后,可以知道这个错误代表 kube-apiserver 调用拓展调度器失败,可能有两个原因,其他情况具体排查需要看 kube-apiserver 日志。 + +- 通信异常: 从 kube-apiserver 到拓展调度器的 https 端口不通,有几种可能 + - dns 无法解析 + - 跨节点通信有问题 + - 拓展调度器的服务异常 +- TLS 验证错误: 一般会显示 `webhook x509: certificate signed by unknown authority`,helmchart 部署时有一个 `jobs.batch` `hami-vgpu.admission-pathch`,如果没有运行完成会出现这样的问题 + +#### 调度问题 + +容器一直在 pending 状态,使用 `kubectl describe` 命令可以看到具体原因,主要有以下几个: + +- `card Insufficient remaining memory` +- `calcScore:node not fit pod` + + 主要原因一般是确实资源不足,或者配置错误,配置错误是指 devicememoryscaling 配置未符合预期。 + 有两个地方可以配置,优先级为节点配置大于全局配置,容易发生问题的地方在于 name 需要和 kubectl get node 显示的 nodename 一致才能生效。 + +- 全局配置 `kubectl get cm hami-scheduler-device` + + ```yaml + deviceMemoryScaling: 3 + ``` + +- 节点配置 `kubectl get cm hami-device-plugin` + + ```json + { + "nodeconfig": [ + { + "name": "node1", + "devicememoryscaling": 3, + "devicesplitcount": 10, + "migstrategy": "none", + "filterdevices": { + "uuid": [], + "index": [] + } + } + ] + } + ``` + +### MutatingWebhook + +K8s 提供了 adminssionWebhook 资源, 以 k8s 资源操作为触发器,触发 hook,用途最广泛的为针对 +Pod 创建做拦截,对 Pod 做 YAML 注入,具体的例如增加 init 容器注入文件等等。 + +#### Webhook 配置 + +hami-webhook: + +```bash +kubectl get mutatingwebhookconfigurations.admissionregistration.k8s.io hami-webhook -o yaml +``` + +```yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + annotations: + meta.helm.sh/release-name: hami + meta.helm.sh/release-namespace: kube-system + creationTimestamp: "2024-12-10T03:50:37Z" + generation: 5 + labels: + app.kubernetes.io/managed-by: Helm + name: hami-webhook + resourceVersion: "2307810" + uid: 2cdcebe4-f561-429f-9480-701e65980687 +webhooks: +- admissionReviewVersions: + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUJkakNDQVJ5Z0F3SUJBZ0lSQUxjd2FQMjUrMlphdGhTTlFMcG1qT0V3Q2dZSUtvWkl6ajBFQXdJd0R6RU4KTUFzR0ExVUVDaE1FYm1sc01UQWdGdzB5TkRFeU1EWXdOekV4TVRWYUdBOHlNVEkwTVRFeE1qQTNNVEV4TlZvdwpEekVOTUFzR0ExVUVDaE1FYm1sc01UQlpNQk1HQnlxR1NNNDlBZ0VHQ0NxR1NNNDlBd0VIQTBJQUJDUnlXUDdYCkRmT2N4NEVTMVRYaUs0dnFFU2wrcUFHYjI2YzNrOEdMWlZTL1lHaFpLZVVxaEgydVRhTFdWTW1hZVJFbkxqM0cKSStMVFRVTTR6SVhEUld5alZ6QlZNQTRHQTFVZER3RUIvd1FFQXdJQ0JEQVRCZ05WSFNVRUREQUtCZ2dyQmdFRgpCUWNEQVRBUEJnTlZIUk1CQWY4RUJUQURBUUgvTUIwR0ExVWREZ1FXQkJTcVV4bWpGa29YUlpRK0xXVzBNM1pJCnMzck1wakFLQmdncWhrak9QUVFEQWdOSUFEQkZBaUJSY2VRL2tJVkR2VTV3Vjl0K3NRWm93TmFhTWhIMTV5K2sKT3VrR0FlRGVtQUloQUxDZzFrM0JQZUJBNG8reWY5emxvVjM2VEk2RHUzaGdMT1B3MXhaZkFvcDMKLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + service: + name: hami-scheduler + namespace: kube-system + path: /webhook + port: 443 + failurePolicy: Ignore + matchPolicy: Equivalent + name: vgpu.hami.io + namespaceSelector: + matchExpressions: + - key: hami.io/webhook + operator: NotIn + values: + - ignore + objectSelector: + matchExpressions: + - key: hami.io/webhook + operator: NotIn + values: + - ignore + reinvocationPolicy: Never + rules: + - apiGroups: + - "" + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: None + timeoutSeconds: 10 +``` + +当 Pod 创建时,调用 `https://hami-scheduler.kube-system:443/webhook` 做 TLS 校验,CA 为 `caBundle` 配置。 +当命名空间有 `hami.io/webhook: ignore` 的标签时不触发。 + +#### Webhook Server 实现 + +需要实现一个 TLS 的 HTTP Server,且提供 `/webhook` 接口。 + +cmd/scheduler/main.go:84 + +```golang +func start() { + ... + router.POST("/webhook", routes.WebHookRoute()) +``` + +`WebHookRoute` 需要实现 `sigs.k8s.io/controller-runtime@v0.16.3/pkg/webhook/admission/webhook.go:98` + +pkg/scheduler/webhook.go:52 + +```golang + pod := &corev1.Pod{} + err := h.decoder.Decode(req, pod) + if err != nil { + klog.Errorf("Failed to decode request: %v", err) + return admission.Errored(http.StatusBadRequest, err) + } + if len(pod.Spec.Containers) == 0 { + klog.Warningf(template+" - Denying admission as pod has no containers", req.Namespace, req.Name, req.UID) + return admission.Denied("pod has no containers") + } + klog.Infof(template, req.Namespace, req.Name, req.UID) + hasResource := false + for idx, ctr := range pod.Spec.Containers { + c := &pod.Spec.Containers[idx] + if ctr.SecurityContext != nil { + if ctr.SecurityContext.Privileged != nil && *ctr.SecurityContext.Privileged { + klog.Warningf(template+" - Denying admission as container %s is privileged", req.Namespace, req.Name, req.UID, c.Name) + continue + } + } + for _, val := range device.GetDevices() { + found, err := val.MutateAdmission(c, pod) + if err != nil { + klog.Errorf("validating pod failed:%s", err.Error()) + return admission.Errored(http.StatusInternalServerError, err) + } + hasResource = hasResource || found + } + } + + if !hasResource { + klog.Infof(template+" - Allowing admission for pod: no resource found", req.Namespace, req.Name, req.UID) + //return admission.Allowed("no resource found") + } else if len(config.SchedulerName) > 0 { + pod.Spec.SchedulerName = config.SchedulerName + if pod.Spec.NodeName != "" { + klog.Infof(template+" - Pod already has node assigned", req.Namespace, req.Name, req.UID) + return admission.Denied("pod has node assigned") + } + } + marshaledPod, err := json.Marshal(pod) + if err != nil { + klog.Errorf(template+" - Failed to marshal pod, error: %v", req.Namespace, req.Name, req.UID, err) + return admission.Errored(http.StatusInternalServerError, err) + } + return admission.PatchResponseFromRaw(req.Object.Raw, marshaledPod) +``` + +主要通过 Pod 中容器的 resource 来判断是否要不要走拓展调度器。 + +pkg/device/nvidia/device.go:246 + +```golang +func (dev *NvidiaGPUDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Pod) (bool, error) { + /*gpu related */ + priority, ok := ctr.Resources.Limits[corev1.ResourceName(dev.config.ResourcePriority)] + if ok { + ctr.Env = append(ctr.Env, corev1.EnvVar{ + Name: util.TaskPriority, + Value: fmt.Sprint(priority.Value()), + }) + } + + _, resourceNameOK := ctr.Resources.Limits[corev1.ResourceName(dev.config.ResourceCountName)] + if resourceNameOK { + return resourceNameOK, nil + } + + _, resourceCoresOK := ctr.Resources.Limits[corev1.ResourceName(dev.config.ResourceCoreName)] + _, resourceMemOK := ctr.Resources.Limits[corev1.ResourceName(dev.config.ResourceMemoryName)] + _, resourceMemPercentageOK := ctr.Resources.Limits[corev1.ResourceName(dev.config.ResourceMemoryPercentageName)] + + if resourceCoresOK || resourceMemOK || resourceMemPercentageOK { + if dev.config.DefaultGPUNum > 0 { + ctr.Resources.Limits[corev1.ResourceName(dev.config.ResourceCountName)] = *resource.NewQuantity(int64(dev.config.DefaultGPUNum), resource.BinarySI) + resourceNameOK = true + } + } + + if !resourceNameOK && dev.config.OverwriteEnv { + ctr.Env = append(ctr.Env, corev1.EnvVar{ + Name: "NVIDIA_VISIBLE_DEVICES", + Value: "none", + }) + } + return resourceNameOK, nil +} +``` + +主要比对 Pod 的 Resources Limit 中有没有包含 `device-config.yaml` 的配置,如果有走 hami 调度流程 + +`deivce-config` 以英伟达显卡为例: + +```yaml +nvidia: + resourceCountName: nvidia.com/gpu + resourceMemoryName: nvidia.com/gpumem + resourceMemoryPercentageName: nvidia.com/gpumem-percentage + resourceCoreName: nvidia.com/gpucores + resourcePriorityName: nvidia.com/priority + overwriteEnv: false + defaultMemory: 0 + defaultCores: 0 + defaultGPUNum: 1 + deviceSplitCount: 10 + deviceMemoryScaling: 3 + deviceCoreScaling: 3 +``` + +确定走 HAMi 调度流程之后,通过 Patch 修改 Pod `schedulerName` 为 HAMi 调度器的名称。 + +### 拓展 k8s scheduler + +[KubeSchedulerConfiguration](https://kubernetes.io/docs/reference/config-api/kube-scheduler-config.v1/) 拓展调度器可以通过实现拓展点进行调度器的拓展 + +#### KubeSchedulerConfiguration + +```yaml +kubectl get cm hami-scheduler-newversion -o yaml +``` + +```yaml +apiVersion: v1 +data: + config.yaml: | + apiVersion: kubescheduler.config.k8s.io/v1beta2 + kind: KubeSchedulerConfiguration + leaderElection: + leaderElect: false + profiles: + - schedulerName: hami-scheduler + extenders: + - urlPrefix: "https://127.0.0.1:443" + filterVerb: filter + bindVerb: bind + nodeCacheCapable: true + weight: 1 + httpTimeout: 30s + enableHTTPS: true + tlsConfig: + insecure: true + managedResources: + - name: nvidia.com/gpu + ignoredByScheduler: true + - name: nvidia.com/gpumem + ignoredByScheduler: true + - name: nvidia.com/gpucores + ignoredByScheduler: true + - name: nvidia.com/gpumem-percentage + ignoredByScheduler: true + - name: nvidia.com/priority + ignoredByScheduler: true + - name: cambricon.com/vmlu + ignoredByScheduler: true + - name: hygon.com/dcunum + ignoredByScheduler: true + - name: hygon.com/dcumem + ignoredByScheduler: true + - name: hygon.com/dcucores + ignoredByScheduler: true + - name: iluvatar.ai/vgpu + ignoredByScheduler: true +kind: ConfigMap +metadata: + annotations: + meta.helm.sh/release-name: hami + meta.helm.sh/release-namespace: kube-system + creationTimestamp: "2024-12-10T03:50:36Z" + labels: + + app.kubernetes.io/component: hami-scheduler + app.kubernetes.io/instance: hami + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: hami + app.kubernetes.io/version: 2.4.1 + helm.sh/chart: hami-2.4.1 + name: hami-scheduler-newversion + namespace: kube-system + resourceVersion: "2316275" + uid: 3a61a72c-0bab-432f-b4d7-5c1ae46ee14d +``` + +拓展调度器通过[拓展点](https://kubernetes.io/docs/reference/scheduling/config/#extension-points)进行拓展, 这里拓展了 filter 和 bind。 + +- filter: 找到最合适的 node +- bind: 为 Pod 创建一个 binding 资源 + +调度时会根据拓展点顺序来调用拓展调度器的实现,这里会先调用 +`https://127.0.0.1:443/filter`,再调用 `https://127.0.0.1:443/filter` + +#### 拓展调度器 HTTP Server 启动 + +`cmd/scheduler/main.go:70` + +```golang +func start() { + device.InitDevices() + sher = scheduler.NewScheduler() + sher.Start() + defer sher.Stop() + + // start monitor metrics + go sher.RegisterFromNodeAnnotations() + go initMetrics(config.MetricsBindAddress) + + // start http server + router := httprouter.New() + router.POST("/filter", routes.PredicateRoute(sher)) + router.POST("/bind", routes.Bind(sher)) +``` + +#### filter 实现 + +`pkg/scheduler/routes/route.go:41` + +```golang +func PredicateRoute(s *scheduler.Scheduler) httprouter.Handle { + klog.Infoln("Into Predicate Route outer func") + return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) { + klog.Infoln("Into Predicate Route inner func") + checkBody(w, r) + + var buf bytes.Buffer + body := io.TeeReader(r.Body, &buf) + + var extenderArgs extenderv1.ExtenderArgs + var extenderFilterResult *extenderv1.ExtenderFilterResult + + if err := json.NewDecoder(body).Decode(&extenderArgs); err != nil { + klog.Errorln("decode error", err.Error()) + extenderFilterResult = &extenderv1.ExtenderFilterResult{ + Error: err.Error(), + } + } else { + extenderFilterResult, err = s.Filter(extenderArgs) + if err != nil { + klog.Errorf("pod %v filter error, %v", extenderArgs.Pod.Name, err) + extenderFilterResult = &extenderv1.ExtenderFilterResult{ + Error: err.Error(), + } + } + } + + if resultBody, err := json.Marshal(extenderFilterResult); err != nil { + klog.Errorf("Failed to marshal extenderFilterResult: %+v, %+v", + err, extenderFilterResult) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusInternalServerError) + w.Write([]byte(err.Error())) + } else { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write(resultBody) + } + } +} +``` + +`pkg/scheduler/scheduler.go:430` + +```golang +func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFilterResult, error) { + klog.InfoS("begin schedule filter", "pod", args.Pod.Name, "uuid", args.Pod.UID, "namespaces", args.Pod.Namespace) + nums := k8sutil.Resourcereqs(args.Pod) + total := 0 + for _, n := range nums { + for _, k := range n { + total += int(k.Nums) + } + } + if total == 0 { + klog.V(1).Infof("pod %v not find resource", args.Pod.Name) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, fmt.Errorf("does not request any resource")) + return &extenderv1.ExtenderFilterResult{ + NodeNames: args.NodeNames, + FailedNodes: nil, + Error: "", + }, nil + } + annos := args.Pod.Annotations + s.delPod(args.Pod) + nodeUsage, failedNodes, err := s.getNodesUsage(args.NodeNames, args.Pod) + if err != nil { + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, err) + return nil, err + } + if len(failedNodes) != 0 { + klog.V(5).InfoS("getNodesUsage failed nodes", "nodes", failedNodes) + } + nodeScores, err := s.calcScore(nodeUsage, nums, annos, args.Pod) + if err != nil { + err := fmt.Errorf("calcScore failed %v for pod %v", err, args.Pod.Name) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, err) + return nil, err + } + if len((*nodeScores).NodeList) == 0 { + klog.V(4).Infof("All node scores do not meet for pod %v", args.Pod.Name) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, fmt.Errorf("no available node, all node scores do not meet")) + return &extenderv1.ExtenderFilterResult{ + FailedNodes: failedNodes, + }, nil + } + klog.V(4).Infoln("nodeScores_len=", len((*nodeScores).NodeList)) + sort.Sort(nodeScores) + m := (*nodeScores).NodeList[len((*nodeScores).NodeList)-1] + klog.Infof("schedule %v/%v to %v %v", args.Pod.Namespace, args.Pod.Name, m.NodeID, m.Devices) + annotations := make(map[string]string) + annotations[util.AssignedNodeAnnotations] = m.NodeID + annotations[util.AssignedTimeAnnotations] = strconv.FormatInt(time.Now().Unix(), 10) + + for _, val := range device.GetDevices() { + val.PatchAnnotations(&annotations, m.Devices) + } + + //InRequestDevices := util.EncodePodDevices(util.InRequestDevices, m.devices) + //supportDevices := util.EncodePodDevices(util.SupportDevices, m.devices) + //maps.Copy(annotations, InRequestDevices) + //maps.Copy(annotations, supportDevices) + s.addPod(args.Pod, m.NodeID, m.Devices) + err = util.PatchPodAnnotations(args.Pod, annotations) + if err != nil { + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, err) + s.delPod(args.Pod) + return nil, err + } + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringSucceed, []string{m.NodeID}, nil) + res := extenderv1.ExtenderFilterResult{NodeNames: &[]string{m.NodeID}} + return &res, nil +} +``` + +这里核心逻辑主要有两步, 获取节点资源、根据节点已分配资源与总资源计算分数并选出一个最高分。 + +##### 获取节点资源信息 + +`pkg/scheduler/scheduler.go:241` + +```golang +func (s *Scheduler) getNodesUsage(nodes *[]string, task *corev1.Pod) (*map[string]*NodeUsage, map[string]string, error) { + overallnodeMap := make(map[string]*NodeUsage) + cachenodeMap := make(map[string]*NodeUsage) + failedNodes := make(map[string]string) + //for _, nodeID := range *nodes { + allNodes, err := s.ListNodes() + if err != nil { + return &overallnodeMap, failedNodes, err + } + + for _, node := range allNodes { + nodeInfo := &NodeUsage{} + userGPUPolicy := config.GPUSchedulerPolicy + if task != nil && task.Annotations != nil { + if value, ok := task.Annotations[policy.GPUSchedulerPolicyAnnotationKey]; ok { + userGPUPolicy = value + } + } + nodeInfo.Node = node.Node + nodeInfo.Devices = policy.DeviceUsageList{ + Policy: userGPUPolicy, + DeviceLists: make([]*policy.DeviceListsScore, 0), + } + for _, d := range node.Devices { + nodeInfo.Devices.DeviceLists = append(nodeInfo.Devices.DeviceLists, &policy.DeviceListsScore{ + Score: 0, + Device: &util.DeviceUsage{ + ID: d.ID, + Index: d.Index, + Used: 0, + Count: d.Count, + Usedmem: 0, + Totalmem: d.Devmem, + Totalcore: d.Devcore, + Usedcores: 0, + MigUsage: util.MigInUse{ + Index: 0, + UsageList: make(util.MIGS, 0), + }, + MigTemplate: d.MIGTemplate, + Mode: d.Mode, + Type: d.Type, + Numa: d.Numa, + Health: d.Health, + }, + }) + } + overallnodeMap[node.ID] = nodeInfo + } + + podsInfo := s.ListPodsInfo() + for _, p := range podsInfo { + node, ok := overallnodeMap[p.NodeID] + if !ok { + continue + } + for _, podsingleds := range p.Devices { + for _, ctrdevs := range podsingleds { + for _, udevice := range ctrdevs { + for _, d := range node.Devices.DeviceLists { + deviceID := udevice.UUID + if strings.Contains(deviceID, "[") { + deviceID = strings.Split(deviceID, "[")[0] + } + if d.Device.ID == deviceID { + d.Device.Used++ + d.Device.Usedmem += udevice.Usedmem + d.Device.Usedcores += udevice.Usedcores + if strings.Contains(udevice.UUID, "[") { + tmpIdx, Instance := util.ExtractMigTemplatesFromUUID(udevice.UUID) + if len(d.Device.MigUsage.UsageList) == 0 { + util.PlatternMIG(&d.Device.MigUsage, d.Device.MigTemplate, tmpIdx) + } + d.Device.MigUsage.UsageList[Instance].InUse = true + klog.V(3).Infoln("add mig usage", d.Device.MigUsage, "template=", d.Device.MigTemplate, "uuid=", d.Device.ID) + } + } + } + } + } + } + klog.V(5).Infof("usage: pod %v assigned %v %v", p.Name, p.NodeID, p.Devices) + } + s.overviewstatus = overallnodeMap + for _, nodeID := range *nodes { + node, err := s.GetNode(nodeID) + if err != nil { + // The identified node does not have a gpu device, so the log here has no practical meaning,increase log priority. + klog.V(5).InfoS("node unregistered", "node", nodeID, "error", err) + failedNodes[nodeID] = "node unregistered" + continue + } + cachenodeMap[node.ID] = overallnodeMap[node.ID] + } + s.cachedstatus = cachenodeMap + return &cachenodeMap, failedNodes, nil +} +``` + +获取 Node 总的资源与已分配的资源, 首先获取 Node 信息。 + +`pkg/scheduler/nodes.go:120` + +```golang +func (m *nodeManager) ListNodes() (map[string]*util.NodeInfo, error) { + m.mutex.RLock() + defer m.mutex.RUnlock() + return m.nodes, nil +} +``` + +这里用到了缓存,缓存节点信息,由 `addNode` 添加缓存。 + +###### Node 缓存 + +`pkg/scheduler/nodes.go:46` + +```golang +func (m *nodeManager) addNode(nodeID string, nodeInfo *util.NodeInfo) { + if nodeInfo == nil || len(nodeInfo.Devices) == 0 { + return + } + m.mutex.Lock() + defer m.mutex.Unlock() + _, ok := m.nodes[nodeID] + if ok { + if len(nodeInfo.Devices) > 0 { + tmp := make([]util.DeviceInfo, 0, len(nodeInfo.Devices)) + devices := device.GetDevices() + deviceType := "" + for _, val := range devices { + if strings.Contains(nodeInfo.Devices[0].Type, val.CommonWord()) { + deviceType = val.CommonWord() + } + } + for _, val := range m.nodes[nodeID].Devices { + if !strings.Contains(val.Type, deviceType) { + tmp = append(tmp, val) + } + } + m.nodes[nodeID].Devices = tmp + m.nodes[nodeID].Devices = append(m.nodes[nodeID].Devices, nodeInfo.Devices...) + } + } else { + m.nodes[nodeID] = nodeInfo + } +} +``` + +这里的主要逻辑在于 `device.GetDevices()` 获取设备信息 + +`pkg/device/devices.go:81` + +```golang +func GetDevices() map[string]Devices { + return devices +} +``` + +device 也是个缓存,后面再分析,首先看 Node 缓存是什么时候被调用的。 + +`pkg/scheduler/scheduler.go:155` + +```golang +func (s *Scheduler) RegisterFromNodeAnnotations() { + klog.V(5).Infoln("Scheduler into RegisterFromNodeAnnotations") + ticker := time.NewTicker(time.Second * 15) + for { + select { + case <-s.nodeNotify: + case <-ticker.C: + case <-s.stopCh: + return + } + labelSelector := labels.Everything() + if len(config.NodeLabelSelector) > 0 { + labelSelector = (labels.Set)(config.NodeLabelSelector).AsSelector() + } + rawNodes, err := s.nodeLister.List(labelSelector) + if err != nil { + klog.Errorln("nodes list failed", err.Error()) + continue + } + var nodeNames []string + for _, val := range rawNodes { + nodeNames = append(nodeNames, val.Name) + for devhandsk, devInstance := range device.GetDevices() { + health, needUpdate := devInstance.CheckHealth(devhandsk, val) + klog.V(5).InfoS("device check health", "node", val.Name, "deviceVendor", devhandsk, "health", health, "needUpdate", needUpdate) + if !health { + err := devInstance.NodeCleanUp(val.Name) + // If the device is not healthy, the device is removed from the node. + // At the same time, this node needs to be removed from the cache. + if err != nil { + klog.Errorln("node cleanup failed", err.Error()) + } + info, ok := s.nodes[val.Name] + if ok { + klog.Infof("node %v device %s:%v leave, %v remaining devices:%v", val.Name, devhandsk, info.ID, err, s.nodes[val.Name].Devices) + s.rmNodeDevice(val.Name, info, devhandsk) + continue + } + } + if !needUpdate { + continue + } + _, ok := util.HandshakeAnnos[devhandsk] + if ok { + tmppat := make(map[string]string) + tmppat[util.HandshakeAnnos[devhandsk]] = "Requesting_" + time.Now().Format("2006.01.02 15:04:05") + klog.V(4).InfoS("New timestamp", util.HandshakeAnnos[devhandsk], tmppat[util.HandshakeAnnos[devhandsk]], "nodeName", val.Name) + n, err := util.GetNode(val.Name) + if err != nil { + klog.Errorln("get node failed", err.Error()) + continue + } + util.PatchNodeAnnotations(n, tmppat) + } + + nodeInfo := &util.NodeInfo{} + nodeInfo.ID = val.Name + nodeInfo.Node = val + nodedevices, err := devInstance.GetNodeDevices(*val) + if err != nil { + continue + } + nodeInfo.Devices = make([]util.DeviceInfo, 0) + for _, deviceinfo := range nodedevices { + nodeInfo.Devices = append(nodeInfo.Devices, *deviceinfo) + } + s.addNode(val.Name, nodeInfo) + if s.nodes[val.Name] != nil && len(nodeInfo.Devices) > 0 { + klog.Infof("node %v device %s come node info=%s,%v total=%v", val.Name, devhandsk, nodeInfo.ID, nodeInfo.Devices, s.nodes[val.Name].Devices) + } + } + } + _, _, err = s.getNodesUsage(&nodeNames, nil) + if err != nil { + klog.Errorln("get node usage failed", err.Error()) + } + } +} +``` + +启动了一个 15s 的定时任务,获取 Node 信息维护 Node 缓存。 + +这里的核心逻辑在于 `for devhandsk, devInstance := range device.GetDevices()` 获取所有的 device, +主要是一些根据不同的设备注册了不同的 handler,根据注册的 device 获取显卡的资源信息 `devInstance.GetNodeDevices`。 + +这里会通过注册的 device(此环境为 nvidia),调用到不同显卡的`GetNodeDevices`实现,device 后面再做具体说明。 + +`pkg/device/nvidia/device.go:209` + +```golang +ffunc (dev *NvidiaGPUDevices) GetNodeDevices(n corev1.Node) ([]*util.DeviceInfo, error) { + devEncoded, ok := n.Annotations[RegisterAnnos] + if !ok { + return []*util.DeviceInfo{}, errors.New("annos not found " + RegisterAnnos) + } + nodedevices, err := util.DecodeNodeDevices(devEncoded) + if err != nil { + klog.ErrorS(err, "failed to decode node devices", "node", n.Name, "device annotation", devEncoded) + return []*util.DeviceInfo{}, err + } + if len(nodedevices) == 0 { + klog.InfoS("no nvidia gpu device found", "node", n.Name, "device annotation", devEncoded) + return []*util.DeviceInfo{}, errors.New("no gpu found on node") + } + for _, val := range nodedevices { + if val.Mode == "mig" { + val.MIGTemplate = make([]util.Geometry, 0) + for _, migTemplates := range dev.config.MigGeometriesList { + found := false + for _, migDevices := range migTemplates.Models { + if strings.Contains(val.Type, migDevices) { + found = true + break + } + } + if found { + val.MIGTemplate = append(val.MIGTemplate, migTemplates.Geometries...) + break + } + } + } + } + devDecoded := util.EncodeNodeDevices(nodedevices) + klog.V(5).InfoS("nodes device information", "node", n.Name, "nodedevices", devDecoded) + return nodedevices, nil +} +``` + +看到这里基本逻辑是 scheduler 通过定时器去读取 node 的 annotation 信息并将其维护再 node 缓存中,以供调度时使用。 + +```yaml +apiVersion: v1 +kind: Node +metadata: + annotations: + ... + hami.io/node-nvidia-register: 'GPU-7aebc545-cbd3-18a0-afce-76cae449702a,10,24576,300,NVIDIA-NVIDIA + GeForce RTX 3090,0,true: +``` + +又调用到了 device,这个我们待会儿再看,继续看谁调用的 `RegisterFromNodeAnnotations`。 + +`cmd/scheduler/main.go:70` + +```golang +func start() { + device.InitDevices() + sher = scheduler.NewScheduler() + sher.Start() + defer sher.Stop() + + // start monitor metrics + go sher.RegisterFromNodeAnnotations() + go initMetrics(config.MetricsBindAddress) +``` + +调度器启动的时候就会调用,这里逻辑明确了,继续看刚刚的 device。 + +###### device + +device 通过 `pkg/device/devices.go:85` 进行初始化。 + +```golang +func InitDevicesWithConfig(config *Config) { + devices = make(map[string]Devices) + DevicesToHandle = []string{} + devices[nvidia.NvidiaGPUDevice] = nvidia.InitNvidiaDevice(config.NvidiaConfig) + devices[cambricon.CambriconMLUDevice] = cambricon.InitMLUDevice(config.CambriconConfig) + devices[hygon.HygonDCUDevice] = hygon.InitDCUDevice(config.HygonConfig) + devices[iluvatar.IluvatarGPUDevice] = iluvatar.InitIluvatarDevice(config.IluvatarConfig) + devices[mthreads.MthreadsGPUDevice] = mthreads.InitMthreadsDevice(config.MthreadsConfig) + devices[metax.MetaxGPUDevice] = metax.InitMetaxDevice(config.MetaxConfig) + + DevicesToHandle = append(DevicesToHandle, nvidia.NvidiaGPUCommonWord) + DevicesToHandle = append(DevicesToHandle, cambricon.CambriconMLUCommonWord) + DevicesToHandle = append(DevicesToHandle, hygon.HygonDCUCommonWord) + DevicesToHandle = append(DevicesToHandle, iluvatar.IluvatarGPUCommonWord) + DevicesToHandle = append(DevicesToHandle, mthreads.MthreadsGPUCommonWord) + DevicesToHandle = append(DevicesToHandle, metax.MetaxGPUCommonWord) + for _, dev := range ascend.InitDevices(config.VNPUs) { + devices[dev.CommonWord()] = dev + DevicesToHandle = append(DevicesToHandle, dev.CommonWord()) + } +} +``` + +这里用的是 nvidia,所以主要看 `InitNvidiaDevice` 即可。 + +`pkg/device/devices.go:42` + +```golang +type Devices interface { + CommonWord() string + MutateAdmission(ctr *corev1.Container, pod *corev1.Pod) (bool, error) + CheckHealth(devType string, n *corev1.Node) (bool, bool) + NodeCleanUp(nn string) error + GetNodeDevices(n corev1.Node) ([]*util.DeviceInfo, error) + CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool) + // CheckUUID is check current device id whether in GPUUseUUID or GPUNoUseUUID set, return true is check success. + CheckUUID(annos map[string]string, d util.DeviceUsage) bool + LockNode(n *corev1.Node, p *corev1.Pod) error + ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error + GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest + PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string + CustomFilterRule(allocated *util.PodDevices, request util.ContainerDeviceRequest, toAllicate util.ContainerDevices, device *util.DeviceUsage) bool + ScoreNode(node *corev1.Node, podDevices util.PodSingleDevice, policy string) float32 + AddResourceUsage(n *util.DeviceUsage, ctr *util.ContainerDevice) error + // This should not be associated with a specific device object + //ParseConfig(fs *flag.FlagSet) +} +``` + +这里定义了一些接口,不同的设备进行不同的实现,在 scheduler 启动时进行初始化,以供运行中调用。 + +获取到各个节点的各个设备的资源情况之后开始进行打分。 + +##### 根据节点资源信息打分 + +`pkg/scheduler/scheduler.go:458` + +```golang + nodeScores, err := s.calcScore(nodeUsage, nums, annos, args.Pod) + if err != nil { + err := fmt.Errorf("calcScore failed %v for pod %v", err, args.Pod.Name) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, err) + return nil, err + } +``` + +`pkg/scheduler/score.go:198` + +```golang +func (s *Scheduler) calcScore(nodes *map[string]*NodeUsage, nums util.PodDeviceRequests, annos map[string]string, task *corev1.Pod) (*policy.NodeScoreList, error) { + userNodePolicy := config.NodeSchedulerPolicy + if annos != nil { + if value, ok := annos[policy.NodeSchedulerPolicyAnnotationKey]; ok { + userNodePolicy = value + } + } + res := policy.NodeScoreList{ + Policy: userNodePolicy, + NodeList: make([]*policy.NodeScore, 0), + } + + //func calcScore(nodes *map[string]*NodeUsage, errMap *map[string]string, nums util.PodDeviceRequests, annos map[string]string, task *corev1.Pod) (*NodeScoreList, error) { + // res := make(NodeScoreList, 0, len(*nodes)) + for nodeID, node := range *nodes { + viewStatus(*node) + score := policy.NodeScore{NodeID: nodeID, Node: node.Node, Devices: make(util.PodDevices), Score: 0} + score.ComputeDefaultScore(node.Devices) + + //This loop is for different container request + ctrfit := false + for ctrid, n := range nums { + sums := 0 + for _, k := range n { + sums += int(k.Nums) + } + + if sums == 0 { + for idx := range score.Devices { + for len(score.Devices[idx]) <= ctrid { + score.Devices[idx] = append(score.Devices[idx], util.ContainerDevices{}) + } + score.Devices[idx][ctrid] = append(score.Devices[idx][ctrid], util.ContainerDevice{}) + continue + } + } + klog.V(5).InfoS("fitInDevices", "pod", klog.KObj(task), "node", nodeID) + fit, _ := fitInDevices(node, n, annos, task, &score.Devices) + ctrfit = fit + if !fit { + klog.InfoS("calcScore:node not fit pod", "pod", klog.KObj(task), "node", nodeID) + break + } + } + + if ctrfit { + res.NodeList = append(res.NodeList, &score) + score.OverrideScore(node.Devices, userNodePolicy) + } + } + return &res, nil +} +``` + +这块逻辑主要分为遍历节点打分,遍历 Pod 的容器计算每个容器对应的设备的分数,返回所有可以承载 limits 所需资源的 node 返回。 + +##### 计算出节点的分数 + +`pkg/scheduler/policy/node_policy.go:68` + +```golang +func (ns *NodeScore) ComputeDefaultScore(devices DeviceUsageList) { + used, usedCore, usedMem := int32(0), int32(0), int32(0) + for _, device := range devices.DeviceLists { + used += device.Device.Used + usedCore += device.Device.Usedcores + usedMem += device.Device.Usedmem + } + klog.V(2).Infof("node %s used %d, usedCore %d, usedMem %d,", ns.NodeID, used, usedCore, usedMem) + + total, totalCore, totalMem := int32(0), int32(0), int32(0) + for _, deviceLists := range devices.DeviceLists { + total += deviceLists.Device.Count + totalCore += deviceLists.Device.Totalcore + totalMem += deviceLists.Device.Totalmem + } + useScore := float32(used) / float32(total) + coreScore := float32(usedCore) / float32(totalCore) + memScore := float32(usedMem) / float32(totalMem) + ns.Score = float32(Weight) * (useScore + coreScore + memScore) + klog.V(2).Infof("node %s computer default score is %f", ns.NodeID, ns.Score) +} +``` + +节点打分规则比较简单 + +##### 计算每个容器对应的设备的分数 + +`pkg/scheduler/score.go:149` + +```golang +func fitInDevices(node *NodeUsage, requests util.ContainerDeviceRequests, annos map[string]string, pod *corev1.Pod, devinput *util.PodDevices) (bool, float32) { + //devmap := make(map[string]util.ContainerDevices) + devs := util.ContainerDevices{} + total, totalCore, totalMem := int32(0), int32(0), int32(0) + free, freeCore, freeMem := int32(0), int32(0), int32(0) + sums := 0 + // computer all device score for one node + for index := range node.Devices.DeviceLists { + node.Devices.DeviceLists[index].ComputeScore(requests) + } + //This loop is for requests for different devices + for _, k := range requests { + sums += int(k.Nums) + if int(k.Nums) > len(node.Devices.DeviceLists) { + klog.InfoS("request devices nums cannot exceed the total number of devices on the node.", "pod", klog.KObj(pod), "request devices nums", k.Nums, "node device nums", len(node.Devices.DeviceLists)) + return false, 0 + } + sort.Sort(node.Devices) + fit, tmpDevs := fitInCertainDevice(node, k, annos, pod, devinput) + if fit { + for idx, val := range tmpDevs[k.Type] { + for nidx, v := range node.Devices.DeviceLists { + //bc node.Devices has been sorted, so we should find out the correct device + if v.Device.ID != val.UUID { + continue + } + total += v.Device.Count + totalCore += v.Device.Totalcore + totalMem += v.Device.Totalmem + free += v.Device.Count - v.Device.Used + freeCore += v.Device.Totalcore - v.Device.Usedcores + freeMem += v.Device.Totalmem - v.Device.Usedmem + err := device.GetDevices()[k.Type].AddResourceUsage(node.Devices.DeviceLists[nidx].Device, &tmpDevs[k.Type][idx]) + if err != nil { + klog.Errorf("AddResource failed:%s", err.Error()) + return false, 0 + } + klog.Infoln("After AddResourceUsage:", node.Devices.DeviceLists[nidx].Device) + } + } + devs = append(devs, tmpDevs[k.Type]...) + } else { + return false, 0 + } + (*devinput)[k.Type] = append((*devinput)[k.Type], devs) + } + return true, 0 +} +``` + +主要逻辑为: + +- 给容器对应的每个设备打分、遍历不同的容器对应的 limit 资源,找到可以承载容器 limits 资源的设备 + +`pkg/scheduler/policy/gpu_policy.go:58` + +```golang +func (ds *DeviceListsScore) ComputeScore(requests util.ContainerDeviceRequests) { + request, core, mem := int32(0), int32(0), int32(0) + // Here we are required to use the same type device + for _, container := range requests { + request += container.Nums + core += container.Coresreq + if container.MemPercentagereq != 0 && container.MemPercentagereq != 101 { + mem += ds.Device.Totalmem * (container.MemPercentagereq / 100.0) + continue + } + mem += container.Memreq + } + klog.V(2).Infof("device %s user %d, userCore %d, userMem %d,", ds.Device.ID, ds.Device.Used, ds.Device.Usedcores, ds.Device.Usedmem) + + usedScore := float32(request+ds.Device.Used) / float32(ds.Device.Count) + coreScore := float32(core+ds.Device.Usedcores) / float32(ds.Device.Totalcore) + memScore := float32(mem+ds.Device.Usedmem) / float32(ds.Device.Totalmem) + ds.Score = float32(Weight) * (usedScore + coreScore + memScore) + klog.V(2).Infof("device %s computer score is %f", ds.Device.ID, ds.Score) +} +``` + +打分规则与节点类似。 + +`pkg/scheduler/score.go:65` + +```golang +func fitInCertainDevice(node *NodeUsage, request util.ContainerDeviceRequest, annos map[string]string, pod *corev1.Pod, allocated *util.PodDevices) (bool, map[string]util.ContainerDevices) { + k := request + originReq := k.Nums + prevnuma := -1 + klog.InfoS("Allocating device for container request", "pod", klog.KObj(pod), "card request", k) + var tmpDevs map[string]util.ContainerDevices + tmpDevs = make(map[string]util.ContainerDevices) + for i := len(node.Devices.DeviceLists) - 1; i >= 0; i-- { + klog.InfoS("scoring pod", "pod", klog.KObj(pod), "Memreq", k.Memreq, "MemPercentagereq", k.MemPercentagereq, "Coresreq", k.Coresreq, "Nums", k.Nums, "device index", i, "device", node.Devices.DeviceLists[i].Device.ID) + found, numa := checkType(annos, *node.Devices.DeviceLists[i].Device, k) + if !found { + klog.InfoS("card type mismatch,continuing...", "pod", klog.KObj(pod), (node.Devices.DeviceLists[i].Device).Type, k.Type) + continue + } + if numa && prevnuma != node.Devices.DeviceLists[i].Device.Numa { + klog.InfoS("Numa not fit, resotoreing", "pod", klog.KObj(pod), "k.nums", k.Nums, "numa", numa, "prevnuma", prevnuma, "device numa", node.Devices.DeviceLists[i].Device.Numa) + k.Nums = originReq + prevnuma = node.Devices.DeviceLists[i].Device.Numa + tmpDevs = make(map[string]util.ContainerDevices) + } + if !checkUUID(annos, *node.Devices.DeviceLists[i].Device, k) { + klog.InfoS("card uuid mismatch,", "pod", klog.KObj(pod), "current device info is:", *node.Devices.DeviceLists[i].Device) + continue + } + + memreq := int32(0) + if node.Devices.DeviceLists[i].Device.Count <= node.Devices.DeviceLists[i].Device.Used { + continue + } + if k.Coresreq > 100 { + klog.ErrorS(nil, "core limit can't exceed 100", "pod", klog.KObj(pod)) + k.Coresreq = 100 + //return false, tmpDevs + } + if k.Memreq > 0 { + memreq = k.Memreq + } + if k.MemPercentagereq != 101 && k.Memreq == 0 { + //This incurs an issue + memreq = node.Devices.DeviceLists[i].Device.Totalmem * k.MemPercentagereq / 100 + } + if node.Devices.DeviceLists[i].Device.Totalmem-node.Devices.DeviceLists[i].Device.Usedmem < memreq { + klog.V(5).InfoS("card Insufficient remaining memory", "pod", klog.KObj(pod), "device index", i, "device", node.Devices.DeviceLists[i].Device.ID, "device total memory", node.Devices.DeviceLists[i].Device.Totalmem, "device used memory", node.Devices.DeviceLists[i].Device.Usedmem, "request memory", memreq) + continue + } + if node.Devices.DeviceLists[i].Device.Totalcore-node.Devices.DeviceLists[i].Device.Usedcores < k.Coresreq { + klog.V(5).InfoS("card Insufficient remaining cores", "pod", klog.KObj(pod), "device index", i, "device", node.Devices.DeviceLists[i].Device.ID, "device total core", node.Devices.DeviceLists[i].Device.Totalcore, "device used core", node.Devices.DeviceLists[i].Device.Usedcores, "request cores", k.Coresreq) + continue + } + // Coresreq=100 indicates it want this card exclusively + if node.Devices.DeviceLists[i].Device.Totalcore == 100 && k.Coresreq == 100 && node.Devices.DeviceLists[i].Device.Used > 0 { + klog.V(5).InfoS("the container wants exclusive access to an entire card, but the card is already in use", "pod", klog.KObj(pod), "device index", i, "device", node.Devices.DeviceLists[i].Device.ID, "used", node.Devices.DeviceLists[i].Device.Used) + continue + } + // You can't allocate core=0 job to an already full GPU + if node.Devices.DeviceLists[i].Device.Totalcore != 0 && node.Devices.DeviceLists[i].Device.Usedcores == node.Devices.DeviceLists[i].Device.Totalcore && k.Coresreq == 0 { + klog.V(5).InfoS("can't allocate core=0 job to an already full GPU", "pod", klog.KObj(pod), "device index", i, "device", node.Devices.DeviceLists[i].Device.ID) + continue + } + if !device.GetDevices()[k.Type].CustomFilterRule(allocated, request, tmpDevs[k.Type], node.Devices.DeviceLists[i].Device) { + continue + } + if k.Nums > 0 { + klog.InfoS("first fitted", "pod", klog.KObj(pod), "device", node.Devices.DeviceLists[i].Device.ID) + k.Nums-- + tmpDevs[k.Type] = append(tmpDevs[k.Type], util.ContainerDevice{ + Idx: int(node.Devices.DeviceLists[i].Device.Index), + UUID: node.Devices.DeviceLists[i].Device.ID, + Type: k.Type, + Usedmem: memreq, + Usedcores: k.Coresreq, + }) + } + if k.Nums == 0 { + klog.InfoS("device allocate success", "pod", klog.KObj(pod), "allocate device", tmpDevs) + return true, tmpDevs + } + if node.Devices.DeviceLists[i].Device.Mode == "mig" { + i++ + } + } + return false, tmpDevs +} +``` + +遍历设备,主要根据设备资源余量来判断是否够 container 分配,返回所有够分配的设备。 + +`pkg/scheduler/scheduler.go:458` + +```golang + nodeScores, err := s.calcScore(nodeUsage, nums, annos, args.Pod) + if err != nil { + err := fmt.Errorf("calcScore failed %v for pod %v", err, args.Pod.Name) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, err) + return nil, err + } + if len((*nodeScores).NodeList) == 0 { + klog.V(4).Infof("All node scores do not meet for pod %v", args.Pod.Name) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, fmt.Errorf("no available node, all node scores do not meet")) + return &extenderv1.ExtenderFilterResult{ + FailedNodes: failedNodes, + }, nil + } + klog.V(4).Infoln("nodeScores_len=", len((*nodeScores).NodeList)) + sort.Sort(nodeScores) + m := (*nodeScores).NodeList[len((*nodeScores).NodeList)-1] + klog.Infof("schedule %v/%v to %v %v", args.Pod.Namespace, args.Pod.Name, m.NodeID, m.Devices) + annotations := make(map[string]string) + annotations[util.AssignedNodeAnnotations] = m.NodeID + annotations[util.AssignedTimeAnnotations] = strconv.FormatInt(time.Now().Unix(), 10) + + for _, val := range device.GetDevices() { + val.PatchAnnotations(&annotations, m.Devices) + } + + //InRequestDevices := util.EncodePodDevices(util.InRequestDevices, m.devices) + //supportDevices := util.EncodePodDevices(util.SupportDevices, m.devices) + //maps.Copy(annotations, InRequestDevices) + //maps.Copy(annotations, supportDevices) + s.addPod(args.Pod, m.NodeID, m.Devices) + err = util.PatchPodAnnotations(args.Pod, annotations) + if err != nil { + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, err) + s.delPod(args.Pod) + return nil, err + } + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringSucceed, []string{m.NodeID}, nil) + res := extenderv1.ExtenderFilterResult{NodeNames: &[]string{m.NodeID}} + return &res, nil +``` + +遍历完成之后选择分数最高的, 给 Pod 打标签。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + annotations: + hami.io/vgpu-node: node1 + hami.io/vgpu-time: "1733988480" + hami.io/vgpu-devices-allocated: GPU-7aebc545-cbd3-18a0-afce-76cae449702a,NVIDIA,20000,80:; + hami.io/vgpu-devices-to-allocate: ; +``` + +#### binding 实现 + +bind 逻辑比较简单,将 Pod 绑定到 Node。 + +`pkg/scheduler/routes/route.go:82` + +```golang +func Bind(s *scheduler.Scheduler) httprouter.Handle { + return func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { + var buf bytes.Buffer + body := io.TeeReader(r.Body, &buf) + var extenderBindingArgs extenderv1.ExtenderBindingArgs + var extenderBindingResult *extenderv1.ExtenderBindingResult + + if err := json.NewDecoder(body).Decode(&extenderBindingArgs); err != nil { + klog.ErrorS(err, "Decode extender binding args") + extenderBindingResult = &extenderv1.ExtenderBindingResult{ + Error: err.Error(), + } + } else { + extenderBindingResult, err = s.Bind(extenderBindingArgs) + } + + if response, err := json.Marshal(extenderBindingResult); err != nil { + klog.ErrorS(err, "Marshal binding result", "result", extenderBindingResult) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusInternalServerError) + errMsg := fmt.Sprintf("{'error':'%s'}", err.Error()) + w.Write([]byte(errMsg)) + } else { + klog.V(5).InfoS("Return bind response", "result", extenderBindingResult) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write(response) + } + } +} +``` + +路由处理: + +```golang +func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.ExtenderBindingResult, error) { + klog.InfoS("Bind", "pod", args.PodName, "namespace", args.PodNamespace, "podUID", args.PodUID, "node", args.Node) + var err error + var res *extenderv1.ExtenderBindingResult + binding := &corev1.Binding{ + ObjectMeta: metav1.ObjectMeta{Name: args.PodName, UID: args.PodUID}, + Target: corev1.ObjectReference{Kind: "Node", Name: args.Node}, + } + current, err := s.kubeClient.CoreV1().Pods(args.PodNamespace).Get(context.Background(), args.PodName, metav1.GetOptions{}) + if err != nil { + klog.ErrorS(err, "Get pod failed") + } + + node, err := s.kubeClient.CoreV1().Nodes().Get(context.Background(), args.Node, metav1.GetOptions{}) + if err != nil { + klog.ErrorS(err, "Failed to get node", "node", args.Node) + s.recordScheduleBindingResultEvent(current, EventReasonBindingFailed, []string{}, fmt.Errorf("failed to get node %v", args.Node)) + res = &extenderv1.ExtenderBindingResult{ + Error: err.Error(), + } + return res, nil + } + + tmppatch := make(map[string]string) + for _, val := range device.GetDevices() { + err = val.LockNode(node, current) + if err != nil { + goto ReleaseNodeLocks + } + } + + tmppatch[util.DeviceBindPhase] = "allocating" + tmppatch[util.BindTimeAnnotations] = strconv.FormatInt(time.Now().Unix(), 10) + + err = util.PatchPodAnnotations(current, tmppatch) + if err != nil { + klog.ErrorS(err, "patch pod annotation failed") + } + if err = s.kubeClient.CoreV1().Pods(args.PodNamespace).Bind(context.Background(), binding, metav1.CreateOptions{}); err != nil { + klog.ErrorS(err, "Failed to bind pod", "pod", args.PodName, "namespace", args.PodNamespace, "podUID", args.PodUID, "node", args.Node) + } + if err == nil { + s.recordScheduleBindingResultEvent(current, EventReasonBindingSucceed, []string{args.Node}, nil) + res = &extenderv1.ExtenderBindingResult{ + Error: "", + } + klog.Infoln("After Binding Process") + return res, nil + } +ReleaseNodeLocks: + klog.InfoS("bind failed", "err", err.Error()) + for _, val := range device.GetDevices() { + val.ReleaseNodeLock(node, current) + } + s.recordScheduleBindingResultEvent(current, EventReasonBindingFailed, []string{}, err) + return &extenderv1.ExtenderBindingResult{ + Error: err.Error(), + }, nil +} +``` + +### Node 将设备情况写入 node annotation + +scheduler 获取 node 的设备信息主要是通过读取 node 的 annotation,主要有如下几步: + +- 启动插件 + +```yaml +apiVersion: v1 +kind: Node +metadata: + annotations: + hami.io/node-handshake: Requesting_2024.12.24 03:31:30 + hami.io/node-handshake-dcu: Deleted_2024.12.06 07:43:49 + hami.io/node-nvidia-register: + "GPU-7aebc545-cbd3-18a0-afce-76cae449702a,10,73728,300,NVIDIA-NVIDIA + GeForce RTX 3090,0,true:" +``` + +#### 启动 device-plugin 服务 + +这里用到了 `github.com/urfave/cli/v2` 作为 command 启动服务,需要注意 -v 不是日志等级而是是否显示版本 + +`cmd/device-plugin/nvidia/main.go:40` + +```golang +func main() { + var configFile string + + c := cli.NewApp() + c.Name = "NVIDIA Device Plugin" + c.Usage = "NVIDIA device plugin for Kubernetes" + c.Version = info.GetVersionString() + c.Action = func(ctx *cli.Context) error { + return start(ctx, c.Flags) + } +``` + +#### 启动 plugin + +这里的 plugin 主要是针对不同厂家的设备需要实现不同的方法,这里定义了 pluigin 的控制器,例如 start、restart、exit 等,这里我们主要关注`plugins, restartPlugins, err := startPlugins(c, flags, restarting)` + +`cmd/device-plugin/nvidia/main.go:156` + +```golang +func start(c *cli.Context, flags []cli.Flag) error { + klog.Info("Starting FS watcher.") + util.NodeName = os.Getenv(util.NodeNameEnvName) + watcher, err := newFSWatcher(kubeletdevicepluginv1beta1.DevicePluginPath) + if err != nil { + return fmt.Errorf("failed to create FS watcher: %v", err) + } + defer watcher.Close() + //device.InitDevices() + + /*Loading config files*/ + klog.Infof("Start working on node %s", util.NodeName) + klog.Info("Starting OS watcher.") + sigs := newOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) + + var restarting bool + var restartTimeout <-chan time.Time + var plugins []plugin.Interface +restart: + // If we are restarting, stop plugins from previous run. + if restarting { + err := stopPlugins(plugins) + if err != nil { + return fmt.Errorf("error stopping plugins from previous run: %v", err) + } + } + + klog.Info("Starting Plugins.") + plugins, restartPlugins, err := startPlugins(c, flags, restarting) + if err != nil { + return fmt.Errorf("error starting plugins: %v", err) + } + + if restartPlugins { + klog.Info("Failed to start one or more plugins. Retrying in 30s...") + restartTimeout = time.After(30 * time.Second) + } + + restarting = true + + // Start an infinite loop, waiting for several indicators to either log + // some messages, trigger a restart of the plugins, or exit the program. + for { + select { + // If the restart timeout has expired, then restart the plugins + case <-restartTimeout: + goto restart + + // Detect a kubelet restart by watching for a newly created + // 'kubeletdevicepluginv1beta1.KubeletSocket' file. When this occurs, restart this loop, + // restarting all of the plugins in the process. + case event := <-watcher.Events: + if event.Name == kubeletdevicepluginv1beta1.KubeletSocket && event.Op&fsnotify.Create == fsnotify.Create { + klog.Infof("inotify: %s created, restarting.", kubeletdevicepluginv1beta1.KubeletSocket) + goto restart + } + + // Watch for any other fs errors and log them. + case err := <-watcher.Errors: + klog.Errorf("inotify: %s", err) + + // Watch for any signals from the OS. On SIGHUP, restart this loop, + // restarting all of the plugins in the process. On all other + // signals, exit the loop and exit the program. + case s := <-sigs: + switch s { + case syscall.SIGHUP: + klog.Info("Received SIGHUP, restarting.") + goto restart + default: + klog.Infof("Received signal \"%v\", shutting down.", s) + goto exit + } + } + } +exit: + err = stopPlugins(plugins) + if err != nil { + return fmt.Errorf("error stopping plugins: %v", err) + } + return nil +} +``` + +`cmd/device-plugin/nvidia/main.go:239` + +启动插件,主要方法 `p.Start()` + +```golang +func startPlugins(c *cli.Context, flags []cli.Flag, restarting bool) ([]plugin.Interface, bool, error) { + // Load the configuration file + klog.Info("Loading configuration.") + config, err := loadConfig(c, flags) + if err != nil { + return nil, false, fmt.Errorf("unable to load config: %v", err) + } + disableResourceRenamingInConfig(config) + + /*Loading config files*/ + //fmt.Println("NodeName=", config.NodeName) + devConfig, err := generateDeviceConfigFromNvidia(config, c, flags) + if err != nil { + klog.Errorf("failed to load config file %s", err.Error()) + return nil, false, err + } + + // Update the configuration file with default resources. + klog.Info("Updating config with default resource matching patterns.") + err = rm.AddDefaultResourcesToConfig(&devConfig) + if err != nil { + return nil, false, fmt.Errorf("unable to add default resources to config: %v", err) + } + + // Print the config to the output. + configJSON, err := json.MarshalIndent(devConfig, "", " ") + if err != nil { + return nil, false, fmt.Errorf("failed to marshal config to JSON: %v", err) + } + klog.Infof("\nRunning with config:\n%v", string(configJSON)) + + // Get the set of plugins. + klog.Info("Retrieving plugins.") + pluginManager, err := NewPluginManager(&devConfig) + if err != nil { + return nil, false, fmt.Errorf("error creating plugin manager: %v", err) + } + plugins, err := pluginManager.GetPlugins() + if err != nil { + return nil, false, fmt.Errorf("error getting plugins: %v", err) + } + + // Loop through all plugins, starting them if they have any devices + // to serve. If even one plugin fails to start properly, try + // starting them all again. + started := 0 + for _, p := range plugins { + // Just continue if there are no devices to serve for plugin p. + if len(p.Devices()) == 0 { + continue + } + + // Start the gRPC server for plugin p and connect it with the kubelet. + if err := p.Start(); err != nil { + klog.Error("Could not contact Kubelet. Did you enable the device plugin feature gate?") + klog.Error("You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites") + klog.Error("You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start") + return plugins, true, nil + } + started++ + } + + if started == 0 { + klog.Info("No devices found. Waiting indefinitely.") + } + + return plugins, false, nil +} +``` + +其中 p(plugin) 需要实现几个方法来管理插件。 + +`pkg/device-plugin/nvidiadevice/nvinternal/plugin/api.go:37` + +```golang +type Interface interface { + Devices() rm.Devices + Start() error + Stop() error +} +``` + +同时如果需要 kubelet 能够识别 resource 中的类似 `nvidia.com/gpu: 1` 这样的拓展字段需要启动一个 GRPC +服务挂载 `/var/lib/kubelet/device-plugins/` 且实现如下方法。这块跟调度相关性不大,暂且不展开 +[device-plugins](https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/)。 + +`k8s.io/kubelet@v0.28.3/pkg/apis/deviceplugin/v1beta1/api.pb.go:1419` + +```golang +type DevicePluginServer interface { + // GetDevicePluginOptions returns options to be communicated with Device + // Manager + GetDevicePluginOptions(context.Context, *Empty) (*DevicePluginOptions, error) + // ListAndWatch returns a stream of List of Devices + // Whenever a Device state change or a Device disappears, ListAndWatch + // returns the new list + ListAndWatch(*Empty, DevicePlugin_ListAndWatchServer) error + // GetPreferredAllocation returns a preferred set of devices to allocate + // from a list of available ones. The resulting preferred allocation is not + // guaranteed to be the allocation ultimately performed by the + // devicemanager. It is only designed to help the devicemanager make a more + // informed allocation decision when possible. + GetPreferredAllocation(context.Context, *PreferredAllocationRequest) (*PreferredAllocationResponse, error) + // Allocate is called during container creation so that the Device + // Plugin can run device specific operations and instruct Kubelet + // of the steps to make the Device available in the container + Allocate(context.Context, *AllocateRequest) (*AllocateResponse, error) + // PreStartContainer is called, if indicated by Device Plugin during registeration phase, + // before each container start. Device plugin can run device specific operations + // such as resetting the device before making devices available to the container + PreStartContainer(context.Context, *PreStartContainerRequest) (*PreStartContainerResponse, error) +} +``` + +#### nvidia 插件的实现 + +主要看`plugin.WatchAndRegister()` + +`pkg/device-plugin/nvidiadevice/nvinternal/plugin/server.go:196` + +```golang +func (plugin *NvidiaDevicePlugin) Start() error { + plugin.initialize() + + err := plugin.Serve() + if err != nil { + klog.Infof("Could not start device plugin for '%s': %s", plugin.rm.Resource(), err) + plugin.cleanup() + return err + } + klog.Infof("Starting to serve '%s' on %s", plugin.rm.Resource(), plugin.socket) + + err = plugin.Register() + if err != nil { + klog.Infof("Could not register device plugin: %s", err) + plugin.Stop() + return err + } + klog.Infof("Registered device plugin for '%s' with Kubelet", plugin.rm.Resource()) + + if plugin.operatingMode == "mig" { + cmd := exec.Command("nvidia-mig-parted", "export") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + err := cmd.Run() + if err != nil { + klog.Fatalf("nvidia-mig-parted failed with %s\n", err) + } + outStr := stdout.Bytes() + yaml.Unmarshal(outStr, &plugin.migCurrent) + os.WriteFile("/tmp/migconfig.yaml", outStr, os.ModePerm) + if len(plugin.migCurrent.MigConfigs["current"]) == 1 && len(plugin.migCurrent.MigConfigs["current"][0].Devices) == 0 { + idx := 0 + plugin.migCurrent.MigConfigs["current"][0].Devices = make([]int32, 0) + for idx < GetDeviceNums() { + plugin.migCurrent.MigConfigs["current"][0].Devices = append(plugin.migCurrent.MigConfigs["current"][0].Devices, int32(idx)) + idx++ + } + } + klog.Infoln("Mig export", plugin.migCurrent) + } + go func() { + err := plugin.rm.CheckHealth(plugin.stop, plugin.health) + if err != nil { + klog.Infof("Failed to start health check: %v; continuing with health checks disabled", err) + } + }() + + go func() { + plugin.WatchAndRegister() + }() + + return nil +} +``` + +这里是个定时器,每 30s 收集一次该 node 的设备信息,并写入 node annotation。 + +```golang +func (plugin *NvidiaDevicePlugin) WatchAndRegister() { + klog.Info("Starting WatchAndRegister") + errorSleepInterval := time.Second * 5 + successSleepInterval := time.Second * 30 + for { + err := plugin.RegistrInAnnotation() + if err != nil { + klog.Errorf("Failed to register annotation: %v", err) + klog.Infof("Retrying in %v seconds...", errorSleepInterval) + time.Sleep(errorSleepInterval) + } else { + klog.Infof("Successfully registered annotation. Next check in %v seconds...", successSleepInterval) + time.Sleep(successSleepInterval) + } + } +} +``` + +```golang +func (plugin *NvidiaDevicePlugin) RegistrInAnnotation() error { + devices := plugin.getAPIDevices() + klog.InfoS("start working on the devices", "devices", devices) + annos := make(map[string]string) + node, err := util.GetNode(util.NodeName) + if err != nil { + klog.Errorln("get node error", err.Error()) + return err + } + encodeddevices := util.EncodeNodeDevices(*devices) + annos[nvidia.HandshakeAnnos] = "Reported " + time.Now().String() + annos[nvidia.RegisterAnnos] = encodeddevices + klog.Infof("patch node with the following annos %v", fmt.Sprintf("%v", annos)) + err = util.PatchNodeAnnotations(node, annos) + + if err != nil { + klog.Errorln("patch node error", err.Error()) + } + return err +} +``` + +具体数据收集逻辑。 + +`pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go:110` + +```golang +func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*util.DeviceInfo { + devs := plugin.Devices() + klog.V(5).InfoS("getAPIDevices", "devices", devs) + nvml.Init() + res := make([]*util.DeviceInfo, 0, len(devs)) + for UUID := range devs { + ndev, ret := nvml.DeviceGetHandleByUUID(UUID) + if ret != nvml.SUCCESS { + klog.Errorln("nvml new device by index error uuid=", UUID, "err=", ret) + panic(0) + } + idx, ret := ndev.GetIndex() + if ret != nvml.SUCCESS { + klog.Errorln("nvml get index error ret=", ret) + panic(0) + } + memoryTotal := 0 + memory, ret := ndev.GetMemoryInfo() + if ret == nvml.SUCCESS { + memoryTotal = int(memory.Total) + } else { + klog.Error("nvml get memory error ret=", ret) + panic(0) + } + Model, ret := ndev.GetName() + if ret != nvml.SUCCESS { + klog.Error("nvml get name error ret=", ret) + panic(0) + } + + registeredmem := int32(memoryTotal / 1024 / 1024) + if plugin.schedulerConfig.DeviceMemoryScaling != 1 { + registeredmem = int32(float64(registeredmem) * plugin.schedulerConfig.DeviceMemoryScaling) + } + klog.Infoln("MemoryScaling=", plugin.schedulerConfig.DeviceMemoryScaling, "registeredmem=", registeredmem) + health := true + for _, val := range devs { + if strings.Compare(val.ID, UUID) == 0 { + // when NVIDIA-Tesla P4, the device info is : ID:GPU-e290caca-2f0c-9582-acab-67a142b61ffa,Health:Healthy,Topology:nil, + // it is more reasonable to think of healthy as case-insensitive + if strings.EqualFold(val.Health, "healthy") { + health = true + } else { + health = false + } + break + } + } + numa, err := plugin.getNumaInformation(idx) + if err != nil { + klog.ErrorS(err, "failed to get numa information", "idx", idx) + } + res = append(res, &util.DeviceInfo{ + ID: UUID, + Index: uint(idx), + Count: int32(plugin.schedulerConfig.DeviceSplitCount), + Devmem: registeredmem, + Devcore: int32(plugin.schedulerConfig.DeviceCoreScaling * 100), + Type: fmt.Sprintf("%v-%v", "NVIDIA", Model), + Numa: numa, + Mode: plugin.operatingMode, + Health: health, + }) + klog.Infof("nvml registered device id=%v, memory=%v, type=%v, numa=%v", idx, registeredmem, Model, numa) + } + return &res +} +``` + +这里通过 nvidia 驱动获取设备信息,需要注意的是这里有配置 DeviceMemoryScaling,内存超分配置, +这里是通过命令行启动的 --config-file 参数指定的 schduler 配置和代码中固化的 +`config/config.json` 来取值的,其中 config/config.json 优先级大于 --config-file + +到这里,调度所需的所有东西就准备好了,Pod 可以顺利被分配到合适的节点上。 + +## 参考 + +- [kubernetes 官网](https://kubernetes.io/) +- [自定义 Kubernetes 调度器](https://www.qikqiak.com/post/custom-kube-scheduler/) +- [自定义资源支持:K8s Device Plugin 从原理到实现](https://www.lixueduan.com/posts/kubernetes/21-device-plugin/) diff --git a/docs/zh/contributor/adopters.md b/docs/zh/contributor/adopters.md new file mode 100644 index 0000000..6067059 --- /dev/null +++ b/docs/zh/contributor/adopters.md @@ -0,0 +1,32 @@ +--- +title: HAMi 采用者 +--- + +# HAMi 采用者 + +您和您的组织正在使用 HAMi?太棒了!我们很乐意听到您的使用反馈!💖 + +## 添加您的信息 + +[这里](https://github.com/Project-HAMi/website/blob/master/src/pages/adopters.mdx)列出了在生产环境中采用 HAMi 项目的组织。 + +您只需为您的公司添加一个条目,合并后它将自动添加到我们的网站中。 + +要添加您的组织,请按照以下步骤操作: + +1. Fork [HAMi-io/website](https://github.com/Project-HAMi/website) 仓库。 +2. 使用 `git clone https://github.com/<您的-GH-用户名>/website.git` 将其克隆到本地。 +3. (可选) 将您组织的 logo 添加到 `static/img/supporters` 目录。建议将 logo 文件命名为 `<公司名>.png`。 + 这些内容不会用于商业用途。 +4. 编辑 [adopters.mdx](https://github.com/Project-HAMi/website/blob/master/src/pages/adopters.mdx) 中的采用者列表。 + 您可以参考下面的示例表格格式。 + + | 公司名称 | 联系方式 | 环境 | 场景描述 | + | -------- | --------------------------------- | ---- | ------------------------------ | + | 我的公司 | [email](mailto:email@company.com) | 生产 | We use HAMi to manage our GPU. | + +5. 保存文件,然后执行 `git add -A` 并使用 `git commit -s -m "Add MY-ORG to adopters"` 提交。 +6. 使用 `git push origin main` 推送提交。 +7. 向 [HAMi-io/website](https://github.com/Project-HAMi/website) 开启一个拉取请求(Pull Request),预览构建将会出现。 + +非常感谢您成为我们社区的一员 - 我们非常感激! diff --git a/docs/zh/contributor/cherry-picks.md b/docs/zh/contributor/cherry-picks.md new file mode 100644 index 0000000..a336756 --- /dev/null +++ b/docs/zh/contributor/cherry-picks.md @@ -0,0 +1,86 @@ +--- +title: 如何 cherry-pick PRs +translated: true +--- + +本文档解释了如何在 `Project-HAMi/HAMi` 仓库的发布分支上管理 cherry pick。一个常见的用例是将 PR 从 master 分支回移到发布分支。 + +> 本文档摘自 [Kubernetes cherry-pick](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-release/cherry-picks.md)。 + +- [先决条件](#prerequisites) +- [哪些 PR 适合进行 Cherry Pick](#what-kind-of-prs-are-good-for-cherry-picks) +- [发起 Cherry Pick](#initiate-a-cherry-pick) +- [Cherry Pick 审核](#cherry-pick-review) +- [Cherry Pick 故障排除](#troubleshooting-cherry-picks) +- [不支持版本的 Cherry Pick](#cherry-picks-for-unsupported-releases) + +## 先决条件 + +- 一个已合并到 `master` 分支的拉取请求。 +- 发布分支已存在(例如:[`release-2.4`](https://github.com/Project-HAMi/HAMi/releases)) +- 正常配置的 git 和 GitHub shell 环境,用于推送到 GitHub 上的 HAMi `origin` fork,并对配置的远程 `upstream` 提交拉取请求,该 `upstream` 跟踪 `https://github.com/Project-HAMi/HAMi`,包括 `GITHUB_USER`。 +- 按照[安装说明](https://github.com/cli/cli#installation)安装 GitHub CLI (`gh`)。 +- 一个具有 "repo" 和 "read:org" 权限的 GitHub 个人访问令牌。权限是为 [gh auth login](https://cli.github.com/manual/gh_auth_login) 所需,与 cherry-pick 创建过程无关(创建分支和发起 PR)。 + +## 哪些 PR 适合进行 Cherry Pick + +与正常的 master 分支的合并量相比,发布分支的 PR 数量要少一个或两个数量级。这是因为发布分支的审查更为严格。重点在于关键的错误修复,例如: + +- 数据丢失 +- 内存损坏 +- 崩溃、挂起 +- 安全问题 + +仅影响 alpha 功能的功能性问题的错误修复(不是数据丢失或安全问题)不符合关键错误修复的标准。 + +如果您提议进行 cherry pick,但它不是一个明显的关键错误修复,请重新考虑。如果在反思后您仍希望继续,请通过补充您的 PR 来加强您的理由,例如: + +- 详细描述问题的 GitHub issue + +- 变更的范围 + +- 添加变更的风险 + +- 相关回归的风险 + +- 执行的测试,添加的测试用例 + +- 关键利益相关者的审阅者/批准者对变更为必要的回移的信心的证明 + +确保我们的整个社区积极参与项目的增强是至关重要的。如果某个已发布的功能未在特定提供商的平台上启用,这是一个需要在 `master` 分支中解决的社区失误,以便后续发布。这样的启用不会被回移到补丁发布分支。 + +## 发起 Cherry Pick + +- 运行 [cherry pick 脚本][cherry-pick-script] + + 此示例将 master 分支的 PR #1206 应用于远程分支 `upstream/release-1.0`: + + ```shell + hack/cherry_pick_pull.sh upstream/release-1.0 1206 + ``` + + - 请注意,cherry pick 脚本假定您有一个名为 `upstream` 的 git 远程指向 HAMi GitHub 组织。 + + - 您需要为每个想要进行 cherry pick 的补丁发布单独运行 cherry pick 脚本。cherry pick 应应用于所有适用修复的活动发布分支。 + + - 如果未设置 `GITHUB_TOKEN`,您将被要求输入 GitHub 密码:提供 GitHub [个人访问令牌](https://github.com/settings/tokens) 而不是实际的 GitHub 密码。如果您可以安全地将环境变量 `GITHUB_TOKEN` 设置为您的个人访问令牌,则可以避免交互式提示。参考 [https://github.com/github/hub/issues/2655#issuecomment-735836048](https://github.com/github/hub/issues/2655#issuecomment-735836048) + +## Cherry Pick 审核 + +与其他 PR 一样,代码 OWNERS 会根据需要对 cherry pick PR 进行审核 (`/lgtm`) 和批准 (`/approve`)。 + +与正常的拉取请求相同,发布说明要求适用,除了发布说明部分将自动从发起 cherry pick 的 master 分支拉取请求中填充。 + +## Cherry Pick 故障排除 + +贡献者在发起 cherry pick 时可能会遇到以下一些困难。 + +- cherry pick PR 无法干净地应用于旧的发布分支。在这种情况下,您需要手动修复冲突。 + +- cherry pick PR 包含无法通过 CI 测试的代码。在这种情况下,您需要从您的 fork 中获取自动生成的分支,修改有问题的提交并强制推送到自动生成的分支。或者,您可以创建一个新的 PR,这样会更繁琐。 + +## 不支持版本的 Cherry Pick + +社区支持和补丁的版本需要讨论。 + +[cherry-pick-script]: https://github.com/Project-HAMi/HAMi/blob/master/hack/cherry_pick_pull.sh \ No newline at end of file diff --git a/docs/zh/contributor/contribute-docs.md b/docs/zh/contributor/contribute-docs.md new file mode 100644 index 0000000..cd9bb7f --- /dev/null +++ b/docs/zh/contributor/contribute-docs.md @@ -0,0 +1,174 @@ +--- +title: 如何贡献文档 +translated: true +--- + +从1.3版本开始,社区文档将在HAMi网站上提供。本文件解释了如何向`Project-HAMi/website`仓库贡献文档。 + +## 前提条件 + +- 文档和代码一样,也按版本分类和存储。1.3是我们归档的第一个版本。 +- 文档需要翻译成多种语言,以便来自不同地区的读者阅读。社区现在支持中文和英文。英文是文档的官方语言。 +- 我们的文档使用Markdown。如果您不熟悉Markdown,请参阅https://guides.github.com/features/mastering-markdown/或https://www.markdownguide.org/以获取更详细的信息。 +- 我们通过[Docusaurus 2](https://docusaurus.io/)获得了一些附加功能,这是一个现代静态网站生成器。 + +## 设置 + +您可以通过克隆我们的网站仓库来设置本地环境。 + +```shell +git clone https://github.com/Project-HAMi/website.git +cd website +``` + +我们的网站组织如下: + +``` +website +├── sidebars.json # 当前文档版本的侧边栏 +├── docs # 当前文档版本的文档目录 +│ ├── foo +│ │ └── bar.md # https://mysite.com/docs/next/foo/bar +│ └── hello.md # https://mysite.com/docs/next/hello +├── versions.json # 指示可用版本的文件 +├── versioned_docs +│ ├── version-1.1.0 +│ │ ├── foo +│ │ │ └── bar.md # https://mysite.com/docs/foo/bar +│ │ └── hello.md +│ └── version-1.0.0 +│ ├── foo +│ │ └── bar.md # https://mysite.com/docs/1.0.0/foo/bar +│ └── hello.md +├── versioned_sidebars +│ ├── version-1.1.0-sidebars.json +│ └── version-1.0.0-sidebars.json +├── docusaurus.config.js +└── package.json +``` + +`versions.json`文件是一个版本列表,从最新到最早。下表解释了版本化文件如何映射到其版本和生成的URL。 + +| 路径 | 版本 | URL | +| --------------------------------------- | -------------- | ----------------- | +| `versioned_docs/version-1.0.0/hello.md` | 1.0.0 | /docs/1.0.0/hello | +| `versioned_docs/version-1.1.0/hello.md` | 1.1.0 (最新) | /docs/hello | +| `docs/hello.md` | 当前 | /docs/next/hello | + +:::提示 + +`docs`目录中的文件属于`current`文档版本。 + +`current`文档版本标记为`Next`,托管在`/docs/next/*`下。 + +贡献者主要为当前版本贡献文档。 +::: + +## 撰写文档 + +### 在顶部开始一个标题 + +在Markdown文件的顶部指定有关文章的元数据是很重要的,这个部分称为**Front Matter**。 + +现在,让我们看一个快速示例,它应该解释**Front Matter**中最相关的条目: + +``` +--- +title: 带有标签的文档 +--- + +## 二级标题 +``` + +在两行---之间的顶部部分是Front Matter部分。在这里,我们定义了一些条目,告诉Docusaurus如何处理文章: + +- 标题相当于HTML文档中的`

`或Markdown文章中的`# `。 +- 每个文档都有一个唯一的ID。默认情况下,文档ID是与根文档目录相关的文档名称(不带扩展名)。 + +### 链接到其他文档 + +您可以通过添加以下任何链接轻松路由到其他地方: + +- 指向外部站点的绝对URL,如`https://github.com`或`https://k8s.io` - 您可以使用任何Markdown标记来实现这一点,因此 + - `<https://github.com>`或 + - `[kubernetes](https://k8s.io)`都可以。 +- 链接到Markdown文件或生成的路径。您可以使用相对路径索引相应的文件。 +- 链接到图片或其他资源。如果您的文章包含图片或其他资源,您可以在`/docs/resources`中创建相应的目录,并将文章相关文件放在该目录中。现在我们将关于HAMi的公共图片存储在`/docs/resources/general`中。您可以使用以下方式链接图片: + - `![Git工作流](https://github.com/Project-HAMi/HAMi/raw/master/docs/develop/resources/contributor/git_workflow.png)` + +### 目录组织 + +Docusaurus 2使用侧边栏来管理文档。 + +创建侧边栏有助于: + +- 组织多个相关文档 +- 在每个文档上显示侧边栏 +- 提供分页导航,带有下一页/上一页按钮 + +对于我们的文档,您可以从[https://github.com/Project-HAMi/website/blob/main/sidebars.js](https://github.com/Project-HAMi/website/blob/main/sidebars.js)了解我们的文档是如何组织的。 + +```js +module.exports = { + docs: [ + { + type: "category", + label: "核心概念", + collapsed: false, + items: [ + "core-concepts/introduction", + "core-concepts/concepts", + "core-concepts/architecture", + ], + }, + { + type: "doc", + id: "key-features/features", + }, + { + type: "category", + label: "入门", + items: [ + "get-started/deploy-with-helm" + ], + }, +.... +``` + +目录中文档的顺序严格按照项目的顺序。 + +```yaml +type: "category", +label: "核心概念", +collapsed: false, +items: [ + "core-concepts/introduction", + "core-concepts/concepts", + "core-concepts/architecture", +], +``` + +如果您添加了文档,您必须将其添加到`sidebars.js`中以使其正确显示。如果您不确定您的文档位于何处,可以在PR中询问社区成员。 + +### 关于中文文档 + +关于文档的中文版有两种情况: + +- 您想将我们现有的英文文档翻译成中文。在这种情况下,您需要修改相应文件的内容,路径为[https://github.com/Project-HAMi/website/tree/main/i18n/zh/docusaurus-plugin-content-docs/current](https://github.com/Project-HAMi/website/tree/main/i18n/zh/docusaurus-plugin-content-docs/current)。该目录的组织与外层完全相同。`current.json`保存了文档目录的翻译。如果您想翻译目录名称,可以编辑它。 +- 您想贡献没有英文版的中文文档。欢迎任何类型的文章。在这种情况下,您可以先将文章和标题添加到主目录。文章内容可以先标记为TBD。然后将相应的中文内容添加到中文目录中。 + +## 调试文档 + +现在您已经完成了文档。在您向`Project-HAMi/website`发起PR后,如果通过CI,您可以在网站上预览您的文档。 + +点击红色标记的**Details**,您将进入网站的预览视图。 + +点击**Next**,您可以看到相应的更改。如果您有与中文版相关的更改,请点击旁边的语言下拉框切换到中文。 + +如果预览页面不是您期望的,请再次检查您的文档。 + +## 常见问题 + +### 版本控制 + +对于每个版本的新补充文档,我们将在每个版本的发布日期同步到最新版本,旧版本的文档将不再修改。对于文档中发现的勘误,我们将在每次发布时修复。 \ No newline at end of file diff --git a/docs/zh/contributor/contributers.md b/docs/zh/contributor/contributers.md new file mode 100644 index 0000000..47e1f65 --- /dev/null +++ b/docs/zh/contributor/contributers.md @@ -0,0 +1,24 @@ +--- +title: 贡献者名单 +--- + +- 请查阅[HAMi社区成员制度](https://github.com/Project-HAMi/community/blob/main/community-membership.md)了解如何成为贡献者。 +- 完整[维护者名单](https://github.com/Project-HAMi/community/blob/main/MAINTAINERS.md)请参见此处。 + +以下贡献者(按字母顺序排列)已签署或提交了HAMi代码库的提交记录: + +| 贡献者 | 邮箱 | +|-----------------|-----------| +| [archlitchi](https://github.com/archlitchi) | archlitchi@gmail.com| +| [atttx123](https://github.com/atttx123) | - | +| [chaunceyjiang](https://github.com/chaunceyjiang) | chaunceyjiang@gmail.com| +| [CoderTH](https://github.com/CoderTH) | - | +| [gsakun](https://github.com/gsakun) | - | +| [lengrongfu](https://github.com/lengrongfu) | - | +| [ouyangluwei](https://github.com/ouyangluwei163) | ouyangluwei@riseunion.io | +| peizhaoyou | peizhaoyou@4paradigm.com | +| [wawa0210](https://github.com/wawa0210) | xiaozhang0210@hotmail.com | +| [whybeyoung](https://github.com/whybeyoung) | - | +| [yinyu](https://github.com/Nimbus318) | nimbus-nimo@proton.me | +| [yangshiqi](https://github.com/yangshiqi) | yangshiqi@riseunion.io | +| zhengbingxian | - | diff --git a/docs/zh/contributor/contributing.md b/docs/zh/contributor/contributing.md new file mode 100644 index 0000000..8f004de --- /dev/null +++ b/docs/zh/contributor/contributing.md @@ -0,0 +1,102 @@ +--- +title: 贡献指南 +translated: true +--- + +欢迎来到 HAMi! + +- [贡献](#contributing) +- [在您开始之前](#before-you-get-started) + - [行为准则](#code-of-conduct) + - [社区期望](#community-expectations) +- [入门](#getting-started) +- [您的第一次贡献](#your-first-contribution) + - [找到要处理的内容](#find-something-to-work-on) + - [找到一个好的入门主题](#find-a-good-first-topic) + - [处理一个问题](#work-on-an-issue) + - [提交一个问题](#file-an-issue) +- [贡献者工作流程](#contributor-workflow) + - [创建拉取请求](#creating-pull-requests) + - [代码审查](#code-review) + +# 在您开始之前 + +## 行为准则 + +请务必阅读并遵守我们的[行为准则](https://github.com/cncf/foundation/blob/main/code-of-conduct.md) + +## 社区期望 + +HAMi 是一个由社区驱动的项目,致力于促进一个健康、友好和富有成效的环境。 + +# 入门 + +- 在 GitHub 上 fork 这个仓库。 +- 在您的 fork 仓库中进行更改。 +- 提交一个 PR。 + +# 您的第一次贡献 + +我们将帮助您在不同领域进行贡献,如提交问题、开发功能、修复关键错误以及让您的工作得到审查和合并。 + +如果您对开发过程有疑问,请随时[提交一个问题](https://github.com/Project-HAMi/HAMi/issues/new/choose)。 + +## 找到要处理的内容 + +我们总是需要帮助,无论是修复文档、报告错误还是编写代码。 +查看您认为没有遵循最佳编码实践的地方,需要代码重构或缺少测试的地方。 +以下是您如何开始的步骤。 + +### 找到一个好的入门主题 + +在 HAMi 组织内有[多个仓库](https://github.com/Project-HAMi/)。 +每个仓库都有适合初学者的问题,提供一个好的入门问题。 +例如,[Project-HAMi/HAMi](https://github.com/Project-HAMi/HAMi) 有 +[需要帮助](https://github.com/Project-HAMi/HAMi/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22)和 +[好的入门问题](https://github.com/Project-HAMi/HAMi/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) +标签的问题,这些问题不需要对系统有深入的了解。 +我们可以帮助希望处理这些问题的新贡献者。 + +另一个好的贡献方式是找到文档改进的地方,比如缺失/损坏的链接。 +请参阅下面的[贡献](#contributing)以了解工作流程。 + +#### 处理一个问题 + +当您愿意承担一个问题时,只需在问题上回复。维护者会将其分配给您。 + +### 提交一个问题 + +虽然我们鼓励每个人贡献代码,但也欢迎有人报告问题。 +问题应在相应的 HAMi 子仓库下提交。 + +*示例:* HAMi 问题应提交到 [Project-HAMi/HAMi](https://github.com/Project-HAMi/HAMi/issues)。 + +请在提交问题时遵循提示的提交指南。 + +# 贡献者工作流程 + +请不要犹豫提出问题或发送拉取请求。 + +这是贡献者工作流程的大致概述: + +- 创建一个主题分支作为贡献的基础。通常是 master。 +- 进行逻辑单元的提交。 +- 将更改推送到个人 fork 的仓库的主题分支。 +- 提交一个拉取请求到 [Project-HAMi/HAMi](https://github.com/Project-HAMi/HAMi)。 + +## 创建拉取请求 + +拉取请求通常简称为 "PR"。 +HAMi 通常遵循标准的[github 拉取请求](https://help.github.com/articles/about-pull-requests/)流程。 +要提交建议的更改,请开发代码/修复并添加新的测试用例。 +之后,在提交拉取请求之前运行这些本地验证,以预测持续集成的通过或失败。 + +* 运行并通过 `make verify` + +## 代码审查 + +为了让您的 PR 更容易获得审查,请考虑审查者需要您: + +* 遵循[良好的编码指南](https://github.com/golang/go/wiki/CodeReviewComments)。 +* 撰写[良好的提交信息](https://chris.beams.io/posts/git-commit/)。 +* 将大的更改分解为一系列逻辑的小补丁,这些补丁单独进行易于理解的更改,并在整体上解决更广泛的问题。 \ No newline at end of file diff --git a/docs/zh/contributor/github-workflow.md b/docs/zh/contributor/github-workflow.md new file mode 100644 index 0000000..40f7291 --- /dev/null +++ b/docs/zh/contributor/github-workflow.md @@ -0,0 +1,258 @@ +--- +description: An overview of the GitHub workflow used by the HAMi project. It includes some tips and suggestions on things such as keeping your local environment in sync with upstream and commit hygiene. +title: GitHub 工作流 +translated: true +--- + +> 本文档摘自 [Kubernetes github-workflow](https://github.com/kubernetes/community/blob/master/contributors/guide/github-workflow.md)。 + +![Git 工作流程](https://github.com/Project-HAMi/HAMi/raw/master/docs/develop/resources/contributor/git_workflow.png) + +## 在云端创建 Fork + +1. 访问 https://github.com/Project-HAMi/HAMi +2. 点击 `Fork` 按钮(右上角)以建立基于云的 fork。 + +## 克隆 fork 到本地存储 + +根据 Go 的 [工作区说明][go-workspace],使用以下克隆步骤将 HAMi 的代码放置在你的 `GOPATH` 中。 + +[go-workspace]: https://golang.org/doc/code.html#Workspaces + +定义一个本地工作目录: + +```sh +# 如果你的 GOPATH 有多个路径,选择一个并在此处使用它而不是 $GOPATH。 +# 你必须严格遵循此模式, +# 既不能是 `$GOPATH/src/github.com/${your github profile name/` +# 也不能是其他任何模式。 +export working_dir="$(go env GOPATH)/src/github.com/Project-HAMi" +``` + +将 `user` 设置为与你的 GitHub 个人资料名称匹配: + +```sh +export user={your github profile name} +``` + +上图中提到了 `$working_dir` 和 `$user`。 + +创建你的克隆: + +```sh +mkdir -p $working_dir +cd $working_dir +git clone https://github.com/$user/HAMi.git +# 或者:git clone git@github.com:$user/HAMi.git + +cd $working_dir/HAMi +git remote add upstream https://github.com/Project-HAMi/HAMi +# 或者:git remote add upstream git@github.com:Project-HAMi/HAMi.git + +# 永远不要推送到 upstream master +git remote set-url --push upstream no_push + +# 确认你的远程仓库设置合理: +git remote -v +``` + +## 分支 + +更新你的本地 master: + +```sh +# 取决于你正在使用的哪个仓库, +# 默认分支可能被称为 'main' 而不是 'master'。 + +cd $working_dir/HAMi +git fetch upstream +git checkout master +git rebase upstream/master +``` + +从中创建分支: + +```sh +git checkout -b myfeature +``` + +然后在 `myfeature` 分支上编辑代码。 + +## 保持你的分支同步 + +```sh +# 取决于你正在使用的哪个仓库, +# 默认分支可能被称为 'main' 而不是 'master'。 + +# 在你的 myfeature 分支上 +git fetch upstream +git rebase upstream/master +``` + +请不要使用 `git pull` 代替上述的 `fetch` / `rebase`。`git pull` 会进行合并,这会留下合并提交。这会使提交历史变得混乱,并违反提交应该是单独可理解和有用的原则(见下文)。你也可以考虑通过 `git config branch.autoSetupRebase always` 更改 `.git/config` 文件以更改 `git pull` 的行为,或使用其他非合并选项如 `git pull --rebase`。 + +## 提交 + +提交你的更改。 + +```sh +git commit --signoff +``` + +可能你会返回并进行更多的编辑/构建/测试,然后在几个周期中 `commit --amend`。 + +## 推送 + +准备好进行审查时(或只是为了建立工作内容的异地备份),将你的分支推送到 `github.com` 上的 fork: + +```sh +git push -f ${your_remote_name} myfeature +``` + +## 创建拉取请求 + +1. 访问你的 fork `https://github.com/$user/HAMi` +2. 点击 `myfeature` 分支旁边的 `Compare & Pull Request` 按钮。 + +_如果你有上游写入权限_,请避免使用 GitHub UI 创建 PR,因为 GitHub 会在主仓库中创建 PR 分支,而不是在你的 fork 中。 + +### 获取代码审查 + +一旦你的拉取请求被打开,它将被分配给一个或多个审查者。那些审查者将进行彻底的代码审查,寻找正确性、错误、改进机会、文档和注释,以及风格。 + +在你的 fork 上的同一分支中提交对审查意见的更改。 + +非常小的 PR 很容易审查。非常大的 PR 则很难审查。 + +### 压缩提交 + +在审查之后,通过压缩你的提交来准备你的 PR 以进行合并。 + +在审查后留在你的分支上的所有提交都应该代表有意义的里程碑或工作单元。使用提交来增加开发和审查过程的清晰度。 + +在合并 PR 之前,压缩以下类型的提交: + +- 修复/审查反馈 +- 拼写错误 +- 合并和变基 +- 工作进行中 + +如果可以,尽量让 PR 中的每个提交都能独立编译并通过测试,但这不是必需的。特别是,`merge` 提交必须被移除,因为它们不会通过测试。 + +要压缩你的提交,请执行[交互式变基](https://git-scm.com/book/en/v2/Git-Tools-Rewriting-History): + +1. 检查你的 git 分支: + + ```bash + git status + ``` + + 输出类似于: + + ```text + On branch your-contribution + Your branch is up to date with 'origin/your-contribution'. + ``` + +2. 使用特定的提交哈希开始交互式变基,或从最后一次提交向后计数使用 `HEAD~<n>`,其中 `<n>` 表示要包含在变基中的提交数量。 + + ```bash + git rebase -i HEAD~3 + ``` + + 输出类似于: + + ```text + pick 2ebe926 原始提交 + pick 31f33e9 处理反馈 + pick b0315fe 第二个工作单元 + + # Rebase 7c34fc9..b0315ff onto 7c34fc9 (3 commands) + # + # Commands: + # p, pick <commit> = 使用提交 + # r, reword <commit> = 使用提交,但编辑提交消息 + # e, edit <commit> = 使用提交,但停止以进行修改 + # s, squash <commit> = 使用提交,但合并到前一个提交 + # f, fixup <commit> = 类似于 "squash",但丢弃此提交的日志消息 + ... + + ``` + +3. 使用命令行文本编辑器将 `pick` 改为 `squash`,然后保存更改并继续变基: + + ```text + pick 2ebe926 原始提交 + squash 31f33e9 处理反馈 + pick b0315fe 第二个工作单元 + ... + ``` + + 输出(保存更改后)类似于: + + ```text + [detached HEAD 61fdded] 第二个工作单元 + Date: Thu Mar 5 19:01:32 2020 +0100 + 2 files changed, 15 insertions(+), 1 deletion(-) + ... + + 成功变基并更新 refs/heads/master。 + ``` + +4. 强制推送你的更改到你的远程分支: + + ```bash + git push --force + ``` + +对于大规模自动修正(例如自动文档格式化),使用一个或多个提交进行工具更改,并使用最终提交大规模应用修正。这使得审查更容易。 + +## 合并提交 + +一旦你收到审查和批准,并且你的提交已被压缩,你的 PR 就可以合并了。 + +在审查者和批准者都批准 PR 后,合并会自动进行。如果你没有压缩你的提交,他们可能会要求你在批准 PR 之前这样做。 + +## 撤销提交 + +如果你希望撤销提交,请使用以下说明。 + +_如果你有上游写入权限_,请避免使用 GitHub UI 中的 `Revert` 按钮创建 PR,因为 GitHub 会在主仓库中创建 PR 分支,而不是在你的 fork 中。 + +- 创建一个分支并与上游同步。 + + ```sh + # 取决于你正在使用的哪个仓库, + # 默认分支可能被称为 'main' 而不是 'master'。 + + # 创建一个分支 + git checkout -b myrevert + + # 与上游同步分支 + git fetch upstream + git rebase upstream/master + ``` + +- 如果你希望撤销的提交是: + + - **合并提交:** + + ```sh + # SHA 是你希望撤销的合并提交的哈希 + git revert -m 1 SHA + ``` + + - **单个提交:** + + ```sh + # SHA 是你希望撤销的单个提交的哈希 + git revert SHA + ``` + +- 这将创建一个新的提交以撤销更改。将此新提交推送到你的远程。 + + ```sh + git push ${your_remote_name} myrevert + ``` + +- [创建一个拉取请求](#7-create-a-pull-request) 使用此分支。 \ No newline at end of file diff --git a/docs/zh/contributor/goverance.md b/docs/zh/contributor/goverance.md new file mode 100644 index 0000000..3f8b07a --- /dev/null +++ b/docs/zh/contributor/goverance.md @@ -0,0 +1,46 @@ +--- +title: 社区治理 +translated: true +--- + +异构AI计算虚拟化中间件(HAMi),前称为k8s-vGPU-scheduler,是一款设计用于管理k8s集群中异构AI计算设备的“全合一”工具 + +- [HAMi项目治理](#hami项目治理) + - [价值观](#价值观) + - [成员资格](#成员资格) + - [会议](#会议) + - [行为准则](#行为准则) + - [修改本章程](#修改本章程) + +## 价值观 + +HAMi及其领导层秉持以下价值观: + +* 开放性:沟通和决策在公开场合进行,并可供未来参考。尽可能多的讨论和工作在公共论坛和开放的代码库中进行。 + +* 公平性:所有利益相关者都有机会提供反馈和提交贡献,这些贡献将根据其优点进行考虑。 + +* 社区优先于产品或公司:维持和发展我们的社区优先于发布代码或赞助商的组织目标。每位贡献者以个人身份参与项目。 + +* 包容性:我们通过不同的视角和技能集进行创新,这只能在一个欢迎和尊重的环境中实现。 + +* 参与性:项目中的责任通过参与获得,并且有一条清晰的路径从贡献者晋升到领导职位。 + +## 成员资格 + +目前,维护者是项目的管理机构。随着社区的发展,这可能会改变,例如通过采用选举产生的指导委员会。 + +## 会议 + +在时区允许的情况下,维护者应参与公共开发者会议,该会议在 +[Google Docs](https://docs.google.com/document/d/1YC6hco03_oXbF9IOUPJ29VWEddmITIKIfSmBX8JtGBw/edit)上进行。 + +维护者还将举行闭门会议,以讨论安全报告或行为准则违规问题。任何维护者在收到安全问题或行为准则报告时应安排此类会议。所有现任维护者必须被邀请参加此类闭门会议,除非有维护者被指控违反行为准则。 + +## 行为准则 + +社区成员的[行为准则](https://github.com/cncf/foundation/blob/main/code-of-conduct.md)违规将提交给CNCF行为准则委员会。如果CNCF行为准则委员会需要与项目合作解决问题,维护者将任命一名未涉事的贡献者与他们合作。 + +## 修改本章程 + +对本治理及其支持文件的更改可通过维护者的2/3投票批准。 \ No newline at end of file diff --git a/docs/zh/contributor/ladder.md b/docs/zh/contributor/ladder.md new file mode 100644 index 0000000..5ef8277 --- /dev/null +++ b/docs/zh/contributor/ladder.md @@ -0,0 +1,182 @@ +--- +title: 贡献者阶梯 +translated: true +--- + +本文档介绍了在项目中参与和提升的不同方式。您可以在贡献者角色中看到项目中的不同角色。 + +- [贡献者阶梯](#contributor-ladder) + - [贡献者阶梯](#contributor-ladder-1) + - [社区参与者](#community-participant) + - [贡献者](#contributor) + - [组织成员](#organization-member) + - [审阅者](#reviewer) + - [维护者](#maintainer) + - [活跃的维护者应该](#an-active-maintainer-should) + - [如何成为维护者](#how-to-be-a-maintainer) + - [移除维护者](#removing-maintainers) + - [不活跃](#inactivity) + - [非自愿移除或降级](#involuntary-removal-or-demotion) + +## 贡献者阶梯 + +您好!我们很高兴您想了解更多关于我们项目贡献者阶梯的信息!这个贡献者阶梯概述了项目中的不同贡献者角色,以及与之相关的责任和特权。社区成员通常从“阶梯”的第一级开始,并随着他们在项目中的参与度增加而逐步提升。我们的项目成员乐于帮助您在贡献者阶梯上进步。 + +以下每个贡献者角色都分为三种类型的列表。“责任”是指贡献者应履行的事项。“要求”是指一个人需要满足的资格条件,而“特权”是指该级别的贡献者有权享有的事项。 + +### 社区参与者 + +描述:社区参与者与项目及其社区互动,贡献他们的时间、想法等。社区参与者通常是那些不再匿名并开始积极参与项目讨论的用户。 + +* 责任: + * 必须遵循 [CNCF 行为准则](https://github.com/cncf/foundation/blob/main/code-of-conduct.md) +* 用户如何参与社区: + * 参与社区讨论 + * 帮助其他用户 + * 提交错误报告 + * 评论问题 + * 试用新版本 + * 参加社区活动 + +### 贡献者 + +描述:贡献者直接为项目做出贡献并为其增值。贡献不一定是代码。处于贡献者级别的人可能是新贡献者,或者他们可能只是偶尔贡献。 + +* 责任包括: + * 遵循 CNCF 行为准则 + * 遵循项目贡献指南 +* 要求(以下之一或多个): + * 报告并有时解决问题 + * 偶尔提交 PR + * 贡献文档 + * 参加会议并做笔记 + * 回答其他社区成员的问题 + * 提交问题和 PR 的反馈 + * 测试版本和补丁并提交评审 + * 组织或协助组织活动 + * 在公共场合推广项目 + * 帮助管理项目基础设施 + * [待办事项:其他小贡献] +* 特权: + * 受邀参加贡献者活动 + * 有资格成为组织成员 + +特别感谢[长长的名单](https://github.com/Project-HAMi/HAMi/blob/master/AUTHORS.md)中那些为项目做出贡献并帮助维护项目的人。没有你们的贡献,我们不会有今天的成就。谢谢!💖 + +只要您为 HAMi 做出贡献,您的名字将被添加到[这里](https://github.com/Project-HAMi/HAMi/blob/master/AUTHORS.md)。如果您没有找到您的名字,请联系我们添加。 + +### 组织成员 + +描述:组织成员是定期参与项目的既定贡献者。组织成员在项目存储库和选举中享有特权,因此他们被期望以整个项目的利益为重。 + +组织成员必须满足贡献者的责任和要求,并且: + +* 责任包括: + * 继续定期贡献,至少每年有 50 次 GitHub 贡献 +* 要求: + * 在其 GitHub 账户上启用[双因素认证] + * 必须对项目或社区有成功的贡献,包括以下至少一项: + * 5 个被接受的 PR, + * 审核了 5 个 PR, + * 解决并关闭了 3 个问题, + * 成为关键项目管理领域的负责人, + * 或其他等效的组合或贡献 + * 必须至少贡献 1 个月 + * 必须积极参与至少一个项目领域 + * 必须有两个赞助者,他们也是组织成员,其中至少一个不为同一雇主工作 + * **[在 HAMi-project/HAMi 仓库中打开一个问题][membership request]** + - 确保您的赞助者在问题中被 @提及 + - 完成问题清单上的每一项 + - 确保所列贡献代表您在项目中的工作。 + * 让您的赞助审阅者回复赞助确认:`+1` + * 一旦您的赞助者回复,您的请求将由 `HAMi GitHub 管理团队`处理。 + +* 特权: + * 可以被分配问题和评审 + * 可以向 CI/CD 自动化发出命令 + * 可以被添加到 [待办事项:Repo Host] 团队 + * 可以推荐其他贡献者成为组织成员 + +贡献者成为组织成员的过程如下: + +1. 联系维护者并获得至少两个维护者的同意 +2. 提交一个申请成为成员的问题 + +### 审阅者 + +描述:审阅者负责特定的代码、文档、测试或其他项目领域。他们与其他审阅者共同负责审阅这些领域的所有更改,并指示这些更改是否准备好合并。他们在项目中有贡献和审阅的记录。 + +审阅者负责一个“特定领域”。这可以是特定的代码目录、驱动程序、文档章节、测试任务、事件或其他明确定义的项目组件,通常小于整个存储库或子项目。最常见的是一个或多个 Git 存储库中的一个或一组目录。以下的“特定领域”指的是这个责任领域。 + +审阅者拥有组织成员的所有权利和责任,并且: + +* 责任包括: + * 遵循审阅指南 + * 审阅大多数针对其特定责任领域的拉取请求 + * 每年至少审阅 20 个 PR + * 帮助其他贡献者成为审阅者 +* 要求: + * 至少有 3 个月的贡献者经验 + * 是组织成员 + * 已审阅或协助审阅至少 10 个拉取请求 + * 已分析并解决其特定领域的测试失败 + * 展示了对特定领域的深入了解 + * 承诺对该特定领域负责 + * 支持新的和偶尔的贡献者,并帮助使有用的 PR 准备好提交 +* 额外特权: + * 拥有 GitHub 或 CI/CD 权限以批准特定目录中的拉取请求 + * 可以推荐和审阅其他贡献者成为审阅者 + +成为审阅者的过程是: + +1. 通过在适当的存储库中打开一个 PR 提名贡献者,将他们的 GitHub 用户名添加到一个或多个目录的 OWNERS 文件中。 +2. 至少两个已经是批准者的团队成员批准该 PR。 + +### 维护者 + +描述:维护者是非常成熟的贡献者,负责整个项目。因此,他们有能力批准针对项目任何领域的 PR,并被期望参与项目战略和优先事项的决策。 + +维护者必须满足审阅者的责任和要求,并且: + +当前维护者列表可以在 [MAINTAINERS](https://github.com/Project-HAMi/HAMi/blob/master/MAINTAINERS.md) 中找到。 + +### 活跃的维护者应该 + +* 积极参与审阅拉取请求和处理新问题。请注意,没有关于什么是“足够活跃”的硬性规定,这取决于当前维护者小组的判断。 + +* 积极参与关于项目设计和未来的讨论。 + +* 负责他们批准和合并的 PR 的适当分支的回溯。 + +* 尽力遵循所有代码、测试和设计惯例,这些惯例由活跃维护者之间的共识决定。 + +* 当他们不再计划积极参与项目时,优雅地退出他们的维护者角色。 + +### 如何成为维护者 + +新维护者由当前维护者小组通过共识添加。这可以通过 Slack 或电子邮件的私人讨论进行。大多数维护者应支持添加新成员,并且没有单个维护者反对添加新维护者。 + +在添加新维护者时,我们应提交一个 PR 到 [HAMi](https://github.com/Project-HAMi/HAMi) 并更新 [MAINTAINERS](https://github.com/Project-HAMi/HAMi/blob/master/MAINTAINERS.md)。一旦这个 PR 合并,您将成为 HAMi 的维护者。 + +### 移除维护者 + +由于其他责任,维护者的来去是正常的。如果没有持续参与的预期,可能会移除不活跃的维护者。如果前维护者恢复参与,他们应被快速考虑重新加入团队。 + +## 不活跃 + +对于贡献者来说,保持活跃以树立榜样并展示对项目的承诺是很重要的。不活跃对项目有害,因为它可能导致意外的延迟、贡献者流失以及对项目的信任丧失。 + +* 不活跃的衡量标准: + * 超过 3 个月没有贡献的时期 + * 超过 3 个月没有沟通的时期 +* 不活跃的后果包括: + * 非自愿移除或降级 + * 被要求转为名誉状态 + +## 非自愿移除或降级 + +当贡献者未能履行责任和要求时,会发生非自愿移除/降级。这可能包括重复的不活跃模式、长时间的不活跃、未能满足角色要求的时期和/或违反行为准则。这个过程很重要,因为它保护了社区及其交付物,同时也为新贡献者提供了介入的机会。 + +非自愿移除或降级通过当前维护者的多数投票处理。 + +[双因素认证]: https://help.github.com/articles/about-two-factor-authentication \ No newline at end of file diff --git a/docs/zh/contributor/lifted.md b/docs/zh/contributor/lifted.md new file mode 100644 index 0000000..1fe0ceb --- /dev/null +++ b/docs/zh/contributor/lifted.md @@ -0,0 +1,121 @@ +--- +title: 如何管理提升的代码 +translated: true +--- + +本文档解释了如何管理提升的代码。此任务的一个常见用户案例是开发人员从其他代码库中提升代码到 `pkg/util/lifted` 目录。 + +- [提升代码的步骤](#steps-of-lifting-code) +- [如何编写提升注释](#how-to-write-lifted-comments) +- [示例](#examples) + +## 提升代码的步骤 + +- 从另一个代码库中复制代码并将其保存到 `pkg/util/lifted` 下的一个 go 文件中。 +- 可选择性地更改提升的代码。 +- 为代码添加提升注释 [如指导](#how-to-write-lifted-comments)。 +- 运行 `hack/update-lifted.sh` 来更新提升文档 `pkg/util/lifted/doc.go`。 + +## 如何编写提升注释 + +提升注释应放置在提升代码之前(可以是函数、类型、变量或常量)。 +在提升注释和提升代码之间只允许有空行和注释。 + +提升注释由一行或多行注释组成,每行格式为 `+lifted:KEY[=VALUE]`。对于某些键,值是可选的。 + +有效的键如下: + +- source: + + 键 `source` 是必需的。其值指示代码从何处提升。 + +- changed: + + 键 `changed` 是可选的。它指示代码是否已更改。 + 值是可选的(`true` 或 `false`,默认为 `true`)。 + 不添加此键或将其设置为 `false` 表示没有代码更改。 + +## 示例 + +### 提升函数 + +将函数 `IsQuotaHugePageResourceName` 提升到 `corehelpers.go`: + +```go +// +lifted:source=https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/apis/core/helper/helpers.go#L57-L61 + +// IsQuotaHugePageResourceName 返回 true 如果资源名称具有与配额相关的大页资源前缀。 +func IsQuotaHugePageResourceName(name corev1.ResourceName) bool { + return strings.HasPrefix(string(name), corev1.ResourceHugePagesPrefix) || strings.HasPrefix(string(name), corev1.ResourceRequestsHugePagesPrefix) +} +``` + +在 `doc.go` 中添加: + +```markdown +| 提升文件 | 源文件 | 常量/变量/类型/函数 | 更改 | +| ----------- | ----------- | ------------------- | ------- | +| corehelpers.go | https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/apis/core/helper/helpers.go#L57-L61 | func IsQuotaHugePageResourceName | N | +``` + +### 更改提升函数 + +提升并更改函数 `GetNewReplicaSet` 到 `deployment.go` + +```go +// +lifted:source=https://github.com/kubernetes/kubernetes/blob/release-1.22/pkg/controller/deployment/util/deployment_util.go#L536-L544 +// +lifted:changed + +// GetNewReplicaSet 返回与给定部署意图匹配的副本集;从客户端接口获取 ReplicaSetList。 +// 如果新的副本集尚不存在,则返回 nil。 +func GetNewReplicaSet(deployment *appsv1.Deployment, f ReplicaSetListFunc) (*appsv1.ReplicaSet, error) { + rsList, err := ListReplicaSetsByDeployment(deployment, f) + if err != nil { + return nil, err + } + return FindNewReplicaSet(deployment, rsList), nil +} +``` + +在 `doc.go` 中添加: + +```markdown +| 提升文件 | 源文件 | 常量/变量/类型/函数 | 更改 | +| ----------- | ----------- | ------------------- | ------- | +| deployment.go | https://github.com/kubernetes/kubernetes/blob/release-1.22/pkg/controller/deployment/util/deployment_util.go#L536-L544 | func GetNewReplicaSet | Y | +``` + +### 提升常量 + +将常量 `isNegativeErrorMsg` 提升到 `corevalidation.go `: + +```go +// +lifted:source=https://github.com/kubernetes/kubernetes/blob/release-1.22/pkg/apis/core/validation/validation.go#L59 +const isNegativeErrorMsg string = apimachineryvalidation.IsNegativeErrorMsg +``` + +在 `doc.go` 中添加: + +```markdown +| 提升文件 | 源文件 | 常量/变量/类型/函数 | 更改 | +| ----------- | ----------- | ------------------- | ------- | +| corevalidation.go | https://github.com/kubernetes/kubernetes/blob/release-1.22/pkg/apis/core/validation/validation.go#L59 | const isNegativeErrorMsg | N | +``` + +### 提升类型 + +将类型 `Visitor` 提升到 `visitpod.go`: + +```go +// +lifted:source=https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/api/v1/pod/util.go#L82-L83 + +// Visitor 被调用时传入每个对象名称,并返回 true 如果访问应继续 +type Visitor func(name string) (shouldContinue bool) +``` + +在 `doc.go` 中添加: + +```markdown +| 提升文件 | 源文件 | 常量/变量/类型/函数 | 更改 | +| ----------- | ----------- | ------------------- | ------- | +| visitpod.go | https://github.com/kubernetes/kubernetes/blob/release-1.23/pkg/api/v1/pod/util.go#L82-L83 | type Visitor | N | \ No newline at end of file diff --git a/docs/zh/core-concepts/architecture.md b/docs/zh/core-concepts/architecture.md new file mode 100644 index 0000000..79d09d3 --- /dev/null +++ b/docs/zh/core-concepts/architecture.md @@ -0,0 +1,23 @@ +--- +title: 架构设计 +translated: true +--- + +HAMi 的整体架构如下所示: + +![Architecture](../resources/architect.jpg) + +HAMi 由以下组件组成: + +- HAMi MutatingWebhook +- HAMi scheduler-extender +- 设备插件 (HAMi-device-plugin) +- 容器内资源控制 (HAMi-Core) + +HAMi MutatingWebhook 检查该任务是否可以由 HAMi 处理,它扫描每个提交的 pod 的资源字段,如果这些 pod 所需的每个资源是 'cpu'、'memory' 或 HAMi 资源,则会将该 pod 的 schedulerName 字段设置为 'HAMi-scheduler'。 + +HAMi 调度器负责将任务分配给适当的节点和设备。同时,调度器需要维护异构计算设备的全局视图以进行监控。 + +设备插件层从任务的注释字段获取调度结果,并将相应的设备映射到容器。 + +容器内资源控制负责监控容器内的资源使用情况,并提供硬隔离能力。 \ No newline at end of file diff --git a/docs/zh/core-concepts/introduction.md b/docs/zh/core-concepts/introduction.md new file mode 100644 index 0000000..2143d9a --- /dev/null +++ b/docs/zh/core-concepts/introduction.md @@ -0,0 +1,47 @@ +--- +title: 什么是HAMi? +translated: true +slug: / +--- + +## 目录 {#toc} + +- [HAMi:异构AI计算虚拟化中间件](#hami-heterogeneous-ai-computing-virtualization-middleware) +- [为什么选择HAMi](#why-hami) +- [后续步骤](#whats-next) + +## HAMi:异构AI计算虚拟化中间件 {#hami-heterogeneous-ai-computing-virtualization-middleware} + +异构AI计算虚拟化中间件(HAMi),前身为k8s-vGPU-scheduler,是一个专为管理k8s集群中异构AI计算设备而设计的"一体化"Helm Chart。它能够实现异构AI设备在多个任务间的共享能力。 + +HAMi是[云原生计算基金会(CNCF)](https://cncf.io/)的SandBox项目,同时被收录于[CNCF技术全景图-编排与调度类目](https://landscape.cncf.io/?item=orchestration-management--scheduling-orchestration--hami)及[CNAI技术全景图](https://landscape.cncf.io/?group=cnai&item=cnai--general-orchestration--hami)。 + +## 为什么选择HAMi {#why-hami} + +- **设备共享** + - 支持多种异构AI计算设备(如NVIDIA GPU/CUDA) + - 支持多设备容器的设备共享 + +- **设备内存控制** + - 容器内硬性内存限制 + - 支持动态设备内存分配 + - 支持按MB或百分比分配内存 + +- **设备规格指定** + - 支持指定特定类型的异构AI计算设备 + - 支持通过设备UUID指定具体设备 + +- **开箱即用** + - 对容器内任务透明无感 + - 通过helm一键安装/卸载,简洁环保 + +- **开放中立** + - 由互联网、金融、制造业、云服务商等多领域联合发起 + - 以CNCF开放治理为目标 + +## 后续步骤 {#whats-next} + +推荐继续了解: + +- 学习HAMi的[架构设计](./architecture.md) +- 开始[安装HAMi](../installation/prequisities.md) diff --git a/docs/zh/developers/Dynamic-mig.md b/docs/zh/developers/Dynamic-mig.md new file mode 100644 index 0000000..e3a0043 --- /dev/null +++ b/docs/zh/developers/Dynamic-mig.md @@ -0,0 +1,164 @@ +--- +title: NVIDIA GPU MPS 和 MIG 动态切片插件 +translated: true +--- + +- + +# NVIDIA GPU MPS 和 MIG 动态切片插件 + +## 特别感谢 + +没有 @sailorvii 的帮助,这个功能将无法实现。 + +## 介绍 + +NVIDIA GPU 内置的共享方法包括:时间片、MPS 和 MIG。时间片共享的上下文切换会浪费一些时间,所以我们选择了 MPS 和 MIG。GPU MIG 配置是可变的,用户可以在配置定义中获取 MIG 设备,但当前实现仅在用户需求之前定义了专用配置。这限制了 MIG 的使用。我们希望开发一个自动切片插件,并在用户需要时创建切片。 +对于调度方法,将支持节点级别的 binpack 和 spread。参考 binpack 插件,我们考虑了 CPU、内存、GPU 内存和其他用户定义的资源。 +HAMi 是通过使用 [hami-core](https://github.com/Project-HAMi/HAMi-core) 完成的,这是一个 cuda-hacking 库。但 mig 在全球范围内也被广泛使用。需要一个用于动态-mig 和 hami-core 的统一 API。 + +## 目标 + +- CPU、内存和 GPU 组合调度 +- GPU 动态切片:Hami-core 和 MIG +- 支持通过 GPU 内存、CPU 和内存的节点级别 binpack 和 spread +- 不同虚拟化技术的统一 vGPU 池 +- 任务可以选择使用 MIG、使用 HAMi-core 或同时使用两者。 + +### 配置映射 +- hami-scheduler-device-configMap +此 configmap 定义了插件配置,包括 resourceName、MIG 几何形状和节点级别配置。 + +```yaml +apiVersion: v1 +data: + device-config.yaml: | + nvidia: + resourceCountName: nvidia.com/gpu + resourceMemoryName: nvidia.com/gpumem + resourceCoreName: nvidia.com/gpucores + knownMigGeometries: + - models: [ "A30" ] + allowedGeometries: + - + - name: 1g.6gb + memory: 6144 + count: 4 + - + - name: 2g.12gb + memory: 12288 + count: 2 + - + - name: 4g.24gb + memory: 24576 + count: 1 + - models: [ "A100-SXM4-40GB", "A100-40GB-PCIe", "A100-PCIE-40GB", "A100-SXM4-40GB" ] + allowedGeometries: + - + - name: 1g.5gb + memory: 5120 + count: 7 + - + - name: 2g.10gb + memory: 10240 + count: 3 + - name: 1g.5gb + memory: 5120 + count: 1 + - + - name: 3g.20gb + memory: 20480 + count: 2 + - + - name: 7g.40gb + memory: 40960 + count: 1 + - models: [ "A100-SXM4-80GB", "A100-80GB-PCIe", "A100-PCIE-80GB"] + allowedGeometries: + - + - name: 1g.10gb + memory: 10240 + count: 7 + - + - name: 2g.20gb + memory: 20480 + count: 3 + - name: 1g.10gb + memory: 10240 + count: 1 + - + - name: 3g.40gb + memory: 40960 + count: 2 + - + - name: 7g.79gb + memory: 80896 + count: 1 + nodeconfig: + - name: nodeA + operatingmode: hami-core + - name: nodeB + operatingmode: mig +``` + +## 结构 + +<img src="https://github.com/Project-HAMi/HAMi/blob/master/docs/develop/imgs/hami-dynamic-mig-structure.png?raw=true" width = "600" /> + +## 示例 + +动态 mig 与 hami 任务兼容,如下例所示: +只需设置 `nvidia.com/gpu` 和 `nvidia.com/gpumem`。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod1 +spec: + containers: + - name: ubuntu-container1 + image: ubuntu:20.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # 请求 2 个 vGPU + nvidia.com/gpumem: 8000 # 每个 vGPU 包含 8000m 设备内存(可选,整数) +``` + +任务可以通过设置 `annotations.nvidia.com/vgpu-mode` 为相应的值来决定仅使用 `mig` 或 `hami-core`,如下例所示: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod1 + annotations: + nvidia.com/vgpu-mode: "mig" +spec: + containers: + - name: ubuntu-container1 + image: ubuntu:20.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # 请求 2 个 vGPU + nvidia.com/gpumem: 8000 # 每个 vGPU 包含 8000m 设备内存(可选,整数) +``` + +## 流程 + +使用动态-mig 的 vGPU 任务的流程如下所示: + +<img src="https://github.com/Project-HAMi/HAMi/blob/master/docs/develop/imgs/hami-dynamic-mig-procedure.png?raw=true" width = "800" /> + +请注意,在提交任务后,deviceshare 插件将遍历 configMap `hami-scheduler-device` 中定义的模板,并找到第一个可用的模板来适配。您可以随时更改该 configMap 的内容,并重新启动 vc-scheduler 进行自定义。 + +如果您在空的 A100-PCIE-40GB 节点上提交示例,那么它将选择一个 GPU 并选择以下 MIG 模板: + +```yaml + 2g.10gb : 3 + 1g.5gb : 1 +``` + +然后启动具有 2g.10gb 实例 * 2 的容器。 \ No newline at end of file diff --git a/docs/zh/developers/HAMi-core-design.md b/docs/zh/developers/HAMi-core-design.md new file mode 100644 index 0000000..67fb967 --- /dev/null +++ b/docs/zh/developers/HAMi-core-design.md @@ -0,0 +1,30 @@ +--- +title: HAMi-core 设计 +--- + +HAMi-core是一个为 CUDA 环境设计的 hook 库,作为容器内的 GPU 资源控制器,已被 +[HAMi](https://github.com/HAMi-project/HAMi) 和 +[Volcano](https://github.com/volcano-sh/devices) 等项目采用。 + +![img](../resources/hami-arch.png) + +## 功能特性 + +HAMi-core 提供以下核心功能: + +1. 设备显存虚拟化 + + ![image](../resources/sample_nvidia-smi.png) + +2. 限制设备使用率 + + 通过自定义的时间片机制控制 GPU 使用情况。 + +3. 实时监控设备使用率 + +## 设计原理 + +HAMi-core 通过劫持 CUDA 运行时库(`libcudart.so`)与 CUDA 驱动库(`libcuda.so`)之间的 +API 调用来实现其功能,如下图所示: + +![img](../resources/hami-core-position.png) diff --git a/docs/zh/developers/build.md b/docs/zh/developers/build.md new file mode 100644 index 0000000..ec31be8 --- /dev/null +++ b/docs/zh/developers/build.md @@ -0,0 +1,95 @@ +--- +title: 构建 HAMi +translated: true +--- + +## 制作二进制文件 + +### 前提条件 + +需要以下工具: + +- go v1.20+ +- make + +### 构建 + +```bash +make +``` + +如果一切成功构建,将打印以下输出 + +``` +go build -ldflags '-s -w -X github.com/Project-HAMi/HAMi/pkg/version.version=v0.0.1' -o bin/scheduler ./cmd/scheduler +go build -ldflags '-s -w -X github.com/Project-HAMi/HAMi/pkg/version.version=v0.0.1' -o bin/vGPUmonitor ./cmd/vGPUmonitor +go build -ldflags '-s -w -X github.com/Project-HAMi/HAMi/pkg/version.version=v0.0.1' -o bin/nvidia-device-plugin ./cmd/device-plugin/nvidia +``` + +## 制作镜像 + +### 前提条件 + +需要以下工具: + +- docker +- make + +### 构建 + +```bash +make docker +``` + +如果一切成功构建,将打印以下输出 + +``` +go build -ldflags '-s -w -X github.com/Project-HAMi/HAMi/pkg/version.version=v0.0.1' -o bin/scheduler ./cmd/scheduler +go build -ldflags '-s -w -X github.com/Project-HAMi/HAMi/pkg/version.version=v0.0.1' -o bin/vGPUmonitor ./cmd/vGPUmonitor +go build -ldflags '-s -w -X github.com/Project-HAMi/HAMi/pkg/version.version=v0.0.1' -o bin/nvidia-device-plugin ./cmd/device-plugin/nvidia +[+] Building 146.4s (28/28) +FINISHED docker:default + => [internal] load build definition from Dockerfile 0.0s + => => transferring dockerfile: 1.30kB 0.0s + => [internal] load metadata for docker.io/nvidia/cuda:12.2.0-base-ubuntu22.04 5.5s + => [internal] load metadata for docker.io/library/golang:1.21-bullseye 4.5s + => [internal] load metadata for docker.io/nvidia/cuda:12.2.0-devel-ubuntu20.04 0.0s + => [auth] nvidia/cuda:pull token for registry-1.docker.io 0.0s + => [auth] library/golang:pull token for registry-1.docker.io 0.0s + => [internal] load .dockerignore 0.0s + => => transferring context: 2B 0.0s + => [internal] load build context 1.3s + => => transferring context: 119.90MB 1.3s + => [stage-3 1/6] FROM docker.io/nvidia/cuda:12.2.0-base-ubuntu22.04@sha256:ecdf8549dd5f12609e365217a64dedde26ecda26da8f3ff3f82def6749f53051 0.0s + => CACHED [gobuild 1/4] FROM docker.io/library/golang:1.21-bullseye@sha256:311468bffa9fa4747a334b94e6ce3681b564126d653675a6adc46698b2b88d35 0.0s + => [nvbuild 1/9] FROM docker.io/nvidia/cuda:12.2.0-devel-ubuntu20.04 0.0s + => [gobuild 2/4] ADD . /k8s-vgpu 0.8s + => [nvbuild 2/9] COPY ./libvgpu /libvgpu 0.3s + => [nvbuild 3/9] WORKDIR /libvgpu 0.2s + => [nvbuild 4/9] RUN apt-get -y update && apt-get -y install wget 21.9s + => [gobuild 3/4] RUN apt-get update && apt-get -y install libhwloc-dev libdrm-dev 18.8s + => [gobuild 4/4] RUN cd /k8s-vgpu && make all 83.5s + => [nvbuild 5/9] RUN wget https://cmake.org/files/v3.19/cmake-3.19.8-Linux-x86_64.tar.gz 99.8s + => CACHED [stage-3 2/6] COPY ./LICENSE /k8s-vgpu/LICENSE 0.0s + => [stage-3 3/6] COPY --from=GOBUILD /k8s-vgpu/bin /k8s-vgpu/bin 0.5s + => [stage-3 4/6] COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh 0.2s + => [stage-3 5/6] COPY ./lib /k8s-vgpu/lib 0.2s + => [nvbuild 6/9] RUN tar -xf cmake-3.19.8-Linux-x86_64.tar.gz 2.1s + => [nvbuild 7/9] RUN cp /libvgpu/cmake-3.19.8-Linux-x86_64/bin/cmake /libvgpu/cmake-3.19.8-Linux-x86_64/bin/cmake3 1.3s + => [nvbuild 8/9] RUN apt-get -y install openssl libssl-dev 7.7s + => [nvbuild 9/9] RUN bash ./build.sh 4.0s + => [stage-3 6/6] COPY --from=NVBUILD /libvgpu/build/libvgpu.so /k8s-vgpu/lib/nvidia/ 0.3s + => exporting to image 1.8s + => => exporting layers 1.8s + => => writing image sha256:fc0ce42b41f9a177921c9bfd239babfa06fc77cf9e4087e8f2d959d749e8039f 0.0s + => => naming to docker.io/projecthami/hami:master-103b2b677e018a40af6322a56c2e9d5d5c62cccf 0.0s +The push refers to repository [docker.io/projecthami/hami] +``` + +## 制作HAMi-Core + +建议在nvidia/cuda镜像中构建HAMi-Core: + +```bash +git clone https://github.com/Project-HAMi/HAMi-core.git +docker build . -f dockerfiles/Dockerfile.{arch} \ No newline at end of file diff --git a/docs/zh/developers/kunlunxin-topology.md b/docs/zh/developers/kunlunxin-topology.md new file mode 100644 index 0000000..13eae7f --- /dev/null +++ b/docs/zh/developers/kunlunxin-topology.md @@ -0,0 +1,52 @@ +--- +title: 昆仑芯拓扑感知调度 +--- + +## 背景 + +当单个P800服务器配置多块XPU时,若GPU连接或位于同一NUMA节点内(如下图所示),可获得最优性能表现。这种配置会在服务器内所有GPU之间形成特定拓扑关系。 + +![img](../resources/kunlunxin_topo.jpg) + +当用户作业申请特定数量的`kunlunxin.com/xpu`资源时,Kubernetes会将pod调度到合适节点以最小化资源碎片并保持高性能。选定节点后,XPU设备会根据以下规则进行细粒度资源分配: + +1. 仅允许1、2、4或8卡分配方案 +2. 1/2/4卡分配不得跨NUMA节点 +3. 分配后应最小化资源碎片 + +## 过滤阶段 + +过滤阶段识别所有符合分配条件的节点。针对每个节点,系统会筛选最优XPU组合方案并缓存,供评分阶段使用。筛选流程如下图所示: + +![img](../resources/kunlunxin_filter.png) + +## 评分阶段 + +在评分阶段,所有通过过滤的节点会接受评估并打分以选择最优调度目标。我们引入**MTF**(最小填充分任务数)指标,用于量化节点在分配后容纳未来任务的能力。 + +下表展示了XPU占用情况与对应MTF值的示例: + +| XPU占用状态 | MTF | 说明 | +|----------------|-----|-------------| +| 11111111 | 0 | 完全占用,无法调度新任务 | +| 00000000 | 1 | 可被一个8-XPU任务完全占用 | +| 00000011 | 2 | 可调度一个4-XPU任务和一个2-XPU任务 | +| 00000001 | 3 | 可容纳一个4-XPU、一个2-XPU和一个1-XPU任务 | +| 00010001 | 4 | 可容纳两个2-XPU任务和两个1-XPU任务 | + +节点得分基于分配前后的**MTF差值**计算。差值越小表示适配度越高,得分也越高。具体评分逻辑如下: + +| MTF差值 | 得分 | 示例 | +|------------|-------|---------| +| -1 | 2000 | 00000111->00001111 | +| 0 | 1000 | 00000111->00110111 | +| 1 | 0 | 00001111->00011111 | +| 2 | -1000 | 00000000->00000001 | + +## 绑定阶段 + +在绑定阶段,分配结果会以注解形式注入pod。例如: + +``` +BAIDU_COM_DEVICE_IDX=0,1,2,3 +``` diff --git a/docs/zh/developers/mindmap.md b/docs/zh/developers/mindmap.md new file mode 100644 index 0000000..8b5ea4e --- /dev/null +++ b/docs/zh/developers/mindmap.md @@ -0,0 +1,8 @@ +--- +title: HAMi 路线图 +translated: true +--- + +## 思维导图 + +![image](https://github.com/Project-HAMi/HAMi/blob/master/docs/mind-map/HAMI-VGPU-mind-map-Chinese.png?raw=true) \ No newline at end of file diff --git a/docs/zh/developers/protocol.md b/docs/zh/developers/protocol.md new file mode 100644 index 0000000..b5e7a95 --- /dev/null +++ b/docs/zh/developers/protocol.md @@ -0,0 +1,37 @@ +--- +title: 协议设计 +translated: true +--- + +## 协议实现 + +### 设备注册 + +为了进行更准确的调度,HAMI 调度器需要在设备注册时感知设备的规格,包括 UUID、显存、计算能力、型号、numa 数量等。 + +然而,device-plugin 设备注册 API 并未提供相应的参数获取,因此 HAMi-device-plugin 在注册时将这些补充信息存储在节点的注释中,以供调度器读取,如下图所示: + +<img src="https://github.com/Project-HAMi/website/blob/master/versioned_docs/version-v1.3.0/resources/device_registration.png?raw=true" width="600px"/> + +这里需要使用两个注释,其中一个是时间戳,如果超过指定的阈值,则认为对应节点上的设备无效。另一个是设备注册信息。一个具有 2 个 32G-V100 GPU 的节点可以注册如下所示: + +``` +hami.io/node-handshake: Requesting_2024.05.14 07:07:33 +hami.io/node-nvidia-register: 'GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec,10,32768,100,NVIDIA-Tesla V100-PCIE-32GB,0,true:GPU-0fc3eda5-e98b-a25b-5b0d-cf5c855d1448,10,32768,100,NVIDIA-Tesla V100-PCIE-32GB,0,true:' +``` + +### 调度决策 + +kube-scheduler 在 `bind` 过程中调用 device-plugin 挂载设备,但仅向 device-plugin 提供设备的 `UUID`。因此,在设备共享的场景中,device-plugin 无法获取任务请求的相应设备规格,如 `设备内存` 和 `计算核心`。 + +因此,有必要开发一个协议,使调度器层与 device-plugin 进行通信以传递任务调度信息。调度器通过将调度结果补丁到 pod 的注释中并在 device-plugin 中读取它来传递此信息,如下图所示: + +<img src="https://github.com/Project-HAMi/website/blob/master/versioned_docs/version-v1.3.0/resources/task_dispatch.png?raw=true" width="600px"/> + +在此过程中,需要设置 3 个注释,分别是 `时间戳`、`待分配设备` 和 `已分配设备`。调度器创建时,`待分配设备` 和 `已分配设备` 的内容相同,但 device-plugin 将根据 `待分配设备` 的内容确定当前设备分配情况,当分配成功时,相应设备将从注释中移除,因此当任务成功运行时,`待分配设备` 的内容将为空。 + +一个请求 3000M 设备内存的 GPU 任务的示例将生成如下的相应注释: +``` +hami.io/bind-time: 1716199325 +hami.io/vgpu-devices-allocated: GPU-0fc3eda5-e98b-a25b-5b0d-cf5c855d1448,NVIDIA,3000,0:; +hami.io/vgpu-devices-to-allocate: ; \ No newline at end of file diff --git a/docs/zh/developers/scheduling.md b/docs/zh/developers/scheduling.md new file mode 100644 index 0000000..7c21763 --- /dev/null +++ b/docs/zh/developers/scheduling.md @@ -0,0 +1,169 @@ +--- +title: 调度策略 +translated: true +--- + +## 摘要 + +当前在一个拥有许多 GPU 节点的集群中,节点在做调度决策时没有进行 `binpack` 或 `spread`,使用 vGPU 时 GPU 卡也没有进行 `binpack` 或 `spread`。 + +## 提案 + +我们在配置中添加 `node-scheduler-policy` 和 `gpu-scheduler-policy`,然后调度器可以使用此策略实现节点 `binpack` 或 `spread` 或 GPU `binpack` 或 `spread`。用户可以设置 Pod 注释来更改此默认策略,使用 `hami.io/node-scheduler-policy` 和 `hami.io/gpu-scheduler-policy` 来覆盖调度器配置。 + +### 用户故事 + +这是一个 GPU 集群,拥有两个节点,以下故事以此集群为前提。 + +![scheduler-policy-story.png](https://github.com/Project-HAMi/HAMi/raw/master/docs/develop/imgs/scheduler-policy-story.png) + +#### 故事 1 + +节点 binpack,尽可能使用一个节点的 GPU 卡,例如: +- 集群资源: + - 节点1:GPU 拥有 4 个 GPU 设备 + - 节点2:GPU 拥有 4 个 GPU 设备 + +- 请求: + - pod1:使用 1 个 GPU + - pod2:使用 1 个 GPU + +- 调度结果: + - pod1:调度到节点1 + - pod2:调度到节点1 + +#### 故事 2 + +节点 spread,尽可能使用来自不同节点的 GPU 卡,例如: + +- 集群资源: + - 节点1:GPU 拥有 4 个 GPU 设备 + - 节点2:GPU 拥有 4 个 GPU 设备 + +- 请求: + - pod1:使用 1 个 GPU + - pod2:使用 1 个 GPU + +- 调度结果: + - pod1:调度到节点1 + - pod2:调度到节点2 + +#### 故事 3 + +GPU binpack,尽可能使用同一个 GPU 卡,例如: + +- 集群资源: + - 节点1:GPU 拥有 4 个 GPU 设备,它们是 GPU1, GPU2, GPU3, GPU4 + +- 请求: + - pod1:使用 1 个 GPU,gpucore 是 20%,gpumem-percentage 是 20% + - pod2:使用 1 个 GPU,gpucore 是 20%,gpumem-percentage 是 20% + +- 调度结果: + - pod1:调度到节点1,选择 GPU1 这个设备 + - pod2:调度到节点1,选择 GPU1 这个设备 + +#### 故事 4 + +GPU spread,尽可能使用不同的 GPU 卡,例如: + +- 集群资源: + - 节点1:GPU 拥有 4 个 GPU 设备,它们是 GPU1, GPU2, GPU3, GPU4 + +- 请求: + - pod1:使用 1 个 GPU,gpucore 是 20%,gpumem-percentage 是 20% + - pod2:使用 1 个 GPU,gpucore 是 20%,gpumem-percentage 是 20% + +- 调度结果: + - pod1:调度到节点1,选择 GPU1 这个设备 + - pod2:调度到节点1,选择 GPU2 这个设备 + +## 设计细节 + +### Node-scheduler-policy + +![node-shceduler-policy-demo.png](https://github.com/Project-HAMi/HAMi/raw/master/docs/develop/imgs/node-shceduler-policy-demo.png) + +#### Binpack + +Binpack 主要考虑节点资源使用情况。使用越满,得分越高。 + +``` +score: ((request + used) / allocatable) * 10 +``` + +1. 节点1的 Binpack 评分信息如下 + +``` +Node1 score: ((1+3)/4) * 10= 10 +``` + +2. 节点2的 Binpack 评分信息如下 + +``` +Node2 score: ((1+2)/4) * 10= 7.5 +``` + +因此,在 `Binpack` 策略中我们可以选择 `Node1`。 + +#### Spread + +Spread 主要考虑节点资源使用情况。使用越少,得分越高。 + +``` +score: ((request + used) / allocatable) * 10 +``` + +1. 节点1的 Spread 评分信息如下 +``` +Node1 score: ((1+3)/4) * 10= 10 +``` + +2. 节点2的 Spread 评分信息如下 +``` +Node2 score: ((1+2)/4) * 10= 7.5 +``` + +因此,在 `Spread` 策略中我们可以选择 `Node2`。 + +### GPU-scheduler-policy + +![gpu-scheduler-policy-demo.png](https://github.com/Project-HAMi/HAMi/raw/master/docs/develop/imgs/gpu-scheduler-policy-demo.png) + +#### Binpack + +Binpack 主要关注每张卡的计算能力和显存使用情况。使用越多,得分越高。 +``` +score: ((request.core + used.core) / allocatable.core + (request.mem + used.mem) / allocatable.mem)) * 10 +``` + +1. GPU1 的 Binpack 评分信息如下 +``` +GPU1 Score: ((20+10)/100 + (1000+2000)/8000)) * 10 = 6.75 +``` + +2. GPU2 的 Binpack 评分信息如下 +``` +GPU2 Score: ((20+70)/100 + (1000+6000)/8000)) * 10 = 17.75 +``` + +因此,在 `Binpack` 策略中我们可以选择 `GPU2`。 + +#### Spread + +Spread 主要关注每张卡的计算能力和显存使用情况。使用越少,得分越高。 +``` +score: ((request.core + used.core) / allocatable.core + (request.mem + used.mem) / allocatable.mem)) * 10 +``` + +1. GPU1 的 Spread 评分信息如下 +``` +GPU1 Score: ((20+10)/100 + (1000+2000)/8000)) * 10 = 6.75 +``` + +2. GPU2 的 Spread 评分信息如下 +``` +GPU2 Score: ((20+70)/100 + (1000+6000)/8000)) * 10 = 17.75 +``` + +因此,在 `Spread` 策略中我们可以选择 `GPU1`。 \ No newline at end of file diff --git a/docs/zh/get-started/deploy-with-helm.md b/docs/zh/get-started/deploy-with-helm.md new file mode 100644 index 0000000..618d7ba --- /dev/null +++ b/docs/zh/get-started/deploy-with-helm.md @@ -0,0 +1,188 @@ +--- +title: 使用 Helm 部署 HAMi +--- + +## 目录 {#toc} + +- [先决条件](#prerequisites) +- [安装步骤](#installation) +- [演示](#demo) + +本指南将涵盖: + +- 为每个 GPU 节点配置 NVIDIA 容器运行时 +- 使用 Helm 安装 HAMi +- 启动 vGPU 任务 +- 验证容器内设备资源是否受限 + +## 先决条件 {#prerequisites} + +- [Helm](https://helm.sh/zh/docs/) v3+ +- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) v1.16+ +- [CUDA](https://developer.nvidia.com/cuda-toolkit) v10.2+ +- [NVIDIA 驱动](https://www.nvidia.cn/drivers/unix/) v440+ + +## 安装步骤 {#installation} + +### 1. 配置 nvidia-container-toolkit {#configure-nvidia-container-toolkit} + +<summary> 配置 nvidia-container-toolkit </summary> + +在所有 GPU 节点执行以下操作。 + +本文档假设已预装 NVIDIA 驱动和 `nvidia-container-toolkit`,并已将 `nvidia-container-runtime` 配置为默认底层运行时。 + +参考:[nvidia-container-toolkit 安装指南](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) + +#### 基于 Debian 系统(使用 `Docker` 和 `containerd`)示例 {#example-for-debian-based-systems-with-docker-and-containerd} + +##### 安装 `nvidia-container-toolkit` {#install-the-nvidia-container-toolkit} + +```bash +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | sudo apt-key add - +curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \ + sudo tee /etc/apt/sources.list.d/libnvidia-container.list + +sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit +``` + +##### 配置 `Docker` {#configure-docker} + +当使用 `Docker` 运行 `Kubernetes` 时,编辑配置文件(通常位于 `/etc/docker/daemon.json`),将 +`nvidia-container-runtime` 设为默认底层运行时: + +```json +{ + "default-runtime": "nvidia", + "runtimes": { + "nvidia": { + "path": "/usr/bin/nvidia-container-runtime", + "runtimeArgs": [] + } + } +} +``` + +然后重启 `Docker`: + +```bash +sudo systemctl daemon-reload && systemctl restart docker +``` + +##### 配置 `containerd` {#configure-containerd} + +当使用 `containerd` 运行 `Kubernetes` 时,修改配置文件(通常位于 `/etc/containerd/config.toml`),将 +`nvidia-container-runtime` 设为默认底层运行时: + +```toml +version = 2 +[plugins] + [plugins."io.containerd.grpc.v1.cri"] + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + privileged_without_host_devices = false + runtime_engine = "" + runtime_root = "" + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "/usr/bin/nvidia-container-runtime" +``` + +然后重启 `containerd`: + +```bash +sudo systemctl daemon-reload && systemctl restart containerd +``` + +#### 2. 标记节点 {#label-your-nodes} + +通过添加 "gpu=on" 标签将 GPU 节点标记为可调度 HAMi 任务。未标记的节点将无法被调度器管理。 + +```bash +kubectl label nodes {节点ID} gpu=on +``` + +#### 3. 使用 Helm 部署 HAMi {#deploy-hami-using-helm} + +首先通过以下命令确认 Kubernetes 版本: + +```bash +kubectl version +``` + +然后添加 Helm 仓库: + +```bash +helm repo add hami-charts https://project-hami.github.io/HAMi/ +``` + +安装时需设置 Kubernetes 调度器镜像版本与集群版本匹配。例如集群版本为 1.16.8 时,使用以下命令部署: + +```bash +helm install hami hami-charts/hami \ + --set scheduler.kubeScheduler.imageTag=v1.16.8 \ + -n kube-system +``` + +若一切正常,可见 vgpu-device-plugin 和 vgpu-scheduler 的 Pod 均处于 Running 状态。 + +### 演示 {#demo} + +#### 1. 提交演示任务 {#submit-demo-task} + +容器现在可通过 `nvidia.com/gpu` 资源类型申请 NVIDIA vGPU: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 1 # 申请 1 个 vGPU + nvidia.com/gpumem: 10240 # 每个 vGPU 包含 10240m 设备内存(可选,整型) +``` + +#### 2. 验证容器内资源限制 {#verify-in-container-resouce-control} + +执行查询命令: + +```bash +kubectl exec -it gpu-pod nvidia-smi +``` + +预期输出: + +```text +[HAMI-core Msg(28:140561996502848:libvgpu.c:836)]: Initializing..... +Wed Apr 10 09:28:58 2024 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 Tesla V100-PCIE-32GB On | 00000000:3E:00.0 Off | 0 | +| N/A 29C P0 24W / 250W | 0MiB / 10240MiB | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| No running processes found | ++-----------------------------------------------------------------------------------------+ +[HAMI-core Msg(28:140561996502848:multiprocess_memory_limit.c:434)]: Calling exit handler 28 +``` diff --git a/docs/zh/index.md b/docs/zh/index.md new file mode 100644 index 0000000..7e61666 --- /dev/null +++ b/docs/zh/index.md @@ -0,0 +1,37 @@ +# HAMi +需要翻译 + +### What is HAMi + +HAMi (Heterogeneous AI Computing Virtualization Middleware) formerly known as k8s-vGPU-scheduler, is an 'all-in-one' chart designed to manage Heterogeneous AI Computing Devices in a k8s cluster. It can provide the ability to share Heterogeneous AI devices and provide resource isolation among tasks. + +HAMi is committed to improving the utilization rate of heterogeneous computing devices in Kubernetes clusters and providing a unified multiplexing interface for different types of heterogeneous devices. + +### Why HAMi + +#### Kubernetes Native API Compatible + +Zero change upgrade: compatible with default behaviour from Kubernetes. + +#### Open and Neutral + +Jointly initiated by Internet, finance, manufacturing, cloud providers, etc. Target for open governance with CNCF + +#### Avoid Vendor Lock-in + +Integration with mainstream cloud providers; Not tied to proprietary vendor orchestration + +#### Resource Isolation + +Provides hard isolation of resources within containers, task in containers can't use resources that exceed their quota + +#### Supports a variety of heterogeneous computing devices + +Provides device-sharing on GPU, MLU, NPU from a variety of manufacturers + +#### Unified Management + +Unified monitoring system, Configurable scheduling policies(binpack,spread,etc...) + +### How It Works +tbd... \ No newline at end of file diff --git a/docs/zh/installation/aws-installation.md b/docs/zh/installation/aws-installation.md new file mode 100644 index 0000000..90f651d --- /dev/null +++ b/docs/zh/installation/aws-installation.md @@ -0,0 +1,54 @@ +--- +title: 在 AWS 上安装和使用 HAMi +translated: true +--- + +HAMi 已经在 AWS 市场上发布,你可以通过 helm 或 AWS add-on 快速安装。 + +## 准备工作 + +在安装前,请确保你已经: + +- 在市场上订阅了 HAMi +- 创建了一个 Kubernetes 集群 + +## 使用 Helm 安装 + +你可以使用以下命令获取 HAMi 的 Helm chart 并安装: + +```shell +export HELM_EXPERIMENTAL_OCI=1 +# The `username` and `password-stdin` correspond to your AWS login credentials. +aws ecr get-login-password --region us-east-1 | helm registry login --username AWS --password-stdin 709825985650.dkr.ecr.us-east-1.amazonaws.com +mkdir awsmp-chart && cd awsmp-chart +helm pull oci://709825985650.dkr.ecr.us-east-1.amazonaws.com/dynamia-ai/hami --version 2.6.1-community +tar xf $(pwd)/* && find $(pwd) -maxdepth 1 -type f -delete +helm install --generate-name --namespace <ENTER_NAMESPACE_HERE> ./* +``` + +您可以通过调整[配置](../userguide/configure.md)来自定义安装。 + +## 使用 AWS add-on 安装 + +在使用 AWS add-on 安装 HAMi 前,你需要安装cert-manager,你可以在 AWS 插件市场中找到该插件并通过控制台安装。 +你也可以参考[AWS 用户指南](https://docs.aws.amazon.com/eks/latest/userguide/lbc-manifest.html#lbc-cert)进行安装。 + +然后你就可以使用 AWS 插件市场中的 HAMi 插件进行安装。 + +## 验证您的安装 + +您可以使用以下命令验证您的安装: + +``` +kubectl get pods -n kube-system +``` + +如果 hami-device-plugin 和 hami-scheduler pods 都处于 Running 状态,则说明您的安装成功。 + +## 使用示例 + +### NVIDIA 设备 +[使用独占 GPU](https://project-hami.io/zh/docs/userguide/NVIDIA-device/examples/use-exclusive-card) +[为容器分配特定设备内存](https://project-hami.io/zh/docs/userguide/NVIDIA-device/examples/allocate-device-memory) +[为容器分配设备核心资源](https://project-hami.io/zh/docs/userguide/NVIDIA-device/examples/allocate-device-core) +[将任务分配给 mig 实例](https://project-hami.io/zh/docs/userguide/NVIDIA-device/examples/dynamic-mig-example) \ No newline at end of file diff --git a/docs/zh/installation/how-to-use-volcano-vgpu.md b/docs/zh/installation/how-to-use-volcano-vgpu.md new file mode 100644 index 0000000..13c8091 --- /dev/null +++ b/docs/zh/installation/how-to-use-volcano-vgpu.md @@ -0,0 +1,126 @@ +--- +title: 如何使用 Volcano vGPU 设备插件 +translated: true +--- + +# Kubernetes 的 Volcano vgpu 设备插件 + +**注意**: + +使用 volcano-vgpu 时,您*不需要*安装 HAMi,只需使用 +[Volcano vgpu 设备插件](https://github.com/Project-HAMi/volcano-vgpu-device-plugin) 就足够了。它可以为由 volcano 管理的 NVIDIA 设备提供设备共享机制。 + +这基于 [Nvidia 设备插件](https://github.com/NVIDIA/k8s-device-plugin),使用 [HAMi-core](https://github.com/Project-HAMi/HAMi-core) 支持 GPU 卡的硬隔离。 + +Volcano vgpu 仅在 volcano > 1.9 中可用 + +## 快速开始 + +### 配置调度器 + +更新调度器配置: + +```shell script +kubectl edit cm -n volcano-system volcano-scheduler-configmap +``` + +```yaml +kind: ConfigMap +apiVersion: v1 +metadata: + name: volcano-scheduler-configmap + namespace: volcano-system +data: + volcano-scheduler.conf: | + actions: "enqueue, allocate, backfill" + tiers: + - plugins: + - name: priority + - name: gang + - name: conformance + - plugins: + - name: drf + - name: deviceshare + arguments: + deviceshare.VGPUEnable: true # 启用 vgpu + - name: predicates + - name: proportion + - name: nodeorder + - name: binpack +``` + +### 在 Kubernetes 中启用 GPU 支持 + +一旦您在*所有*希望使用的 GPU 节点上启用了此选项,您就可以通过部署以下 Daemonset 在集群中启用 GPU 支持: + +``` +$ kubectl create -f https://raw.githubusercontent.com/Project-HAMi/volcano-vgpu-device-plugin/main/volcano-vgpu-device-plugin.yml +``` + +### 验证环境是否准备好 + +检查节点状态,如果 `volcano.sh/vgpu-number` 包含在可分配资源中,则表示正常。 + +```shell script +$ kubectl get node {node name} -oyaml +... +status: + addresses: + - address: 172.17.0.3 + type: InternalIP + - address: volcano-control-plane + type: Hostname + allocatable: + cpu: "4" + ephemeral-storage: 123722704Ki + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 8174332Ki + pods: "110" + volcano.sh/gpu-number: "10" # vGPU 资源 + capacity: + cpu: "4" + ephemeral-storage: 123722704Ki + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 8174332Ki + pods: "110" + volcano.sh/gpu-memory: "89424" + volcano.sh/gpu-number: "10" # vGPU 资源 +``` + +### 运行 VGPU 作业 + +可以通过在 resource.limit 中设置 "volcano.sh/vgpu-number"、"volcano.sh/vgpu-cores" 和 "volcano.sh/vgpu-memory" 来请求 VGPU。 + +```shell script +$ cat <<EOF | kubectl apply -f - +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod1 +spec: + containers: + - name: cuda-container + image: nvidia/cuda:9.0-devel + command: ["sleep"] + args: ["100000"] + resources: + limits: + volcano.sh/vgpu-number: 2 # 请求 2 个 gpu 卡 + volcano.sh/vgpu-memory: 3000 # (可选)每个 vGPU 使用 3G 设备内存 + volcano.sh/vgpu-cores: 50 # (可选)每个 vGPU 使用 50% 核心 +EOF +``` + +您可以在容器内使用 nvidia-smi 验证设备内存: + +> **警告:** *如果在使用设备插件和 NVIDIA 镜像时不请求 GPU,机器上的所有 GPU 都将暴露在您的容器内。 +> 容器使用的 vgpu 数量不能超过该节点上的 gpu 数量。* + +### 监控 + +volcano-scheduler-metrics 记录每个 GPU 的使用和限制,访问以下地址以获取这些指标。 + +``` +curl {volcano scheduler cluster ip}:8080/metrics \ No newline at end of file diff --git a/docs/zh/installation/offline-installation.md b/docs/zh/installation/offline-installation.md new file mode 100644 index 0000000..2d2d8be --- /dev/null +++ b/docs/zh/installation/offline-installation.md @@ -0,0 +1,58 @@ +--- +title: 离线安装 +translated: true +--- + +如果您的集群无法直接访问外部网络,您可以使用离线部署来安装 HAMi + +## 准备您的镜像 + +您需要将以下镜像保存到一个 tarball 文件中,并将其复制到集群中。 +镜像列表: +``` +projecthami/hami:{HAMi 版本} +docker.io/jettech/kube-webhook-certgen:v1.5.2 +liangjw/kube-webhook-certgen:v1.1.1 +registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler:{您的 kubernetes 版本} +``` + +加载这些镜像,将这些镜像标记为您的内部注册表,并将它们推送到您的注册表中 + +``` +docker load -i {HAMi_image}.tar +docker tag projecthami/hami:{HAMi 版本} {your_inner_registry}/hami:{HAMi 版本} +docker push {your_inner_registry}/hami:{HAMi 版本} +docker tag docker.io/jettech/kube-webhook-certgen:v1.5.2 {your inner_regisry}/kube-webhook-certgen:v1.5.2 +docker push {your inner_regisry}/kube-webhook-certgen:v1.5.2 +docker tag liangjw/kube-webhook-certgen:v1.1.1 {your_inner_registry}/kube-webhook-certgen:v1.1.1 +docker tag registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler:{您的 kubernetes 版本} {your_inner_registry}/kube-scheduler:{您的 kubernetes 版本} +docker push {your_inner_registry}/kube-scheduler:{您的 kubernetes 版本} +``` + +## 准备 HAMi chart + +从 [github](https://github.com/Project-HAMi/HAMi/tree/master/charts) 下载 charts 文件夹,将其放置在集群内的 $\{CHART_PATH\},然后编辑 $\{CHART_PATH\}/hami/values.yaml 中的以下字段。 + +``` +scheduler.kubeScheduler.image +scheduler.extender.image +scheduler.patch.image +scheduler.patch.imageNew +scheduler.devicePlugin.image +scheduler.devicePlugin.monitorimage +``` + +## 在您的 $\{CHART_PATH\} 文件夹中执行以下命令 + +``` +helm install hami hami --set scheduler.kubeScheduler.imageTag={您的 k8s 服务器版本} -n kube-system +``` + +7. 验证您的安装 + +执行以下命令 +``` +kubectl get pods -n kube-system +``` + +如果您可以看到 'device-plugin' 和 'scheduler' 都在运行,那么 HAMi 已成功安装。 \ No newline at end of file diff --git a/docs/zh/installation/online-installation.md b/docs/zh/installation/online-installation.md new file mode 100644 index 0000000..7758f7b --- /dev/null +++ b/docs/zh/installation/online-installation.md @@ -0,0 +1,42 @@ +--- +title: 通过 Helm 在线安装(推荐) +translated: true +--- + +最佳实践是使用 helm 部署 HAMi。 + +## 添加 HAMi 仓库 + +您可以使用以下命令添加 HAMi 图表仓库: + +``` +helm repo add hami-charts https://project-hami.github.io/HAMi/ +``` + +## 获取您的 Kubernetes 版本 + +安装时需要 Kubernetes 版本。您可以使用以下命令获取此信息: + +``` +kubectl version +``` + +## 安装 + +在安装过程中,将 Kubernetes 调度器镜像版本设置为与您的 Kubernetes 服务器版本匹配。例如,如果您的集群服务器版本是 1.16.8,请使用以下命令进行部署: + +``` +helm install hami hami-charts/hami --set scheduler.kubeScheduler.imageTag=v1.16.8 -n kube-system +``` + +您可以通过调整[配置](../userguide/configure.md)来自定义安装。 + +## 验证您的安装 + +您可以使用以下命令验证您的安装: + +``` +kubectl get pods -n kube-system +``` + +如果 hami-device-plugin 和 hami-scheduler pods 都处于 Running 状态,则说明您的安装成功。 \ No newline at end of file diff --git a/docs/zh/installation/prequisities.md b/docs/zh/installation/prequisities.md new file mode 100644 index 0000000..f5a6f53 --- /dev/null +++ b/docs/zh/installation/prequisities.md @@ -0,0 +1,88 @@ +--- +title: 前提条件 +translated: true +--- + +## 先决条件 + +- [Helm](https://helm.sh/zh/docs/) 版本 v3+ +- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) 版本 v1.16+ +- [CUDA](https://developer.nvidia.com/cuda-toolkit) 版本 v10.2+ +- [NvidiaDriver](https://www.nvidia.cn/drivers/unix/) v440+ + +## 准备您的 GPU 节点 + +在所有 GPU 节点上执行以下步骤。 + +本 README 假设已预安装 NVIDIA 驱动程序和 `nvidia-container-toolkit`。此外,还假设将 `nvidia-container-runtime` 配置为默认的低级运行时。 + +请参阅:[https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) + +### 适用于基于 Debian 系统的 `Docker` 和 `containerd` 示例 + +#### 安装 `nvidia-container-toolkit` + +```bash +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | sudo apt-key add - +curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | sudo tee /etc/apt/sources.list.d/libnvidia-container.list + +sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit +``` + +#### 配置 `Docker` + +在使用 `Docker` 运行 `Kubernetes` 时,编辑配置文件,通常位于 `/etc/docker/daemon.json`,以设置 `nvidia-container-runtime` 作为默认的低级运行时: + +```json +{ + "default-runtime": "nvidia", + "runtimes": { + "nvidia": { + "path": "/usr/bin/nvidia-container-runtime", + "runtimeArgs": [] + } + } +} +``` + +然后重启 `Docker`: + +``` +sudo systemctl daemon-reload && systemctl restart docker +``` + +#### 配置 `containerd` + +在使用 `containerd` 运行 `Kubernetes` 时,修改配置文件,通常位于 `/etc/containerd/config.toml`,以设置 +`nvidia-container-runtime` 作为默认的低级运行时: + +``` +version = 2 +[plugins] + [plugins."io.containerd.grpc.v1.cri"] + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + privileged_without_host_devices = false + runtime_engine = "" + runtime_root = "" + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "/usr/bin/nvidia-container-runtime" +``` + +然后重启 `containerd`: + +``` +sudo systemctl daemon-reload && systemctl restart containerd +``` + +### 给节点打标签 + +通过添加标签 "gpu=on" 来为 HAMi 调度标记您的 GPU 节点。没有此标签,节点无法被我们的调度器管理。 + +``` +kubectl label nodes {nodeid} gpu=on \ No newline at end of file diff --git a/docs/zh/installation/uninstall.md b/docs/zh/installation/uninstall.md new file mode 100644 index 0000000..b35ab2e --- /dev/null +++ b/docs/zh/installation/uninstall.md @@ -0,0 +1,12 @@ +--- +title: 卸载 HAMi +translated: true +--- + +卸载 hami 的步骤很简单: + +``` +helm uninstall hami -n kube-system +``` + +> **注意:** *卸载不会终止正在运行的任务。* \ No newline at end of file diff --git a/docs/zh/installation/upgrade.md b/docs/zh/installation/upgrade.md new file mode 100644 index 0000000..43f4be2 --- /dev/null +++ b/docs/zh/installation/upgrade.md @@ -0,0 +1,14 @@ +--- +title: 升级 HAMi +translated: true +--- + +将HAMi升级到最新版本是一个简单的过程,更新仓库并重新启动图表: + +``` +helm uninstall hami -n kube-system +helm repo update +helm install hami hami-charts/hami -n kube-system +``` + +> **警告:** *如果在不清除已提交任务的情况下升级HAMi,可能会导致分段错误。* \ No newline at end of file diff --git a/docs/zh/installation/webui-installation.md b/docs/zh/installation/webui-installation.md new file mode 100644 index 0000000..e653686 --- /dev/null +++ b/docs/zh/installation/webui-installation.md @@ -0,0 +1,107 @@ +--- +title: WebUI 安装 +translated: true +--- + +# 使用 Helm Charts 部署 HAMi-WebUI + +本主题包含在 Kubernetes 上使用 Helm Charts 安装和运行 HAMi-WebUI 的说明。 + +WebUI 只能通过本地主机访问,因此您需要通过配置 `~/.kube/config` 将本地主机连接到集群。 + +[Helm](https://helm.sh/) 是一个用于管理 Kubernetes 应用程序的开源命令行工具。它是 [CNCF Landscape](https://www.cncf.io/projects/helm/) 中的一个毕业项目。 + +HAMi-WebUI 开源社区提供了在 Kubernetes 上运行的 Helm Charts。请注意,代码不提供任何担保。如果您遇到任何问题,可以在 [官方 GitHub 仓库](https://github.com/Project-HAMi/HAMi-WebUI/tree/main/charts/hami-webui)报告。 + +## 先决条件 + +要使用 Helm 安装 HAMi-WebUI,请确保满足以下要求: + +1. 本地主机上的 Kubectl + +2. [HAMi](https://github.com/Project-HAMi/HAMi?tab=readme-ov-file#quick-start) >= 2.4.0 + +3. Prometheus > 2.8.0 + +4. Helm > 3.0 + +## 使用 Helm 安装 HAMi-WebUI + +### 部署 HAMi-WebUI Helm charts + +要设置 HAMi-WebUI Helm 仓库,以便在您的机器上下载正确的 HAMi-WebUI Helm charts,请完成以下步骤: + +1. 使用以下命令语法添加 HAMi-WebUI 仓库: + + ```bash + helm repo add hami-webui https://project-hami.github.io/HAMi-WebUI + ``` + +2. 使用以下命令部署 HAMi-WebUI: + + ```bash + helm install my-hami-webui hami-webui/hami-webui --set externalPrometheus.enabled=true --set externalPrometheus.address="http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090" -n kube-system + ``` + + > _**重要**_:您需要将 'externalPrometheus.address' 的值替换为集群内的 Prometheus 地址 + + 您可以在安装期间根据配置[文档](https://github.com/Project-HAMi/HAMi-WebUI/blob/main/charts/hami-webui/README.md#values)在 [values.yaml](https://github.com/Project-HAMi/HAMi-WebUI/blob/main/charts/hami-webui/values.yaml) 中设置其他字段。 + +3. 运行以下命令以验证安装: + + ```bash + kubectl get pods -n kube-system | grep webui + ``` + + 如果安装成功,您应该看到 'hami-webui' 和 'hami-webui-dcgm-exporter' 都处于运行状态。 + +### 访问 HAMi-WebUI + +1. 在本地主机中配置 ~/.kube/config 以便能够连接到您的集群。 + +2. 运行以下命令以在本地主机上对 HAMi-WebUI 服务进行端口转发,端口为 `3000`。 + + ```bash + kubectl port-forward service/my-hami-webui 3000:3000 --namespace=kube-system + ``` + + 有关端口转发的更多信息,请参阅[使用端口转发访问集群中的应用程序](https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster/)。 + +3. 在浏览器中导航到 `localhost:3000`。 + + HAMi-WebUI 资源概览页面将出现。 + +## 故障排除 + +本节包括在通过 Helm 在 Kubernetes 上部署 HAMi-WebUI 时可能会发现有用的故障排除提示。 + +### 收集日志 + +在排查任何问题时,查看 HAMi-WebUI 服务器日志非常重要。 + +要检查 HAMi-WebUI 日志,请运行以下命令: + +```bash +kubectl logs --namespace=hami deploy/my-hami-webui -c hami-webui-fe-oss +kubectl logs --namespace=hami deploy/my-hami-webui -c hami-webui-be-oss +``` + +有关访问 Kubernetes 应用程序日志的更多信息,请参阅 [Pods](https://kubernetes.io/docs/reference/kubectl/cheatsheet/#interacting-with-running-pods) 和 [Deployments](https://kubernetes.io/docs/reference/kubectl/cheatsheet/#interacting-with-deployments-and-services)。 + +## 卸载 HAMi-WebUI 部署 + +要卸载 HAMi-WebUI 部署,请运行命令: + +`helm uninstall <RELEASE-NAME> <NAMESPACE-NAME>` + +```bash +helm uninstall my-hami-webui -n hami +``` + +这将删除给定命名空间 hami 中的所有对象。 + +如果您想删除命名空间 `hami`,请运行命令: + +```bash +kubectl delete namespace hami +``` diff --git a/docs/zh/key-features/device-resource-isolation.md b/docs/zh/key-features/device-resource-isolation.md new file mode 100644 index 0000000..caadef6 --- /dev/null +++ b/docs/zh/key-features/device-resource-isolation.md @@ -0,0 +1,18 @@ +--- +title: 设备资源隔离 +translated: true +--- + +一个用于设备隔离的简单演示: +一个具有以下资源的任务。 + +``` + resources: + limits: + nvidia.com/gpu: 1 # 请求1个vGPU + nvidia.com/gpumem: 3000 # 每个vGPU包含3000m设备内存 +``` + +将在容器内看到3G设备内存 + +![img](../resources/hard_limit.jpg) diff --git a/docs/zh/key-features/device-sharing.md b/docs/zh/key-features/device-sharing.md new file mode 100644 index 0000000..9e9393f --- /dev/null +++ b/docs/zh/key-features/device-sharing.md @@ -0,0 +1,12 @@ +--- +title: 设备共享 +translated: true +--- + +- 通过设置核心使用率(百分比),进行设备的部分分配 +- 通过设置显存(单位:MB),进行设备的部分分配 +- 对流式多处理器进行硬限制 +- 无需对现有程序进行任何修改 +- 支持动态MIG切片能力,样例 + +![img](https://github.com/Project-HAMi/HAMi/raw/master/imgs/example.png) diff --git a/docs/zh/releases.md b/docs/zh/releases.md new file mode 100644 index 0000000..d7becba --- /dev/null +++ b/docs/zh/releases.md @@ -0,0 +1,67 @@ +--- +title: Releases +translated: true +--- + +## 发布说明和资源 + +发布说明可在 GitHub 上查看:https://github.com/Project-HAMi/HAMi/releases + +## 发布管理 + +本节提供了有关发布时间表和发布分支维护的指南。 + +### 发布时间表 + +HAMi 使用语义版本控制模式。HAMi v2.4.0 于 2024 年 9 月发布。 +该项目遵循给定的版本号 MAJOR.MINOR.PATCH。 + +### MAJOR 版本 + +主要版本包含大型功能、设计和架构更改,并可能包括不兼容的 API 更改。主要版本发布频率低,并在较长时间内保持稳定。 + +### MINOR 版本 + +次要版本包含以向后兼容方式引入的功能、增强和修复。由于 HAMi 是一个快速增长的项目,功能不断快速迭代,每隔几个月发布一个次要版本有助于平衡速度和稳定性。 + +* 大约每 3 个月 + +### PATCH 版本 + +补丁版本用于向后兼容的错误修复和不影响稳定性或兼容性的非常小的增强。通常只有关键修复会被选入补丁版本。通常在一个次要版本周期中至少会有一个补丁版本。 + +* 当需要关键修复时,或大约每月 + +### 版本控制 + +HAMi 使用 GitHub 标签来管理版本。新版本和候选版本使用通配符标签 `v<major>.<minor>.<patch>` 发布。 + +每当 PR 合并到主分支时,CI 将提取最新代码,生成镜像并上传到镜像库。通常可以在线使用最新标签下载 HAMi 组件的最新镜像。每当发布版本时,镜像也会发布,标签与上述发布的标签相同。 + +### 问题 + +非关键问题和功能默认总是添加到下一个次要版本里程碑。 + +没有解决方法的关键问题会被添加到下一个补丁版本。 + +### 分支和 PR + +发布分支和 PR 的管理如下: + +* 所有更改总是首先提交到 `master`。 +* 为每个主要或次要版本创建分支。 +* 分支名称将包含版本,例如 release-1.2。 +* 补丁版本从发布分支创建。 +* 对于需要包含在补丁版本中的关键修复,PR 应始终首先合并到 master,然后再挑选到发布分支。PR 需要确保有发布说明撰写,这些描述将在下一个补丁版本中反映。 + PR 的挑选过程通过脚本执行。使用方法请参见[此处](https://project-hami.io/docs/contributor/cherry-picks)。 +* 对于复杂的更改,特别是关键错误修复,可能需要为 master 和发布分支分别创建 PR。 +* 里程碑标记(例如 v1.4)将添加到 PR 中,这意味着 PR 中的更改是相应版本的内容之一。 +* 在 PR 审查期间,分配选择用于指示审阅者。 + +### 发布计划 + +次要版本将包含功能、增强和错误修复的混合。 + +主要功能遵循 HAMi 设计提案流程。您可以参考[此处](https://github.com/Project-HAMi/HAMi/tree/master/docs/proposals/resource-interpreter-webhook)作为提案示例。 + +在发布开始时,可能会有许多问题分配给发布里程碑。发布的优先级在每两周一次的社区会议中讨论。随着发布的进展,几个问题可能会被移到下一个里程碑。因此,如果一个问题很重要,重要的是在发布周期的早期倡导其优先级。 \ No newline at end of file diff --git a/docs/zh/roadmap.md b/docs/zh/roadmap.md new file mode 100644 index 0000000..a50c8db --- /dev/null +++ b/docs/zh/roadmap.md @@ -0,0 +1,45 @@ +--- +title: Karmada Roadmap +--- + +# Karmada Roadmap + +This document defines a high level roadmap for Karmada development and upcoming releases. +Community and contributor involvement is vital for successfully implementing all desired items for each release. +We hope that the items listed below will inspire further engagement from the community to keep karmada progressing and shipping exciting and valuable features. + + +## 2022 H1 +- Multi-cluster HA scheduling policy + * spread by region + * spread by zone + * spread by provider +- Multi-cluster Ingress +- Multi-cluster HPA (Horizontal Pod Autoscaling) +- Federated resource quota +- API reference +- [Karmada website](https://karmada.io/) refactor +- Policy-based governance, risk, and compliance +- Multi-cluster DNS (cluster identity) +- Global search across clusters +- Scheduling re-balancing + +## 2022 H2 +- Karmada Dashboard - alpha release +- Karmada scalability baseline (performance report) +- Cluster addons +- Helm chart propagation +- Multi-cluster events +- Multi-cluster Operator specifications +- Multi-cluster Application +- Multi-cluster monitoring +- Multi-cluster logging +- Multi-cluster storage +- Multi-cluster RBAC +- Multi-cluster networking +- Data migration across clusters +- Multi-cluster workflow +- Integration with ecosystem +- Cluster lifecycle management +- Image registry across clouds +- Multi-cluster Service Mesh solutions diff --git a/docs/zh/troubleshooting/troubleshooting.md b/docs/zh/troubleshooting/troubleshooting.md new file mode 100644 index 0000000..74b5c03 --- /dev/null +++ b/docs/zh/troubleshooting/troubleshooting.md @@ -0,0 +1,12 @@ +--- +title: 排障手册 +translated: true +--- + +- 如果在使用 NVIDIA 镜像的设备插件时不请求 vGPU,机器上的所有 GPU 可能会在容器内暴露。 +- 目前,A100 MIG 仅支持 "none" 和 "mixed" 模式。 +- 目前无法调度带有 "nodeName" 字段的任务;请改用 "nodeSelector"。 +- 目前仅支持计算任务;不支持视频编解码处理。 +- 我们将 `device-plugin` 环境变量名称从 `NodeName` 更改为 `NODE_NAME`,如果您使用镜像版本 `v2.3.9`,可能会遇到 `device-plugin` 无法启动的情况,有两种方法可以解决: + - 手动执行 `kubectl edit daemonset` 修改 `device-plugin` 环境变量从 `NodeName` 为 `NODE_NAME`。 + - 使用 helm 升级到最新版本,`device-plugin` 镜像的最新版本是 `v2.3.10`,执行 `helm upgrade hami hami/hami -n kube-system`,它将自动修复。 \ No newline at end of file diff --git a/docs/zh/userguide/AWSNeuron-device/enable-awsneuron-managing.md b/docs/zh/userguide/AWSNeuron-device/enable-awsneuron-managing.md new file mode 100644 index 0000000..fcd4cd2 --- /dev/null +++ b/docs/zh/userguide/AWSNeuron-device/enable-awsneuron-managing.md @@ -0,0 +1,132 @@ +--- +title: 启用AWS-Neuron设备共享 +--- + +## 概述 + +AWS Neuron设备是AWS专为机器学习工作负载设计的硬件加速器,特别针对深度学习推理和训练场景进行了优化。这些设备属于AWS Inferentia和Trainium产品家族,可在AWS云上为AI应用提供高性能、高性价比且可扩展的解决方案。 + +HAMi现已集成[my-scheduler](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/containers/kubernetes-getting-started.html#deploy-neuron-scheduler-extension),提供以下核心功能: + +* **Neuron共享机制**:HAMi支持通过分配设备核心(aws.amazon.com/neuroncore)实现AWS Neuron设备共享,每个Neuron核心对应1/2个物理设备。 + +* **拓扑感知调度**:当容器需要分配多个aws-neuron设备时,HAMi将确保这些设备之间具有物理连接,从而最小化设备间通信开销。具体连接方式请参阅[不同实例类型的设备分配策略](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/containers/kubernetes-getting-started.html#container-device-allocation-on-different-instance-types)。 + +## 前提条件 + +* 已部署Neuron-device-plugin +* 使用`Inf`或`Trn`类型的EC2实例 + +## 启用GCU共享支持 + +* 按照AWS官方文档在Neuron节点部署neuron-device-plugin:[Neuron设备插件部署指南](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/containers/kubernetes-getting-started.html#neuron-device-plugin) + +* 部署HAMi核心组件 + +``` +helm install hami hami-charts/hami -n kube-system +``` + +## 设备分配粒度 + +HAMi将每个AWS Neuron设备划分为2个可分配单元,支持分配半个物理设备。 + +### Neuron资源分配规范 + +- 每个`aws.amazon.com/neuroncore`单元对应1/2个物理设备 +- 无需像其他设备那样显式分配`aws.amazon.com/neuron`,仅需分配`aws.amazon.com/neuroncore` +- 当`aws.amazon.com/neuroncore`≥2时,等效于设置`awa.amazon.com/neuron=1/2 * neuronCoreNumber` +- 当任务需要多个neuron设备时,拓扑感知调度将自动生效 + +## 运行Neuron任务 + +容器现在可以通过以下两种资源类型申请AWS Neuron设备: +`aws.amazon.com/neuron` 或 `aws.amazon.com/neuroncore` + +更多示例可参考examples目录 + +完整设备分配示例: +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: nuropod +spec: + restartPolicy: Never + containers: + - name: nuropod + command: ["sleep","infinity"] + image: public.ecr.aws/neuron/pytorch-inference-neuron:1.13.1-neuron-py310-sdk2.20.2-ubuntu20.04 + resources: + limits: + cpu: "4" + memory: 4Gi + aws.amazon.com/neuron: 1 + requests: + cpu: "1" + memory: 1Gi +``` + +分配半个Neuron设备示例: +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: nuropod +spec: + restartPolicy: Never + containers: + - name: nuropod + command: ["sleep","infinity"] + image: public.ecr.aws/neuron/pytorch-inference-neuron:1.13.1-neuron-py310-sdk2.20.2-ubuntu20.04 + resources: + limits: + cpu: "4" + memory: 4Gi + aws.amazon.com/neuroncore: 1 + requests: + cpu: "1" + memory: 1Gi +``` + +## 按设备UUID选择 + +可通过注解指定使用或排除特定GPU设备: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: poddemo + annotations: + # 指定使用的GPU设备(逗号分隔列表) + aws.amazon.com/use-gpuuuid: "node1-AWSNeuron-0,node1-AWSNeuron-1" + # 或排除特定GPU设备(逗号分隔列表) + aws.amazon.com/nouse-gpuuuid: "node1-AWSNeuron-2,node1-AWSNeuron-3" +spec: + # ... 其他pod配置 +``` + +> **注意:** 设备ID格式为`{节点名称}-AWSNeuron-{索引号}`。可通过节点注解查询可用设备ID。 + +### 查询设备UUID + +通过以下命令查询节点上的AWS Neuron设备UUID: + +```bash +kubectl get pod <pod名称> -o yaml | grep -A 10 "hami.io/<卡类型>-devices-allocated" +``` + +或通过节点注解查询: + +```bash +kubectl get node <节点名称> -o yaml | grep -A 10 "hami.io/node-register-<卡类型>" +``` + +在节点状态中查找包含设备信息的注解字段。 + +## 注意事项 + +1. AWS Neuron共享仅对申请单个设备(即`aws.amazon.com/neuroncore`=1)的容器生效。 + +2. 容器内执行`neuron-ls`显示的是设备总内存,这属于正常现象。实际运行任务时会对设备内存进行正确限制。 diff --git a/docs/zh/userguide/AWSNeuron-device/examples/allocate-neuron-core.md b/docs/zh/userguide/AWSNeuron-device/examples/allocate-neuron-core.md new file mode 100644 index 0000000..0ea745c --- /dev/null +++ b/docs/zh/userguide/AWSNeuron-device/examples/allocate-neuron-core.md @@ -0,0 +1,26 @@ +--- +title: 分配AWS Neuron核心资源 +--- + +如需分配1/2个neuron设备,您可以通过分配neuroncore来实现,如下例所示: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: npod +spec: + restartPolicy: Never + containers: + - name: npod + command: ["sleep","infinity"] + image: public.ecr.aws/neuron/pytorch-inference-neuron:1.13.1-neuron-py310-sdk2.20.2-ubuntu20.04 + resources: + limits: + cpu: "4" + memory: 4Gi + aws.amazon.com/neuroncore: 1 + requests: + cpu: "1" + memory: 1Gi +``` diff --git a/docs/zh/userguide/AWSNeuron-device/examples/allocate-neuron-device.md b/docs/zh/userguide/AWSNeuron-device/examples/allocate-neuron-device.md new file mode 100644 index 0000000..4af17ca --- /dev/null +++ b/docs/zh/userguide/AWSNeuron-device/examples/allocate-neuron-device.md @@ -0,0 +1,26 @@ +--- +title: 分配AWS Neuron核心 +--- + +如需独占分配一个或多个aws neuron设备,可通过`aws.amazon.com/neuron`进行资源分配: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: npod +spec: + restartPolicy: Never + containers: + - name: npod + command: ["sleep","infinity"] + image: public.ecr.aws/neuron/pytorch-inference-neuron:1.13.1-neuron-py310-sdk2.20.2-ubuntu20.04 + resources: + limits: + cpu: "4" + memory: 4Gi + aws.amazon.com/neuron: 2 + requests: + cpu: "1" + memory: 1Gi +``` diff --git a/docs/zh/userguide/Ascend-device/device-template.md b/docs/zh/userguide/Ascend-device/device-template.md new file mode 100644 index 0000000..9261be9 --- /dev/null +++ b/docs/zh/userguide/Ascend-device/device-template.md @@ -0,0 +1,66 @@ +--- +title: Ascend 设备模板 +translated: true +--- + +```yaml +vnpus: +- chipName: 910B + commonWord: Ascend910A + resourceName: huawei.com/Ascend910A + resourceMemoryName: huawei.com/Ascend910A-memory + memoryAllocatable: 32768 + memoryCapacity: 32768 + aiCore: 30 + templates: + - name: vir02 + memory: 2184 + aiCore: 2 + - name: vir04 + memory: 4369 + aiCore: 4 + - name: vir08 + memory: 8738 + aiCore: 8 + - name: vir16 + memory: 17476 + aiCore: 16 +- chipName: 910B3 + commonWord: Ascend910B + resourceName: huawei.com/Ascend910B + resourceMemoryName: huawei.com/Ascend910B-memory + memoryAllocatable: 65536 + memoryCapacity: 65536 + aiCore: 20 + aiCPU: 7 + templates: + - name: vir05_1c_16g + memory: 16384 + aiCore: 5 + aiCPU: 1 + - name: vir10_3c_32g + memory: 32768 + aiCore: 10 + aiCPU: 3 +- chipName: 310P3 + commonWord: Ascend310P + resourceName: huawei.com/Ascend310P + resourceMemoryName: huawei.com/Ascend310P-memory + memoryAllocatable: 21527 + memoryCapacity: 24576 + aiCore: 8 + aiCPU: 7 + templates: + - name: vir01 + memory: 3072 + aiCore: 1 + aiCPU: 1 + - name: vir02 + memory: 6144 + aiCore: 2 + aiCPU: 2 + - name: vir04 + memory: 12288 + aiCore: 4 + aiCPU: 4 +``` diff --git a/docs/zh/userguide/Ascend-device/enable-ascend-sharing.md b/docs/zh/userguide/Ascend-device/enable-ascend-sharing.md new file mode 100644 index 0000000..69248fc --- /dev/null +++ b/docs/zh/userguide/Ascend-device/enable-ascend-sharing.md @@ -0,0 +1,109 @@ +--- +title: 启用 Ascend 共享 +translated: true +--- + +基于虚拟化模板支持内存切片,自动使用可用的租赁模板。有关详细信息,请查看[设备模板](./device-template.md)。 + +## 先决条件 + +* Ascend 设备类型:910B, 910A, 310P +* 驱动版本 >= 24.1.rc1 +* Ascend docker 运行时 + +## 启用 Ascend-sharing 支持 + +* 由于与 HAMi 的依赖关系,您需要在 HAMi 安装期间设置以下参数: + + ``` + devices.ascend.enabled=true + ``` + + 有关更多详细信息,请参阅 values.yaml 中的 'devices' 部分: + + ```yaml + devices: + ascend: + enabled: true + image: "ascend-device-plugin:master" + imagePullPolicy: IfNotPresent + extraArgs: [] + nodeSelector: + ascend: "on" + tolerations: [] + resources: + - huawei.com/Ascend910A + - huawei.com/Ascend910A-memory + - huawei.com/Ascend910B + - huawei.com/Ascend910B-memory + - huawei.com/Ascend310P + - huawei.com/Ascend310P-memory + ``` + +* 使用以下命令标记 Ascend 节点: + + ```bash + kubectl label node {ascend-node} ascend=on + ``` + +* 安装 [Ascend docker 运行时](https://gitee.com/ascend/ascend-docker-runtime) + +* 从 HAMi 项目[下载 Ascend-vgpu-device-plugin 的 yaml](https://github.com/Project-HAMi/ascend-device-plugin/blob/master/build/ascendplugin-hami.yaml),并执行以下命令来部署: + + ```bash + wge https://raw.githubusercontent.com/Project-HAMi/ascend-device-plugin/refs/heads/main/ascend-device-plugin.yaml + kubectl apply -f ascend-device-plugin.yaml + ``` + +## 运行 Ascend 作业 + +### Ascend 910B + +现在可以通过容器请求 Ascend 910B, +使用 `huawei.com/ascend910B` 和 `huawei.com/ascend910B-memory` 资源类型: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + huawei.com/Ascend910B: 1 # 请求 1 个 Ascend + huawei.com/Ascend910B-memory: 2000 # 请求 2000m 设备内存 +``` + +### Ascend 310P + +现在可以通过容器请求 Ascend 310P, +使用 `huawei.com/ascend310P` 和 `huawei.com/ascend310P-memory` 资源类型: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + huawei.com/Ascend310P: 1 # 请求 1 个 Ascend + huawei.com/Ascend310P-memory: 1024 # 请求 1024m 设备内存 +``` + +### 注意事项 + +1. 目前,Ascend 910b 仅支持两种分片策略,分别是 1/4 和 1/2。Ascend 310p 支持 3 种分片策略:1/7、2/7、4/7。作业的内存请求将自动与最接近的分片策略对齐。在此示例中,任务将分配 16384M 设备内存。 + +2. 不支持在初始化容器中使用 Ascend-sharing。 + +3. `huawei.com/Ascend910B-memory` 仅在 `huawei.com/Ascend91B0=1` 时有效。 + `huawe.com/Ascend310P-memory` 仅在 `huawei.com/Ascend310P=1` 时有效。 diff --git a/docs/zh/userguide/Ascend-device/examples/allocate-310p.md b/docs/zh/userguide/Ascend-device/examples/allocate-310p.md new file mode 100644 index 0000000..730744d --- /dev/null +++ b/docs/zh/userguide/Ascend-device/examples/allocate-310p.md @@ -0,0 +1,28 @@ +--- +title: 为容器分配 Ascend-310p 切片 +translated: true +--- + +要分配一定大小的 GPU 设备内存,您只需在 `huawei.com/ascend310P` 之外分配 `huawei.com/ascend310P-memory`。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: ascend310p-pod +spec: + tolerations: + - key: aaa + operator: Exists + effect: NoSchedule + containers: + - name: ubuntu-container + image: swr.cn-south-1.myhuaweicloud.com/ascendhub/ascend-pytorch:24.0.RC1-A2-1.11.0-ubuntu20.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + huawei.com/Ascend310P: 1 + huawei.com/Ascend310P-memory: 1024 +``` + +> **注意:** *Ascend910B 的计算资源也受到 `huawei.com/Ascend310P-memory` 的限制,等于分配的设备内存的百分比。* diff --git a/docs/zh/userguide/Ascend-device/examples/allocate-910b.md b/docs/zh/userguide/Ascend-device/examples/allocate-910b.md new file mode 100644 index 0000000..96afd32 --- /dev/null +++ b/docs/zh/userguide/Ascend-device/examples/allocate-910b.md @@ -0,0 +1,24 @@ +--- +title: 为容器分配 Ascend-910B 切片 +translated: true +--- + +要分配一定大小的 GPU 设备内存,您只需在 `huawei.com/ascend910` 之外分配 `huawei.com/ascend910-memory`。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + huawei.com/Ascend910: 1 # 请求1个NPU + huawei.com/Ascend910-memory: 2000 # 请求2000m设备内存 +``` + +> **注意:** *Ascend910B 的计算资源也受 `huawei.com/Ascend910-memory` 限制,等于分配的设备内存的百分比。* diff --git a/docs/zh/userguide/Ascend-device/examples/allocate-exclusive.md b/docs/zh/userguide/Ascend-device/examples/allocate-exclusive.md new file mode 100644 index 0000000..dfb69bf --- /dev/null +++ b/docs/zh/userguide/Ascend-device/examples/allocate-exclusive.md @@ -0,0 +1,21 @@ +--- +title: 分配独占设备 +translated: true +--- + +要分配整个 Ascend 设备,您只需分配 `huawei.com/ascend910` 或 `huawei.com/310p`,无需其他字段。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod1 +spec: + containers: + - name: ubuntu-container + image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + huawei.com/Ascend910B: 2 # 请求 2 个完整的 Ascend 910b 设备 +``` diff --git a/docs/zh/userguide/Cambricon-device/enable-cambricon-mlu-sharing.md b/docs/zh/userguide/Cambricon-device/enable-cambricon-mlu-sharing.md new file mode 100644 index 0000000..9a5cad0 --- /dev/null +++ b/docs/zh/userguide/Cambricon-device/enable-cambricon-mlu-sharing.md @@ -0,0 +1,80 @@ +--- +title: 启用寒武纪 MLU 共享 +translated: true +--- + +## 简介 + +本组件支持复用寒武纪MLU设备,并为此提供以下几种与vGPU类似的复用功能,包括: + +***MLU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 + +***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配MLU,本组件会确保任务使用的显存不会超过分配数值 + +***可限制分配的算力大小***: 你现在可以用百分比来分配MLU的算力,本组件会确保任务使用的算力不会超过分配数值 + +***指定MLU型号***:当前任务可以通过设置annotation("cambricon.com/use-mlutype","cambricon.com/nouse-mlutype")的方式,来选择使用或者不使用某些具体型号的MLU + +## 节点需求 + +* neuware-mlu370-driver > 5.10 +* cntoolkit > 2.5.3 + +## 开启MLU复用 + +* 通过helm部署本组件, 参照[主文档中的开启vgpu支持章节](https://github.com/Project-HAMi/HAMi/blob/master/README_cn.md#kubernetes开启vgpu支持) + +* 使用以下指令,为MLU节点打上label +``` +kubectl label node {mlu-node} mlu=on +``` + +* 从您的设备提供商处获取cambricon-device-plugin,并配置以下两个参数: + +`mode=dynamic-smlu`, `min-dsmlu-unit=256` + +它们分别代表开启MLU复用功能,与设置最小可分配的内存单元为256M,您可以参考设备提供方的文档来获取更多的配置信息。 + +* 部署配置后的`cambricon-device-plugin` + +``` +kubectl apply -f cambricon-device-plugin-daemonset.yaml +``` + + +## 运行MLU任务 + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: binpack-1 + labels: + app: binpack-1 +spec: + replicas: 1 + selector: + matchLabels: + app: binpack-1 + template: + metadata: + labels: + app: binpack-1 + spec: + containers: + - name: c-1 + image: ubuntu:18.04 + command: ["sleep"] + args: ["100000"] + resources: + limits: + cambricon.com/vmlu: "1" + cambricon.com/mlu.smlu.vmemory: "20" + cambricon.com/mlu.smlu.vcore: "10" +``` + +## 注意事项 + +1. 在init container中无法使用MLU复用功能,否则该任务不会被调度 + +2. 只有申请单MLU的任务可以指定显存`mlu.smlu.vmemory`和算力`mlu.smlu.vcore`的数值,若申请的MLU数量大于1,则所有申请的MLU都会被整卡分配 \ No newline at end of file diff --git a/docs/zh/userguide/Cambricon-device/examples/allocate-core-and-memory.md b/docs/zh/userguide/Cambricon-device/examples/allocate-core-and-memory.md new file mode 100644 index 0000000..61f45f9 --- /dev/null +++ b/docs/zh/userguide/Cambricon-device/examples/allocate-core-and-memory.md @@ -0,0 +1,37 @@ +--- +title: 为容器分配设备核心和内存资源 +translated: true +--- + +## 为容器分配设备核心和内存 + +要分配设备核心资源的某一部分,您只需在容器中使用 `cambricon.com/vmlu` 指定所需的寒武纪 MLU 数量,并分配 `cambricon.com/mlu370.smlu.vmemory` 和 `cambricon.com/mlu370.smlu.vcore`。 + +``` +apiVersion: apps/v1 +kind: Deployment +metadata: + name: binpack-1 + labels: + app: binpack-1 +spec: + replicas: 1 + selector: + matchLabels: + app: binpack-1 + template: + metadata: + labels: + app: binpack-1 + spec: + containers: + - name: c-1 + image: ubuntu:18.04 + command: ["sleep"] + args: ["100000"] + resources: + limits: + cambricon.com/vmlu: "1" + cambricon.com/mlu370.smlu.vmemory: "20" + cambricon.com/mlu370.smlu.vcore: "10" +``` diff --git a/docs/zh/userguide/Cambricon-device/examples/allocate-exclusive.md b/docs/zh/userguide/Cambricon-device/examples/allocate-exclusive.md new file mode 100644 index 0000000..b2fcd8b --- /dev/null +++ b/docs/zh/userguide/Cambricon-device/examples/allocate-exclusive.md @@ -0,0 +1,35 @@ +--- +title: 分配独占设备 +translated: true +--- + +## 分配独占设备 + +要分配整个寒武纪设备,您只需分配 `cambricon.com/vmlu`,无需其他字段。 + +``` +apiVersion: apps/v1 +kind: Deployment +metadata: + name: binpack-1 + labels: + app: binpack-1 +spec: + replicas: 1 + selector: + matchLabels: + app: binpack-1 + template: + metadata: + labels: + app: binpack-1 + spec: + containers: + - name: c-1 + image: ubuntu:18.04 + command: ["sleep"] + args: ["100000"] + resources: + limits: + cambricon.com/vmlu: "1" #分配整个 MLU +``` diff --git a/docs/zh/userguide/Cambricon-device/specify-device-core-usage.md b/docs/zh/userguide/Cambricon-device/specify-device-core-usage.md new file mode 100644 index 0000000..865097c --- /dev/null +++ b/docs/zh/userguide/Cambricon-device/specify-device-core-usage.md @@ -0,0 +1,18 @@ +--- +title: 分配设备核心给容器 +translated: true +--- + +## 分配设备核心给容器 + +通过指定资源 `cambricon.com/mlu.smlu.vcore` 来分配设备核心资源的百分比。 +可选项,每个 `cambricon.com/mlu.smlu.vcore` 单位等于设备核心的 1%。 + +``` + resources: + limits: + cambricon.com/vmlu: 1 # 请求 1 个 MLU + cambricon.com/mlu.smlu.vcore: "10" # 每个 MLU 包含 10% 的设备核心 +``` + +> **注意:** *根据 cambricon-device-plugin 的参数,资源名称可以是 `cambricon.com/mlu370.smlu.vcore` 或其他类型* diff --git a/docs/zh/userguide/Cambricon-device/specify-device-memory-usage.md b/docs/zh/userguide/Cambricon-device/specify-device-memory-usage.md new file mode 100644 index 0000000..f2bf3be --- /dev/null +++ b/docs/zh/userguide/Cambricon-device/specify-device-memory-usage.md @@ -0,0 +1,17 @@ +--- +title: 为容器分配设备内存 +translated: true +--- + +## 为容器分配设备内存 + +通过指定资源如 `cambricon.com/mlu.smlu.vmemory` 来分配设备内存的百分比大小。可选项,每个 `cambricon.com/mlu.smlu.vmemory` 单位等于设备内存的 1%。 + +``` + resources: + limits: + cambricon.com/vmlu: 1 # 请求 1 个 MLU + cambricon.com/mlu.smlu.vmemory: "20" # 每个 GPU 包含 20% 的设备内存 +``` + +> **注意:** *根据 cambricon-device-plugin 的参数,资源名称可以是 `cambricon.com/mlu370.smlu.vmemory` 或其他类型* diff --git a/docs/zh/userguide/Cambricon-device/specify-device-type-to-use.md b/docs/zh/userguide/Cambricon-device/specify-device-type-to-use.md new file mode 100644 index 0000000..32326a7 --- /dev/null +++ b/docs/zh/userguide/Cambricon-device/specify-device-type-to-use.md @@ -0,0 +1,16 @@ +--- +title: 分配到特定设备类型 +translated: true +--- + +## 分配到特定设备类型 + +您需要在 `cambricon-device-plugin` 中添加参数 `- --enable-device-type` 以支持设备类型规范。当设置此选项时,不同类型的 MLU 将生成不同的资源名称,例如 `cambricon.com/mlu370.smlu.vcore` 和 `cambricon.com/mlu370.smlu.vmemory`。 + +``` + resources: + limits: + cambricon.com/vmlu: 1 # 请求 1 个 MLU + cambricon.com/mlu370.smlu.vmemory: "20" # 每个 GPU 包含 20% 的设备内存 + cambricon.com/mlu370.smlu.vcore: "10" # 每个 GPU 包含 10% 的计算核心 +``` diff --git a/docs/zh/userguide/Device-supported.md b/docs/zh/userguide/Device-supported.md new file mode 100644 index 0000000..5cd8426 --- /dev/null +++ b/docs/zh/userguide/Device-supported.md @@ -0,0 +1,16 @@ +--- +title: 支持HAMi的设备 +translated: true +--- + +HAMi支持的设备视图如下表所示: + +| 生产商 | 制造商 | 类型 | 内存隔离 | 核心隔离 | 多卡支持 | +|-------------|------------|-------------|-----------|---------------|-------------------| +| GPU | NVIDIA | 全部 | ✅ | ✅ | ✅ | +| MLU | Cambricon | 370, 590 | ✅ | ✅ | ❌ | +| DCU | Hygon | Z100, Z100L | ✅ | ✅ | ❌ | +| Ascend | Huawei | 910B, 910B3, 310P | ✅ | ✅ | ❌ | +| GPU | iluvatar | 全部 | ✅ | ✅ | ❌ | +| GPU | Mthreads | MTT S4000 | ✅ | ✅ | ❌ | +| DPU | Teco | 检查中 | 进行中 | 进行中 | ❌ | \ No newline at end of file diff --git a/docs/zh/userguide/Enflame-device/enable-enflame-gcu-sharing.md b/docs/zh/userguide/Enflame-device/enable-enflame-gcu-sharing.md new file mode 100644 index 0000000..34a839e --- /dev/null +++ b/docs/zh/userguide/Enflame-device/enable-enflame-gcu-sharing.md @@ -0,0 +1,122 @@ +--- +title: 启用燧原 GPU 共享 +--- + + +## 简介 + +本组件支持复用燧原GCU设备(S60),并为此提供以下几种与vGPU类似的复用功能,包括: + +***GPU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 + +***百分比切片能力***: 你现在可以用百分比来申请一个GCU切片(例如20%),本组件会确保任务使用的显存和算力不会超过这个百分比对应的数值 + +***设备 UUID 选择***: 你可以通过注解指定使用或排除特定的GCU设备 + +***方便易用***: 部署本组件后,只需要部署厂家提供的gcushare-device-plugin即可使用 + + +## 节点需求 + +* Enflame gcushare-device-plugin >= 2.1.6 +* driver version >= 1.2.3.14 +* kubernetes >= 1.24 +* enflame-container-toolkit >=2.0.50 + +## 开启GCU复用 + +* 部署'gcushare-device-plugin',燧原的GCU共享需要配合厂家提供的'gcushare-device-plugin'一起使用,请联系设备提供方获取 + +> **注意:** *只需要安装gcushare-device-plugin,不要安装gcushare-scheduler-plugin.* + +* 在安装HAMi时配置参数'devices.enflame.enabled=true' + +``` +helm install hami hami-charts/hami --set devices.enflame.enabled=true -n kube-system +``` + +> **说明:** 默认资源名称如下: +> - `enflame.com/vgcu` 用于GCU数量,这里只能为1 +> - `enflame.com/vgcu-percentage` 用于生成共享GCU切片 +> +> 你可以通过修改`hami-scheduler-device`配置,来修改这些资源名称 + +## 设备粒度切分 + +HAMi 将每个燧原 GCU 划分为 100 个单元进行资源分配。当你请求一部分 GPU 时,实际上是在请求这些单元中的一定数量。 + +### 内存和核心分配 + +- 每个 `enflame.com/vgcu-percentage` 单位代表1%的算力和1%的显存 +- 如果不指定内存请求,系统将默认使用 100% 的可用内存 +- 内存与核心的分配通过硬限制强制执行,确保任务不会超过其分配的内存与核心 + +## 运行GCU任务 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gcushare-pod-2 + namespace: kube-system +spec: + terminationGracePeriodSeconds: 0 + containers: + - name: pod-gcu-example1 + image: ubuntu:18.04 + imagePullPolicy: IfNotPresent + command: + - sleep + args: + - '100000' + resources: + limits: + enflame.com/vgcu: 1 + enflame.com/vgcu-percentage: 22 +``` +> **注意:** *查看更多的[用例](https://github.com/Project-HAMi/HAMi/tree/release-v2.6/examples/enflame/).* + +## 设备 UUID 选择 + +你可以通过 Pod 注解来指定要使用或排除特定的 GPU 设备: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: poddemo + annotations: + # Use specific GPU devices (comma-separated list) + enflame.com/use-gpuuuid: "node1-enflame-0,node1-enflame-1" + # Or exclude specific GPU devices (comma-separated list) + enflame.com/nouse-gpuuuid: "node1-enflame-2,node1-enflame-3" +spec: + # ... rest of pod spec +``` + +> **说明:** 设备 ID 格式为 `{节点名称}-enflame-{索引}`。你可以在节点状态中找到可用的设备 ID。 + +### 查找设备 UUID + +你可以使用以下命令查找节点上的燧原 GCU 设备 UUID: + +```bash +kubectl get pod <pod-name> -o yaml | grep -A 10 "hami.io/<card-type>-devices-allocated" +``` + +或者通过检查节点注解: + +```bash +kubectl get node <node-name> -o yaml | grep -A 10 "hami.io/node-register-<card-type>" +``` + +在节点注解中查找包含设备信息的注解。 + + +## 注意事项 + +1. 共享模式只对申请一张GPU的容器生效(enflame.com/vgcu=1)。 + +2. 目前暂时不支持一个容器中申请多个GCU设备。 + +3. 任务中使用`efsmi`可以看到全部的显存,但这并非异常,显存会在任务使用过程中被正确限制。 \ No newline at end of file diff --git a/docs/zh/userguide/Hygon-device/enable-hygon-dcu-sharing.md b/docs/zh/userguide/Hygon-device/enable-hygon-dcu-sharing.md new file mode 100644 index 0000000..e2c922d --- /dev/null +++ b/docs/zh/userguide/Hygon-device/enable-hygon-dcu-sharing.md @@ -0,0 +1,76 @@ +--- +title: 启用 Hygon DCU 共享 +translated: true +--- + +## 简介 + +本组件支持复用海光DCU设备,并为此提供以下几种与vGPU类似的复用功能,包括: + +***DCU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 + +***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配DCU,本组件会确保任务使用的显存不会超过分配数值 + +***可限制计算单元数量***: 你现在可以指定任务使用的算力比例(例如60即代表使用60%算力)来分配DCU,本组件会确保任务使用的算力不会超过分配数值 + +***指定DCU型号***:当前任务可以通过设置annotation("hygon.com/use-dcutype","hygon.com/nouse-dcutype")的方式,来选择使用或者不使用某些具体型号的DCU + +## 节点需求 + +* dtk driver >= 24.04 +* hy-smi v1.6.0 + +## 开启DCU复用 + +* 部署[dcu-vgpu-device-plugin](https://github.com/Project-HAMi/dcu-vgpu-device-plugin) + +## 运行DCU任务 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: alexnet-tf-gpu-pod-mem + labels: + purpose: demo-tf-amdgpu +spec: + containers: + - name: alexnet-tf-gpu-container + image: pytorch:resnet50 + workingDir: /root + command: ["sleep","infinity"] + resources: + limits: + hygon.com/dcunum: 1 # 请求一个 DCU + hygon.com/dcumem: 2000 # 每个 DCU 包含 2000M 设备内存 + hygon.com/dcucores: 60 # 每个 DCU 分配 15% 的设备核心 + +``` + +## 容器内开启虚拟DCU功能 + +使用vDCU首先需要激活虚拟环境 +``` +source /opt/hygondriver/env.sh +``` + +随后,使用hdmcli指令查看虚拟设备是否已经激活 +``` +hy-virtual -show-device-info +``` + +若输出如下,则代表虚拟设备已经成功激活 +``` +Device 0: + Actual Device: 0 + Compute units: 60 + Global memory: 2097152000 bytes +``` + +接下来正常启动DCU任务即可 + +## 注意事项 + +1. 在init container中无法使用DCU复用功能,否则该任务不会被调度 + +2. 每个容器最多只能使用一个虚拟DCU设备, 如果您希望在容器中挂载多个DCU设备,则不能使用`hygon.com/dcumem`和`hygon.com/dcucores`字段 \ No newline at end of file diff --git a/docs/zh/userguide/Hygon-device/examples/allocate-core-and-memory.md b/docs/zh/userguide/Hygon-device/examples/allocate-core-and-memory.md new file mode 100644 index 0000000..eedc59a --- /dev/null +++ b/docs/zh/userguide/Hygon-device/examples/allocate-core-and-memory.md @@ -0,0 +1,28 @@ +--- +title: 为容器分配设备核心和内存资源 +translated: true +--- + +## 为容器分配设备核心和内存 + +要分配设备核心资源的某一部分,您只需在容器中使用 `hygon.com/dcunum` 请求的海光 DCU 数量,并分配 `hygon.com/dcucores` 和 `hygon.com/dcumem`。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: alexnet-tf-gpu-pod-mem + labels: + purpose: demo-tf-amdgpu +spec: + containers: + - name: alexnet-tf-gpu-container + image: image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310 + workingDir: /root + command: ["sleep","infinity"] + resources: + limits: + hygon.com/dcunum: 1 # 请求一个 DCU + hygon.com/dcumem: 2000 # 每个 DCU 需要 2000 MiB 设备内存 + hygon.com/dcucores: 15 # 每个 DCU 使用 15% 个设备核心 +``` diff --git a/docs/zh/userguide/Hygon-device/examples/allocate-exclusive.md b/docs/zh/userguide/Hygon-device/examples/allocate-exclusive.md new file mode 100644 index 0000000..f79ce31 --- /dev/null +++ b/docs/zh/userguide/Hygon-device/examples/allocate-exclusive.md @@ -0,0 +1,26 @@ +--- +title: 分配独占设备 +translated: true +--- + +## 分配独占设备 + +要分配整个海光 DCU 设备,您只需分配 `hygon.com/dcunum`,无需其他字段。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: alexnet-tf-gpu-pod-mem + labels: + purpose: demo-tf-amdgpu +spec: + containers: + - name: alexnet-tf-gpu-container + image: image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310 + workingDir: /root + command: ["sleep","infinity"] + resources: + limits: + hygon.com/dcunum: 1 # 请求一个 DCU +``` \ No newline at end of file diff --git a/docs/zh/userguide/Hygon-device/examples/specify-certain-cards.md b/docs/zh/userguide/Hygon-device/examples/specify-certain-cards.md new file mode 100644 index 0000000..afa0045 --- /dev/null +++ b/docs/zh/userguide/Hygon-device/examples/specify-certain-cards.md @@ -0,0 +1,25 @@ +--- +title: 将任务分配给特定的 DCU +translated: true +--- + +## 将任务分配给特定的 DCU + +要将任务分配给特定的 DCU,只需在注释字段中分配 `hygon.com/use-gpuuuid` + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + annotations: + hygon.com/use-gpuuuid: "DCU-123,DCU-456" # 指定以逗号分隔的 DCU UUID +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + hygon.com/dcunum: 1 # 请求两个 DCU 卡 +``` \ No newline at end of file diff --git a/docs/zh/userguide/Hygon-device/specify-device-core-usage.md b/docs/zh/userguide/Hygon-device/specify-device-core-usage.md new file mode 100644 index 0000000..949b281 --- /dev/null +++ b/docs/zh/userguide/Hygon-device/specify-device-core-usage.md @@ -0,0 +1,16 @@ +--- +title: 分配设备核心给容器 +translated: true +--- + +## 分配设备核心给容器 + +通过指定资源 `hygon.com/dcucores` 来分配设备核心资源的百分比。 +可选项,每个 `hygon.com/dcucores` 单位等于设备核心的 1%。 + +```yaml + resources: + limits: + hygon.com/dcunum: 1 # 请求 1 个 DCU + hygon.com/dcucores: 15 # 每个 DCU 分配 15% 的设备核心 +``` diff --git a/docs/zh/userguide/Hygon-device/specify-device-memory-usage.md b/docs/zh/userguide/Hygon-device/specify-device-memory-usage.md new file mode 100644 index 0000000..60c8346 --- /dev/null +++ b/docs/zh/userguide/Hygon-device/specify-device-memory-usage.md @@ -0,0 +1,15 @@ +--- +title: 为容器分配设备内存 +translated: true +--- + +## 为容器分配设备内存 + +通过指定诸如 `hygon.com/dcumem` 之类的资源来分配设备内存的百分比大小。可选项,每个 `hygon.com/dcumem` 单位等于 1M 设备内存。 + +```yaml + resources: + limits: + hygon.com/dcunum: 1 # 请求 1 个 DCU + hygon.com/dcumem: 2000 # 每个 DCU 包含 2000M 设备内存 +``` \ No newline at end of file diff --git a/docs/zh/userguide/Hygon-device/specify-device-uuid-to-use.md b/docs/zh/userguide/Hygon-device/specify-device-uuid-to-use.md new file mode 100644 index 0000000..1d4544a --- /dev/null +++ b/docs/zh/userguide/Hygon-device/specify-device-uuid-to-use.md @@ -0,0 +1,18 @@ +--- +title: 分配到特定设备 +translated: true +--- + +## 分配到特定设备类型 + +有时任务可能希望在某个特定的DCU上运行,可以在pod注释中填写`hygon.com/use-gpuuuid`字段。HAMi调度器将尝试匹配具有该UUID的设备。 + +例如,具有以下注释的任务将被分配到UUID为`DCU-123456`的设备上 + +```yaml +metadata: + annotations: + hygon.com/use-gpuuuid: "DCU-123456" +``` + +> **注意:** *每个DCU UUID在集群中是唯一的,因此分配某个UUID意味着将此任务分配到具有该DCU的特定节点上* \ No newline at end of file diff --git a/docs/zh/userguide/Iluvatar-device/enable-illuvatar-gpu-sharing.md b/docs/zh/userguide/Iluvatar-device/enable-illuvatar-gpu-sharing.md new file mode 100644 index 0000000..4fd0df6 --- /dev/null +++ b/docs/zh/userguide/Iluvatar-device/enable-illuvatar-gpu-sharing.md @@ -0,0 +1,153 @@ +--- +title: 启用天数智芯 GPU 共享 +--- + + +## 简介 + +本组件支持复用天数智芯GPU设备(MR-V100、BI-V150、BI-V100),并为此提供以下几种与vGPU类似的复用功能,包括: + +***GPU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 + +***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配GPU,本组件会确保任务使用的显存不会超过分配数值 + +***可限制分配的算力核组比例***: 你现在可以用算力比例(例如60%)来分配GPU,本组件会确保任务使用的显存不会超过分配数值 + +***设备 UUID 选择***: 你可以通过注解指定使用或排除特定的 GPU 设备 + +***方便易用***: 部署本组件后,只需要部署厂家提供的gpu-manager即可使用 + + +## 节点需求 + +* Iluvatar gpu-manager (please consult your device provider) +* driver version > 3.1.0 + +## 开启GPU复用 + +* 部署'gpu-manager',天数智芯的GPU共享需要配合厂家提供的'gpu-manager'一起使用,请联系设备提供方获取 + +> **注意:** *只需要安装gpu-manager,不要安装gpu-admission.* + +* 部署'gpu-manager'之后,你需要确认显存和核组对应的资源名称(例如 'iluvatar.ai/vcuda-core', 'iluvatar.ai/vcuda-memory') + +* 在安装HAMi时配置'iluvatarResourceMem'和'iluvatarResourceCore'参数 + +``` +helm install hami hami-charts/hami --set scheduler.kubeScheduler.imageTag={your kubernetes version} --set iluvatarResourceMem=iluvatar.ai/vcuda-memory --set iluvatarResourceCore=iluvatar.ai/vcuda-core -n kube-system +``` + +> **说明:** 默认资源名称如下: +> - `iluvatar.ai/vgpu` 用于 GPU 数量 +> - `iluvatar.ai/vcuda-memory` 用于内存分配 +> - `iluvatar.ai/vcuda-core` 用于核心分配 +> +> 你可以通过上述参数自定义这些名称。 + +## 设备粒度切分 + +HAMi 将每个天数智芯 GPU 划分为 100 个单元进行资源分配。当你请求一部分 GPU 时,实际上是在请求这些单元中的一定数量。 + +### 内存分配 + +- 每个 `iluvatar.ai/vcuda-memory` 单位代表 256MB 的设备内存 +- 如果不指定内存请求,系统将默认使用 100% 的可用内存 +- 内存分配通过硬限制强制执行,确保任务不会超过其分配的内存 + +### 核心分配 + +- 每个 `iluvatar.ai/vcuda-core` 单位代表 1% 的可用计算核心 +- 核心分配通过硬限制强制执行,确保任务不会超过其分配的核心 +- 当请求多个 GPU 时,系统会根据请求的 GPU 数量自动设置核心资源 + +## 运行GPU任务 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: poddemo +spec: + restartPolicy: Never + containers: + - name: poddemo + image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e + command: + - bash + args: + - -c + - | + set -ex + echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc + cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ + cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ + source /root/.bashrc + sleep 360000 + resources: + requests: + iluvatar.ai/vgpu: 1 + iluvatar.ai/vcuda-core: 50 + iluvatar.ai/vcuda-memory: 64 + limits: + iluvatar.ai/vgpu: 1 + iluvatar.ai/vcuda-core: 50 + iluvatar.ai/vcuda-memory: 64 +``` + +> **注意1:** *每一单位的vcuda-memory代表256M的显存.* + +> **注意2:** *查看更多的[用例](https://github.com/Project-HAMi/HAMi/tree/release-v2.6/examples/iluvatar/).* + +## 设备 UUID 选择 + +你可以通过 Pod 注解来指定要使用或排除特定的 GPU 设备: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: poddemo + annotations: + # 使用特定的 GPU 设备(逗号分隔的列表) + iluvatar.ai/use-gpuuuid: "node1-iluvatar-0,node1-iluvatar-1" + # 或者排除特定的 GPU 设备(逗号分隔的列表) + iluvatar.ai/nouse-gpuuuid: "node1-iluvatar-2,node1-iluvatar-3" +spec: + # ... 其余 Pod 配置 +``` + +> **说明:** 设备 ID 格式为 `{节点名称}-iluvatar-{索引}`。你可以在节点状态中找到可用的设备 ID。 + +### 查找设备 UUID + +你可以使用以下命令查找节点上的天数智芯 GPU 设备 UUID: + +```bash +kubectl get pod <pod-name> -o yaml | grep -A 10 "hami.io/<card-type>-devices-allocated" +``` + +或者通过检查节点注解: + +```bash +kubectl get node <node-name> -o yaml | grep -A 10 "hami.io/node-register-<card-type>" +``` + +在节点注解中查找包含设备信息的注解。 + + +## 注意事项 + +1. 你需要在容器中进行如下的设置才能正常的使用共享功能 +```sh + set -ex + echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc + cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ + cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ + source /root/.bashrc +``` + +2. 共享模式只对申请一张GPU的容器生效(iluvatar.ai/vgpu=1)。当请求多个 GPU 时,系统会根据请求的 GPU 数量自动设置核心资源。 + +3. `iluvatar.ai/vcuda-memory` 资源仅在 `iluvatar.ai/vgpu=1` 时有效。 + +4. 多设备请求(`iluvatar.ai/vgpu > 1`)不支持 vGPU 模式。 \ No newline at end of file diff --git a/docs/zh/userguide/Iluvatar-device/examples/allocate-device-core-and-memory-to-container.md b/docs/zh/userguide/Iluvatar-device/examples/allocate-device-core-and-memory-to-container.md new file mode 100644 index 0000000..0f93f88 --- /dev/null +++ b/docs/zh/userguide/Iluvatar-device/examples/allocate-device-core-and-memory-to-container.md @@ -0,0 +1,35 @@ +--- +title: Allocate device core and memory resource +--- + +## Allocate device core and memory to container + +To allocate a certain part of device core resource, you need only to assign the iluvatar.ai/vcuda-memory and iluvatar.ai/vcuda-core along with the number of cambricon MLUs you requested in the container using iluvatar.ai/vgpu + +``` +apiVersion: v1 +kind: Pod +metadata: + name: poddemo +spec: + restartPolicy: Never + containers: + - name: poddemo + image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e + command: + - bash + args: + - -c + - | + set -ex + echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc + cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ + cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ + source /root/.bashrc + sleep 360000 + resources: + requests: + iluvatar.ai/vgpu: 2 + limits: + iluvatar.ai/vgpu: 2 +``` \ No newline at end of file diff --git a/docs/zh/userguide/Iluvatar-device/examples/allocate-exclusive.md b/docs/zh/userguide/Iluvatar-device/examples/allocate-exclusive.md new file mode 100644 index 0000000..9797881 --- /dev/null +++ b/docs/zh/userguide/Iluvatar-device/examples/allocate-exclusive.md @@ -0,0 +1,35 @@ +--- +title: Allocate exclusive device +--- + +## Allocate exclusive device + +To allocate a whole cambricon device, you need to only assign `iluvatar.ai/vgpu` without other fields. You can allocate multiple GPUs for a container. + +``` +apiVersion: v1 +kind: Pod +metadata: + name: poddemo +spec: + restartPolicy: Never + containers: + - name: poddemo + image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e + command: + - bash + args: + - -c + - | + set -ex + echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc + cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ + cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ + source /root/.bashrc + sleep 360000 + resources: + requests: + iluvatar.ai/vgpu: 2 + limits: + iluvatar.ai/vgpu: 2 +``` \ No newline at end of file diff --git a/docs/zh/userguide/Metax-device/Metax-GPU/enable-metax-gpu-schedule.md b/docs/zh/userguide/Metax-device/Metax-GPU/enable-metax-gpu-schedule.md new file mode 100644 index 0000000..c546ae3 --- /dev/null +++ b/docs/zh/userguide/Metax-device/Metax-GPU/enable-metax-gpu-schedule.md @@ -0,0 +1,65 @@ +--- +title: 启用沐曦 GPU 拓扑感知调度 +translated: true +--- + +**HAMi 现在通过在沐曦 GPU 之间实现拓扑感知来支持 metax.com/gpu**: + +当在单个服务器上配置多个 GPU 时,GPU 卡根据它们是否连接到同一个 PCIe 交换机或 MetaXLink 而存在远近关系。这在服务器上的所有卡之间形成了一个拓扑,如下图所示: + +![img](../../../resources/metax_topo.jpg) + +用户作业请求一定数量的 metax-tech.com/gpu 资源,Kubernetes 将 Pod 调度到适当的节点。gpu-device 进一步处理在资源节点上分配剩余资源的逻辑,遵循以下标准: + +1. MetaXLink 在两种情况下优先于 PCIe 交换机: + + - 当两个卡之间存在 MetaXLink 连接和 PCIe 交换机连接时,连接被视为 MetaXLink 连接。 + - 当 MetaXLink 和 PCIe 交换机都能满足作业请求时,优先使用 MetaXLink 互连资源。 + +2. 使用 `node-scheduler-policy=spread` 时,尽可能将 Metax 资源分配在同一个 Metaxlink 或 Paiswich 下,如下图所示: + + ![img](../../../resources/metax_spread.jpg) + +3. 使用 `node-scheduler-policy=binpack` 时,分配 GPU 资源,以尽量减少对 MetaxXLink 拓扑的破坏,如下图所示: + + ![img](../../../resources/metax_binpack.jpg) + +## 重要说明 + +1. 目前不支持设备共享。 + +2. 这些功能已在 MXC500 上测试。 + +## 前提条件 + +* 沐曦 GPU 插件 >= 0.8.0 +* Kubernetes 版本 >= 1.23 + +## 启用拓扑感知调度 + +* 在 metax 节点上部署沐曦 GPU 插件(请咨询您的设备提供商以获取其软件包和文档) + +* 根据 README.md 部署 HAMi + +## 运行 Metax 作业 + +现在可以通过容器使用 `metax-tech.com/gpu` 资源类型请求沐曦 GPU: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod1 + annotations: hami.io/node-scheduler-policy: "spread" # 当此参数设置为 spread 时,调度器将尝试为此任务找到最佳拓扑。 +spec: + containers: + - name: ubuntu-container + image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 + imagePullPolicy: IfNotPresent + command: ["sleep","infinity"] + resources: + limits: + metax-tech.com/gpu: 1 # 请求 1 个 GPU +``` + +> **注意:** 您可以在 examples 文件夹中找到更多示例。 diff --git a/docs/zh/userguide/Metax-device/Metax-GPU/examples/allocate-binpack.md b/docs/zh/userguide/Metax-device/Metax-GPU/examples/allocate-binpack.md new file mode 100644 index 0000000..314187e --- /dev/null +++ b/docs/zh/userguide/Metax-device/Metax-GPU/examples/allocate-binpack.md @@ -0,0 +1,24 @@ +--- +title: 使用 binpack 调度策略分配沐曦设备 +translated: true +--- + +要在最小化拓扑损失的情况下分配沐曦设备,您只需将 `metax-tech.com/gpu` 与注释 `hami.io/node-scheduler-policy: "binpack"` 一起分配。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod1 + annotations: + hami.io/node-scheduler-policy: "binpack" # 当此参数设置为 binpack 时,调度器将尝试最小化拓扑损失。 +spec: + containers: + - name: ubuntu-container + image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 + imagePullPolicy: IfNotPresent + command: ["sleep","infinity"] + resources: + limits: + metax-tech.com/gpu: 1 # 请求 1 个沐曦 GPU +``` diff --git a/docs/zh/userguide/Metax-device/Metax-GPU/examples/allocate-spread.md b/docs/zh/userguide/Metax-device/Metax-GPU/examples/allocate-spread.md new file mode 100644 index 0000000..08dba4c --- /dev/null +++ b/docs/zh/userguide/Metax-device/Metax-GPU/examples/allocate-spread.md @@ -0,0 +1,24 @@ +--- +title: 使用扩展调度策略分配沐曦设备 +translated: true +--- + +为了以最佳性能分配沐曦设备,您只需将 `metax-tech.com/gpu` 与注释 `hami.io/node-scheduler-policy: "spread"` 一起分配。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod1 + annotations: + hami.io/node-scheduler-policy: "spread" # 当此参数设置为 spread 时,调度器将尝试为此任务找到最佳拓扑。 +spec: + containers: + - name: ubuntu-container + image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 + imagePullPolicy: IfNotPresent + command: ["sleep","infinity"] + resources: + limits: + metax-tech.com/gpu: 4 # 请求 4 个沐曦 GPU +``` diff --git a/docs/zh/userguide/Metax-device/Metax-GPU/examples/default-use.md b/docs/zh/userguide/Metax-device/Metax-GPU/examples/default-use.md new file mode 100644 index 0000000..e809e73 --- /dev/null +++ b/docs/zh/userguide/Metax-device/Metax-GPU/examples/default-use.md @@ -0,0 +1,22 @@ +--- +title: 分配沐曦设备 +translated: true +--- + +要分配沐曦设备,您只需分配 `metax-tech.com/gpu`,无需其他字段。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod1 +spec: + containers: + - name: ubuntu-container + image: cr.metax-tech.com/public-ai-release/c500/colossalai:2.24.0.5-py38-ubuntu20.04-amd64 + imagePullPolicy: IfNotPresent + command: ["sleep","infinity"] + resources: + limits: + metax-tech.com/gpu: 1 # 请求 1 个沐曦 GPU +``` diff --git a/docs/zh/userguide/Metax-device/Metax-GPU/specify-binpack-task.md b/docs/zh/userguide/Metax-device/Metax-GPU/specify-binpack-task.md new file mode 100644 index 0000000..fdc3124 --- /dev/null +++ b/docs/zh/userguide/Metax-device/Metax-GPU/specify-binpack-task.md @@ -0,0 +1,12 @@ +--- +title: Binpack 调度策略 +translated: true +--- + +为了在最小化拓扑损失的情况下分配 沐曦设备,您只需将 `metax-tech.com/gpu` 与注释 `hami.io/node-scheduler-policy: "binpack"` 一起分配。 + +```yaml +metadata: + annotations: + hami.io/node-scheduler-policy: "binpack" # 当此参数设置为 binpack 时,调度器将尝试最小化拓扑损失。 +``` diff --git a/docs/zh/userguide/Metax-device/Metax-GPU/specify-spread-task.md b/docs/zh/userguide/Metax-device/Metax-GPU/specify-spread-task.md new file mode 100644 index 0000000..ca9489c --- /dev/null +++ b/docs/zh/userguide/Metax-device/Metax-GPU/specify-spread-task.md @@ -0,0 +1,12 @@ +--- +title: 扩展调度策略 +translated: true +--- + +为了分配性能最佳的 沐曦设备,您只需将 `metax-tech.com/gpu` 与注释 `hami.io/node-scheduler-policy: "spread"` 一起分配 + +```yaml +metadata: + annotations: + hami.io/node-scheduler-policy: "spread" # 当此参数设置为 spread 时,调度器将尝试为此任务找到最佳拓扑。 +``` diff --git a/docs/zh/userguide/Metax-device/Metax-sGPU/enable-metax-gpu-sharing.md b/docs/zh/userguide/Metax-device/Metax-sGPU/enable-metax-gpu-sharing.md new file mode 100644 index 0000000..575b9d2 --- /dev/null +++ b/docs/zh/userguide/Metax-device/Metax-sGPU/enable-metax-gpu-sharing.md @@ -0,0 +1,47 @@ +--- +title: 启用沐曦 GPU 共享 +translated: true +--- + +**HAMi 目前支持复用沐曦 GPU 设备,提供与 vGPU 类似的复用功能**,包括: + +- **GPU 共享**: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 + +- **可限制分配的显存大小**: 你现在可以用显存值(例如 4G)来分配 GPU,本组件会确保任务使用的显存不会超过分配数值 + +- **可限制计算单元数量**: 你现在可以指定任务使用的算力比例(例如 60 即代表使用 60% 算力)来分配 GPU,本组件会确保任务使用的算力不会超过分配数值 + +### 需求 + +* Metax Driver >= 2.32.0 +* Metax GPU Operator >= 0.10.2 +* Kubernetes >= 1.23 + +### 开启复用沐曦设备 + +* 部署 Metax GPU Operator (请联系您的设备提供方获取) +* 根据 readme.md 部署 HAMi + +### 运行沐曦任务 + +一个典型的沐曦任务如下所示: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod1 +spec: + containers: + - name: ubuntu-container + image: ubuntu:22.04 + imagePullPolicy: IfNotPresent + command: ["sleep","infinity"] + resources: + limits: + metax-tech.com/sgpu: 1 # 请求 1 个 GPU + metax-tech.com/vcore: 60 # 每个 GPU 使用 60% 的计算核 + metax-tech.com/vmemory: 4 # 每个 GPU 需要 4 GiB 设备显存 +``` + +> **注意:** 您可以在 [examples 文件夹](https://github.com/Project-HAMi/HAMi/tree/release-v2.6/examples/metax/sgpu)中找到更多示例。 diff --git a/docs/zh/userguide/Metax-device/Metax-sGPU/examples/allocate-exclusive.md b/docs/zh/userguide/Metax-device/Metax-sGPU/examples/allocate-exclusive.md new file mode 100644 index 0000000..3329a8e --- /dev/null +++ b/docs/zh/userguide/Metax-device/Metax-sGPU/examples/allocate-exclusive.md @@ -0,0 +1,22 @@ +--- +title: 分配独占设备 +translated: true +--- + +要分配整个沐曦 GPU 设备,您只需为容器申请 `metax-tech.com/sgpu` 资源。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ubuntu:22.04 + imagePullPolicy: IfNotPresent + command: ["sleep","infinity"] + resources: + limits: + metax-tech.com/sgpu: 1 # 请求 1 个独占 GPU +``` diff --git a/docs/zh/userguide/Metax-device/Metax-sGPU/examples/allocate-qos-policy.md b/docs/zh/userguide/Metax-device/Metax-sGPU/examples/allocate-qos-policy.md new file mode 100644 index 0000000..2386beb --- /dev/null +++ b/docs/zh/userguide/Metax-device/Metax-sGPU/examples/allocate-qos-policy.md @@ -0,0 +1,32 @@ +--- +title: 分配特定 Qos Policy 的设备 +translated: true +--- + +用户可以通过 `metax-tech.com/sgpu-qos-policy` 为任务配置 Qos Policy 参数以指定 sGPU 使用的调度策略。具体的 sGPU 调度策略说明参见下表。 + +| 调度策略 | 描述 | +| --- | --- | +| `best-effort` | sGPU 不限制算力 | +| `fixed-share` | sGPU 有固定的算力配额,且无法超过固定配额使用 | +| `burst-share` | sGPU 有固定的算力配额,若 GPU 卡还有空闲算力,就可以被 sGPU 使用 | + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + annotations: + metax-tech.com/sgpu-qos-policy: "best-effort" # 分配特定的 qos sgpu +spec: + containers: + - name: ubuntu-container + image: ubuntu:22.04 + imagePullPolicy: IfNotPresent + command: ["sleep","infinity"] + resources: + limits: + metax-tech.com/sgpu: 1 # 请求 1 个 GPU + metax-tech.com/vcore: 60 # 每个 GPU 使用 60% 的计算核 + metax-tech.com/vmemory: 4 # 每个 GPU 需要 4 GiB 设备显存 +``` diff --git a/docs/zh/userguide/Metax-device/Metax-sGPU/examples/default-use.md b/docs/zh/userguide/Metax-device/Metax-sGPU/examples/default-use.md new file mode 100644 index 0000000..2e07c8c --- /dev/null +++ b/docs/zh/userguide/Metax-device/Metax-sGPU/examples/default-use.md @@ -0,0 +1,26 @@ +--- +title: 为容器分配设备核心和内存资源 +translated: true +--- + +要分配设备核心资源的一部分,您只需在容器中使用 `metax-tech.com/sgpu` 申请沐曦 GPU 数量的同时,申请 `metax-tech.com/vcore` 和 `metax-tech.com/vmemory`。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod1 +spec: + containers: + - name: ubuntu-container + image: ubuntu:22.04 + imagePullPolicy: IfNotPresent + command: ["sleep","infinity"] + resources: + limits: + metax-tech.com/sgpu: 1 # 请求 1 个 GPU + metax-tech.com/vcore: 60 # 每个 GPU 使用 60% 的计算核 + metax-tech.com/vmemory: 4 # 每个 GPU 需要 4 GiB 设备显存 +``` + +> **注意:** 当未申请 `metax-tech.com/vcore` 或 `metax-tech.com/vmemory` 资源时,则表示对应资源配额已满。 diff --git a/docs/zh/userguide/Mthreads-device/enable-mthreads-gpu-sharing.md b/docs/zh/userguide/Mthreads-device/enable-mthreads-gpu-sharing.md new file mode 100644 index 0000000..b6034d4 --- /dev/null +++ b/docs/zh/userguide/Mthreads-device/enable-mthreads-gpu-sharing.md @@ -0,0 +1,69 @@ +--- +title: 启用 Mthreads GPU 共享 +translated: true +--- + +## 简介 + +本组件支持复用摩尔线程GPU设备,并为此提供以下几种与vGPU类似的复用功能,包括: + +***GPU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 + +***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配MLU,本组件会确保任务使用的显存不会超过分配数值、 + +***可限制分配的算力核组比例***: 你现在可以用算力核组数量(例如8个)来分配GPU,本组件会确保任务使用的显存不会超过分配数值 + +## 注意事项 + +1. 暂时不支持多卡切片,多卡任务只能分配整卡 + +2. 一个pod只能使用一个GPU生成的切片,即使该pod中有多个容器 + +3. 支持独占模式,只指定`mthreads.com/vgpu`即为独占申请 + +4. 本特性目前只支持MTT S4000设备 + +## 节点需求 + +* [MT CloudNative Toolkits > 1.9.0](https://docs.mthreads.com/cloud-native/cloud-native-doc-online/) +* 驱动版本 >= 1.2.0 + +## 开启GPU复用 + +* 部署'gpu-manager',天数智芯的GPU共享需要配合厂家提供的'MT-CloudNative Toolkit'一起使用,请联系设备提供方获取 + +> **注意:** *(可选),部署完之后,卸载掉mt-mutating-webhook与mt-scheduler组件,因为这部分功能将由HAMi调度器提供* + +* 在安装HAMi时配置'devices.mthreads.enabled = true'参数 + +``` +helm install hami hami-charts/hami --set scheduler.kubeScheduler.imageTag={your kubernetes version} --set device.mthreads.enabled=true -n kube-system +``` + +## 运行GPU任务 + +通过指定`mthreads.com/vgpu`, `mthreads.com/sgpu-memory` and `mthreads.com/sgpu-core`这3个参数,可以确定容器申请的切片个数,对应的显存和算力核组 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpushare-pod-default +spec: + restartPolicy: OnFailure + containers: + - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc + imagePullPolicy: IfNotPresent + name: gpushare-pod-1 + command: ["sleep"] + args: ["100000"] + resources: + limits: + mthreads.com/vgpu: 1 + mthreads.com/sgpu-memory: 32 + mthreads.com/sgpu-core: 8 +``` + +> **注意1:** *每一单位的sgpu-memory代表512M的显存.* + +> **注意2:** *查看更多的[用例](https://github.com/Project-HAMi/HAMi/tree/release-v2.6/examples/mthreads/).* \ No newline at end of file diff --git a/docs/zh/userguide/Mthreads-device/examples/allocate-core-and-memory.md b/docs/zh/userguide/Mthreads-device/examples/allocate-core-and-memory.md new file mode 100644 index 0000000..62ca500 --- /dev/null +++ b/docs/zh/userguide/Mthreads-device/examples/allocate-core-and-memory.md @@ -0,0 +1,28 @@ +--- +title: 为容器分配设备核心和内存资源 +translated: true +--- + +## 为容器分配设备核心和内存 + +要分配设备核心资源的一部分,您只需在容器中使用 `mthreads.com/vgpu` 请求的寒武纪 MLU 数量的同时,分配 `mthreads.com/sgpu-memory` 和 `mthreads.com/sgpu-core`。 + +``` +apiVersion: v1 +kind: Pod +metadata: + name: gpushare-pod-default +spec: + restartPolicy: OnFailure + containers: + - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc + imagePullPolicy: IfNotPresent + name: gpushare-pod-1 + command: ["sleep"] + args: ["100000"] + resources: + limits: + mthreads.com/vgpu: 1 + mthreads.com/sgpu-memory: 32 + mthreads.com/sgpu-core: 8 +``` diff --git a/docs/zh/userguide/Mthreads-device/examples/allocate-exclusive.md b/docs/zh/userguide/Mthreads-device/examples/allocate-exclusive.md new file mode 100644 index 0000000..94d15cd --- /dev/null +++ b/docs/zh/userguide/Mthreads-device/examples/allocate-exclusive.md @@ -0,0 +1,26 @@ +--- +title: 分配独占设备 +translated: true +--- + +## 分配独占设备 + +要分配整个寒武纪设备,您只需分配 `mthreads.com/vgpu` 而无需其他字段。您可以为一个容器分配多个 GPU。 + +``` +apiVersion: v1 +kind: Pod +metadata: + name: gpushare-pod-multi-cards +spec: + restartPolicy: OnFailure + containers: + - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc + imagePullPolicy: IfNotPresent + name: gpushare-pod-1 + command: ["sleep"] + args: ["100000"] + resources: + limits: + mthreads.com/vgpu: 2 +``` diff --git a/docs/zh/userguide/Mthreads-device/specify-device-core-usage.md b/docs/zh/userguide/Mthreads-device/specify-device-core-usage.md new file mode 100644 index 0000000..3b8111d --- /dev/null +++ b/docs/zh/userguide/Mthreads-device/specify-device-core-usage.md @@ -0,0 +1,15 @@ +--- +title: 分配设备核心给容器 +translated: true +--- + +## 为容器分配设备核心 + +通过指定资源 `mthreads.com/sgpu-core` 来分配部分设备核心资源。可选项,每个 `mthreads.com/smlu-core` 单位等于 1/16 的设备核心。 + +``` + resources: + limits: + mthreads.com/vgpu: 1 # 请求 1 个 GPU + mthreads.com/sgpu-core: "8" # 每个 GPU 包含 50% 的设备核心 +``` diff --git a/docs/zh/userguide/Mthreads-device/specify-device-memory-usage.md b/docs/zh/userguide/Mthreads-device/specify-device-memory-usage.md new file mode 100644 index 0000000..48ddc75 --- /dev/null +++ b/docs/zh/userguide/Mthreads-device/specify-device-memory-usage.md @@ -0,0 +1,15 @@ +--- +title: 为容器分配设备内存 +translated: true +--- + +## 为容器分配设备内存 + +通过指定诸如 `mthreads.com/sgpu-memory` 之类的资源来分配设备内存的百分比大小。可选项,每个 `mthreads.com/sgpu-memory` 单位等于 512M 的设备内存。 + +``` + resources: + limits: + mthreads.com/vgpu: 1 # 请求 1 个 MLU + mthreads.com/sgpu-memory: 32 # 每个 GPU 包含 16G 设备内存 +``` diff --git a/docs/zh/userguide/NVIDIA-device/dynamic-mig-support.md b/docs/zh/userguide/NVIDIA-device/dynamic-mig-support.md new file mode 100644 index 0000000..d1dbe5c --- /dev/null +++ b/docs/zh/userguide/NVIDIA-device/dynamic-mig-support.md @@ -0,0 +1,181 @@ +--- +title: 启用动态 MIG 功能 +translated: true +--- + +## 介绍 + +**我们现在支持通过使用 mig-parted 动态调整 mig-devices 来支持 dynamic-mig**,包括: + +***动态 MIG 实例管理***:用户无需在 GPU 节点上操作,使用 'nvidia-smi -i 0 -mig 1' 或其他命令来管理 MIG 实例,所有操作将由 HAMi-device-plugin 完成。 + +***动态 MIG 调整***:HAMi 管理的每个 MIG 设备将根据提交的任务在必要时动态调整其 MIG 模板。 + +***设备 MIG 观察***:HAMi 生成的每个 MIG 实例将在调度器监视器中显示,包括任务信息。用户可以清晰地查看 MIG 节点的概况。 + +***兼容 HAMi-core 节点***:HAMi 可以管理 `HAMi-core 节点` 和 `mig 节点` 的统一 GPU 池。如果没有通过 `nvidia.com/vgpu-mode` 注释手动指定,任务可以被调度到任一节点。 + +***与 HAMi-core 统一的 API***:无需进行任何工作即可使作业与 dynamic-mig 功能兼容。 + +## 前提条件 + +* NVIDIA Blackwell 和 Hopper™ 及 Ampere 设备 +* HAMi > v2.5.0 +* Nvidia-container-toolkit + +## 启用 Dynamic-mig 支持 + +* 使用 helm 安装 chart,参见[此处](https://github.com/Project-HAMi/HAMi#enabling-vgpu-support-in-kubernetes)的“在 Kubernetes 中启用 vGPU 支持”部分 + +* 在 device-plugin configMap 中将 `mode` 配置为 `mig` 以支持 MIG 节点 +``` +kubectl describe cm hami-device-plugin -n kube-system +``` + +```json +{ + "nodeconfig": [ + { + "name": "MIG-NODE-A", + "operatingmode": "mig", + "filterdevices": { + "uuid": [], + "index": [] + } + } + ] +} +``` + +* 重启以下 pod 以使更改生效: + * hami-scheduler + * 'MIG-NODE-A' 上的 hami-device-plugin + +## 自定义 mig 配置(可选) +HAMi 目前有一个 [内置的 mig 配置](https://github.com/Project-HAMi/HAMi/blob/master/charts/hami/templates/scheduler/device-configmap.yaml) 用于 MIG。 + +您可以按照以下步骤自定义 mig 配置: + + ### 更改 charts/hami/templates/scheduler 中 'device-configmap.yaml' 的内容,如下所示 + + ```yaml + nvidia: + resourceCountName: {{ .Values.resourceName }} + resourceMemoryName: {{ .Values.resourceMem }} + resourceMemoryPercentageName: {{ .Values.resourceMemPercentage }} + resourceCoreName: {{ .Values.resourceCores }} + resourcePriorityName: {{ .Values.resourcePriority }} + overwriteEnv: false + defaultMemory: 0 + defaultCores: 0 + defaultGPUNum: 1 + deviceSplitCount: {{ .Values.devicePlugin.deviceSplitCount }} + deviceMemoryScaling: {{ .Values.devicePlugin.deviceMemoryScaling }} + deviceCoreScaling: {{ .Values.devicePlugin.deviceCoreScaling }} + knownMigGeometries: + - models: [ "A30" ] + allowedGeometries: + - + - name: 1g.6gb + memory: 6144 + count: 4 + - + - name: 2g.12gb + memory: 12288 + count: 2 + - + - name: 4g.24gb + memory: 24576 + count: 1 + - models: [ "A100-SXM4-40GB", "A100-40GB-PCIe", "A100-PCIE-40GB", "A100-SXM4-40GB" ] + allowedGeometries: + - + - name: 1g.5gb + memory: 5120 + count: 7 + - + - name: 2g.10gb + memory: 10240 + count: 3 + - name: 1g.5gb + memory: 5120 + count: 1 + - + - name: 3g.20gb + memory: 20480 + count: 2 + - + - name: 7g.40gb + memory: 40960 + count: 1 + - models: [ "A100-SXM4-80GB", "A100-80GB-PCIe", "A100-PCIE-80GB"] + allowedGeometries: + - + - name: 1g.10gb + memory: 10240 + count: 7 + - + - name: 2g.20gb + memory: 20480 + count: 3 + - name: 1g.10gb + memory: 10240 + count: 1 + - + - name: 3g.40gb + memory: 40960 + count: 2 + - + - name: 7g.79gb + memory: 80896 + count: 1 + ``` + + > **注意** Helm 安装和更新将基于此文件中的配置,覆盖 Helm 的内置配置 + + > **注意** 请注意 HAMi 将按照此 configMap 的顺序找到并使用适合任务的第一个 MIG 模板 + +## 运行 MIG 作业 + +MIG 实例现在可以通过容器请求,方式与使用 `hami-core` 相同,只需指定 `nvidia.com/gpu` 和 `nvidia.com/gpumem` 资源类型。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + annotations: + nvidia.com/vgpu-mode: "mig" #(可选),如果未设置,此 pod 可以被分配到 MIG 实例或 hami-core 实例 +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 + nvidia.com/gpumem: 8000 +``` + +在上面的示例中,任务分配了两个 mig 实例,每个实例至少具有 8G 设备内存。 + +## 监控 MIG 实例 + +由 HAMi 管理的 MIG 实例将在调度器监视器中显示(调度器节点 ip:31993/metrics),如下所示: + +```bash +# HELP nodeGPUMigInstance GPU 共享模式。0 表示 hami-core,1 表示 mig,2 表示 mps +# TYPE nodeGPUMigInstance gauge +nodeGPUMigInstance{deviceidx="0",deviceuuid="GPU-936619fc-f6a1-74a8-0bc6-ecf6b3269313",migname="3g.20gb-0",nodeid="aio-node15",zone="vGPU"} 1 +nodeGPUMigInstance{deviceidx="0",deviceuuid="GPU-936619fc-f6a1-74a8-0bc6-ecf6b3269313",migname="3g.20gb-1",nodeid="aio-node15",zone="vGPU"} 0 +nodeGPUMigInstance{deviceidx="1",deviceuuid="GPU-30f90f49-43ab-0a78-bf5c-93ed41ef2da2",migname="3g.20gb-0",nodeid="aio-node15",zone="vGPU"} 1 +nodeGPUMigInstance{deviceidx="1",deviceuuid="GPU-30f90f49-43ab-0a78-bf5c-93ed41ef2da2",migname="3g.20gb-1",nodeid="aio-node15",zone="vGPU"} 1 +``` + +## 注意事项 + +1. 您无需在 MIG 节点上执行任何操作,所有操作均由 hami-device-plugin 中的 mig-parted 管理。 + +2. Ampere 架构之前的 Nvidia 设备无法使用 'mig' 模式 + +3. 您不会在节点上看到任何 mig 资源(即 `nvidia.com/mig-1g.10gb`),hami 对 'mig' 和 'hami-core' 节点使用统一的资源名称。 \ No newline at end of file diff --git a/docs/zh/userguide/NVIDIA-device/examples/allocate-device-core.md b/docs/zh/userguide/NVIDIA-device/examples/allocate-device-core.md new file mode 100644 index 0000000..f8814a6 --- /dev/null +++ b/docs/zh/userguide/NVIDIA-device/examples/allocate-device-core.md @@ -0,0 +1,26 @@ +--- +title: 为容器分配设备核心资源 +translated: true +--- + +## 将设备核心分配给容器 + +要分配设备核心资源的某一部分,您只需分配 `nvidia.com/gpucores`,无需其他资源字段。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # 请求2个vGPU + nvidia.com/gpucores: 50 # 请求每个vGPU核心资源的50% +``` + +> **注意:** *HAMi 使用时间片实现 `nvidia.com/gpucores`,因此,当通过 nvidia-smi 命令查询核心利用率时,会有波动* \ No newline at end of file diff --git a/docs/zh/userguide/NVIDIA-device/examples/allocate-device-memory.md b/docs/zh/userguide/NVIDIA-device/examples/allocate-device-memory.md new file mode 100644 index 0000000..8055634 --- /dev/null +++ b/docs/zh/userguide/NVIDIA-device/examples/allocate-device-memory.md @@ -0,0 +1,26 @@ +--- +title: 为容器分配特定设备内存 +translated: true +--- + +## 为容器分配特定设备内存 + +要分配特定大小的 GPU 设备内存,您只需在 `nvidia.com/gpu` 之外分配 `nvidia.com/gpumem`。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # 请求 2 个 vGPU + nvidia.com/gpumem: 3000 # 每个 vGPU 请求 3G 设备内存 +``` + +> **注意:** *`nvidia.com/gpumem` 不能与 `nvidia.com/gpumem-percentage` 一起使用* \ No newline at end of file diff --git a/docs/zh/userguide/NVIDIA-device/examples/allocate-device-memory2.md b/docs/zh/userguide/NVIDIA-device/examples/allocate-device-memory2.md new file mode 100644 index 0000000..f12aa67 --- /dev/null +++ b/docs/zh/userguide/NVIDIA-device/examples/allocate-device-memory2.md @@ -0,0 +1,26 @@ +--- +title: 按百分比分配设备内存给容器 +translated: true +--- + +## 按百分比分配设备内存给容器 + +要按百分比分配一定大小的 GPU 设备内存,您只需在 `nvidia.com/gpu` 之外分配 `nvidia.com/gpumem-percentage`。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # 请求 2 个 vGPU + nvidia.com/gpumem-percentage: 50 # 每个 vGPU 请求 50% 的设备内存 +``` + +> **注意:** *`nvidia.com/gpumem` 不能与 `nvidia.com/gpumem-percentage` 一起使用* \ No newline at end of file diff --git a/docs/zh/userguide/NVIDIA-device/examples/dynamic-mig-example.md b/docs/zh/userguide/NVIDIA-device/examples/dynamic-mig-example.md new file mode 100644 index 0000000..2db4add --- /dev/null +++ b/docs/zh/userguide/NVIDIA-device/examples/dynamic-mig-example.md @@ -0,0 +1,25 @@ +--- +title: 将任务分配给 mig 实例 +translated: true +--- + +## 此示例将为 A100-40GB-PCIE 设备分配 2g.10gb * 2 或为 A100-80GB-XSM 设备分配 1g.10gb * 2。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + annotations: + nvidia.com/vgpu-mode: "mig" + hami.io/gpu-scheduler-policy: "binpack" #(可选) +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 + nvidia.com/gpumem: 8000 +``` diff --git a/docs/zh/userguide/NVIDIA-device/examples/specify-card-type-to-use.md b/docs/zh/userguide/NVIDIA-device/examples/specify-card-type-to-use.md new file mode 100644 index 0000000..b8e7745 --- /dev/null +++ b/docs/zh/userguide/NVIDIA-device/examples/specify-card-type-to-use.md @@ -0,0 +1,28 @@ +--- +title: 分配任务到特定类型 +translated: true +--- + +## 分配任务到特定类型 + +要将任务分配到特定的 GPU 类型,只需在注释字段中分配 `nvidia.com/use-gputype`。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + annotations: + nvidia.com/use-gputype: "A100,V100" + #在此示例中,我们希望在 A100 或 V100 上运行此作业 +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # 请求 2 个 vGPU +``` + +> **注意:** *您可以将此任务分配给多种 GPU 类型,使用逗号分隔。在此示例中,我们希望在 A100 或 V100 上运行此作业* \ No newline at end of file diff --git a/docs/zh/userguide/NVIDIA-device/examples/specify-certain-card.md b/docs/zh/userguide/NVIDIA-device/examples/specify-certain-card.md new file mode 100644 index 0000000..841c6d0 --- /dev/null +++ b/docs/zh/userguide/NVIDIA-device/examples/specify-certain-card.md @@ -0,0 +1,25 @@ +--- +title: 将任务分配给特定的 GPU +translated: true +--- + +## 将任务分配给特定的 GPU + +要将任务分配给特定的 GPU,只需在注释字段中分配 `nvidia.com/use-gpuuuid`。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + annotations: + nvidia.com/use-gpuuuid: "GPU-123456" +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # 请求 2 个 vGPU +``` diff --git a/docs/zh/userguide/NVIDIA-device/examples/use-exclusive-card.md b/docs/zh/userguide/NVIDIA-device/examples/use-exclusive-card.md new file mode 100644 index 0000000..dd0d112 --- /dev/null +++ b/docs/zh/userguide/NVIDIA-device/examples/use-exclusive-card.md @@ -0,0 +1,23 @@ +--- +title: 使用独占 GPU +translated: true +--- + +## 将设备核心分配给容器 + +要以独占模式使用 GPU,这是 nvidia-k8s-device-plugin 的默认行为,您只需分配 `nvidia.com/gpu` 而无需其他资源字段。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # 请求 2 个 vGPU +``` diff --git a/docs/zh/userguide/NVIDIA-device/specify-device-core-usage.md b/docs/zh/userguide/NVIDIA-device/specify-device-core-usage.md new file mode 100644 index 0000000..bb3615d --- /dev/null +++ b/docs/zh/userguide/NVIDIA-device/specify-device-core-usage.md @@ -0,0 +1,17 @@ +--- +title: 分配设备核心给容器 +translated: true +--- + +## 分配设备核心给容器 + +通过指定资源 `nvidia.com/gpucores` 来分配设备核心资源的百分比。可选项,每个单位的 `nvidia.com/gpucores` 等于设备核心的 1%。 + +```yaml + resources: + limits: + nvidia.com/gpu: 1 # 请求 1 个 GPU + nvidia.com/gpucores: 50 # 每个 GPU 分配 50% 的设备核心。 +``` + +> **注意:** *HAMi-core 使用时间片来限制设备核心的使用。因此,通过 nvidia-smi 查看 GPU 利用率时会有波动* \ No newline at end of file diff --git a/docs/zh/userguide/NVIDIA-device/specify-device-memory-usage.md b/docs/zh/userguide/NVIDIA-device/specify-device-memory-usage.md new file mode 100644 index 0000000..9319c80 --- /dev/null +++ b/docs/zh/userguide/NVIDIA-device/specify-device-memory-usage.md @@ -0,0 +1,26 @@ +--- +title: 为容器分配设备内存 +translated: true +--- + +## 为容器分配设备内存 + +通过指定资源如 `nvidia.com/gpumem` 来分配一定大小的设备内存。可选项,每个 `nvidia.com/gpumem` 单位等于 1M。 + +```yaml + resources: + limits: + nvidia.com/gpu: 1 # 请求 1 个 GPU + nvidia.com/gpumem: 3000 # 每个 GPU 包含 3000m 设备内存 +``` + +通过指定资源 `nvidia.com/gpumem-percentage` 来分配设备内存的百分比。可选项,每个 `nvidia.com/gpumem-percentage` 单位等于设备内存的 1% 百分比。 + +```yaml + resources: + limits: + nvidia.com/gpu: 1 # 请求 1 个 GPU + nvidia.com/gpumem-percentage: 50 # 每个 GPU 包含 50% 设备内存 +``` + +> **注意:** *`nvidia.com/gpumem` 和 `nvidia.com/gpumem-percentage` 不能同时分配* \ No newline at end of file diff --git a/docs/zh/userguide/NVIDIA-device/specify-device-type-to-use.md b/docs/zh/userguide/NVIDIA-device/specify-device-type-to-use.md new file mode 100644 index 0000000..8f7762f --- /dev/null +++ b/docs/zh/userguide/NVIDIA-device/specify-device-type-to-use.md @@ -0,0 +1,23 @@ +--- +title: 分配到特定设备类型 +translated: true +--- + +## 分配到特定设备类型 + +有时任务可能希望在某种类型的 GPU 上运行,可以在 pod 注释中填写 `nvidia.com/use-gputype` 字段。HAMi 调度器将检查 `nvidia-smi -L` 返回的设备类型是否包含注释的内容。 + +例如,具有以下注释的任务将被分配到 A100 或 V100 GPU + +```yaml +metadata: + annotations: + nvidia.com/use-gputype: "A100,V100" # 为此作业指定卡类型,使用逗号分隔,不会在未指定的卡上启动作业 +``` + +任务可以使用 `nvidia.com/nouse-gputype` 来避开某种类型的 GPU。在以下示例中,该作业不会被分配到 1080(包括 1080Ti)或 2080(包括 2080Ti)类型的卡。 + +```yaml +metadata: + annotations: + nvidia.com/nouse-gputype: "1080,2080" # 为此作业指定黑名单卡类型,使用逗号分隔,不会在指定的卡上启动作业 \ No newline at end of file diff --git a/docs/zh/userguide/NVIDIA-device/specify-device-uuid-to-use.md b/docs/zh/userguide/NVIDIA-device/specify-device-uuid-to-use.md new file mode 100644 index 0000000..876e680 --- /dev/null +++ b/docs/zh/userguide/NVIDIA-device/specify-device-uuid-to-use.md @@ -0,0 +1,18 @@ +--- +title: 分配到特定设备 +translated: true +--- + +## 分配到特定设备类型 + +有时任务可能希望在某个特定的GPU上运行,可以在pod注释中填写`nvidia.com/use-gpuuuid`字段。HAMi调度器将尝试匹配具有该UUID的设备。 + +例如,具有以下注释的任务将被分配到UUID为`GPU-123456`的设备上 + +```yaml +metadata: + annotations: + nvidia.com/use-gpuuuid: "GPU-123456" +``` + +> **注意:** *每个GPU UUID在集群中是唯一的,因此分配某个UUID意味着将此任务分配到具有该GPU的特定节点上* \ No newline at end of file diff --git a/docs/zh/userguide/configure.md b/docs/zh/userguide/configure.md new file mode 100644 index 0000000..7abcdcb --- /dev/null +++ b/docs/zh/userguide/configure.md @@ -0,0 +1,72 @@ +--- +title: 配置 +translated: true +--- + +# 全局配置 + +## 设备配置:ConfigMap + +:::note +以下列出的所有配置都在 hami-scheduler-device ConfigMap 中管理。 +::: + +您可以通过以下方法之一更新这些配置: + +1. 直接编辑 ConfigMap:如果 HAMi 已成功安装,您可以使用 kubectl edit 命令手动更新 hami-scheduler-device ConfigMap。 + + ```bash + kubectl edit configmap hami-scheduler-device -n <namespace> + ``` + + 更改后,重启相关的 HAMi 组件以应用更新的配置。 + +2. 修改 Helm Chart:更新 [ConfigMap](https://raw.githubusercontent.com/archlitchi/HAMi/refs/heads/master/charts/hami/templates/scheduler/device-configmap.yaml) 中的相应值,然后重新应用 Helm Chart 以重新生成 ConfigMap。 + +| 参数 | 类型 | 描述 | 默认值 | +| --- | ---- | --- | ----- | +| `nvidia.deviceMemoryScaling` | 浮点数 | NVIDIA 设备显存缩放比例,允许大于 1(启用虚拟设备显存,实验性功能)。对于一块拥有 _M_ 显存的 NVIDIA GPU,若设置为 _S_,则由该 GPU 拆分出的 vGPU 在 Kubernetes 中将获得 `S * M` 的显存。 | `1` | +| `nvidia.deviceSplitCount` | 整数 | 单块 GPU 可分配的最大任务数。 | `10` | +| `nvidia.migstrategy` | 字符串 | 设置为 `"none"` 表示忽略 MIG 功能,设置为 `"mixed"` 表示以独立资源方式分配 MIG 设备。 | `"none"` | +| `nvidia.disablecorelimit` | 字符串 | 设置为 `"true"` 表示禁用核心限制,设置为 `"false"` 表示启用核心限制。 | `"false"` | +| `nvidia.defaultMem` | 整数 | 当前任务默认使用的设备显存(MB)。若为 `0`,则表示使用设备 100% 显存。 | `0` | +| `nvidia.defaultCores` | 整数 | 当前任务默认预留的 GPU 核心百分比。`0` 表示只要显存够就可用任何 GPU;`100` 表示独占整块 GPU。 | `0` | +| `nvidia.defaultGPUNum` | 整数 | 默认分配的 GPU 数量。若设为 `0`,则会被过滤。如果 Pod 的资源未显式设置 `nvidia.com/gpu`,则 webhook 会检查是否设置了 `nvidia.com/gpumem`、`resource-mem-percentage` 或 `nvidia.com/gpucores`,若设置了其中任一项,则自动添加默认值的 `nvidia.com/gpu`。 | `1` | +| `nvidia.resourceCountName` | 字符串 | vGPU 数量的资源名。 | `"nvidia.com/gpu"` | +| `nvidia.resourceMemoryName` | 字符串 | vGPU 显存大小的资源名。 | `"nvidia.com/gpumem"` | +| `nvidia.resourceMemoryPercentageName` | 字符串 | vGPU 显存比例的资源名。 | `"nvidia.com/gpumem-percentage"` | +| `nvidia.resourceCoreName` | 字符串 | vGPU 核心的资源名。 | `"nvidia.com/cores"` | +| `nvidia.resourcePriorityName` | 字符串 | vGPU 任务优先级的资源名。 | `"nvidia.com/priority"` | + +## Chart 配置:参数 + +您可以通过使用 `-set` 设置以下参数来自定义您的 vGPU 支持,例如 + +```bash +helm install hami hami-charts/hami --set devicePlugin.deviceMemoryScaling=5 ... +``` + +| 参数 | 类型 | 描述 | 默认值 | +| --- | ---- | --- | ----- | +| `devicePlugin.service.schedulerPort` | 整数 | 调度器 webhook 服务的 NodePort 端口。 | `31998` | +| `scheduler.defaultSchedulerPolicy.nodeSchedulerPolicy` | 字符串 | GPU 节点调度策略:`"binpack"` 表示尽可能将任务分配到同一个 GPU 节点;`"spread"` 表示尽可能将任务分配到不同的 GPU 节点。 | `"binpack"` | +| `scheduler.defaultSchedulerPolicy.gpuSchedulerPolicy` | 字符串 | GPU 调度策略:`"binpack"` 表示尽可能将任务分配到同一个 GPU;`"spread"` 表示尽可能将任务分配到不同的 GPU。 | `"spread"` | + +## Pod 配置:注解 + +| 参数 | 类型 | 描述 | 示例 | +| --- | ---- | --- | ----- | +| `nvidia.com/use-gpuuuid` | 字符串 | 如果设置了此字段,则该 Pod 分配的设备 **必须** 是此字符串中定义的 GPU UUID 之一。 | `"GPU-AAA,GPU-BBB"` | +| `nvidia.com/nouse-gpuuuid` | 字符串 | 如果设置了此字段,则该 Pod 分配的设备 **不能** 是此字符串中定义的 GPU UUID。 | `"GPU-AAA,GPU-BBB"` | +| `nvidia.com/nouse-gputype` | 字符串 | 如果设置了此字段,则该 Pod 分配的设备 **不能** 是此字符串中定义的 GPU 类型。 | `"Tesla V100-PCIE-32GB, NVIDIA A10"` | +| `nvidia.com/use-gputype` | 字符串 | 如果设置了此字段,则该 Pod 分配的设备 **必须** 是此字符串中定义的 GPU 类型之一。 | `"Tesla V100-PCIE-32GB, NVIDIA A10"` | +| `hami.io/node-scheduler-policy` | 字符串 | GPU 节点调度策略:`"binpack"` 表示将 Pod 分配到已有负载的 GPU 节点上执行,`"spread"` 表示分配到不同的 GPU 节点上执行。 | `"binpack"` 或 `"spread"` | +| `hami.io/gpu-scheduler-policy` | 字符串 | GPU 卡调度策略:`"binpack"` 表示将 Pod 分配到同一块 GPU 卡上执行,`"spread"` 表示分配到不同的 GPU 卡上执行。 | `"binpack"` 或 `"spread"` | +| `nvidia.com/vgpu-mode` | 字符串 | 指定该 Pod 希望使用的 vGPU 实例类型。 | `"hami-core"` 或 `"mig"` | + +## 容器配置:环境变量 + +| 参数 | 类型 | 描述 | 默认值 | +| --- | ---- | --- | ----- | +| `GPU_CORE_UTILIZATION_POLICY` | 字符串 | 定义 GPU 核心使用策略:<ul><li>`"default"`:默认使用策略。</li><li>`"force"`:强制将核心使用率限制在 `"nvidia.com/gpucores"` 设定值以下。</li><li>`"disable"`:在任务运行期间忽略 `"nvidia.com/gpucores"` 设置的使用限制。</li></ul> | `"default"` | +| `CUDA_DISABLE_CONTROL` | 布尔值 | 若为 `"true"`,容器内将不会启用 HAMi-core,导致无资源隔离与限制(用于调试)。 | `false` | diff --git a/docs/zh/userguide/monitoring/device-allocation.md b/docs/zh/userguide/monitoring/device-allocation.md new file mode 100644 index 0000000..3c62461 --- /dev/null +++ b/docs/zh/userguide/monitoring/device-allocation.md @@ -0,0 +1,25 @@ +--- +title: 集群设备分配 +translated: true +--- + +## 集群设备分配端点 + +您可以通过访问 `{scheduler node ip}:31993/metrics` 获取集群设备分配和限制的概览,或者将其添加到 Prometheus 端点,如下命令所示: + +``` +curl {scheduler node ip}:31993/metrics +``` + +它包含以下指标: + +| 指标 | 描述 | 示例 | +|----------|-------------|---------| +| GPUDeviceCoreLimit | GPU 设备核心限制 | `{deviceidx="0",deviceuuid="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec",nodeid="aio-node67",zone="vGPU"}` 100 | +| GPUDeviceMemoryLimit | GPU 设备内存限制 | `{deviceidx="0",deviceuuid="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec",nodeid="aio-node67",zone="vGPU"}` 3.4359738368e+10 | +| GPUDeviceCoreAllocated | 分配给某个 GPU 的设备核心 | `{deviceidx="0",deviceuuid="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec",nodeid="aio-node67",zone="vGPU"}` 45 | +| GPUDeviceMemoryAllocated | 分配给某个 GPU 的设备内存 | `{devicecores="0",deviceidx="0",deviceuuid="aio-node74-arm-Ascend310P-0",nodeid="aio-node74-arm",zone="vGPU"}` 3.221225472e+09 | +| GPUDeviceSharedNum | 共享此 GPU 的容器数量 | `{deviceidx="0",deviceuuid="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec",nodeid="aio-node67",zone="vGPU"}` 1 | +| vGPUPodsDeviceAllocated | 从 pod 分配的 vGPU | `{containeridx="Ascend310P",deviceusedcore="0",deviceuuid="aio-node74-arm-Ascend310P-0",nodename="aio-node74-arm",podname="ascend310p-pod",podnamespace="default",zone="vGPU"}` 3.221225472e+09 | + +> **注意** 请注意,这只是关于设备分配的概览,并不是设备的实时使用指标。有关实时使用情况,请参见实时设备使用。 \ No newline at end of file diff --git a/docs/zh/userguide/monitoring/globalview.md b/docs/zh/userguide/monitoring/globalview.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/zh/userguide/monitoring/real-time-device-usage.md b/docs/zh/userguide/monitoring/real-time-device-usage.md new file mode 100644 index 0000000..cad2bb8 --- /dev/null +++ b/docs/zh/userguide/monitoring/real-time-device-usage.md @@ -0,0 +1,23 @@ +--- +title: 实时设备使用 +translated: true +--- + +## 实时设备使用端点 + +您可以通过访问 `{GPU 节点 IP}:31992/metrics` 获取实时设备内存和核心使用情况,或者将其添加到 Prometheus 端点,如下命令所示: + +``` +curl {GPU 节点 IP}:31992/metrics +``` + +它包含以下指标: + +| 指标 | 描述 | 示例 | +|----------|-------------|---------| +| Device_memory_desc_of_container | 容器设备内存实时使用情况 | `{context="0",ctrname="2-1-3-pod-1",data="0",deviceuuid="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec",module="0",offset="0",podname="2-1-3-pod-1",podnamespace="default",vdeviceid="0",zone="vGPU"}` 0 | +| Device_utilization_desc_of_container | 容器设备实时利用率 | `{ctrname="2-1-3-pod-1",deviceuuid="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec",podname="2-1-3-pod-1",podnamespace="default",vdeviceid="0",zone="vGPU"}` 0 | +| HostCoreUtilization | 主机上的 GPU 实时利用率 | `{deviceidx="0",deviceuuid="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec",zone="vGPU"}` 0 | +| HostGPUMemoryUsage | 主机上的 GPU 实时设备内存使用情况 | `{deviceidx="0",deviceuuid="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec",zone="vGPU"}` 2.87244288e+08 | +| vGPU_device_memory_limit_in_bytes | 某个容器的设备限制 | `{ctrname="2-1-3-pod-1",deviceuuid="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec",podname="2-1-3-pod-1",podnamespace="default",vdeviceid="0",zone="vGPU"}` 2.62144e+09 | +| vGPU_device_memory_usage_in_bytes | 某个容器的设备使用情况 | `{ctrname="2-1-3-pod-1",deviceuuid="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec",podname="2-1-3-pod-1",podnamespace="default",vdeviceid="0",zone="vGPU"}` 0 | \ No newline at end of file diff --git a/docs/zh/userguide/monitoring/real-time-usage.md b/docs/zh/userguide/monitoring/real-time-usage.md new file mode 100644 index 0000000..6bad08d --- /dev/null +++ b/docs/zh/userguide/monitoring/real-time-usage.md @@ -0,0 +1,6 @@ +--- +title: 实时使用情况 +translated: true +--- + +待改进。 \ No newline at end of file diff --git a/docs/zh/userguide/support-devices.md b/docs/zh/userguide/support-devices.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/examples/default_use.md b/docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/examples/default_use.md new file mode 100644 index 0000000..2da22f5 --- /dev/null +++ b/docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/examples/default_use.md @@ -0,0 +1,27 @@ +--- +title: 默认 vgpu 作业 +translated: true +--- + +## 职位描述 + +VGPU 可以通过在 resource.limit 中设置 "volcano.sh/vgpu-number"、"volcano.sh/vgpu-cores" 和 "volcano.sh/vgpu-memory" 来请求。 + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: test1 +spec: + restartPolicy: OnFailure + schedulerName: volcano + containers: + - image: ubuntu:20.04 + name: pod1-ctr + command: ["sleep"] + args: ["100000"] + resources: + limits: + volcano.sh/vgpu-memory: 1024 + volcano.sh/vgpu-number: 1 +``` diff --git a/docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/examples/use_exclusive_gpu.md b/docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/examples/use_exclusive_gpu.md new file mode 100644 index 0000000..51bd3ef --- /dev/null +++ b/docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/examples/use_exclusive_gpu.md @@ -0,0 +1,26 @@ +--- +title: 使用独占 GPU +translated: true +--- + +## 职位描述 + +要分配一个独占的GPU,您只需分配`volcano.sh/vgpu-number`,而无需其他`volcano.sh/xxx`字段,如下例所示: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: test1 +spec: + restartPolicy: OnFailure + schedulerName: volcano + containers: + - image: ubuntu:20.04 + name: pod1-ctr + command: ["sleep"] + args: ["100000"] + resources: + limits: + volcano.sh/vgpu-number: 1 +``` diff --git a/docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/how-to-use-volcano-vgpu.md b/docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/how-to-use-volcano-vgpu.md new file mode 100644 index 0000000..3e59d28 --- /dev/null +++ b/docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/how-to-use-volcano-vgpu.md @@ -0,0 +1,142 @@ +--- +title: 如何使用 Volcano vGPU +translated: true +--- + +# Volcano vgpu 设备插件用于 Kubernetes + +**注意**: + +使用 volcano-vgpu 时,**不需要** 安装 HAMi,仅使用 +[Volcano vgpu device-plugin](https://github.com/Project-HAMi/volcano-vgpu-device-plugin) 即可。它可以为由 volcano 管理的 NVIDIA 设备提供设备共享机制。 + +该插件基于 [Nvidia Device Plugin](https://github.com/NVIDIA/k8s-device-plugin),并使用 [HAMi-core](https://github.com/Project-HAMi/HAMi-core) 实现对 GPU 卡的硬隔离支持。 + +Volcano vgpu 仅在 volcano > 1.9 版本中可用。 + +## 快速开始 + +### 安装 Volcano + +```bash +helm repo add volcano-sh https://volcano-sh.github.io/helm-charts +helm install volcano volcano-sh/volcano -n volcano-system --create-namespace +``` + +### 配置调度器 + +更新调度器配置: + +```bash +kubectl edit cm -n volcano-system volcano-scheduler-configmap +``` + +```yaml +kind: ConfigMap +apiVersion: v1 +metadata: + name: volcano-scheduler-configmap + namespace: volcano-system +data: + volcano-scheduler.conf: | + actions: "enqueue, allocate, backfill" + tiers: + - plugins: + - name: priority + - name: gang + - name: conformance + - plugins: + - name: drf + - name: deviceshare + arguments: + deviceshare.VGPUEnable: true # 启用 vgpu + - name: predicates + - name: proportion + - name: nodeorder + - name: binpack +``` + +### 启用 Kubernetes 的 GPU 支持 + +在你想要使用的 **所有** GPU 节点上启用此选项后, +可以通过部署以下 DaemonSet 来在集群中启用 GPU 支持: + +```bash +kubectl create -f https://raw.githubusercontent.com/Project-HAMi/volcano-vgpu-device-plugin/main/volcano-vgpu-device-plugin.yml +``` + +### 验证环境是否就绪 + +检查节点状态,如果 `volcano.sh/vgpu-number` 出现在 allocatable 资源中,即表示正常。 + +```bash +kubectl get node {node name} -oyaml +``` + +输出示例: + +```yaml +status: + addresses: + - address: 172.17.0.3 + type: InternalIP + - address: volcano-control-plane + type: Hostname + allocatable: + cpu: "4" + ephemeral-storage: 123722704Ki + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 8174332Ki + pods: "110" + volcano.sh/gpu-number: "10" # vGPU 资源 + capacity: + cpu: "4" + ephemeral-storage: 123722704Ki + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 8174332Ki + pods: "110" + volcano.sh/gpu-memory: "89424" + volcano.sh/gpu-number: "10" # vGPU 资源 +``` + +### 运行 VGPU 作业 + +可以通过在 `resources.limits` 中设置 `volcano.sh/vgpu-number`、`volcano.sh/vgpu-cores` 和 `volcano.sh/vgpu-memory` 来请求 VGPU: + +```bash +cat <<EOF | kubectl apply -f - +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod1 +spec: + containers: + - name: cuda-container + image: nvidia/cuda:9.0-devel + command: ["sleep"] + args: ["100000"] + resources: + limits: + volcano.sh/vgpu-number: 2 # 请求 2 张 GPU 卡 + volcano.sh/vgpu-memory: 3000 # (可选)每个 vGPU 使用 3G 显存 + volcano.sh/vgpu-cores: 50 # (可选)每个 vGPU 使用 50% 核心 +EOF +``` + +你可以在容器内使用 `nvidia-smi` 验证设备显存使用情况: + +> **⚠️ 警告:** +> 如果你在使用 device plugin 配合 NVIDIA 镜像时未显式请求 GPU, +> 那么该节点上所有 GPU 都会暴露在你的容器中。 +> 容器中使用的 vGPU 数量不能超过该节点上的 GPU 总数。 + +### 监控 + +`volcano-scheduler-metrics` 会记录每次 GPU 使用和限制情况, +你可以通过访问以下地址来获取这些指标: + +```bash +curl {volcano scheduler cluster ip}:8080/metrics +``` diff --git a/docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/monitor.md b/docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/monitor.md new file mode 100644 index 0000000..a5b11cd --- /dev/null +++ b/docs/zh/userguide/volcano-vgpu/NVIDIA-GPU/monitor.md @@ -0,0 +1,23 @@ +--- +title: 监控 Volcano vGPU +translated: true +--- + +### 监控 + +volcano-scheduler-metrics 记录每个 GPU 的使用情况和限制,访问以下地址获取这些指标。 + +``` +curl {volcano scheduler cluster ip}:8080/metrics +``` + +它包含以下指标: + +| 指标 | 描述 | 示例 | +|----------|-------------|---------| +| volcano_vgpu_device_allocated_cores | 此卡中分配的 GPU 计算核心的百分比 | `{NodeName="aio-node67",devID="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec"}` 0 | +| volcano_vgpu_device_allocated_memory | 此卡中分配的 Vgpu 内存 | `{NodeName="aio-node67",devID="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec"}` 32768| +| volcano_vgpu_device_core_allocation_for_a_vertain_pod| 为某个 pod 分配的 vgpu 设备核心 | `{NodeName="aio-node67",devID="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec",podName="resnet101-deployment-7b487d974d-jjc8p"}` 0| +| volcano_vgpu_device_memory_allocation_for_a_certain_pod | 为某个 pod 分配的 vgpu 设备内存 | `{NodeName="aio-node67",devID="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec",podName="resnet101-deployment-7b487d974d-jjc8p"}` 16384 | +| volcano_vgpu_device_memory_limit | 此卡中设备内存的总数 | `{NodeName="m5-cloudinfra-online01",devID="GPU-a88b5d0e-eb85-924b-b3cd-c6cad732f745"}` 32768 | +| volcano_vgpu_device_shared_number | 共享此卡的 vgpu 任务数量 | `{NodeName="aio-node67",devID="GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec"}` 2| \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..f81a50c --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,58 @@ +site_name: HAMi +theme: + palette: + primary: teal + name: material + logo: logo.svg + language: en + custom_dir: overrides + features: + - navigation.footer + - toc.integrate + + highlightjs: true + hljs_languages: + - yaml + - rust + +docs_dir: ./docs +nav: + - blog: + - blog/2024-12-18-support-blog-post/index.md + - blog/2024-12-31-post/index.md + - Core Concepts: + - What is HAMI: core-concepts/introduction.md + - Architecture: core-concepts/architecture.md + - Changelog: + - changelog/source/v2.5.0.md + - changelog/source/v2.5.1.md + +markdown_extensions: + - admonition + - pymdownx.tasklist: + custom_checkbox: true + - pymdownx.details + - pymdownx.superfences + - pymdownx.inlinehilite + - toc: + permalink: true # 在标题旁添加链接图标 + baselevel: 1 # 从哪个级别开始生成目录 + separator: "-" # 锚点分隔符 + +extra: + version: v0.4 + +plugins: + - search: + lang: + - en + - i18n: + docs_structure: folder + languages: + - locale: en + default: true + name: English + build: true + - locale: zh + name: Chinese + build: true \ No newline at end of file diff --git a/overrides/partials/footer.html b/overrides/partials/footer.html new file mode 100644 index 0000000..0c2464f --- /dev/null +++ b/overrides/partials/footer.html @@ -0,0 +1,14 @@ +<html> + <body> + <p style="text-align:center;font-size:15px;"> + Copyright Contributors to the HAMi's project. + </p> + <p style="text-align:center;font-size:20px;color:#4169E1;"> + The Linux Foundation® (TLF) has registered trademarks and uses trademarks. For a list of TLF trademarks, see <a href="https://www.linuxfoundation.org/trademark-usage/" target="_blank"> Trademark Usage.</a> + </p> + <p style="text-align:center;font-size:15px;"> + Footer part can be edit. + </p> + </body> + +</html> diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..89a458b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,80 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "HAMi-docs" +description = "Documentation for HAMi" +version = "0.4.0" +readme = "README.md" +license = { file = "LICENSE" } +authors = [{ name = "HAMi Contributors" }] +maintainers = [{ name = "HAMi Contributors" }] +keywords = ["kubernetes", "energy", "power", "sustainability", "documentation"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: System Administrators", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Documentation", + "Topic :: System :: Monitoring", +] +requires-python = ">=3.8" +dependencies = [ + "mkdocs>=1.6", + "mkdocs-material>=9.5", + "mkdocs-static-i18n>=1.2", +] + +[dependency-groups] +# Development tools (additional dependencies beyond the main ones) +dev = [ + # Development utilities could go here + # e.g., "pre-commit", "ruff", etc. +] +ci = [ + # CI/CD specific tools could go here + # Currently handled by GitHub Actions with super-linter +] + +[project.urls] +Homepage = "https://sustainable-computing.io" +Documentation = "https://sustainable-computing.io" +Repository = "https://github.com/sustainable-computing-io/kepler" +"Bug Tracker" = "https://github.com/sustainable-computing-io/kepler/issues" + +[tool.hatch.version] +path = "mkdocs.yml" +pattern = "version: (?P<version>[^\\s]+)" + +[tool.hatch.build.targets.wheel] +# For a documentation project, include the essential files +only-include = [ + "docs/", + "templates/", + "data/", + "overrides/", + "mkdocs.yml", + "README.md", + "LICENSE", +] + +[tool.hatch.envs.default] +dependencies = [ + "mkdocs>=1.6", + "mkdocs-material>=9.5", + "mkdocs-static-i18n>=1.2", +] + +[tool.hatch.envs.default.scripts] +serve = "mkdocs serve" +serve-remote = "mkdocs serve -a 0.0.0.0:8000" +build = "mkdocs build" +generate-adopters = "gomplate -d adopters=./data/adopters.yaml -f templates/adopters.md -o docs/project/adopters.md" diff --git a/templates/adopters.md b/templates/adopters.md new file mode 100644 index 0000000..4b53c62 --- /dev/null +++ b/templates/adopters.md @@ -0,0 +1,21 @@ +--- +title: HAMi Adopters +type: adopters +description: > + On this page you can see a selection of organisations who self-identified as using Kepler. +--- + +## HAMi Adopters + +Organizations below all are using HAMi. + +To join this list, please follow these instructions. + +{{- range (datasource "adopters").adopters.companies }} +{{ if has . "logo" }} +![{{.name}}](../fig/{{ .logo }}) +{{ else }} +![{{.name}}](../fig/logos/default.svg) +{{ end }} +[{{.name}}]({{.url}}) +{{ end }}