diff --git a/docs/assets/javascripts/extra.js b/docs/assets/javascripts/extra.js index a1401b2d8..30bdcd5fd 100644 --- a/docs/assets/javascripts/extra.js +++ b/docs/assets/javascripts/extra.js @@ -155,4 +155,11 @@ window.addEventListener("DOMContentLoaded", function() { } }); }) + + document.querySelectorAll('a[href^="http"]').forEach(link => { + if (!link.href.includes(location.hostname)) { + link.setAttribute('target', '_blank'); + link.setAttribute('rel', 'noopener noreferrer'); + } + }); })() diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index bf4ea92aa..99655a1fe 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -1350,7 +1350,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { visibility: visible; }*/ - .twemoji.external { + /* .twemoji.external { position: relative; top: 2.5px; height: 18.5px; @@ -1364,7 +1364,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { position: relative; top: 1.5px; margin-right: -7px; - } + } */ /*.md-tabs__item:nth-child(6) .md-tabs__link:before { position: relative; @@ -1585,7 +1585,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { .md-typeset.md-banner__inner a { color: var(--md-default-bg-color); - border-bottom: 1.5px dotted; + /* border-bottom: 1.5px dotted; */ font-weight: 600; } @@ -1801,3 +1801,37 @@ img.border { font-size: 12px !important;; padding: 30px !important; } + +/* External link indicator */ +a[href^="http"]:not(:where( + /* skip if marked with external-skip */ + .external-skip, + /* exclude http:// dstack links */ + [href^="http://dstack.ai"], + /* exclude https://dstack.ai links */ + [href^="https://dstack.ai"], + /* exclude md-content__button links */ + .md-content__button, +)):after { + content: ''; + display: inline-block; + width: 18.5px; + height: 18.5px; + margin-left: 0.15em; + vertical-align: -0.2em; + background-color: currentColor; + mask-image: url('data:image/svg+xml,'); + mask-size: 100%; + mask-repeat: no-repeat; + mask-position: center; + -webkit-mask-image: url('data:image/svg+xml,'); + -webkit-mask-size: 100%; + -webkit-mask-repeat: no-repeat; + -webkit-mask-position: center; + text-decoration: none; +} + +/* Exclude links inside .md-social */ +.md-social a[href^="http"]:after { + display: none; +} diff --git a/docs/assets/stylesheets/landing.css b/docs/assets/stylesheets/landing.css index 24253b322..6efd82de9 100644 --- a/docs/assets/stylesheets/landing.css +++ b/docs/assets/stylesheets/landing.css @@ -327,7 +327,7 @@ margin-right: -7px; } -.md-button-secondary.external:after { +/* .md-button-secondary.external:after { content: url('data:image/svg+xml,'); line-height: 14px; margin-left: 5px; @@ -343,7 +343,7 @@ position: relative; top: 2.5px; margin-right: -7px; -} +} */ .md-header__buttons .md-button-secondary, .md-typeset .md-button-secondary, @@ -702,13 +702,13 @@ line-height: 32px; } -.tx-landing__highlights_grid h3.external:after { +/* .tx-landing__highlights_grid h3.external:after { content: url('data:image/svg+xml,'); margin-left: 2px; position: relative; top: 3px; margin-right: -7px; -} +} */ .tx-landing__highlights_grid p { font-size: 16px; diff --git a/docs/blog/archive/ambassador-program.md b/docs/blog/archive/ambassador-program.md index e9241f516..778f2cb37 100644 --- a/docs/blog/archive/ambassador-program.md +++ b/docs/blog/archive/ambassador-program.md @@ -58,8 +58,8 @@ yourself and your experience. We’ll reach out with a starter kit and next step Get involved -Have questions? Reach out via [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"}! +Have questions? Reach out via [Discord](https://discord.gg/u8SmfwPpMd)! > 💜 In the meantime, we’re thrilled to -> welcome [Park Chansung :material-arrow-top-right-thin:{ .external }](https://x.com/algo_diver){:target="_blank"}, the +> welcome [Park Chansung](https://x.com/algo_diver), the > first `dstack` ambassador. diff --git a/docs/blog/archive/efa.md b/docs/blog/archive/efa.md index 4fe919daf..6841cd976 100644 --- a/docs/blog/archive/efa.md +++ b/docs/blog/archive/efa.md @@ -10,7 +10,7 @@ categories: # Efficient distributed training with AWS EFA -[Amazon Elastic Fabric Adapter (EFA) :material-arrow-top-right-thin:{ .external }](https://aws.amazon.com/hpc/efa/){:target="_blank"} is a high-performance network interface designed for AWS EC2 instances, enabling +[Amazon Elastic Fabric Adapter (EFA)](https://aws.amazon.com/hpc/efa/) is a high-performance network interface designed for AWS EC2 instances, enabling ultra-low latency and high-throughput communication between nodes. This makes it an ideal solution for scaling distributed training workloads across multiple GPUs and instances. @@ -39,7 +39,7 @@ network interfaces, you’ll need to disable public IPs. Note, the `dstack` server in this case should have access to the private subnet of the VPC. You’ll also need to specify an AMI that includes the GDRCopy drivers. For example, you can use the -[AWS Deep Learning Base GPU AMI :material-arrow-top-right-thin:{ .external }](https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-22-04/){:target="_blank"}. +[AWS Deep Learning Base GPU AMI](https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-22-04/). Here’s an example backend configuration: @@ -164,10 +164,10 @@ $ dstack apply -f examples/misc/efa/task.dstack.yml -R EFA. > Have questions? You're welcome to join -> our [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} or talk -> directly to [our team :material-arrow-top-right-thin:{ .external }](https://calendly.com/dstackai/discovery-call){:target="_blank"}. +> our [Discord](https://discord.gg/u8SmfwPpMd) or talk +> directly to [our team](https://calendly.com/dstackai/discovery-call). !!! info "What's next?" 1. Check [fleets](../../docs/concepts/fleets.md), [tasks](../../docs/concepts/tasks.md), and [volumes](../../docs/concepts/volumes.md) 2. Also see [dev environments](../../docs/concepts/dev-environments.md) and [services](../../docs/concepts/services.md) - 3. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/amd-mi300x-inference-benchmark.md b/docs/blog/posts/amd-mi300x-inference-benchmark.md index 13c3af99f..bc747ee78 100644 --- a/docs/blog/posts/amd-mi300x-inference-benchmark.md +++ b/docs/blog/posts/amd-mi300x-inference-benchmark.md @@ -12,7 +12,7 @@ categories: At `dstack`, we've been adding support for AMD GPUs with [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets), so we saw this as a great chance to test our integration by benchmarking AMD GPUs. Our friends at -[Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"}, who build top-tier +[Hot Aisle](https://hotaisle.xyz/), who build top-tier bare metal compute for AMD GPUs, kindly provided the hardware for the benchmark. @@ -106,7 +106,7 @@ Here is the spec of the bare metal machine we got: ??? info "TGI" The `ghcr.io/huggingface/text-generation-inference:sha-11d7af7-rocm` Docker image was used. -For conducting the tests, we've been using the [`benchmark_serving` :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py){:target="_blank"} provided by vLLM. +For conducting the tests, we've been using the [`benchmark_serving`](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py) provided by vLLM. ## Observations @@ -175,7 +175,7 @@ to vLLM. -This difference may be related to how vLLM [pre-allocates GPU cache :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/models/performance.html){:target="_blank"}. +This difference may be related to how vLLM [pre-allocates GPU cache](https://docs.vllm.ai/en/latest/models/performance.html). ## Conclusion @@ -203,7 +203,7 @@ like the H100 and H200, as well as possibly Google TPU. ### Source code The source code used for this benchmark can be found in our -[GitHub repo :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/benchmarks/tree/main/amd/inference){:target="_blank"}. +[GitHub repo](https://github.com/dstackai/benchmarks/tree/main/amd/inference). If you have questions, feedback, or want to help improve the benchmark, please reach out to our team. @@ -211,7 +211,7 @@ If you have questions, feedback, or want to help improve the benchmark, please r ### Hot Aisle -[Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"} +[Hot Aisle](https://hotaisle.xyz/) is the primary sponsor of this benchmark, and we are sincerely grateful for their hardware and support. If you'd like to use top-tier bare metal compute with AMD GPUs, we recommend going @@ -219,6 +219,6 @@ with Hot Aisle. Once you gain access to a cluster, it can be easily accessed via ### RunPod If you’d like to use on-demand compute with AMD GPUs at affordable prices, you can configure `dstack` to -use [RunPod :material-arrow-top-right-thin:{ .external }](https://runpod.io/){:target="_blank"}. In +use [RunPod](https://runpod.io/). In this case, `dstack` will be able to provision fleets automatically when you run dev environments, tasks, and services. diff --git a/docs/blog/posts/amd-on-runpod.md b/docs/blog/posts/amd-on-runpod.md index 1e32a27e7..c1ff25015 100644 --- a/docs/blog/posts/amd-on-runpod.md +++ b/docs/blog/posts/amd-on-runpod.md @@ -33,14 +33,14 @@ One of the main advantages of the `MI300X` is its VRAM. For example, with the `H version of Llama 3.1 405B into a single node with 8 GPUs—you'd have to use FP8 instead. However, with the `MI300X`, you can fit FP16 into a single node with 8 GPUs, and for FP8, you'd only need 4 GPUs. -With the [latest update :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/releases/0.18.11rc1){:target="_blank"}, +With the [latest update](https://github.com/dstackai/dstack/releases/0.18.11rc1), you can now specify an AMD GPU under `resources`. Below are a few examples. ## Configuration === "Service" Here's an example of a [service](../../docs/concepts/services.md) that deploys - Llama 3.1 70B in FP16 using [TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/installation_amd){:target="_blank"}. + Llama 3.1 70B in FP16 using [TGI](https://huggingface.co/docs/text-generation-inference/en/installation_amd).
@@ -72,7 +72,7 @@ you can now specify an AMD GPU under `resources`. Below are a few examples. === "Dev environment" Here's an example of a [dev environment](../../docs/concepts/dev-environments.md) using - [TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/installation_amd){:target="_blank"}'s + [TGI](https://huggingface.co/docs/text-generation-inference/en/installation_amd)'s Docker image: ```yaml @@ -111,11 +111,11 @@ cloud resources and run the configuration. ## What's next? 1. The examples above demonstrate the use of -[TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/installation_amd){:target="_blank"}. +[TGI](https://huggingface.co/docs/text-generation-inference/en/installation_amd). AMD accelerators can also be used with other frameworks like vLLM, Ollama, etc., and we'll be adding more examples soon. 2. RunPod is the first cloud provider where dstack supports AMD. More cloud providers will be supported soon as well. -3. Want to give RunPod and `dstack` a try? Make sure you've signed up for [RunPod :material-arrow-top-right-thin:{ .external }](https://www.runpod.io/){:target="_blank"}, +3. Want to give RunPod and `dstack` a try? Make sure you've signed up for [RunPod](https://www.runpod.io/), then [set up](../../docs/reference/server/config.yml.md#runpod) the `dstack server`. -> Have questioned or feedback? Join our [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} +> Have questioned or feedback? Join our [Discord](https://discord.gg/u8SmfwPpMd) server. diff --git a/docs/blog/posts/amd-on-tensorwave.md b/docs/blog/posts/amd-on-tensorwave.md index 5a062f454..51552207e 100644 --- a/docs/blog/posts/amd-on-tensorwave.md +++ b/docs/blog/posts/amd-on-tensorwave.md @@ -14,7 +14,7 @@ Since last month, when we introduced support for private clouds and data centers to orchestrate AI containers with any AI cloud vendor, whether they provide on-demand compute or reserved clusters. In this tutorial, we’ll walk you through how `dstack` can be used with -[TensorWave :material-arrow-top-right-thin:{ .external }](https://tensorwave.com/){:target="_blank"} using +[TensorWave](https://tensorwave.com/) using [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets). @@ -237,4 +237,4 @@ Want to see how it works? Check out the video below: !!! info "What's next?" 1. See [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets) 2. Read about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md) - 3. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd) + 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/benchmark-amd-containers-and-partitions.md b/docs/blog/posts/benchmark-amd-containers-and-partitions.md index cf1d8baaa..8b945aaba 100644 --- a/docs/blog/posts/benchmark-amd-containers-and-partitions.md +++ b/docs/blog/posts/benchmark-amd-containers-and-partitions.md @@ -16,7 +16,7 @@ Our new benchmark explores two important areas for optimizing AI workloads on AM -This benchmark was supported by [Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"}, +This benchmark was supported by [Hot Aisle](https://hotaisle.xyz/), a provider of AMD GPU bare-metal and VM infrastructure. ## Benchmark 1: Bare-metal vs containers @@ -56,11 +56,11 @@ Our experiments consistently demonstrate that running multi-node AI workloads in ## Benchmark 2: Partition performance isolated vs mesh -The AMD GPU can be [partitioned :material-arrow-top-right-thin:{ .external }](https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/gpu-partitioning/mi300x/overview.html){:target="_blank"} into smaller, independent units (e.g., NPS4 mode splits one GPU into four partitions). This promises better memory bandwidth utilization. Does this theoretical gain translate to better performance in practice? +The AMD GPU can be [partitioned](https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/gpu-partitioning/mi300x/overview.html) into smaller, independent units (e.g., NPS4 mode splits one GPU into four partitions). This promises better memory bandwidth utilization. Does this theoretical gain translate to better performance in practice? ### Finding 1: Higher performance for isolated partitions -First, we sought to reproduce and extend findings from the [official ROCm blog :material-arrow-top-right-thin:{ .external }](https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html){:target="_blank"}. We benchmarked the memory bandwidth of a single partition (in CPX/NPS4 mode) against a full, unpartitioned GPU (in SPX/NPS1 mode). +First, we sought to reproduce and extend findings from the [official ROCm blog](https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html). We benchmarked the memory bandwidth of a single partition (in CPX/NPS4 mode) against a full, unpartitioned GPU (in SPX/NPS1 mode). @@ -100,7 +100,7 @@ GPU partitioning is only practical if used dynamically—for instance, to run mu #### Limitations 1. **Reproducibility**: AMD’s original blog post on partitioning lacked detailed setup information, so we had to reconstruct the benchmarks independently. -2. **Network tuning**: These benchmarks were run on a default, out-of-the-box network configuration. Our results for RCCL (~339 GB/s) and RDMA (~726 Gbps) are slightly below the peak figures [reported by Dell :material-arrow-top-right-thin:{ .external }](https://infohub.delltechnologies.com/en-us/l/generative-ai-in-the-enterprise-with-amd-accelerators/rccl-and-perftest-for-cluster-validation-1/4/){:target="_blank"}. This suggests that further performance could be unlocked with expert tuning of network topology, MTU size, and NCCL environment variables. +2. **Network tuning**: These benchmarks were run on a default, out-of-the-box network configuration. Our results for RCCL (~339 GB/s) and RDMA (~726 Gbps) are slightly below the peak figures [reported by Dell](https://infohub.delltechnologies.com/en-us/l/generative-ai-in-the-enterprise-with-amd-accelerators/rccl-and-perftest-for-cluster-validation-1/4/). This suggests that further performance could be unlocked with expert tuning of network topology, MTU size, and NCCL environment variables. ## Benchmark setup @@ -352,7 +352,7 @@ The `SIZE` value is `1M`, `2M`, .., `8G`. **vLLM data parallel** -1. Build nginx container (see [vLLM-nginx :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/stable/deployment/nginx.html#build-nginx-container){:target="_blank"}). +1. Build nginx container (see [vLLM-nginx](https://docs.vllm.ai/en/stable/deployment/nginx.html#build-nginx-container)). 2. Create `nginx.conf` @@ -471,13 +471,13 @@ HIP_VISIBLE_DEVICES=0 python3 toy_inference_benchmark.py \ ## Source code -All source code and findings are available in [our GitHub repo :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/benchmarks/tree/main/amd/baremetal_container_partition){:target="_blank"}. +All source code and findings are available in [our GitHub repo](https://github.com/dstackai/benchmarks/tree/main/amd/baremetal_container_partition). ## References -* [AMD Instinct MI300X GPU partitioning overview :material-arrow-top-right-thin:{ .external }](https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/gpu-partitioning/mi300x/overview.html){:target="_blank"} -* [Deep dive into partition modes by AMD :material-arrow-top-right-thin:{ .external }](https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html){:target="_blank"}. -* [RCCL and PerfTest for cluster validation by Dell :material-arrow-top-right-thin:{ .external }](https://infohub.delltechnologies.com/en-us/l/generative-ai-in-the-enterprise-with-amd-accelerators/rccl-and-perftest-for-cluster-validation-1/4/){:target="_blank"}. +* [AMD Instinct MI300X GPU partitioning overview](https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/gpu-partitioning/mi300x/overview.html) +* [Deep dive into partition modes by AMD](https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html). +* [RCCL and PerfTest for cluster validation by Dell](https://infohub.delltechnologies.com/en-us/l/generative-ai-in-the-enterprise-with-amd-accelerators/rccl-and-perftest-for-cluster-validation-1/4/). ## What's next? @@ -487,5 +487,5 @@ Benchmark the performance impact of VMs vs bare-metal for inference and training #### Hot Aisle -Big thanks to [Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"} for providing the compute power behind these benchmarks. +Big thanks to [Hot Aisle](https://hotaisle.xyz/) for providing the compute power behind these benchmarks. If you’re looking for fast AMD GPU bare-metal or VM instances, they’re definitely worth checking out. diff --git a/docs/blog/posts/benchmark-amd-vms.md b/docs/blog/posts/benchmark-amd-vms.md index 099fee9be..b8d9105d0 100644 --- a/docs/blog/posts/benchmark-amd-vms.md +++ b/docs/blog/posts/benchmark-amd-vms.md @@ -18,7 +18,7 @@ This is the first in our series of benchmarks exploring the performance of AMD G Our findings reveal that for single-GPU LLM training and inference, both setups deliver comparable performance. The subtle differences we observed highlight how virtualization overhead can influence performance under specific conditions, but for most practical purposes, the performance is nearly identical. -This benchmark was supported by [Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"}, +This benchmark was supported by [Hot Aisle](https://hotaisle.xyz/), a provider of AMD GPU bare-metal and VM infrastructure. ## Benchmark 1: Inference @@ -201,11 +201,11 @@ python3 trl/scripts/sft.py \ ## Source code -All source code and findings are available in our [GitHub repo :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/benchmarks/tree/main/amd/single_gpu_vm_vs_bare-metal){:target="_blank"}. +All source code and findings are available in our [GitHub repo](https://github.com/dstackai/benchmarks/tree/main/amd/single_gpu_vm_vs_bare-metal). ## References -* [vLLM V1 Meets AMD Instinct GPUs: A New Era for LLM Inference Performance :material-arrow-top-right-thin:{ .external }](https://rocm.blogs.amd.com/software-tools-optimization/vllmv1-rocm-llm/README.html){:target="_blank"} +* [vLLM V1 Meets AMD Instinct GPUs: A New Era for LLM Inference Performance](https://rocm.blogs.amd.com/software-tools-optimization/vllmv1-rocm-llm/README.html) ## What's next? @@ -215,5 +215,5 @@ Our next steps are to benchmark VM vs. bare-metal performance in multi-GPU and m #### Hot Aisle -Big thanks to [Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"} for providing the compute power behind these benchmarks. +Big thanks to [Hot Aisle](https://hotaisle.xyz/) for providing the compute power behind these benchmarks. If you’re looking for fast AMD GPU bare-metal or VM instances, they’re definitely worth checking out. diff --git a/docs/blog/posts/benchmarking-pd-ratios.md b/docs/blog/posts/benchmarking-pd-ratios.md index 1069c5794..c303163e1 100644 --- a/docs/blog/posts/benchmarking-pd-ratios.md +++ b/docs/blog/posts/benchmarking-pd-ratios.md @@ -21,19 +21,19 @@ We evaluate different ratios across workload profiles and concurrency levels to ### What is Prefill–Decode disaggregation? -LLM inference has two distinct phases: prefill and decode. Prefill processes all prompt tokens in parallel and is compute-intensive. Decode generates tokens one by one, repeatedly accessing the KV-cache, making it memory- and bandwidth-intensive. DistServe ([Zhong et al., 2024 :material-arrow-top-right-thin:{ .external }](https://arxiv.org/pdf/2401.09670){:target="_blank"}) introduced prefill–decode disaggregation to separate these phases across dedicated workers, reducing interference and enabling hardware to be allocated more efficiently. +LLM inference has two distinct phases: prefill and decode. Prefill processes all prompt tokens in parallel and is compute-intensive. Decode generates tokens one by one, repeatedly accessing the KV-cache, making it memory- and bandwidth-intensive. DistServe ([Zhong et al., 2024](https://arxiv.org/pdf/2401.09670)) introduced prefill–decode disaggregation to separate these phases across dedicated workers, reducing interference and enabling hardware to be allocated more efficiently. ### What is the prefill–decode ratio? The ratio of prefill to decode workers determines how much capacity is dedicated to each phase. DistServe showed that for a workload with ISL=512 and OSL=64, a 2:1 ratio met both TTFT and TPOT targets. But this example does not answer how the ratio should be chosen more generally, or whether it needs to change at runtime. !!! info "Reasoning model example" - In the DeepSeek deployment ([LMSYS, 2025 :material-arrow-top-right-thin:{ .external }](https://lmsys.org/blog/2025-05-05-large-scale-ep){:target="_blank"}), the ratio was 1:3. This decode-leaning split reflects reasoning workloads, where long outputs dominate. Allocating more workers to decode reduces inter-token latency and keeps responses streaming smoothly. + In the DeepSeek deployment ([LMSYS, 2025](https://lmsys.org/blog/2025-05-05-large-scale-ep)), the ratio was 1:3. This decode-leaning split reflects reasoning workloads, where long outputs dominate. Allocating more workers to decode reduces inter-token latency and keeps responses streaming smoothly. ### Dynamic ratio -Dynamic approaches, such as NVIDIA’s [SLA-based :material-arrow-top-right-thin:{ .external }](https://docs.nvidia.com/dynamo/latest/architecture/sla_planner.html){:target="_blank"} -and [Load-based :material-arrow-top-right-thin:{ .external }](https://docs.nvidia.com/dynamo/latest/architecture/load_planner.html){:target="_blank"} planners, adjust the ratio at runtime according to SLO targets or load. However, they do this in conjunction with auto-scaling, which increases orchestration complexity. This raises the question: does the prefill–decode ratio really need to be dynamic, or can a fixed ratio be chosen ahead of time and still provide robust performance? +Dynamic approaches, such as NVIDIA’s [SLA-based](https://docs.nvidia.com/dynamo/latest/architecture/sla_planner.html) +and [Load-based](https://docs.nvidia.com/dynamo/latest/architecture/load_planner.html) planners, adjust the ratio at runtime according to SLO targets or load. However, they do this in conjunction with auto-scaling, which increases orchestration complexity. This raises the question: does the prefill–decode ratio really need to be dynamic, or can a fixed ratio be chosen ahead of time and still provide robust performance? ## Benchmark purpose @@ -72,7 +72,7 @@ If a fixed ratio consistently performs well across these metrics, it would indic * **Model**: `openai/gpt-oss-120b` * **Backend**: SGLang -For full steps and raw data, see the [GitHub repo :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/benchmarks/tree/main/comparison/pd_ratio){:target="_blank"}. +For full steps and raw data, see the [GitHub repo](https://github.com/dstackai/benchmarks/tree/main/comparison/pd_ratio). ## Finding 1: Prefill-heavy workloads @@ -134,8 +134,8 @@ Overall, more study on how the optimal ratio is found and what factors it depend ## References -* [DistServe :material-arrow-top-right-thin:{ .external }](https://arxiv.org/pdf/2401.09670){:target="_blank"} -* [DeepSeek deployment on 96 H100 GPUs :material-arrow-top-right-thin:{ .external }](https://lmsys.org/blog/2025-05-05-large-scale-ep/){:target="_blank"} -* [Dynamo disaggregated serving :material-arrow-top-right-thin:{ .external }](https://docs.nvidia.com/dynamo/latest/architecture/disagg_serving.html#){:target="_blank"} -* [SGLang PD disaggregation :material-arrow-top-right-thin:{ .external }](https://docs.sglang.ai/advanced_features/pd_disaggregation.html){:target="_blank"} -* [vLLM disaggregated prefilling :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/v0.9.2/features/disagg_prefill.html){:target="_blank"} +* [DistServe](https://arxiv.org/pdf/2401.09670) +* [DeepSeek deployment on 96 H100 GPUs](https://lmsys.org/blog/2025-05-05-large-scale-ep/) +* [Dynamo disaggregated serving](https://docs.nvidia.com/dynamo/latest/architecture/disagg_serving.html#) +* [SGLang PD disaggregation](https://docs.sglang.ai/advanced_features/pd_disaggregation.html) +* [vLLM disaggregated prefilling](https://docs.vllm.ai/en/v0.9.2/features/disagg_prefill.html) diff --git a/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md b/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md index dec4945ed..4c6b43f9b 100644 --- a/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md +++ b/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md @@ -21,25 +21,25 @@ As 2024 comes to a close, we reflect on the milestones we've achieved and look a While `dstack` integrates with leading cloud GPU providers, we aim to expand partnerships with more providers sharing our vision of simplifying AI infrastructure orchestration with a lightweight, efficient alternative to Kubernetes. -This year, we’re excited to welcome our first partners: [Lambda :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/){:target="_blank"}, -[RunPod :material-arrow-top-right-thin:{ .external }](https://www.runpod.io/){:target="_blank"}, -[CUDO Compute :material-arrow-top-right-thin:{ .external }](https://www.cudocompute.com/){:target="_blank"}, -and [Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"}. +This year, we’re excited to welcome our first partners: [Lambda](https://lambdalabs.com/), +[RunPod](https://www.runpod.io/), +[CUDO Compute](https://www.cudocompute.com/), +and [Hot Aisle](https://hotaisle.xyz/). -We’d also like to thank [Oracle :material-arrow-top-right-thin:{ .external }](https://www.oracle.com/cloud/){:target="_blank"} +We’d also like to thank [Oracle ](https://www.oracle.com/cloud/) for their collaboration, ensuring seamless integration between `dstack` and OCI. -> Special thanks to [Lambda :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/){:target="_blank"} and -> [Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"} for providing NVIDIA and AMD hardware, enabling us conducting +> Special thanks to [Lambda](https://lambdalabs.com/) and +> [Hot Aisle](https://hotaisle.xyz/) for providing NVIDIA and AMD hardware, enabling us conducting > [benchmarks](/blog/category/benchmarks/), which > are essential to advancing open-source inference and training stacks for all accelerator chips. ## Community Thanks to your support, the project has -reached [1.6K stars on GitHub :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack){:target="_blank"}, +reached [1.6K stars on GitHub](https://github.com/dstackai/dstack), reflecting the growing interest and trust in its mission. -Your issues, pull requests, as well as feedback on [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"}, play a +Your issues, pull requests, as well as feedback on [Discord](https://discord.gg/u8SmfwPpMd), play a critical role in the project's development. ## Fleets @@ -87,7 +87,7 @@ This turns your on-prem cluster into a `dstack` fleet, ready to run dev environm ### GPU blocks At `dstack`, when running a job on an instance, it uses all available GPUs on that instance. In Q1 2025, we will -introduce [GPU blocks :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/1780){:target="_blank"}, +introduce [GPU blocks](https://github.com/dstackai/dstack/issues/1780), allowing the allocation of instance GPUs into discrete blocks that can be reused by concurrent jobs. This will enable more cost-efficient utilization of expensive instances. @@ -112,16 +112,16 @@ for model deployment, and we continue to enhance support for the rest of NVIDIA' This year, we’re particularly proud of our newly added integration with AMD. `dstack` works seamlessly with any on-prem AMD clusters. For example, you can rent such servers through our partner -[Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"}. +[Hot Aisle](https://hotaisle.xyz/). -> Among cloud providers, [AMD :material-arrow-top-right-thin:{ .external }](https://www.amd.com/en/products/accelerators/instinct.html){:target="_blank"} is supported only through RunPod. In Q1 2025, we plan to extend it to -[Nscale :material-arrow-top-right-thin:{ .external }](https://www.nscale.com/){:target="_blank"}, -> [Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"}, and potentially other providers open to collaboration. +> Among cloud providers, [AMD](https://www.amd.com/en/products/accelerators/instinct.html) is supported only through RunPod. In Q1 2025, we plan to extend it to +[Nscale](https://www.nscale.com/), +> [Hot Aisle](https://hotaisle.xyz/), and potentially other providers open to collaboration. ### Intel In Q1 2025, our roadmap includes added integration with -[Intel Gaudi :material-arrow-top-right-thin:{ .external }](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html){:target="_blank"} +[Intel Gaudi](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html) among other accelerator chips. ## Join the community diff --git a/docs/blog/posts/changelog-07-25.md b/docs/blog/posts/changelog-07-25.md index 13bf67f54..909fa2859 100644 --- a/docs/blog/posts/changelog-07-25.md +++ b/docs/blog/posts/changelog-07-25.md @@ -112,7 +112,7 @@ resources: #### Tenstorrent -`dstack` remains committed to supporting multiple GPU vendors—including NVIDIA, AMD, TPUs, and more recently, [Tenstorrent :material-arrow-top-right-thin:{ .external }](https://tenstorrent.com/){:target="_blank"}. The latest release improves Tenstorrent support by handling hosts with multiple N300 cards and adds Docker-in-Docker support. +`dstack` remains committed to supporting multiple GPU vendors—including NVIDIA, AMD, TPUs, and more recently, [Tenstorrent](https://tenstorrent.com/). The latest release improves Tenstorrent support by handling hosts with multiple N300 cards and adds Docker-in-Docker support. @@ -192,7 +192,7 @@ Server-side performance has been improved. With optimized handling and backgroun #### Google SSO -Alongside the open-source version, `dstack` also offers [dstack Enterprise :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack-enterprise){:target="_blank"} — which adds dedicated support and extra integrations like Single Sign-On (SSO). The latest release introduces support for configuring your company’s Google account for authentication. +Alongside the open-source version, `dstack` also offers [dstack Enterprise](https://github.com/dstackai/dstack-enterprise) — which adds dedicated support and extra integrations like Single Sign-On (SSO). The latest release introduces support for configuring your company’s Google account for authentication. @@ -201,4 +201,4 @@ If you’d like to learn more about `dstack` Enterprise, [let us know](https://c That’s all for now. !!! info "What's next?" - Give dstack a try, and share your feedback—whether it’s [GitHub :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack){:target="_blank"} issues, PRs, or questions on [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"}. We’re eager to hear from you! + Give dstack a try, and share your feedback—whether it’s [GitHub](https://github.com/dstackai/dstack) issues, PRs, or questions on [Discord](https://discord.gg/u8SmfwPpMd). We’re eager to hear from you! diff --git a/docs/blog/posts/cursor.md b/docs/blog/posts/cursor.md index a5f960469..4e8e01fb4 100644 --- a/docs/blog/posts/cursor.md +++ b/docs/blog/posts/cursor.md @@ -15,7 +15,7 @@ automatic repository fetching, and streamlined access via SSH or a preferred des Previously, support was limited to VS Code. However, as developers rely on a variety of desktop IDEs, we’ve expanded compatibility. With this update, dev environments now offer effortless access for users of -[Cursor :material-arrow-top-right-thin:{ .external }](https://www.cursor.com/){:target="_blank"}. +[Cursor](https://www.cursor.com/). @@ -79,8 +79,8 @@ Using Cursor over VS Code offers multiple benefits, particularly when it comes t enhanced developer experience. !!! info "What's next?" - 1. [Download :material-arrow-top-right-thin:{ .external }](https://www.cursor.com/){:target="_blank"} and install Cursor + 1. [Download](https://www.cursor.com/) and install Cursor 2. Learn more about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 2. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 2. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/digitalocean-and-amd-dev-cloud.md b/docs/blog/posts/digitalocean-and-amd-dev-cloud.md index 4103e6e15..ce400899d 100644 --- a/docs/blog/posts/digitalocean-and-amd-dev-cloud.md +++ b/docs/blog/posts/digitalocean-and-amd-dev-cloud.md @@ -12,8 +12,8 @@ categories: Orchestration automates provisioning, running jobs, and tearing them down. While Kubernetes and Slurm are powerful in their domains, they lack the lightweight, GPU-native focus modern teams need to move faster. -`dstack` is built entirely around GPUs. Our latest update introduces native integration with [DigitalOcean :material-arrow-top-right-thin:{ .external }](https://www.digitalocean.com/products/gradient/gpu-droplets){:target="_blank"} and -[AMD Developer Cloud :material-arrow-top-right-thin:{ .external }](https://www.amd.com/en/developer/resources/cloud-access/amd-developer-cloud.html){:target="_blank"}, enabling teams to provision cloud GPUs and run workloads more cost-efficiently. +`dstack` is built entirely around GPUs. Our latest update introduces native integration with [DigitalOcean](https://www.digitalocean.com/products/gradient/gpu-droplets) and +[AMD Developer Cloud](https://www.amd.com/en/developer/resources/cloud-access/amd-developer-cloud.html), enabling teams to provision cloud GPUs and run workloads more cost-efficiently. @@ -143,9 +143,9 @@ $ dstack apply -f examples/models/gpt-oss/120b.dstack.yml !!! info "What's next?" 1. Check [Quickstart](../../docs/quickstart.md) - 2. Learn more about [DigitalOcean :material-arrow-top-right-thin:{ .external }](https://www.digitalocean.com/products/gradient/gpu-droplets){:target="_blank"} and - [AMD Developer Cloud :material-arrow-top-right-thin:{ .external }](https://www.amd.com/en/developer/resources/cloud-access/amd-developer-cloud.html){:target="_blank"} + 2. Learn more about [DigitalOcean](https://www.digitalocean.com/products/gradient/gpu-droplets) and + [AMD Developer Cloud](https://www.amd.com/en/developer/resources/cloud-access/amd-developer-cloud.html) 3. Explore [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 4. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/docker-inside-containers.md b/docs/blog/posts/docker-inside-containers.md index 13af39030..699e75fe7 100644 --- a/docs/blog/posts/docker-inside-containers.md +++ b/docs/blog/posts/docker-inside-containers.md @@ -94,12 +94,12 @@ Last but not least, you can, of course, use the `docker run` command, for exampl ## Examples -A few examples of using this feature can be found in [`examples/misc/docker-compose` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/misc/docker-compose){: target="_ blank"}. +A few examples of using this feature can be found in [`examples/misc/docker-compose`](https://github.com/dstackai/dstack/blob/master/examples/misc/docker-compose). ## Feedback If you find something not working as intended, please be sure to report it to -our [bug tracker :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues){:target="_ blank"}. +our [bug tracker](https://github.com/dstackai/dstack/issues){:target="_ blank"}. Your feedback and feature requests are also very welcome on both -[Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} and the -[issue tracker :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues){:target="_blank"}. +[Discord](https://discord.gg/u8SmfwPpMd) and the +[issue tracker](https://github.com/dstackai/dstack/issues). diff --git a/docs/blog/posts/dstack-metrics.md b/docs/blog/posts/dstack-metrics.md index f4647d782..4558cd0cc 100644 --- a/docs/blog/posts/dstack-metrics.md +++ b/docs/blog/posts/dstack-metrics.md @@ -63,7 +63,7 @@ Monitoring is a critical part of observability, and we have many more features o ## Feedback If you find something not working as intended, please be sure to report it to -our [bug tracker :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues){:target="_ blank"}. +our [bug tracker](https://github.com/dstackai/dstack/issues){:target="_ blank"}. Your feedback and feature requests are also very welcome on both -[Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} and the -[issue tracker :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues){:target="_blank"}. +[Discord](https://discord.gg/u8SmfwPpMd) and the +[issue tracker](https://github.com/dstackai/dstack/issues). diff --git a/docs/blog/posts/dstack-sky-own-cloud-accounts.md b/docs/blog/posts/dstack-sky-own-cloud-accounts.md index 16b68867c..13c927a31 100644 --- a/docs/blog/posts/dstack-sky-own-cloud-accounts.md +++ b/docs/blog/posts/dstack-sky-own-cloud-accounts.md @@ -9,7 +9,7 @@ categories: # dstack Sky now supports your own cloud accounts -[dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"} +[dstack Sky](https://sky.dstack.ai) enables you to access GPUs from the global marketplace at the most competitive rates. However, sometimes you may want to use your own cloud accounts. With today's release, both options are now supported. @@ -30,12 +30,12 @@ CUDO, RunPod, and Vast.ai. Additionally, you can disable certain backends if you do not plan to use them. Typically, if you prefer using your own cloud accounts, it's recommended that you use the -[open-source version :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/){:target="_blank"} of `dstack`. +[open-source version](https://github.com/dstackai/dstack/) of `dstack`. However, if you prefer not to host it yourself, now you can use `dstack Sky` with your own cloud accounts as well. > Seeking the cheapest on-demand and spot cloud GPUs? -> [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"} has you covered! +> [dstack Sky](https://sky.dstack.ai) has you covered! Need help, have a question, or just want to stay updated? diff --git a/docs/blog/posts/ea-gtc25.md b/docs/blog/posts/ea-gtc25.md index 262554866..b7a28ba59 100644 --- a/docs/blog/posts/ea-gtc25.md +++ b/docs/blog/posts/ea-gtc25.md @@ -12,7 +12,7 @@ links: # How EA uses dstack to fast-track AI development -At NVIDIA GTC 2025, Electronic Arts [shared :material-arrow-top-right-thin:{ .external }](https://www.nvidia.com/en-us/on-demand/session/gtc25-s73667/){:target="_blank"} how they’re scaling AI development and managing infrastructure across teams. They highlighted using tools like `dstack` to provision GPUs quickly, flexibly, and cost-efficiently. This case study summarizes key insights from their talk. +At NVIDIA GTC 2025, Electronic Arts [shared](https://www.nvidia.com/en-us/on-demand/session/gtc25-s73667/) how they’re scaling AI development and managing infrastructure across teams. They highlighted using tools like `dstack` to provision GPUs quickly, flexibly, and cost-efficiently. This case study summarizes key insights from their talk. @@ -80,7 +80,7 @@ Workflows became standardized, reproducible, and easier to trace—thanks to the By adopting tools that are cloud-agnostic and developer-friendly, EA has reduced friction—from provisioning GPUs to deploying models—and enabled teams to spend more time on actual ML work. -*Huge thanks to Kris and Keng from EA’s central tech team for sharing these insights. For more details, including the recording and slides, check out the full talk on the [NVIDIA GTC website :material-arrow-top-right-thin:{ .external }](https://www.nvidia.com/en-us/on-demand/session/gtc25-s73667/){:target="_blank"}.* +*Huge thanks to Kris and Keng from EA’s central tech team for sharing these insights. For more details, including the recording and slides, check out the full talk on the [NVIDIA GTC website](https://www.nvidia.com/en-us/on-demand/session/gtc25-s73667/).* !!! info "What's next?" 1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) diff --git a/docs/blog/posts/gh200-on-lambda.md b/docs/blog/posts/gh200-on-lambda.md index e7831a76c..1a87dc90e 100644 --- a/docs/blog/posts/gh200-on-lambda.md +++ b/docs/blog/posts/gh200-on-lambda.md @@ -42,7 +42,7 @@ The GH200 Superchip’s 450 GB/s bidirectional bandwidth enables KV cache offloa ## GH200 on Lambda -[Lambda :material-arrow-top-right-thin:{ .external }](https://cloud.lambda.ai/sign-up?_gl=1*1qovk06*_gcl_au*MTg2MDc3OTAyOS4xNzQyOTA3Nzc0LjE3NDkwNTYzNTYuMTc0NTQxOTE2MS4xNzQ1NDE5MTYw*_ga*MTE2NDM5MzI0My4xNzQyOTA3Nzc0*_ga_43EZT1FM6Q*czE3NDY3MTczOTYkbzM0JGcxJHQxNzQ2NzE4MDU2JGo1NyRsMCRoMTU0Mzg1NTU1OQ..){:target="_blank"} provides secure, user-friendly, reliable, and affordable cloud GPUs. Since end of last year, Lambda started to offer on-demand GH200 instances through their public cloud. Furthermore, they offer these instances at the promotional price of $1.49 per hour until June 30th 2025. +[Lambda](https://cloud.lambda.ai/sign-up?_gl=1*1qovk06*_gcl_au*MTg2MDc3OTAyOS4xNzQyOTA3Nzc0LjE3NDkwNTYzNTYuMTc0NTQxOTE2MS4xNzQ1NDE5MTYw*_ga*MTE2NDM5MzI0My4xNzQyOTA3Nzc0*_ga_43EZT1FM6Q*czE3NDY3MTczOTYkbzM0JGcxJHQxNzQ2NzE4MDU2JGo1NyRsMCRoMTU0Mzg1NTU1OQ..) provides secure, user-friendly, reliable, and affordable cloud GPUs. Since end of last year, Lambda started to offer on-demand GH200 instances through their public cloud. Furthermore, they offer these instances at the promotional price of $1.49 per hour until June 30th 2025. With the latest `dstack` update, it’s now possible to use these instances with your Lambda account whether you’re running a dev environment, task, or service: @@ -81,7 +81,7 @@ $ dstack apply -f .dstack.yml > If you have GH200 or GB200-powered hosts already provisioned via Lambda, another cloud provider, or on-prem, you can now use them with [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets). !!! info "What's next?" - 1. Sign up with [Lambda :material-arrow-top-right-thin:{ .external }](https://cloud.lambda.ai/sign-up?_gl=1*1qovk06*_gcl_au*MTg2MDc3OTAyOS4xNzQyOTA3Nzc0LjE3NDkwNTYzNTYuMTc0NTQxOTE2MS4xNzQ1NDE5MTYw*_ga*MTE2NDM5MzI0My4xNzQyOTA3Nzc0*_ga_43EZT1FM6Q*czE3NDY3MTczOTYkbzM0JGcxJHQxNzQ2NzE4MDU2JGo1NyRsMCRoMTU0Mzg1NTU1OQ..){:target="_blank"} + 1. Sign up with [Lambda](https://cloud.lambda.ai/sign-up?_gl=1*1qovk06*_gcl_au*MTg2MDc3OTAyOS4xNzQyOTA3Nzc0LjE3NDkwNTYzNTYuMTc0NTQxOTE2MS4xNzQ1NDE5MTYw*_ga*MTE2NDM5MzI0My4xNzQyOTA3Nzc0*_ga_43EZT1FM6Q*czE3NDY3MTczOTYkbzM0JGcxJHQxNzQ2NzE4MDU2JGo1NyRsMCRoMTU0Mzg1NTU1OQ..) 2. Set up the [Lambda](../../docs/concepts/backends.md#lambda) backend 3. Follow [Quickstart](../../docs/quickstart.md) 4. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) diff --git a/docs/blog/posts/gpu-blocks-and-proxy-jump.md b/docs/blog/posts/gpu-blocks-and-proxy-jump.md index cbf9ab7dc..61f28ea81 100644 --- a/docs/blog/posts/gpu-blocks-and-proxy-jump.md +++ b/docs/blog/posts/gpu-blocks-and-proxy-jump.md @@ -39,7 +39,7 @@ enables optimal hardware utilization by allowing concurrent workloads to run on available resources on each host. > For example, imagine you’ve reserved a cluster with multiple bare-metal nodes, each equipped with 8x MI300X GPUs from -[Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"}. +[Hot Aisle](https://hotaisle.xyz/). With `dstack`, you can define your fleet configuration like this: @@ -108,7 +108,7 @@ The latest `dstack` release introduces the [`proxy_jump`](../../docs/concepts/fl through a login node. > For example, imagine you’ve reserved a 1-Click Cluster from -> [Lambda :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/){:target="_blank"} with multiple nodes, each equipped with 8x H100 GPUs from. +> [Lambda](https://lambdalabs.com/) with multiple nodes, each equipped with 8x H100 GPUs from. With `dstack`, you can define your fleet configuration like this: @@ -174,8 +174,8 @@ an AI-native experience, simplicity, and vendor-agnostic orchestration for both !!! info "Roadmap" We plan to further enhance `dstack`'s support for both cloud and on-premises setups. For more details on our roadmap, - refer to our [GitHub :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/2184){:target="_blank"}. + refer to our [GitHub](https://github.com/dstackai/dstack/issues/2184). > Have questions? You're welcome to join -> our [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} or talk -> directly to [our team :material-arrow-top-right-thin:{ .external }](https://calendly.com/dstackai/discovery-call){:target="_blank"}. +> our [Discord](https://discord.gg/u8SmfwPpMd) or talk +> directly to [our team](https://calendly.com/dstackai/discovery-call). diff --git a/docs/blog/posts/gpu-health-checks.md b/docs/blog/posts/gpu-health-checks.md index 1571935b6..cc28bb96a 100644 --- a/docs/blog/posts/gpu-health-checks.md +++ b/docs/blog/posts/gpu-health-checks.md @@ -64,10 +64,10 @@ Passive GPU health checks work on AWS (except with custom `os_images`), Azure (e This update is about visibility: giving engineers real-time insight into GPU health before jobs run. Next comes automation — policies to skip GPUs with warnings, and self-healing workflows that replace unhealthy instances without manual steps. If you have experience with GPU reliability or ideas for automated recovery, join the conversation on -[Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"}. +[Discord](https://discord.gg/u8SmfwPpMd). !!! info "What's next?" 1. Check [Quickstart](../../docs/quickstart.md) 2. Explore the [clusters](../../docs/guides/clusters.md) guide 3. Learn more about [metrics](../../docs/guides/metrics.md) - 4. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/h100-mi300x-inference-benchmark.md b/docs/blog/posts/h100-mi300x-inference-benchmark.md index 350795684..2209393d1 100644 --- a/docs/blog/posts/h100-mi300x-inference-benchmark.md +++ b/docs/blog/posts/h100-mi300x-inference-benchmark.md @@ -22,8 +22,8 @@ Finally, we extrapolate performance projections for upcoming GPUs like NVIDIA H2 This benchmark is made possible through the generous support of our friends at -[Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"} and -[Lambda :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/){:target="_blank"}, +[Hot Aisle](https://hotaisle.xyz/) and +[Lambda](https://lambdalabs.com/), who provided high-end hardware. @@ -42,7 +42,7 @@ who provided high-end hardware. ### Benchmark modes 1. **Online inference**: Benchmarked across QPS 16, 32, and 1000 using - the [ShareGPT :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered){:target="_blank"} dataset. Execution used + the [ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) dataset. Execution used vLLM’s [benchmark\_serving](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py). 2. **Offline inference**: Benchmarked with varying input/output lengths across different batch sizes, using vLLM’s [benchmark\_throughput.py](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_throughput.py). @@ -79,7 +79,7 @@ With large prompts and batch sizes, two replicas on 4xMI300x GPUs hit memory sat length x batch size) exceed the available memory for the KV cache. This forces the inference engine to compute KV tensors on-the-fly or offload them to CPU memory, degrading throughput. -In [Lambda :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/blog/partner-spotlight-evaluating-nvidia-h200-gpus-for-ai-inference-with-baseten){:target="_blank"}’ +In [Lambda](https://lambdalabs.com/blog/partner-spotlight-evaluating-nvidia-h200-gpus-for-ai-inference-with-baseten)’ benchmark, an 8xH200 setup processed 3.4 times more tokens per second than an 8xH100. Extrapolating to our setup, an 8xH200 would process around 2,186 tokens per second (3.4 × 643), though still lower than 8xMI300x. @@ -122,7 +122,7 @@ Despite offering more memory, 4xMI300x lacks the parallelism of 8xH100, leading Processing a single large prompt request with 8xMI300x takes around 11.25 seconds. This latency is mainly due to computational demands during the prefill phase, where KV tensors are computed. -Optimizations like [automatic prefix caching :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/automatic_prefix_caching/apc.html){:target="_blank"} +Optimizations like [automatic prefix caching](https://docs.vllm.ai/en/latest/automatic_prefix_caching/apc.html) could help reduce this time, but are outside the scope of this benchmark. ## Benchmark notes @@ -139,7 +139,7 @@ with batch size 16 due to memory saturation, resulting in slower generation time ### Model checkpoints -For AMD MI300x, we used [`amd/Llama-3.1-405B-Instruct-FP8-KV` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV){:target="_blank"} +For AMD MI300x, we used [`amd/Llama-3.1-405B-Instruct-FP8-KV`](https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV) to achieve optimal performance, relying on AMD for quantization. ### vLLM configuration @@ -186,20 +186,20 @@ cost-efficiency. ## Source code All the source code and findings to help you replicate the results are available in -[our GitHub repo :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/benchmarks/tree/main/comparison/h100sxm5_vs_mi300x){:target="_blank"}. +[our GitHub repo](https://github.com/dstackai/benchmarks/tree/main/comparison/h100sxm5_vs_mi300x). ## Thanks to our friends ### Hot Aisle -[Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"} sponsored this benchmark by providing access to 8x MI300x hardware. We’re deeply grateful for their support. +[Hot Aisle](https://hotaisle.xyz/) sponsored this benchmark by providing access to 8x MI300x hardware. We’re deeply grateful for their support. If you're looking for top-tier bare metal compute with AMD GPUs, we highly recommend Hot Aisle. With `dstack`, accessing your cluster via SSH is seamless and straightforward. ### Lambda -[Lambda :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/){:target="_blank"} sponsored this benchmark with credits for on-demand 8x H100 instances. +[Lambda](https://lambdalabs.com/) sponsored this benchmark with credits for on-demand 8x H100 instances. We’re truly thankful for their support. For top-tier cloud compute with NVIDIA GPUs, Lambda is an excellent choice. Once set up, you can easily provision diff --git a/docs/blog/posts/h200-mi300x-deepskeek-benchmark.md b/docs/blog/posts/h200-mi300x-deepskeek-benchmark.md index 0d9e6f1fe..b8587d43f 100644 --- a/docs/blog/posts/h200-mi300x-deepskeek-benchmark.md +++ b/docs/blog/posts/h200-mi300x-deepskeek-benchmark.md @@ -21,8 +21,8 @@ determine the optimal backend and hardware pairing for DeepSeek-R1's demanding r This benchmark was made possible through the generous support of our partners at -[Vultr :material-arrow-top-right-thin:{ .external }](https://www.vultr.com/){:target="_blank"} and -[Lambda :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/){:target="_blank"}, +[Vultr](https://www.vultr.com/) and +[Lambda](https://lambdalabs.com/), who provided access to the necessary hardware. @@ -44,7 +44,7 @@ who provided access to the necessary hardware. **Online inference** -We utilized SGLang's [`Deepseek-R1/bench_serving.py` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/benchmarks/tree/main/Deepseek-R1/bench_serving.py){:target="_blank"} +We utilized SGLang's [`Deepseek-R1/bench_serving.py`](https://github.com/dstackai/benchmarks/tree/main/Deepseek-R1/bench_serving.py) script, modified to incorporate TensorRT-LLM. Tests were conducted across multiple request concurrencies and output token lengths, with input token length fixed at 3200. @@ -59,9 +59,9 @@ To test prefix caching ability, about 62.5% of each ~3200-token prompt (i.e., 20 **Offline inference** -For offline inference, we used vLLM’s [`benchmark_throughput.py` :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_throughput.py){:target="_blank"}, +For offline inference, we used vLLM’s [`benchmark_throughput.py`](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_throughput.py), modified for SGLang. TensorRT-LLM was tested using a custom -[`benchmark_throughput_trt.py` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/benchmarks/blob/deepseek-r1-benchmark/Deepseek-R1/benchmark_throughput_trt.py){:target="_blank"}. +[`benchmark_throughput_trt.py`](https://github.com/dstackai/benchmarks/blob/deepseek-r1-benchmark/Deepseek-R1/benchmark_throughput_trt.py). The benchmark examined performance across various batch sizes and output token lengths. | Batch Sizes | Output Token Lengths | @@ -129,7 +129,7 @@ TensorRT-LLM maintained the lowest and most consistent TTFT up to concurrency 64 vLLM achieved the lowest TTFT at concurrency 128. Below 128, vLLM and SGLang had similar TTFT. -TTFT, being compute-intensive, highlights H200's advantage, aligning with [SemiAnalysis’s MI300X vs. H200 TFLOPS benchmark :material-arrow-top-right-thin:{ .external }](https://semianalysis.com/2024/12/22/mi300x-vs-h100-vs-h200-benchmark-part-1-training/){:target="_blank"}. +TTFT, being compute-intensive, highlights H200's advantage, aligning with [SemiAnalysis’s MI300X vs. H200 TFLOPS benchmark](https://semianalysis.com/2024/12/22/mi300x-vs-h100-vs-h200-benchmark-part-1-training/). However, at 128 concurrent requests, MI300X's memory capacity and bandwidth advantages become evident. ### Time Per Output Token (TPOT) @@ -194,10 +194,10 @@ TPOT increased after prefix caching, which requires further investigation. ## Limitations 1. The offline benchmark results for TensorRT-LLM were obtained using the DeepSeek-R1 model engine built from the - [`deepseek` branch :material-arrow-top-right-thin:{ .external }](https://github.com/NVIDIA/TensorRT-LLM/tree/deepseek){:target="_blank"}. + [`deepseek` branch](https://github.com/NVIDIA/TensorRT-LLM/tree/deepseek). However, the TensorRT-LLM team recommends using the TorchFlow-based approach for deployment. 2. The impact of dynamic batching on inference efficiency was not tested. -3. vLLM's prefix caching support for MI300X is a work in progress and can be tracked [here :material-arrow-top-right-thin:{ .external }](https://github.com/ROCm/vllm/issues/457){:target="_blank"}. +3. vLLM's prefix caching support for MI300X is a work in progress and can be tracked [here](https://github.com/ROCm/vllm/issues/457). 4. The inference backends are being optimized for the DeepSeek-R1 model. Given these continuous updates, the current results reflect only the performance tested at the time of the benchmark. Overall, performance for all backends is expected to improve as more optimizations are made by the backend teams. @@ -205,27 +205,27 @@ TPOT increased after prefix caching, which requires further investigation. ## Source code All source code and findings are available in -[our GitHub repo :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/benchmarks/tree/deepseek-r1-benchmark/Deepseek-R1){:target="_blank"}. +[our GitHub repo](https://github.com/dstackai/benchmarks/tree/deepseek-r1-benchmark/Deepseek-R1). ## References -* [Unlock DeepSeek-R1 Inference Performance on AMD Instinct MI300X GPU :material-arrow-top-right-thin:{ .external }](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html){:target="_blank"} -* [Deploy DeepSeek-R1 671B on 8x NVIDIA H200 with SGLang :material-arrow-top-right-thin:{ .external }](https://datacrunch.io/blog/deploy-deepseek-r1-on-8x-nvidia-h200){:target="_blank"} -* [vLLM Prefix Caching :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/design/automatic_prefix_caching.html#design-automatic-prefix-caching){:target="_blank"} -* [SgLang Prefix Caching :material-arrow-top-right-thin:{ .external }](https://lmsys.org/blog/2024-01-17-sglang/){:target="_blank"} +* [Unlock DeepSeek-R1 Inference Performance on AMD Instinct MI300X GPU](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html) +* [Deploy DeepSeek-R1 671B on 8x NVIDIA H200 with SGLang](https://datacrunch.io/blog/deploy-deepseek-r1-on-8x-nvidia-h200) +* [vLLM Prefix Caching](https://docs.vllm.ai/en/latest/design/automatic_prefix_caching.html#design-automatic-prefix-caching) +* [SgLang Prefix Caching](https://lmsys.org/blog/2024-01-17-sglang/) ## Acknowledgments ### Vultr -[Vultr :material-arrow-top-right-thin:{ .external }](https://www.vultr.com/){:target="_blank"} provided access to 8x AMD MI300X GPUs. We are truly thankful for their support. +[Vultr](https://www.vultr.com/) provided access to 8x AMD MI300X GPUs. We are truly thankful for their support. If you're looking for top-tier bare metal compute with AMD GPUs, we highly recommend Vultr. With `dstack`, provisioning and accessing compute via `dstack` is seamless and straightforward. ### Lambda -[Lambda :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/){:target="_blank"} provided access to 8x +[Lambda](https://lambdalabs.com/) provided access to 8x NVIDIA H200 GPUs. We are truly thankful for their support Both Vultr and Lambda are natively supported and can be seamlessly integrated with `dstack`. diff --git a/docs/blog/posts/hotaisle.md b/docs/blog/posts/hotaisle.md index 4d2e761c0..928d0cf9c 100644 --- a/docs/blog/posts/hotaisle.md +++ b/docs/blog/posts/hotaisle.md @@ -16,7 +16,7 @@ As the ecosystem around AMD GPUs matures, developers are looking for easier ways -Today, we’re excited to announce native integration with [Hot Aisle :material-arrow-top-right-thin:{ .external }](https://www.hotaisle.io/){:target="_blank"}, an AMD-only GPU neocloud offering VMs and clusters at highly competitive on-demand pricing. +Today, we’re excited to announce native integration with [Hot Aisle](https://www.hotaisle.io/), an AMD-only GPU neocloud offering VMs and clusters at highly competitive on-demand pricing. @@ -107,8 +107,8 @@ Currently, `dstack` supports 1xGPU Hot Aisle VMs. Support for 8xGPU VMs will be !!! info "What's next?" 1. Check [Quickstart](../../docs/quickstart.md) - 2. Learn more about [Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"} + 2. Learn more about [Hot Aisle](https://hotaisle.xyz/) 3. Explore [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 4. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/inactivity-duration.md b/docs/blog/posts/inactivity-duration.md index d04a8eba4..7c3d88eb5 100644 --- a/docs/blog/posts/inactivity-duration.md +++ b/docs/blog/posts/inactivity-duration.md @@ -72,4 +72,4 @@ fleets by teams. 1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 2. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 2. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/instance-volumes.md b/docs/blog/posts/instance-volumes.md index 07b82012c..36ead6930 100644 --- a/docs/blog/posts/instance-volumes.md +++ b/docs/blog/posts/instance-volumes.md @@ -82,6 +82,6 @@ volumes: ## Feedback If you find something not working as intended, please be sure to report it to -[GitHub issues :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues){:target="_ blank"}. +[GitHub issues](https://github.com/dstackai/dstack/issues){:target="_ blank"}. Your feedback and feature requests is also very welcome on our -[Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} server. +[Discord](https://discord.gg/u8SmfwPpMd) server. diff --git a/docs/blog/posts/intel-gaudi.md b/docs/blog/posts/intel-gaudi.md index 887ae32a6..4ac0e6770 100644 --- a/docs/blog/posts/intel-gaudi.md +++ b/docs/blog/posts/intel-gaudi.md @@ -92,10 +92,10 @@ Provisioning... With your fleet provisioned, you can now run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md). -Below is an example of a task configuration for fine-tuning the [`DeepSeek-R1-Distill-Qwen-7B` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B){:target="_blank"} -model using [Optimum for Intel Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-habana){:target="_blank"} -and [DeepSpeed :material-arrow-top-right-thin:{ .external }](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/DeepSpeed_User_Guide/DeepSpeed_User_Guide.html#deepspeed-user-guide){:target="_blank"} with -the [`lvwerra/stack-exchange-paired` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/lvwerra/stack-exchange-paired){:target="_blank"} dataset: +Below is an example of a task configuration for fine-tuning the [`DeepSeek-R1-Distill-Qwen-7B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) +model using [Optimum for Intel Gaudi](https://github.com/huggingface/optimum-habana) +and [DeepSpeed](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/DeepSpeed_User_Guide/DeepSpeed_User_Guide.html#deepspeed-user-guide) with +the [`lvwerra/stack-exchange-paired`](https://huggingface.co/datasets/lvwerra/stack-exchange-paired) dataset:
@@ -163,7 +163,7 @@ $ dstack apply -f examples/single-node-training/trl/intel/.dstack.yml -R !!! info "Intel Tiber AI Cloud" At `dstack`, we’re grateful to be part of the Intel Liftoff program, which allowed us to access Intel Gaudi AI - accelerators via [Intel Tiber AI Cloud :material-arrow-top-right-thin:{ .external }](https://www.intel.com/content/www/us/en/developer/tools/tiber/ai-cloud.html){:target="_blank"}. + accelerators via [Intel Tiber AI Cloud](https://www.intel.com/content/www/us/en/developer/tools/tiber/ai-cloud.html). You can sign up if you’d like to access Intel Gaudi AI accelerators via the cloud. Native integration with Intel Tiber AI Cloud is also coming soon to `dstack`. @@ -171,4 +171,4 @@ $ dstack apply -f examples/single-node-training/trl/intel/.dstack.yml -R !!! info "What's next?" 1. Refer to [Quickstart](../../docs/quickstart.md) 2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 3. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/kubernetes-beta.md b/docs/blog/posts/kubernetes-beta.md index cc2529e7f..6dfc7cd5b 100644 --- a/docs/blog/posts/kubernetes-beta.md +++ b/docs/blog/posts/kubernetes-beta.md @@ -29,11 +29,11 @@ A major advantage of Kubernetes is its portability. Whether you’re using manag !!! info "NVIDIA GPU Operator" For `dstack` to correctly detect GPUs in your Kubernetes cluster, the cluster must have the - [NVIDIA GPU Operator :material-arrow-top-right-thin:{ .external }](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html){:target="_blank"} pre-installed. + [NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html) pre-installed. ### Nebius example -If you're using [Nebius :material-arrow-top-right-thin:{ .external }](https://nebius.com/){:target="_blank"}, the process of creating a Kubernetes cluster is straightforward. +If you're using [Nebius](https://nebius.com/), the process of creating a Kubernetes cluster is straightforward. Select the region of interest and click `Create cluster`. Once the cluster is created, switch to `Applications` and install the `nvidia-device-plugin` application — this can be done in one click. @@ -312,4 +312,4 @@ Support for AMD GPUs is coming soon — our team is actively working on it right [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) 3. Read the the [clusters](../../docs/guides/clusters.md) guide - 4. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/metrics-ui.md b/docs/blog/posts/metrics-ui.md index b15bbffc5..db21cf019 100644 --- a/docs/blog/posts/metrics-ui.md +++ b/docs/blog/posts/metrics-ui.md @@ -55,4 +55,4 @@ metrics from `dstack`. !!! info "What's next?" 1. See [Metrics](../../docs/guides/metrics.md) 2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 3. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/mpi.md b/docs/blog/posts/mpi.md index 5473d64a2..7b4b3d64b 100644 --- a/docs/blog/posts/mpi.md +++ b/docs/blog/posts/mpi.md @@ -94,11 +94,11 @@ With this, now you can use such a task to run both NCCL or RCCL tests on both cl as well as use MPI for other tasks. > The `dstackai/efa` image used in the example comes with MPI and NCCL tests pre-installed. While it is optimized for -> [AWS EFA :material-arrow-top-right-thin:{ .external }](https://aws.amazon.com/hpc/efa/){:target="_blank"}, it can also +> [AWS EFA](https://aws.amazon.com/hpc/efa/), it can also > be used with regular TCP/IP network adapters and InfiniBand. -> See the [source code :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/docker/efa) for the image. +> See the [source code](https://github.com/dstackai/dstack/blob/master/docker/efa) for the image. !!! info "What's next?" 1. Learn more about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) 2. Check the [NCCL tests](../../examples/clusters/nccl-tests/index.md) example - 3. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/nebius-in-dstack-sky.md b/docs/blog/posts/nebius-in-dstack-sky.md index e05bc2764..a65a06dcf 100644 --- a/docs/blog/posts/nebius-in-dstack-sky.md +++ b/docs/blog/posts/nebius-in-dstack-sky.md @@ -8,11 +8,11 @@ categories: - Changelog --- -# Nebius joins dstack Sky GPU marketplace, with production-ready GPU clusters +# Nebius in dstack Sky GPU marketplace, with production-ready GPU clusters -`dstack` is an [open-source :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack){:target="_blank"} control plane for orchestrating GPU workloads. It can provision cloud VMs, run on top of Kubernetes, or manage on-prem clusters. If you don’t want to self-host, you can use [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"}, the managed version of `dstack` that also provides access to cloud GPUs via its markfetplace. +`dstack` is an [open-source](https://github.com/dstackai/dstack) control plane for orchestrating GPU workloads. It can provision cloud VMs, run on top of Kubernetes, or manage on-prem clusters. If you don’t want to self-host, you can use [dstack Sky](https://sky.dstack.ai), the managed version of `dstack` that also provides access to cloud GPUs via its markfetplace. -With our latest release, we’re excited to announce that [Nebius :material-arrow-top-right-thin:{ .external }](https://nebius.com/){:target="_blank"}, a purpose-built AI cloud for large scale training and inference, has joined the `dstack` Sky marketplace +With our latest release, we’re excited to announce that [Nebius](https://nebius.com/), a purpose-built AI cloud for large scale training and inference, has joined the `dstack` Sky marketplace to offer on-demand and spot GPUs, including clusters. @@ -26,7 +26,7 @@ Since early this year, the open-source `dstack` has supported Nebius, making it ## About dstack Sky -With this week's release, Nebius officially joins [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"}. Nebius can now be used not only with your own account, but also directly via the GPU marketplace. +With this week's release, Nebius officially joins [dstack Sky](https://sky.dstack.ai). Nebius can now be used not only with your own account, but also directly via the GPU marketplace. The marketplace lets you access Nebius GPUs without having a Nebius account. You can pay through `dstack Sky`, and switch to your own Nebius account anytime with just a few clicks. @@ -39,7 +39,7 @@ With Nebius, `dstack` Sky users can orchestrate NVIDIA GPUs provisioned in hours ## Getting started -After you [sign up :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"} with `dstack` Sky, +After you [sign up](https://sky.dstack.ai) with `dstack` Sky, you’ll be prompted to create a project and choose between the GPU marketplace or your own cloud account: @@ -118,9 +118,9 @@ Our goal is to give teams maximum flexibility while removing the complexity of m providing a simple, multi-cloud interface for development, training, and inference. !!! info "What's next" - 1. Sign up with [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"} + 1. Sign up with [dstack Sky](https://sky.dstack.ai) 2. Check [Quickstart](../../docs/quickstart.md) - 3. Learn more about [Nebius :material-arrow-top-right-thin:{ .external }](https://nebius.com/){:target="_blank"} + 3. Learn more about [Nebius](https://nebius.com/) 4. Explore [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) diff --git a/docs/blog/posts/nebius.md b/docs/blog/posts/nebius.md index d24681959..c7a280971 100644 --- a/docs/blog/posts/nebius.md +++ b/docs/blog/posts/nebius.md @@ -17,7 +17,7 @@ alternative to Kubernetes and Slurm. -Today, we’re announcing native integration with [Nebius :material-arrow-top-right-thin:{ .external }](https://nebius.com/){:target="_blank"}, +Today, we’re announcing native integration with [Nebius](https://nebius.com/), offering a streamlined developer experience for teams using GPUs for AI workloads. @@ -44,7 +44,7 @@ long-running services—without the operational overhead. To use `dstack` with Nebius, configure your `nebius` backend: -1. Log in to your [Nebius AI Cloud :material-arrow-top-right-thin:{ .external }](https://console.eu.nebius.com/){:target="_blank"} account. +1. Log in to your [Nebius AI Cloud](https://console.eu.nebius.com/) account. 2. Navigate to `Access`, and select `Service Accounts`. 3. Create a new service account, assign it to the `editors` group, and upload an authorized key. @@ -108,8 +108,8 @@ interconnects is coming soon. !!! info "What's next?" 1. Check [Quickstart](../../docs/quickstart.md) - 2. Sign up with [Nebius AI Cloud :material-arrow-top-right-thin:{ .external }](https://console.eu.nebius.com/){:target="_blank"} + 2. Sign up with [Nebius AI Cloud](https://console.eu.nebius.com/) 3. Read about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 4. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/nvidia-and-amd-on-vultr.md b/docs/blog/posts/nvidia-and-amd-on-vultr.md index 2fb30ebbc..ed75607d4 100644 --- a/docs/blog/posts/nvidia-and-amd-on-vultr.md +++ b/docs/blog/posts/nvidia-and-amd-on-vultr.md @@ -15,7 +15,7 @@ increasingly important. At `dstack`, we’re committed to redefining AI container orchestration by prioritizing an AI-native, open-source-first approach. Today, we’re excited to share a new integration and partnership -with [Vultr :material-arrow-top-right-thin:{ .external }](https://www.vultr.com/){:target="_blank"}. +with [Vultr](https://www.vultr.com/). @@ -26,9 +26,9 @@ and NVIDIA GPUs with greater flexibility and efficiency–using `dstack`. ## About Vultr -[Vultr :material-arrow-top-right-thin:{ .external }](https://www.vultr.com/){:target="_blank"} provides cloud GPUs across 32 regions, supporting both NVIDIA and AMD hardware with on-demand and reserved +[Vultr](https://www.vultr.com/) provides cloud GPUs across 32 regions, supporting both NVIDIA and AMD hardware with on-demand and reserved capacity. Their offerings include AMD MI300X and NVIDIA GH200, H200, H100, A100, L40S, and A40, all available at -competitive [pricing :material-arrow-top-right-thin:{ .external }](https://www.vultr.com/pricing/#cloud-gpu){:target="_blank"}. +competitive [pricing](https://www.vultr.com/pricing/#cloud-gpu). ## Why dstack @@ -47,7 +47,7 @@ and volumes—so you can focus on building instead of troubleshooting infrastruc To use `dstack` with your Vultr account, you need to [configure a `vultr` backend](../../docs/concepts/backends.md): -Log into your [Vultr :material-arrow-top-right-thin:{ .external }](https://www.vultr.com/) account, click `Account` in the sidebar, select `API`, find the `Personal Access Token` panel and click the `Enable API` button. In the `Access Control` panel, allow API requests from all addresses or from the subnet where your `dstack` server is deployed. +Log into your [Vultr](https://www.vultr.com/) account, click `Account` in the sidebar, select `API`, find the `Personal Access Token` panel and click the `Enable API` button. In the `Access Control` panel, allow API requests from all addresses or from the subnet where your `dstack` server is deployed. Then, go ahead and configure the backend: @@ -71,8 +71,8 @@ For more details, refer to [Installation](../../docs/installation/index.md). !!! info "What's next?" 1. Refer to [Quickstart](../../docs/quickstart.md) - 2. Sign up with [Vultr :material-arrow-top-right-thin:{ .external }](https://www.vultr.com/) + 2. Sign up with [Vultr](https://www.vultr.com/) 3. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 4. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/nvidia-dgx-spark.md b/docs/blog/posts/nvidia-dgx-spark.md index a51fee474..60202c60c 100644 --- a/docs/blog/posts/nvidia-dgx-spark.md +++ b/docs/blog/posts/nvidia-dgx-spark.md @@ -10,7 +10,7 @@ image: https://dstack.ai/static-assets/static-assets/images/nvidia-dgx-spark.png # Orchestrating workloads on NVIDIA DGX Spark -With support from [Graphsignal :material-arrow-top-right-thin:{ .external }](https://x.com/GraphsignalAI/status/1986565583593197885){:target="_blank" }, our team gained access to the new [NVIDIA DGX Spark :material-arrow-top-right-thin:{ .external }](https://www.nvidia.com/en-us/products/workstations/dgx-spark/){:target="_blank"} and used it to validate how `dstack` operates on this hardware. This post walks through how to set it up with `dstack` and use it alongside existing on-prem clusters or GPU cloud environments to run workloads. +With support from [Graphsignal](https://x.com/GraphsignalAI/status/1986565583593197885), our team gained access to the new [NVIDIA DGX Spark](https://www.nvidia.com/en-us/products/workstations/dgx-spark/) and used it to validate how `dstack` operates on this hardware. This post walks through how to set it up with `dstack` and use it alongside existing on-prem clusters or GPU cloud environments to run workloads. @@ -121,12 +121,12 @@ To open in VS Code Desktop, use this link: > Running workloads on DGX Spark with `dstack` works the same way as on any other [backend](../../docs/concepts/backends.md) (including GPU clouds): you can run [dev environments](../../docs/concepts/dev-environments.md) for interactive development, [tasks](../../docs/concepts/tasks.md) for fine tuning, and [services](../../docs/concepts/services.md) for inference through the unified interface. -1. Read the [NVIDIA DGX Spark in-depth review :material-arrow-top-right-thin:{ .external }](https://lmsys.org/blog/2025-10-13-nvidia-dgx-spark/){:target="_blank"} by the SGLang team. +1. Read the [NVIDIA DGX Spark in-depth review](https://lmsys.org/blog/2025-10-13-nvidia-dgx-spark/) by the SGLang team. 2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) 3. Follow [Quickstart](../../docs/quickstart.md) -4. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} +4. Join [Discord](https://discord.gg/u8SmfwPpMd) !!! info "Aknowledgement" - Thanks to the [Graphsignal :material-arrow-top-right-thin:{ .external }](https://graphsignal.com/){:target="_blank"} team for access to DGX Spark and for supporting testing and validation. Graphsignal provides inference observability tooling used to profile CUDA workloads during both training and inference. + Thanks to the [Graphsignal](https://graphsignal.com/) team for access to DGX Spark and for supporting testing and validation. Graphsignal provides inference observability tooling used to profile CUDA workloads during both training and inference. diff --git a/docs/blog/posts/probes.md b/docs/blog/posts/probes.md index 428d0a7fa..d3d85335a 100644 --- a/docs/blog/posts/probes.md +++ b/docs/blog/posts/probes.md @@ -108,4 +108,4 @@ See [services](../../docs/concepts/services.md#probes) and the [reference](../.. !!! info "What's next?" 1. Check [Quickstart](../../docs/quickstart.md) 2. Learn about [services](../../docs/concepts/services.md) - 3. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/prometheus.md b/docs/blog/posts/prometheus.md index 2594619c0..8a4d579c0 100644 --- a/docs/blog/posts/prometheus.md +++ b/docs/blog/posts/prometheus.md @@ -63,4 +63,4 @@ For a full list of available metrics and labels, check out [Metrics](../../docs/ 1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 2. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 2. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/state-of-cloud-gpu-2025.md b/docs/blog/posts/state-of-cloud-gpu-2025.md index c689c9c4b..238926ebf 100644 --- a/docs/blog/posts/state-of-cloud-gpu-2025.md +++ b/docs/blog/posts/state-of-cloud-gpu-2025.md @@ -135,11 +135,11 @@ This turns capacity from individual silos into one fungible pool. - **Next steps.** We plan to publish price normalization, hardware/network microbenchmarks, and a scheduler capability matrix; preliminary harnesses are linked in the appendix. Contributors welcome. -> If you need a lighter, simpler orchestration and control-plane alternative to Kubernetes or Slurm, consider [dstack :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/){:target="_blank"}. +> If you need a lighter, simpler orchestration and control-plane alternative to Kubernetes or Slurm, consider [dstack](https://github.com/dstackai/dstack/). It’s open-source and self-hosted. ??? info "dstack Sky" - If you want unified access to low-cost on-demand and spot GPUs across multiple clouds, try [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai/){:target="_blank"}. + If you want unified access to low-cost on-demand and spot GPUs across multiple clouds, try [dstack Sky](https://sky.dstack.ai/). diff --git a/docs/blog/posts/tpu-on-gcp.md b/docs/blog/posts/tpu-on-gcp.md index 8fff83cb4..4a45af000 100644 --- a/docs/blog/posts/tpu-on-gcp.md +++ b/docs/blog/posts/tpu-on-gcp.md @@ -14,8 +14,8 @@ If you’re using or planning to use TPUs with Google Cloud, you can now do so v Read below to find out how to use TPUs with `dstack` for fine-tuning and deploying LLMs, leveraging open-source tools like Hugging Face’s -[Optimum TPU :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu){:target="_blank"} -and [vLLM :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html){:target="_blank"}. +[Optimum TPU](https://github.com/huggingface/optimum-tpu) +and [vLLM](https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html). @@ -45,8 +45,8 @@ If you've configured the `gcp` backend, `dstack` will automatically provision th You can use any serving framework, such as vLLM, TGI. Here's an example of a [service](https://dstack.ai/docs/services) that deploys Llama 3.1 8B using -[Optimum TPU :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu){:target="_blank"} -and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm){:target="_blank"}. +[Optimum TPU](https://github.com/huggingface/optimum-tpu) +and [vLLM](https://github.com/vllm-project/vllm). === "Optimum TPU" @@ -79,7 +79,7 @@ and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm- ```
- Once the [pull request :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu/pull/87){:target="_blank"} is merged, + Once the [pull request](https://github.com/huggingface/optimum-tpu/pull/87) is merged, the official Docker image can be used instead of `dstackai/optimum-tpu:llama31`. === "vLLM" @@ -145,7 +145,7 @@ Note, `v5litepod` is optimized for serving transformer-based models. Each core i | Framework | Quantization | Note | |-----------|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | **TGI** | bfloat16 | To deploy with TGI, Optimum TPU must be used. | -| **vLLM** | int8, bfloat16 | int8 quantization still requires the same memory because the weights are first moved to the TPU in bfloat16, and then converted to int8. See the [pull request :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm/pull/7005){:target="_blank"} for more details. | +| **vLLM** | int8, bfloat16 | int8 quantization still requires the same memory because the weights are first moved to the TPU in bfloat16, and then converted to int8. See the [pull request](https://github.com/vllm-project/vllm/pull/7005) for more details. | ### Running a configuration @@ -154,8 +154,8 @@ cloud resources and run the configuration. ## Fine-tuning -Below is an example of fine-tuning Llama 3.1 8B using [Optimum TPU :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu){:target="_blank"} -and the [Abirate/english_quotes :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/Abirate/english_quotes){:target="_blank"} +Below is an example of fine-tuning Llama 3.1 8B using [Optimum TPU](https://github.com/huggingface/optimum-tpu) +and the [Abirate/english_quotes](https://huggingface.co/datasets/Abirate/english_quotes) dataset.
@@ -208,12 +208,12 @@ Note, `v5litepod` is optimized for fine-tuning transformer-based models. Each co ## What's next? -1. Browse [Optimum TPU :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu){:target="_blank"}, - [Optimum TPU TGI :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference){:target="_blank"} and - [vLLM :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html){:target="_blank"}. +1. Browse [Optimum TPU](https://github.com/huggingface/optimum-tpu), + [Optimum TPU TGI](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference) and + [vLLM](https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html). 2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](https://dstack.ai/docs/tasks), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md). !!! info "Multi-host TPUs" If you’d like to use `dstack` with more than eight TPU cores, upvote the corresponding - [issue :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/1337){:target="_blank"}. + [issue](https://github.com/dstackai/dstack/issues/1337). diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index c89875486..291857139 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -10,7 +10,7 @@ They can be configured via `~/.dstack/server/config.yml` or through the [project * [On-prem](#on-prem) – use `dstack`'s native support for on-prem servers without needing Kubernetes. !!! info "dstack Sky" - If you're using [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"}, backend configuration is optional. dstack Sky lets you use pre-configured backends to access GPU marketplace. + If you're using [dstack Sky](https://sky.dstack.ai), backend configuration is optional. dstack Sky lets you use pre-configured backends to access GPU marketplace. See the examples of backend configuration below. @@ -46,7 +46,7 @@ There are two ways to configure AWS: using an access key or using the default cr === "Access key" - Create an access key by following the [this guide :material-arrow-top-right-thin:{ .external }](https://docs.aws.amazon.com/cli/latest/userguide/cli-authentication-user.html#cli-authentication-user-get). + Create an access key by following the [this guide](https://docs.aws.amazon.com/cli/latest/userguide/cli-authentication-user.html#cli-authentication-user-get). Once you've downloaded the `.csv` file with your IAM user's Access key ID and Secret access key, proceed to configure the backend. @@ -223,7 +223,7 @@ There are two ways to configure AWS: using an access key or using the default cr Additionally, private subnets must have outbound internet connectivity provided by NAT Gateway, Transit Gateway, or other mechanism. ??? info "OS images" - By default, `dstack` uses its own [AMI :material-arrow-top-right-thin:{ .external }](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html) + By default, `dstack` uses its own [AMI](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html) optimized for `dstack`. To use your own or other third-party images, set the `os_images` property: @@ -282,7 +282,7 @@ There are two ways to configure Azure: using a client secret or using the defaul
- If you don't know your `subscription_id` and `tenant_id`, use [Azure CLI :material-arrow-top-right-thin:{ .external }](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli): + If you don't know your `subscription_id` and `tenant_id`, use [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli): ```shell az account show --query "{subscription_id: id, tenant_id: tenantId}" @@ -290,7 +290,7 @@ There are two ways to configure Azure: using a client secret or using the defaul === "Client secret" - A client secret can be created using the [Azure CLI :material-arrow-top-right-thin:{ .external }](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli): + A client secret can be created using the [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli): ```shell SUBSCRIPTION_ID=... @@ -320,7 +320,7 @@ There are two ways to configure Azure: using a client secret or using the defaul
- If you don't know your `subscription_id`, use [Azure CLI :material-arrow-top-right-thin:{ .external }](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli): + If you don't know your `subscription_id`, use [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli): ```shell az account show --query "{subscription_id: id}" @@ -440,7 +440,7 @@ There are two ways to configure GCP: using a service account or using the defaul === "Service account" - To create a service account, follow [this guide :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/iam/docs/service-accounts-create). After setting up the service account [create a key :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/iam/docs/keys-create-delete) for it and download the corresponding JSON file. + To create a service account, follow [this guide](https://cloud.google.com/iam/docs/service-accounts-create). After setting up the service account [create a key](https://cloud.google.com/iam/docs/keys-create-delete) for it and download the corresponding JSON file. Then go ahead and configure the backend by specifying the downloaded file path. @@ -488,7 +488,7 @@ There are two ways to configure GCP: using a service account or using the defaul -If you don't know your GCP project ID, use [Google Cloud CLI :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/sdk/docs/install-sdk): +If you don't know your GCP project ID, use [Google Cloud CLI](https://cloud.google.com/sdk/docs/install-sdk): ```shell gcloud projects list --format="json(projectId)" @@ -629,7 +629,7 @@ gcloud projects list --format="json(projectId)" ### Lambda -Log into your [Lambda Cloud :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/service/gpu-cloud) account, click API keys in the sidebar, and then click the `Generate API key` +Log into your [Lambda Cloud](https://lambdalabs.com/service/gpu-cloud) account, click API keys in the sidebar, and then click the `Generate API key` button to create a new API key. Then, go ahead and configure the backend: @@ -650,7 +650,7 @@ projects: ### Nebius -Log into your [Nebius AI Cloud :material-arrow-top-right-thin:{ .external }](https://console.eu.nebius.com/) account, navigate to Access, and select Service Accounts. Create a service account, add it to the editors group, and upload its authorized key. +Log into your [Nebius AI Cloud](https://console.eu.nebius.com/) account, navigate to Access, and select Service Accounts. Create a service account, add it to the editors group, and upload its authorized key. Then configure the backend: @@ -671,7 +671,7 @@ projects: ??? info "Credentials file" - It's also possible to configure the `nebius` backend using a credentials file [generated :material-arrow-top-right-thin:{ .external }](https://docs.nebius.com/iam/service-accounts/authorized-keys#create){:target="_blank"} by the `nebius` CLI: + It's also possible to configure the `nebius` backend using a credentials file [generated](https://docs.nebius.com/iam/service-accounts/authorized-keys#create) by the `nebius` CLI:
@@ -743,7 +743,7 @@ projects: ### Vultr -Log into your [Vultr :material-arrow-top-right-thin:{ .external }](https://www.vultr.com/) account, click `Account` in the sidebar, select `API`, find the `Personal Access Token` panel and click the `Enable API` button. In the `Access Control` panel, allow API requests from all addresses or from the subnet where your `dstack` server is deployed. +Log into your [Vultr](https://www.vultr.com/) account, click `Account` in the sidebar, select `API`, find the `Personal Access Token` panel and click the `Enable API` button. In the `Access Control` panel, allow API requests from all addresses or from the subnet where your `dstack` server is deployed. Then, go ahead and configure the backend: @@ -763,7 +763,7 @@ projects: ### CUDO -Log into your [CUDO Compute :material-arrow-top-right-thin:{ .external }](https://compute.cudo.org/) account, click API keys in the sidebar, and click the `Create an API key` button. +Log into your [CUDO Compute](https://compute.cudo.org/) account, click API keys in the sidebar, and click the `Create an API key` button. Ensure you've created a project with CUDO Compute, then proceed to configuring the backend. @@ -804,7 +804,7 @@ There are two ways to configure OCI: using client credentials or using the defau === "Client credentials" - Log into the [OCI Console :material-arrow-top-right-thin:{ .external }](https://cloud.oracle.com), go to `My profile`, + Log into the [OCI Console](https://cloud.oracle.com), go to `My profile`, select `API keys`, and click `Add API key`. Once you add a key, you'll see the configuration file. Copy its values to configure the backend as follows: @@ -858,7 +858,7 @@ There are two ways to configure OCI: using client credentials or using the defau ### DataCrunch -Log into your [DataCrunch :material-arrow-top-right-thin:{ .external }](https://cloud.datacrunch.io/) account, click Keys in the sidebar, find `REST API Credentials` area and then click the `Generate Credentials` button. +Log into your [DataCrunch](https://cloud.datacrunch.io/) account, click Keys in the sidebar, find `REST API Credentials` area and then click the `Generate Credentials` button. Then, go ahead and configure the backend: @@ -878,7 +878,7 @@ projects:
### AMD Developer Cloud -Log into your [AMD Developer Cloud :material-arrow-top-right-thin:{ .external }](https://amd.digitalocean.com/login) account. Click `API` in the sidebar and click the button `Generate New Token`. +Log into your [AMD Developer Cloud](https://amd.digitalocean.com/login) account. Click `API` in the sidebar and click the button `Generate New Token`. Then, go ahead and configure the backend: @@ -912,7 +912,7 @@ projects: ### Digital Ocean -Log into your [Digital Ocean :material-arrow-top-right-thin:{ .external }](https://cloud.digitalocean.com/login) account. Click `API` in the sidebar and click the button `Generate New Token`. +Log into your [Digital Ocean](https://cloud.digitalocean.com/login) account. Click `API` in the sidebar and click the button `Generate New Token`. Then, go ahead and configure the backend: @@ -946,7 +946,7 @@ projects: ### Hot Aisle -Log in to the SSH TUI as described in the [Hot Aisle Quick Start :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/quick-start/). +Log in to the SSH TUI as described in the [Hot Aisle Quick Start](https://hotaisle.xyz/quick-start/). Create a new team and generate an API key for the member in the team. Then, go ahead and configure the backend: @@ -975,7 +975,7 @@ projects: ### CloudRift -Log into your [CloudRift :material-arrow-top-right-thin:{ .external }](https://console.cloudrift.ai/) console, click `API Keys` in the sidebar and click the button to create a new API key. +Log into your [CloudRift](https://console.cloudrift.ai/) console, click `API Keys` in the sidebar and click the button to create a new API key. Ensure you've created a project with CloudRift. @@ -1038,11 +1038,11 @@ projects: ??? info "NVIDIA GPU Operator" For `dstack` to correctly detect GPUs in your Kubernetes cluster, the cluster must have the - [NVIDIA GPU Operator :material-arrow-top-right-thin:{ .external }](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html){:target="_blank"} pre-installed. + [NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html) pre-installed. @@ -1075,7 +1075,7 @@ projects: ### RunPod -Log into your [RunPod :material-arrow-top-right-thin:{ .external }](https://www.runpod.io/console/) console, click Settings in the sidebar, expand the `API Keys` section, and click +Log into your [RunPod](https://www.runpod.io/console/) console, click Settings in the sidebar, expand the `API Keys` section, and click the button to create a Read & Write key. Then proceed to configuring the backend. @@ -1096,7 +1096,7 @@ projects: ??? info "Community Cloud" By default, `dstack` considers instance offers from both the Secure Cloud and the - [Community Cloud :material-arrow-top-right-thin:{ .external }](https://docs.runpod.io/references/faq/#secure-cloud-vs-community-cloud). + [Community Cloud](https://docs.runpod.io/references/faq/#secure-cloud-vs-community-cloud). You can tell them apart by their regions. Secure Cloud regions contain datacenter IDs such as `CA-MTL-3`. @@ -1133,7 +1133,7 @@ projects: ### Vast.ai -Log into your [Vast.ai :material-arrow-top-right-thin:{ .external }](https://cloud.vast.ai/) account, click Account in the sidebar, and copy your +Log into your [Vast.ai](https://cloud.vast.ai/) account, click Account in the sidebar, and copy your API Key. Then, go ahead and configure the backend: diff --git a/docs/docs/concepts/dev-environments.md b/docs/docs/concepts/dev-environments.md index bee28cfbe..ca6485786 100644 --- a/docs/docs/concepts/dev-environments.md +++ b/docs/docs/concepts/dev-environments.md @@ -55,7 +55,7 @@ To open in VS Code Desktop, use this link: ??? info "Windows" On Windows, `dstack` works both natively and inside WSL. But, for dev environments, - it's recommended _not to use_ `dstack apply` _inside WSL_ due to a [VS Code issue :material-arrow-top-right-thin:{ .external }](https://github.com/microsoft/vscode-remote-release/issues/937){:target="_blank"}. + it's recommended _not to use_ `dstack apply` _inside WSL_ due to a [VS Code issue](https://github.com/microsoft/vscode-remote-release/issues/937). To open the dev environment in your desktop IDE, use the link from the output (such as `vscode://vscode-remote/ssh-remote+fast-moth-1/workflow`). @@ -159,7 +159,7 @@ If vendor is omitted, `dstack` infers it from the model or defaults to `nvidia`. #### Default image -If you don't specify `image`, `dstack` uses its [base :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/tree/master/docker/base){:target="_blank"} Docker image pre-configured with +If you don't specify `image`, `dstack` uses its [base](https://github.com/dstackai/dstack/tree/master/docker/base) Docker image pre-configured with `uv`, `python`, `pip`, essential CUDA drivers, `mpirun`, and NCCL tests (under `/opt/nccl-tests/build`). Set the `python` property to pre-install a specific version of Python. diff --git a/docs/docs/concepts/fleets.md b/docs/docs/concepts/fleets.md index 33824746d..58c0f29be 100644 --- a/docs/docs/concepts/fleets.md +++ b/docs/docs/concepts/fleets.md @@ -102,14 +102,14 @@ To ensure instances are interconnected (e.g., for This ensures all instances are provisioned with optimal inter-node connectivity. ??? info "AWS" - When you create a fleet with AWS, [Elastic Fabric Adapter networking :material-arrow-top-right-thin:{ .external }](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html){:target="_blank"} is automatically configured if it’s supported for the corresponding instance type. + When you create a fleet with AWS, [Elastic Fabric Adapter networking](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html) is automatically configured if it’s supported for the corresponding instance type. Note, EFA requires the `public_ips` to be set to `false` in the `aws` backend configuration. Otherwise, instances are only connected by the default VPC subnet. Refer to the [EFA](../../examples/clusters/efa/index.md) example for more details. ??? info "GCP" - When you create a fleet with GCP, `dstack` automatically configures [GPUDirect-TCPXO and GPUDirect-TCPX :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot){:target="_blank"} networking for the A3 Mega and A3 High instance types, as well as RoCE networking for the A4 instance type. + When you create a fleet with GCP, `dstack` automatically configures [GPUDirect-TCPXO and GPUDirect-TCPX](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot) networking for the A3 Mega and A3 High instance types, as well as RoCE networking for the A4 instance type. !!! info "Backend configuration" You may need to configure `extra_vpcs` and `roce_vpcs` in the `gcp` backend configuration. @@ -118,7 +118,7 @@ This ensures all instances are provisioned with optimal inter-node connectivity. [A3 High](../../examples/clusters/a3high/index.md) examples for more details. ??? info "Nebius" - When you create a fleet with Nebius, [InfiniBand networking :material-arrow-top-right-thin:{ .external }](https://docs.nebius.com/compute/clusters/gpu){:target="_blank"} is automatically configured if it’s supported for the corresponding instance type. + When you create a fleet with Nebius, [InfiniBand networking](https://docs.nebius.com/compute/clusters/gpu) is automatically configured if it’s supported for the corresponding instance type. Otherwise, instances are only connected by the default VPC subnet. An InfiniBand fabric for the cluster is selected automatically. If you prefer to use some specific fabrics, configure them in the @@ -304,12 +304,12 @@ Define a fleet configuration as a YAML file in your project directory. The file === "NVIDIA" 2. Hosts with NVIDIA GPUs must also be pre-installed with CUDA 12.1 and - [NVIDIA Container Toolkit :material-arrow-top-right-thin:{ .external }](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). + [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). === "AMD" 2. Hosts with AMD GPUs must also be pre-installed with AMDGPU-DKMS kernel driver (e.g. via - [native package manager :material-arrow-top-right-thin:{ .external }](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/native-install/index.html) - or [AMDGPU installer :material-arrow-top-right-thin:{ .external }](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/amdgpu-install.html).) + [native package manager](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/native-install/index.html) + or [AMDGPU installer](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/amdgpu-install.html).) === "Intel Gaudi" 2. Hosts with Intel Gaudi accelerators must be pre-installed with [Gaudi software and drivers](https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html#driver-installation). diff --git a/docs/docs/concepts/gateways.md b/docs/docs/concepts/gateways.md index 03ddd10e5..eb433f7d3 100644 --- a/docs/docs/concepts/gateways.md +++ b/docs/docs/concepts/gateways.md @@ -1,8 +1,8 @@ # Gateways -Gateways manage ingress traffic for running [services](services.md), handle auto-scaling and rate limits, enable HTTPS, and allow you to configure a custom domain. They also support custom routers, such as the [SGLang Model Gateway :material-arrow-top-right-thin:{ .external }](https://docs.sglang.ai/advanced_features/router.html#){:target="_blank"}. +Gateways manage ingress traffic for running [services](services.md), handle auto-scaling and rate limits, enable HTTPS, and allow you to configure a custom domain. They also support custom routers, such as the [SGLang Model Gateway](https://docs.sglang.ai/advanced_features/router.html#). - ## Apply a configuration @@ -62,7 +62,7 @@ By default, the gateway uses its own load balancer to route traffic between repl #### SGLang -The `sglang` router delegates routing logic to the [SGLang Model Gateway :material-arrow-top-right-thin:{ .external }](https://docs.sglang.ai/advanced_features/router.html#){:target="_blank"}. +The `sglang` router delegates routing logic to the [SGLang Model Gateway](https://docs.sglang.ai/advanced_features/router.html#). To enable it, set `type` field under `router` to `sglang`: diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index 09ff1fba8..716ab211e 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -100,12 +100,12 @@ If [authorization](#authorization) is not disabled, the service endpoint require However, you'll need a gateway in the following cases: * To use auto-scaling or rate limits - * To enable a support custom router, e.g. such as the [SGLang Model Gateway :material-arrow-top-right-thin:{ .external }](https://docs.sglang.ai/advanced_features/router.html#){:target="_blank"} + * To enable a support custom router, e.g. such as the [SGLang Model Gateway](https://docs.sglang.ai/advanced_features/router.html#) * To enable HTTPS for the endpoint and map it to your domain * If your service requires WebSockets * If your service cannot work with a [path prefix](#path-prefix) - If a [gateway](gateways.md) is configured, the service endpoint will be accessible at @@ -434,7 +434,7 @@ If vendor is omitted, `dstack` infers it from the model or defaults to `nvidia`. #### Default image -If you don't specify `image`, `dstack` uses its [base :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/tree/master/docker/base){:target="_blank"} Docker image pre-configured with +If you don't specify `image`, `dstack` uses its [base](https://github.com/dstackai/dstack/tree/master/docker/base) Docker image pre-configured with `uv`, `python`, `pip`, essential CUDA drivers, `mpirun`, and NCCL tests (under `/opt/nccl-tests/build`). Set the `python` property to pre-install a specific version of Python. @@ -992,6 +992,6 @@ The rolling deployment stops when all replicas are updated or when a new deploym 1. Read about [dev environments](dev-environments.md) and [tasks](tasks.md) 2. Learn how to manage [fleets](fleets.md) 3. See how to set up [gateways](gateways.md) - 4. Check the [TGI :material-arrow-top-right-thin:{ .external }](../../examples/inference/tgi/index.md){:target="_blank"}, - [vLLM :material-arrow-top-right-thin:{ .external }](../../examples/inference/vllm/index.md){:target="_blank"}, and - [NIM :material-arrow-top-right-thin:{ .external }](../../examples/inference/nim/index.md){:target="_blank"} examples + 4. Check the [TGI](../../examples/inference/tgi/index.md), + [vLLM](../../examples/inference/vllm/index.md), and + [NIM](../../examples/inference/nim/index.md) examples diff --git a/docs/docs/concepts/tasks.md b/docs/docs/concepts/tasks.md index a196ec6fe..ea0d775fb 100644 --- a/docs/docs/concepts/tasks.md +++ b/docs/docs/concepts/tasks.md @@ -253,7 +253,7 @@ If vendor is omitted, `dstack` infers it from the model or defaults to `nvidia`. #### Default image -If you don't specify `image`, `dstack` uses its [base :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/tree/master/docker/base){:target="_blank"} Docker image pre-configured with +If you don't specify `image`, `dstack` uses its [base](https://github.com/dstackai/dstack/tree/master/docker/base) Docker image pre-configured with `uv`, `python`, `pip`, essential CUDA drivers, `mpirun`, and NCCL tests (under `/opt/nccl-tests/build`). Set the `python` property to pre-install a specific version of Python. diff --git a/docs/docs/guides/clusters.md b/docs/docs/guides/clusters.md index f77bc44a8..2e008c945 100644 --- a/docs/docs/guides/clusters.md +++ b/docs/docs/guides/clusters.md @@ -18,14 +18,14 @@ Ensure a fleet is created before you run any distributed task. This can be eithe For cloud fleets, fast interconnect is currently supported only on the `aws`, `gcp`, `nebius`, and `runpod` backends. === "AWS" - When you create a cloud fleet with AWS, [Elastic Fabric Adapter :material-arrow-top-right-thin:{ .external }](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html){:target="_blank"} networking is automatically configured if it’s supported for the corresponding instance type. + When you create a cloud fleet with AWS, [Elastic Fabric Adapter](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html) networking is automatically configured if it’s supported for the corresponding instance type. !!! info "Backend configuration" Note, EFA requires the `public_ips` to be set to `false` in the `aws` backend configuration. Refer to the [EFA](../../examples/clusters/efa/index.md) example for more details. === "GCP" - When you create a cloud fleet with GCP, `dstack` automatically configures [GPUDirect-TCPXO and GPUDirect-TCPX :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot){:target="_blank"} networking for the A3 Mega and A3 High instance types, as well as RoCE networking for the A4 instance type. + When you create a cloud fleet with GCP, `dstack` automatically configures [GPUDirect-TCPXO and GPUDirect-TCPX](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot) networking for the A3 Mega and A3 High instance types, as well as RoCE networking for the A4 instance type. !!! info "Backend configuration" You may need to configure `extra_vpcs` and `roce_vpcs` in the `gcp` backend configuration. @@ -34,13 +34,13 @@ For cloud fleets, fast interconnect is currently supported only on the `aws`, `g [A3 High](../../examples/clusters/a3high/index.md) examples for more details. === "Nebius" - When you create a cloud fleet with Nebius, [InfiniBand :material-arrow-top-right-thin:{ .external }](https://docs.nebius.com/compute/clusters/gpu){:target="_blank"} networking is automatically configured if it’s supported for the corresponding instance type. + When you create a cloud fleet with Nebius, [InfiniBand](https://docs.nebius.com/compute/clusters/gpu) networking is automatically configured if it’s supported for the corresponding instance type. === "Runpod" When you run multinode tasks in a cluster cloud fleet with Runpod, `dstack` provisions [Runpod Instant Clusters](https://docs.runpod.io/instant-clusters) with InfiniBand networking configured. > To request fast interconnect support for other backends, -file an [issue :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues){:target="_ blank"}. +file an [issue](https://github.com/dstackai/dstack/issues){:target="_ blank"}. ## Distributed tasks diff --git a/docs/docs/guides/dstack-sky.md b/docs/docs/guides/dstack-sky.md index ec0fea776..a51054f70 100644 --- a/docs/docs/guides/dstack-sky.md +++ b/docs/docs/guides/dstack-sky.md @@ -26,7 +26,7 @@ Configuration is updated at ~/.dstack/config.yml ### Configure clouds -By default, [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"} +By default, [dstack Sky](https://sky.dstack.ai) uses the GPU from its marketplace, which requires a credit card to be attached in your account settings. @@ -41,4 +41,4 @@ the [server/config.yml reference](../reference/server/config.yml.md). 1. Follow [quickstart](../quickstart.md) 2. Browse [examples](https://dstack.ai/examples) -3. Join the community via [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd) +3. Join the community via [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/docs/guides/kubernetes.md b/docs/docs/guides/kubernetes.md index 5e4ded65a..fa90e3c31 100644 --- a/docs/docs/guides/kubernetes.md +++ b/docs/docs/guides/kubernetes.md @@ -41,7 +41,7 @@ No additional setup is required — `dstack` configures and manages the proxy au ### NVIDIA GPU Operator > For `dstack` to correctly detect GPUs in your Kubernetes cluster, the cluster must have the -[NVIDIA GPU Operator :material-arrow-top-right-thin:{ .external }](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html){:target="_blank"} pre-installed. +[NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html) pre-installed. After the backend is set up, you interact with `dstack` just as you would with other backends or SSH fleets. You can run dev environments, tasks, and services. @@ -102,7 +102,7 @@ For more details on clusters, see the [corresponding guide](clusters.md). ??? info "Is managed Kubernetes with auto-scaling supported?" Managed Kubernetes is supported. However, the `kubernetes` backend can only run on pre-provisioned nodes. - Support for auto-scalable Kubernetes clusters is coming soon—you can track progress in the corresponding [issue :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/3126){:target="_blank"}. + Support for auto-scalable Kubernetes clusters is coming soon—you can track progress in the corresponding [issue](https://github.com/dstackai/dstack/issues/3126). If on-demand provisioning is important, we recommend using [VM-based](../concepts/backends.md#vm-based) backends as they already support auto-scaling. diff --git a/docs/docs/guides/protips.md b/docs/docs/guides/protips.md index d676ddadf..167b8f1b4 100644 --- a/docs/docs/guides/protips.md +++ b/docs/docs/guides/protips.md @@ -273,7 +273,7 @@ $ dstack apply -e HF_TOKEN=... -f .dstack.yml - If you install [`direnv` :material-arrow-top-right-thin:{ .external }](https://direnv.net/){:target="_blank"}, + If you install [`direnv`](https://direnv.net/), it will automatically apply the environment variables from the `.envrc` file to the `dstack apply` command. Remember to add `.envrc` to `.gitignore` to avoid committing it to the repo. @@ -350,7 +350,7 @@ If you're using multiple `dstack` projects (e.g., from different `dstack` server you can switch between them using the [`dstack project`](../reference/cli/dstack/project.md) command. ??? info ".envrc" - Alternatively, you can install [`direnv` :material-arrow-top-right-thin:{ .external }](https://direnv.net/){:target="_blank"} + Alternatively, you can install [`direnv`](https://direnv.net/) to automatically apply environment variables from the `.envrc` file in your project directory.
@@ -490,7 +490,7 @@ If you're using your own AWS, GCP, Azure, or OCI accounts, before you can use GP corresponding service quotas for each type of instance in each region. ??? info "AWS" - Check this [guide :material-arrow-top-right-thin:{ .external }](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-resource-limits.html){:target="_blank"} on EC2 service quotas. + Check this [guide ](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-resource-limits.html) on EC2 service quotas. The relevant service quotas include: - `Running On-Demand P instances` (on-demand V100, A100 80GB x8) @@ -501,7 +501,7 @@ corresponding service quotas for each type of instance in each region. - `All P5 Spot Instance Requests` (spot H100) ??? info "GCP" - Check this [guide :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/compute/resource-usage){:target="_blank"} on Compute Engine service quotas. + Check this [guide ](https://cloud.google.com/compute/resource-usage) on Compute Engine service quotas. The relevant service quotas include: - `NVIDIA V100 GPUs` (on-demand V100) @@ -518,7 +518,7 @@ corresponding service quotas for each type of instance in each region. - `Preemtible H100 GPUs` (spot H100) ??? info "Azure" - Check this [guide :material-arrow-top-right-thin:{ .external }](https://learn.microsoft.com/en-us/azure/quotas/quickstart-increase-quota-portal){:target="_blank"} on Azure service quotas. + Check this [guide ](https://learn.microsoft.com/en-us/azure/quotas/quickstart-increase-quota-portal) on Azure service quotas. The relevant service quotas include: - `Total Regional Spot vCPUs` (any spot instances) @@ -531,7 +531,7 @@ corresponding service quotas for each type of instance in each region. - `Standard NDSH100v5 Family vCPUs` (on-demand H100 x8) ??? info "OCI" - Check this [guide :material-arrow-top-right-thin:{ .external }](https://docs.oracle.com/en-us/iaas/Content/General/Concepts/servicelimits.htm#Requesti){:target="_blank"} on requesting OCI service limits increase. + Check this [guide ](https://docs.oracle.com/en-us/iaas/Content/General/Concepts/servicelimits.htm#Requesti) on requesting OCI service limits increase. The relevant service category is compute. The relevant resources include: - `GPUs for GPU.A10 based VM and BM instances` (on-demand A10) diff --git a/docs/docs/guides/server-deployment.md b/docs/docs/guides/server-deployment.md index 5edf8007b..fb50f5592 100644 --- a/docs/docs/guides/server-deployment.md +++ b/docs/docs/guides/server-deployment.md @@ -57,7 +57,7 @@ The minimum hardware requirements for running the server are 1 CPU and 1GB of RA ??? info "AWS CloudFormation" If you'd like to deploy the server to a private AWS VPC, you can use - our CloudFormation [template :material-arrow-top-right-thin:{ .external }](https://console.aws.amazon.com/cloudformation/home#/stacks/quickcreate?templateURL=https://get-dstack.s3.eu-west-1.amazonaws.com/cloudformation/template.yaml){:target="_blank"}. + our CloudFormation [template](https://console.aws.amazon.com/cloudformation/home#/stacks/quickcreate?templateURL=https://get-dstack.s3.eu-west-1.amazonaws.com/cloudformation/template.yaml). First, ensure you've set up a private VPC with public and private subnets. @@ -69,7 +69,7 @@ The minimum hardware requirements for running the server are 1 CPU and 1GB of RA To access the server URL, ensure you're connected to the VPC, e.g. via VPN client. > If you'd like to adjust anything, the source code of the template can be found at - [`examples/server-deployment/cloudformation/template.yaml` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/server-deployment/cloudformation/template.yaml){:target="_blank"}. + [`examples/server-deployment/cloudformation/template.yaml`](https://github.com/dstackai/dstack/blob/master/examples/server-deployment/cloudformation/template.yaml). ## Backend configuration @@ -142,7 +142,7 @@ $ DSTACK_DATABASE_URL=postgresql+asyncpg://user:password@db-host:5432/dstack dst export DSTACK_DATABASE_URL="postgresql+asyncpg://..." alembic upgrade head ``` - 4. Install [pgloader :material-arrow-top-right-thin:{.external }](https://github.com/dimitri/pgloader){:target="_blank"} + 4. Install [pgloader :material-arrow-top-right-thin:{.external }](https://github.com/dimitri/pgloader) 5. Pass the path to the `~/.dstack/server/data/sqlite.db` file to `SOURCE_PATH` and set `TARGET_PATH` with the URL of the PostgreSQL database. Example: ```bash @@ -406,7 +406,7 @@ When upgrading the `dstack` server, follow these guidelines to ensure a smooth t ### Before upgrading -1. **Check the changelog**: Review the [release notes :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/releases){:target="_blank"} for breaking changes, new features, and migration notes. +1. **Check the changelog**: Review the [release notes](https://github.com/dstackai/dstack/releases) for breaking changes, new features, and migration notes. 2. **Review backward compatibility**: Understand the [backward compatibility](#backward-compatibility) policy. 3. **Back up your data**: Ensure you always create a backup before upgrading. diff --git a/docs/docs/guides/troubleshooting.md b/docs/docs/guides/troubleshooting.md index 581b8ce05..44d6c9814 100644 --- a/docs/docs/guides/troubleshooting.md +++ b/docs/docs/guides/troubleshooting.md @@ -3,7 +3,7 @@ ## Reporting issues When you encounter a problem, please report it as -a [GitHub issue :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/new/choose){:target="_blank"}. +a [GitHub issue](https://github.com/dstackai/dstack/issues/new/choose). If you have a question or need help, feel free to ask it in our [Discord server](https://discord.gg/u8SmfwPpMd). @@ -23,8 +23,8 @@ environment variable to `DEBUG`. By default, it is set to `INFO`. CLI logs are located in `~/.dstack/logs/cli`, and the default log level is `DEBUG`. -> See these examples for well-reported issues: [this :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/1640){:target="_blank"} -and [this :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/1551){:target="_blank"}. +> See these examples for well-reported issues: [this](https://github.com/dstackai/dstack/issues/1640) +and [this](https://github.com/dstackai/dstack/issues/1551). ## Typical issues @@ -39,7 +39,7 @@ Below are some of the reasons why this might happen. Before you can run any workloads, you need to configure a [backend](../concepts/backends.md), create an [SSH fleet](../concepts/fleets.md#ssh-fleets), or sign up for -[dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"}. +[dstack Sky](https://sky.dstack.ai). If you have configured a backend and still can't use it, check the output of `dstack server` for backend configuration errors. @@ -116,7 +116,7 @@ one of these features, `dstack` will only select offers from the backends that s #### Cause 8: dstack Sky balance If you are using -[dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"}, +[dstack Sky](https://sky.dstack.ai), you will not see marketplace offers until you top up your balance. Alternatively, you can configure your own cloud accounts on the [project settings page](../concepts/projects.md#backends) @@ -188,7 +188,7 @@ If you interrupt the command, the port forwarding will be disconnected. To reatt #### Cause 2: Windows -If you're using the CLI on Windows, make sure to run it through WSL by following [these instructions:material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/1644#issuecomment-2321559265){:target="_blank"}. +If you're using the CLI on Windows, make sure to run it through WSL by following [these instructions:material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/1644#issuecomment-2321559265). Native support will be available soon. ### SSH fleet fails to provision diff --git a/docs/docs/index.md b/docs/docs/index.md index aaea12b1c..b0228fb2c 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -14,7 +14,7 @@ It streamlines development, training, and inference, and is compatible with any #### 1. Set up the server -> Before using `dstack`, ensure you've [installed](installation/index.md) the server, or signed up for [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"}. +> Before using `dstack`, ensure you've [installed](installation/index.md) the server, or signed up for [dstack Sky](https://sky.dstack.ai). #### 2. Define configurations @@ -40,4 +40,4 @@ port-forwarding, ingress, and more. 1. Proceed to [installation](installation/index.md) 2. See [quickstart](quickstart.md) 3. Browse [examples](/examples) - 4. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} + 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/docs/installation/index.md b/docs/docs/installation/index.md index 073c4b176..5d24373fd 100644 --- a/docs/docs/installation/index.md +++ b/docs/docs/installation/index.md @@ -2,7 +2,7 @@ + skip installation and proceed to [dstack Sky](https://sky.dstack.ai). --> ## Set up the server @@ -110,7 +110,7 @@ Once the server is up, you can access it via the `dstack` CLI. ??? info "Windows" To use the CLI on Windows, ensure you've installed Git and OpenSSH via - [Git for Windows:material-arrow-top-right-thin:{ .external }](https://git-scm.com/download/win){:target="_blank"}. + [Git for Windows:material-arrow-top-right-thin:{ .external }](https://git-scm.com/download/win). When installing it, ensure you've checked `Git from the command line and also from 3-rd party software` @@ -207,4 +207,4 @@ This configuration is stored in `~/.dstack/config.yml`. 2. See [Backends](../concepts/backends.md) 3. Check the [server deployment](../guides/server-deployment.md) guide 4. Browse [examples](/examples) - 5. Join the community via [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd) + 5. Join the community via [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/docs/quickstart.md b/docs/docs/quickstart.md index 3cf74475f..7b71519db 100644 --- a/docs/docs/quickstart.md +++ b/docs/docs/quickstart.md @@ -259,7 +259,7 @@ Alternatively, you can create an [SSH fleet](concepts/fleets#ssh-fleets). [rate limits](concepts/services.md#rate-limits), or use a custom domain with HTTPS, set up a [gateway](concepts/gateways.md) before running the service. - If you're using [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"}, + If you're using [dstack Sky](https://sky.dstack.ai), a gateway is pre-configured for you. `dstack apply` automatically provisions instances and runs the workload according to the configuration. @@ -271,4 +271,4 @@ Something not working? See the [troubleshooting](guides/troubleshooting.md) guid !!! info "What's next?" 1. Read about [backends](concepts/backends.md), [dev environments](concepts/dev-environments.md), [tasks](concepts/tasks.md), [services](concepts/services.md), and [fleets](concepts/services.md) 2. Browse [examples](../examples.md) - 3. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd) + 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/docs/reference/cli/dstack/project.md b/docs/docs/reference/cli/dstack/project.md index 5023a3951..6e3e79556 100644 --- a/docs/docs/reference/cli/dstack/project.md +++ b/docs/docs/reference/cli/dstack/project.md @@ -18,7 +18,7 @@ The `dstack project set-default` command can be used to switch between multiple
- Also, you can install [`direnv` :material-arrow-top-right-thin:{ .external }](https://direnv.net/){:target="_blank"} + Also, you can install [`direnv`](https://direnv.net/) to automatically apply environment variables from the `.envrc` file in your project directory.
diff --git a/docs/docs/reference/dstack.yml/service.md b/docs/docs/reference/dstack.yml/service.md index 8d89b2d57..9c9a34ee0 100644 --- a/docs/docs/reference/dstack.yml/service.md +++ b/docs/docs/reference/dstack.yml/service.md @@ -64,7 +64,7 @@ The `service` configuration type allows running [services](../../concepts/servic 2. Doesn't work if `eos_token` is defined in the model repository as a dictionary. As a workaround, set `eos_token` manually, as shown in the example above (see Chat template). If you encounter any other issues, please make sure to file a - [GitHub issue :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/new/choose){:target="_blank"}. + [GitHub issue](https://github.com/dstackai/dstack/issues/new/choose). ### `scaling` diff --git a/docs/docs/reference/plugins/python/index.md b/docs/docs/reference/plugins/python/index.md index cf141bb18..a88e8716f 100644 --- a/docs/docs/reference/plugins/python/index.md +++ b/docs/docs/reference/plugins/python/index.md @@ -75,7 +75,7 @@ Then you can install the plugin package into your Python environment and enable ??? info "Docker" If you deploy `dstack` using a Docker image you can add plugins either - by including them in your custom image built upon the `dstack` [server image :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/tree/master/docker/server){:target="_blank"}, or by mounting installed plugins as volumes. + by including them in your custom image built upon the `dstack` [server image](https://github.com/dstackai/dstack/tree/master/docker/server), or by mounting installed plugins as volumes. ## Apply policies @@ -123,4 +123,4 @@ Plugins implemented as API servers have advantages over plugins implemented as P * You can use any programming language. * If you run the `dstack` server via Docker, you don't need to extend the `dstack` server image with plugins or map them via volumes. -To get started, check out the [plugin server example :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/tree/master/examples/plugins/example_plugin_server){:target="_blank"}. The `rest_plugin` server API is documented [here](../../plugins/rest/index.md). +To get started, check out the [plugin server example](https://github.com/dstackai/dstack/tree/master/examples/plugins/example_plugin_server). The `rest_plugin` server API is documented [here](../../plugins/rest/index.md). diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md index 970db2d98..863a0462b 100644 --- a/examples/accelerators/amd/README.md +++ b/examples/accelerators/amd/README.md @@ -7,7 +7,7 @@ with on-prem AMD GPUs or configuring a backend that offers AMD GPUs such as the ## Deployment Most serving frameworks including vLLM and TGI have AMD support. Here's an example of a [service](https://dstack.ai/docs/services) that deploys -Llama 3.1 70B in FP16 using [TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/installation_amd){:target="_blank"} and [vLLM :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html){:target="_blank"}. +Llama 3.1 70B in FP16 using [TGI](https://huggingface.co/docs/text-generation-inference/en/installation_amd) and [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html). === "TGI" @@ -98,7 +98,7 @@ Llama 3.1 70B in FP16 using [TGI :material-arrow-top-right-thin:{ .external }](h > To speed up the `vLLM-ROCm` installation, we use a pre-built binary from S3. > You can find the task to build and upload the binary in - > [`examples/inference/vllm/amd/` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/inference/vllm/amd/){:target="_blank"}. + > [`examples/inference/vllm/amd/`](https://github.com/dstackai/dstack/blob/master/examples/inference/vllm/amd/). !!! info "Docker image" If you want to use AMD, specifying `image` is currently required. This must be an image that includes @@ -110,8 +110,8 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by === "TRL" - Below is an example of LoRA fine-tuning Llama 3.1 8B using [TRL :material-arrow-top-right-thin:{ .external }](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.html){:target="_blank"} - and the [`mlabonne/guanaco-llama2-1k` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/mlabonne/guanaco-llama2-1k){:target="_blank"} + Below is an example of LoRA fine-tuning Llama 3.1 8B using [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.html) + and the [`mlabonne/guanaco-llama2-1k`](https://huggingface.co/datasets/mlabonne/guanaco-llama2-1k) dataset.
@@ -156,8 +156,8 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by
=== "Axolotl" - Below is an example of fine-tuning Llama 3.1 8B using [Axolotl :material-arrow-top-right-thin:{ .external }](https://rocm.blogs.amd.com/artificial-intelligence/axolotl/README.html){:target="_blank"} - and the [tatsu-lab/alpaca :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/tatsu-lab/alpaca){:target="_blank"} + Below is an example of fine-tuning Llama 3.1 8B using [Axolotl](https://rocm.blogs.amd.com/artificial-intelligence/axolotl/README.html) + and the [tatsu-lab/alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca) dataset.
@@ -212,11 +212,11 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by ```
- Note, to support ROCm, we need to checkout to commit `d4f6c65`. This commit eliminates the need to manually modify the Axolotl source code to make xformers compatible with ROCm, as described in the [xformers workaround :material-arrow-top-right-thin:{ .external }](https://docs.axolotl.ai/docs/amd_hpc.html#apply-xformers-workaround). This installation approach is also followed for building Axolotl ROCm docker image. [(See Dockerfile) :material-arrow-top-right-thin:{ .external }](https://github.com/ROCm/rocm-blogs/blob/release/blogs/artificial-intelligence/axolotl/src/Dockerfile.rocm){:target="_blank"}. + Note, to support ROCm, we need to checkout to commit `d4f6c65`. This commit eliminates the need to manually modify the Axolotl source code to make xformers compatible with ROCm, as described in the [xformers workaround](https://docs.axolotl.ai/docs/amd_hpc.html#apply-xformers-workaround). This installation approach is also followed for building Axolotl ROCm docker image. [(See Dockerfile)](https://github.com/ROCm/rocm-blogs/blob/release/blogs/artificial-intelligence/axolotl/src/Dockerfile.rocm). > To speed up installation of `flash-attention` and `xformers `, we use pre-built binaries uploaded to S3. > You can find the tasks that build and upload the binaries - > in [`examples/single-node-training/axolotl/amd/` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/axolotl/amd/){:target="_blank"}. + > in [`examples/single-node-training/axolotl/amd/`](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/axolotl/amd/). ## Running a configuration @@ -239,17 +239,17 @@ $ dstack apply -f examples/inference/vllm/amd/.dstack.yml ## Source code The source-code of this example can be found in -[`examples/inference/tgi/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/inference/tgi/amd){:target="_blank"}, -[`examples/inference/vllm/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/inference/vllm/amd){:target="_blank"}, -[`examples/single-node-training/axolotl/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/axolotl/amd){:target="_blank"} and -[`examples/single-node-training/trl/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/trl/amd){:target="_blank"} +[`examples/inference/tgi/amd`](https://github.com/dstackai/dstack/blob/master/examples/inference/tgi/amd), +[`examples/inference/vllm/amd`](https://github.com/dstackai/dstack/blob/master/examples/inference/vllm/amd), +[`examples/single-node-training/axolotl/amd`](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/axolotl/amd) and +[`examples/single-node-training/trl/amd`](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/trl/amd) ## What's next? -1. Browse [TGI :material-arrow-top-right-thin:{ .external }](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/deploy-your-model.html#serving-using-hugging-face-tgi), - [vLLM :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-rocm), - [Axolotl :material-arrow-top-right-thin:{ .external }](https://github.com/ROCm/rocm-blogs/tree/release/blogs/artificial-intelligence/axolotl), - [TRL :material-arrow-top-right-thin:{ .external }](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html) and - [ROCm Bitsandbytes :material-arrow-top-right-thin:{ .external }](https://github.com/ROCm/bitsandbytes) +1. Browse [TGI](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/deploy-your-model.html#serving-using-hugging-face-tgi), + [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-rocm), + [Axolotl](https://github.com/ROCm/rocm-blogs/tree/release/blogs/artificial-intelligence/axolotl), + [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html) and + [ROCm Bitsandbytes](https://github.com/ROCm/bitsandbytes) 2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and [services](https://dstack.ai/docs/services). diff --git a/examples/accelerators/intel/README.md b/examples/accelerators/intel/README.md index 5d59e2f95..0220a74ce 100644 --- a/examples/accelerators/intel/README.md +++ b/examples/accelerators/intel/README.md @@ -7,9 +7,9 @@ Serving frameworks like vLLM and TGI have Intel Gaudi support. Here's an example of a service that deploys -[`DeepSeek-R1-Distill-Llama-70B` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B){:target="_blank"} -using [TGI on Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/tgi-gaudi){:target="_blank"} -and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/HabanaAI/vllm-fork){:target="_blank"}. +[`DeepSeek-R1-Distill-Llama-70B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) +using [TGI on Gaudi](https://github.com/huggingface/tgi-gaudi) +and [vLLM](https://github.com/HabanaAI/vllm-fork). === "TGI"
@@ -97,10 +97,10 @@ and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/Haban ## Fine-tuning -Below is an example of LoRA fine-tuning of [`DeepSeek-R1-Distill-Qwen-7B` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B){:target="_blank"} -using [Optimum for Intel Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-habana){:target="_blank"} -and [DeepSpeed :material-arrow-top-right-thin:{ .external }](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/DeepSpeed_User_Guide/DeepSpeed_User_Guide.html#deepspeed-user-guide){:target="_blank"} with -the [`lvwerra/stack-exchange-paired` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/lvwerra/stack-exchange-paired){:target="_blank"} dataset. +Below is an example of LoRA fine-tuning of [`DeepSeek-R1-Distill-Qwen-7B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) +using [Optimum for Intel Gaudi](https://github.com/huggingface/optimum-habana) +and [DeepSpeed](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/DeepSpeed_User_Guide/DeepSpeed_User_Guide.html#deepspeed-user-guide) with +the [`lvwerra/stack-exchange-paired`](https://huggingface.co/datasets/lvwerra/stack-exchange-paired) dataset.
@@ -178,11 +178,11 @@ Provisioning... ## Source code The source-code of this example can be found in -[`examples/llms/deepseek/tgi/intel` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/tgi/intel){:target="_blank"}, -[`examples/llms/deepseek/vllm/intel` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/vllm/intel){:target="_blank"} and -[`examples/llms/deepseek/trl/intel` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/trl/intel){:target="_blank"}. +[`examples/llms/deepseek/tgi/intel`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/tgi/intel), +[`examples/llms/deepseek/vllm/intel`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/vllm/intel) and +[`examples/llms/deepseek/trl/intel`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/trl/intel). !!! info "What's next?" 1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and [services](https://dstack.ai/docs/services). - 2. See also [Intel Gaudi Documentation :material-arrow-top-right-thin:{ .external }](https://docs.habana.ai/en/latest/index.html), [vLLM Inference with Gaudi](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/vLLM_Inference.html) - and [Optimum for Gaudi examples :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-habana/blob/main/examples/trl/README.md). + 2. See also [Intel Gaudi Documentation](https://docs.habana.ai/en/latest/index.html), [vLLM Inference with Gaudi](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/vLLM_Inference.html) + and [Optimum for Gaudi examples](https://github.com/huggingface/optimum-habana/blob/main/examples/trl/README.md). diff --git a/examples/accelerators/tenstorrent/README.md b/examples/accelerators/tenstorrent/README.md index 5ce33567e..bbb0b2207 100644 --- a/examples/accelerators/tenstorrent/README.md +++ b/examples/accelerators/tenstorrent/README.md @@ -7,7 +7,7 @@ image: https://dstack.ai/static-assets/static-assets/images/dstack-tenstorrent-m # Tenstorrent `dstack` supports running dev environments, tasks, and services on Tenstorrent -[Wormwhole :material-arrow-top-right-thin:{ .external }](https://tenstorrent.com/en/hardware/wormhole){:target="_blank"} accelerators via SSH fleets. +[Wormwhole](https://tenstorrent.com/en/hardware/wormhole) accelerators via SSH fleets. ??? info "SSH fleets" @@ -48,8 +48,8 @@ image: https://dstack.ai/static-assets/static-assets/images/dstack-tenstorrent-m ## Services Here's an example of a service that deploys -[`Llama-3.2-1B-Instruct` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/meta-llama/Llama-3.2-1B){:target="_blank"} -using [Tenstorrent Inference Service :material-arrow-top-right-thin:{ .external }](https://github.com/tenstorrent/tt-inference-server){:target="_blank"}. +[`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B) +using [Tenstorrent Inference Service](https://github.com/tenstorrent/tt-inference-server).
@@ -195,5 +195,5 @@ If you run it via `dstack apply`, it will output the URL to access it via your d > Dev nevironments support many options, including inactivity and max duration, IDE configuration, etc. To learn more, refer to [Dev environments](https://dstack.ai/docs/concepts/tasks). ??? info "Feedback" - Found a bug, or want to request a feature? File it in the [issue tracker :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues){:target="_blank"}, - or share via [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"}. + Found a bug, or want to request a feature? File it in the [issue tracker](https://github.com/dstackai/dstack/issues), + or share via [Discord](https://discord.gg/u8SmfwPpMd). diff --git a/examples/accelerators/tpu/README.md b/examples/accelerators/tpu/README.md index 2aa595099..da1632658 100644 --- a/examples/accelerators/tpu/README.md +++ b/examples/accelerators/tpu/README.md @@ -19,8 +19,8 @@ Below are a few examples on using TPUs for deployment and fine-tuning. Many serving frameworks including vLLM and TGI have TPU support. Here's an example of a [service](https://dstack.ai/docs/services) that deploys Llama 3.1 8B using -[Optimum TPU :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu){:target="_blank"} -and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm){:target="_blank"}. +[Optimum TPU](https://github.com/huggingface/optimum-tpu) +and [vLLM](https://github.com/vllm-project/vllm). === "Optimum TPU" @@ -52,7 +52,7 @@ and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm- ??? info "Docker image" The official Docker image `huggingface/optimum-tpu:latest` doesn’t support Llama 3.1-8B. We’ve created a custom image with the fix: `dstackai/optimum-tpu:llama31`. - Once the [pull request :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu/pull/92){:target="_blank"} is merged, + Once the [pull request](https://github.com/huggingface/optimum-tpu/pull/92) is merged, the official Docker image can be used. === "vLLM" @@ -114,7 +114,7 @@ Note, `v5litepod` is optimized for serving transformer-based models. Each core i | Framework | Quantization | Note | |-----------|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | **TGI** | bfloat16 | To deploy with TGI, Optimum TPU must be used. | -| **vLLM** | int8, bfloat16 | int8 quantization still requires the same memory because the weights are first moved to the TPU in bfloat16, and then converted to int8. See the [pull request :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm/pull/7005){:target="_blank"} for more details. | +| **vLLM** | int8, bfloat16 | int8 quantization still requires the same memory because the weights are first moved to the TPU in bfloat16, and then converted to int8. See the [pull request](https://github.com/vllm-project/vllm/pull/7005) for more details. | ### Running a configuration @@ -123,8 +123,8 @@ cloud resources and run the configuration. ## Fine-tuning with Optimum TPU -Below is an example of fine-tuning Llama 3.1 8B using [Optimum TPU :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu){:target="_blank"} -and the [`Abirate/english_quotes` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/Abirate/english_quotes){:target="_blank"} +Below is an example of fine-tuning Llama 3.1 8B using [Optimum TPU](https://github.com/huggingface/optimum-tpu) +and the [`Abirate/english_quotes`](https://huggingface.co/datasets/Abirate/english_quotes) dataset.
@@ -182,14 +182,14 @@ Note, `v5litepod` is optimized for fine-tuning transformer-based models. Each co ## Source code The source-code of this example can be found in -[`examples/inference/tgi/tpu` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/inference/tgi/tpu){:target="_blank"}, -[`examples/inference/vllm/tpu` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/inference/vllm/tpu){:target="_blank"}, -and [`examples/single-node-training/optimum-tpu` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/trl){:target="_blank"}. +[`examples/inference/tgi/tpu`](https://github.com/dstackai/dstack/blob/master/examples/inference/tgi/tpu), +[`examples/inference/vllm/tpu`](https://github.com/dstackai/dstack/blob/master/examples/inference/vllm/tpu), +and [`examples/single-node-training/optimum-tpu`](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/trl). ## What's next? -1. Browse [Optimum TPU :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu), - [Optimum TPU TGI :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference) and - [vLLM :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html). +1. Browse [Optimum TPU](https://github.com/huggingface/optimum-tpu), + [Optimum TPU TGI](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference) and + [vLLM](https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html). 2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), [services](https://dstack.ai/docs/services), and [fleets](https://dstack.ai/docs/concepts/fleets). diff --git a/examples/clusters/a3high/README.md b/examples/clusters/a3high/README.md index bae58d07d..2794851f2 100644 --- a/examples/clusters/a3high/README.md +++ b/examples/clusters/a3high/README.md @@ -226,4 +226,4 @@ resources: ## Source code The source code for this example can be found in -[`examples/distributed-training/a3high-clusters` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/distributed-training/a3high-clusters). +[`examples/distributed-training/a3high-clusters`](https://github.com/dstackai/dstack/blob/master/examples/distributed-training/a3high-clusters). diff --git a/examples/clusters/a3mega/README.md b/examples/clusters/a3mega/README.md index a0c117553..e7db0bd2c 100644 --- a/examples/clusters/a3mega/README.md +++ b/examples/clusters/a3mega/README.md @@ -192,4 +192,4 @@ resources: ## Source code The source code for this example can be found in -[`examples/misc/a3mega-clusters` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/misc/a3mega-clusters). +[`examples/misc/a3mega-clusters`](https://github.com/dstackai/dstack/blob/master/examples/misc/a3mega-clusters). diff --git a/examples/clusters/efa/README.md b/examples/clusters/efa/README.md index 0df910a98..98198a838 100644 --- a/examples/clusters/efa/README.md +++ b/examples/clusters/efa/README.md @@ -1,6 +1,6 @@ # AWS EFA -In this guide, we’ll walk through how to run high-performance distributed training on AWS using [Amazon Elastic Fabric Adapter (EFA) :material-arrow-top-right-thin:{ .external }](https://aws.amazon.com/hpc/efa/){:target="_blank"} with `dstack`. +In this guide, we’ll walk through how to run high-performance distributed training on AWS using [Amazon Elastic Fabric Adapter (EFA)](https://aws.amazon.com/hpc/efa/) with `dstack`. ## Overview @@ -71,7 +71,7 @@ Provisioning... ??? info "Instance types" `dstack` selects suitable instances automatically, but not - [all types support EFA :material-arrow-top-right-thin:{ .external }](https://aws.amazon.com/hpc/efa/){:target="_blank"}. + [all types support EFA](https://aws.amazon.com/hpc/efa/). To enforce EFA, you can specify `instance_types` explicitly: ```yaml diff --git a/examples/clusters/nccl-tests/README.md b/examples/clusters/nccl-tests/README.md index 4202f62f2..29b2d8ee2 100644 --- a/examples/clusters/nccl-tests/README.md +++ b/examples/clusters/nccl-tests/README.md @@ -1,6 +1,6 @@ # NCCL tests -This example shows how to run [NCCL tests :material-arrow-top-right-thin:{ .external }](https://github.com/NVIDIA/nccl-tests){:target="_blank"} on a cluster using [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks). +This example shows how to run [NCCL tests](https://github.com/NVIDIA/nccl-tests) on a cluster using [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks). !!! info "Prerequisites" Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#backend-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). @@ -47,7 +47,7 @@ resources:
!!! info "Default image" - If you don't specify `image`, `dstack` uses its [base :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/tree/master/docker/base){:target="_blank"} Docker image pre-configured with + If you don't specify `image`, `dstack` uses its [base](https://github.com/dstackai/dstack/tree/master/docker/base) Docker image pre-configured with `uv`, `python`, `pip`, essential CUDA drivers, `mpirun`, and NCCL tests (under `/opt/nccl-tests/build`). !!! info "Privileged" @@ -75,7 +75,7 @@ Submit the run nccl-tests? [y/n]: y ## Source code The source-code of this example can be found in -[`examples/clusters/nccl-tests` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/clusters/nccl-tests). +[`examples/clusters/nccl-tests`](https://github.com/dstackai/dstack/blob/master/examples/clusters/nccl-tests). ## What's next? diff --git a/examples/clusters/rccl-tests/README.md b/examples/clusters/rccl-tests/README.md index 3dfd45b1c..36c685701 100644 --- a/examples/clusters/rccl-tests/README.md +++ b/examples/clusters/rccl-tests/README.md @@ -1,6 +1,6 @@ # RCCL tests -This example shows how to run distributed [RCCL tests :material-arrow-top-right-thin:{ .external }](https://github.com/ROCm/rccl-tests){:target="_blank"} using [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks). +This example shows how to run distributed [RCCL tests](https://github.com/ROCm/rccl-tests) using [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks). !!! info "Prerequisites" Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#backend-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). @@ -68,7 +68,7 @@ resources: Other nodes use a `FIFO` pipe to wait for until the MPI run is finished. - There is an open [issue :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/2467){:target="_blank"} to simplify the use of MPI with distributed tasks. + There is an open [issue](https://github.com/dstackai/dstack/issues/2467) to simplify the use of MPI with distributed tasks. !!! info "RoCE library" Broadcom RoCE drivers require the `libbnxt_re` userspace library inside the container to be compatible with the host’s Broadcom @@ -116,7 +116,7 @@ Submit the run rccl-tests? [y/n]: y ## Source code The source-code of this example can be found in -[`examples/distributed-training/rccl-tests` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/distributed-training/rccl-tests). +[`examples/distributed-training/rccl-tests`](https://github.com/dstackai/dstack/blob/master/examples/distributed-training/rccl-tests). ## What's next? diff --git a/examples/distributed-training/axolotl/README.md b/examples/distributed-training/axolotl/README.md index 7bbafe225..9ddd77a36 100644 --- a/examples/distributed-training/axolotl/README.md +++ b/examples/distributed-training/axolotl/README.md @@ -1,6 +1,6 @@ # Axolotl -This example walks you through how to run distributed fine-tune using [Axolotl :material-arrow-top-right-thin:{ .external }](https://github.com/axolotl-ai-cloud/axolotl){:target="_blank"} and [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks). +This example walks you through how to run distributed fine-tune using [Axolotl](https://github.com/axolotl-ai-cloud/axolotl) and [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks). !!! info "Prerequisites" Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#backend-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). @@ -92,7 +92,7 @@ Provisioning... ## Source code The source-code of this example can be found in -[`examples/distributed-training/axolotl` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/distributed-training/axolotl). +[`examples/distributed-training/axolotl`](https://github.com/dstackai/dstack/blob/master/examples/distributed-training/axolotl). !!! info "What's next?" 1. Read the [clusters](https://dstack.ai/docs/guides/clusters) guide diff --git a/examples/distributed-training/ray-ragen/README.md b/examples/distributed-training/ray-ragen/README.md index 2c4b8def6..e79f27f78 100644 --- a/examples/distributed-training/ray-ragen/README.md +++ b/examples/distributed-training/ray-ragen/README.md @@ -1,9 +1,9 @@ # Ray + RAGEN -This example shows how use `dstack` and [RAGEN :material-arrow-top-right-thin:{ .external }](https://github.com/RAGEN-AI/RAGEN){:target="_blank"} +This example shows how use `dstack` and [RAGEN](https://github.com/RAGEN-AI/RAGEN) to fine-tune an agent on multiple nodes. -Under the hood `RAGEN` uses [verl :material-arrow-top-right-thin:{ .external }](https://github.com/volcengine/verl){:target="_blank"} for Reinforcement Learning and [Ray :material-arrow-top-right-thin:{ .external }](https://docs.ray.io/en/latest/){:target="_blank"} for distributed training. +Under the hood `RAGEN` uses [verl](https://github.com/volcengine/verl) for Reinforcement Learning and [Ray](https://docs.ray.io/en/latest/) for distributed training. !!! info "Prerequisites" Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#backend-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). @@ -60,7 +60,7 @@ volumes:
-We are using verl's docker image for vLLM with FSDP. See [Installation :material-arrow-top-right-thin:{ .external }](https://verl.readthedocs.io/en/latest/start/install.html){:target="_blank"} for more. +We are using verl's docker image for vLLM with FSDP. See [Installation](https://verl.readthedocs.io/en/latest/start/install.html) for more. The `RAGEN` setup script `scripts/setup_ragen.sh` isolates dependencies within Conda environment. @@ -127,4 +127,4 @@ Using Ray via `dstack` is a powerful way to get access to the rich Ray ecosystem !!! info "What's next" 1. Check the [Clusters](https://dstack.ai/docs/guides/clusters) guide 2. Read about [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks) and [fleets](https://dstack.ai/docs/concepts/fleets) - 3. Browse Ray's [docs :material-arrow-top-right-thin:{ .external }](https://docs.ray.io/en/latest/train/examples.html){:target="_blank"} for other examples. + 3. Browse Ray's [docs](https://docs.ray.io/en/latest/train/examples.html) for other examples. diff --git a/examples/distributed-training/trl/README.md b/examples/distributed-training/trl/README.md index 8fe1615ec..c6231e517 100644 --- a/examples/distributed-training/trl/README.md +++ b/examples/distributed-training/trl/README.md @@ -1,6 +1,6 @@ # TRL -This example walks you through how to run distributed fine-tune using [TRL :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/trl){:target="_blank"}, [Accelerate :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/accelerate){:target="_blank"} and [Deepspeed :material-arrow-top-right-thin:{ .external }](https://github.com/deepspeedai/DeepSpeed){:target="_blank"}. +This example walks you through how to run distributed fine-tune using [TRL](https://github.com/huggingface/trl), [Accelerate](https://github.com/huggingface/accelerate) and [Deepspeed](https://github.com/deepspeedai/DeepSpeed). !!! info "Prerequisites" Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#backend-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). @@ -152,7 +152,7 @@ Provisioning... ## Source code The source-code of this example can be found in -[`examples/distributed-training/trl` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/distributed-training/trl){:target="_blank"}. +[`examples/distributed-training/trl`](https://github.com/dstackai/dstack/blob/master/examples/distributed-training/trl). !!! info "What's next?" 1. Read the [clusters](https://dstack.ai/docs/guides/clusters) guide diff --git a/examples/inference/nim/README.md b/examples/inference/nim/README.md index fe520e36b..1b125dda8 100644 --- a/examples/inference/nim/README.md +++ b/examples/inference/nim/README.md @@ -5,7 +5,7 @@ description: "This example shows how to deploy DeepSeek-R1-Distill-Llama-8B to a # NVIDIA NIM -This example shows how to deploy DeepSeek-R1-Distill-Llama-8B using [NVIDIA NIM :material-arrow-top-right-thin:{ .external }](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html){:target="_blank"} and `dstack`. +This example shows how to deploy DeepSeek-R1-Distill-Llama-8B using [NVIDIA NIM](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html) and `dstack`. ??? info "Prerequisites" Once `dstack` is [installed](https://dstack.ai/docs/installation), clone the repo with examples. @@ -112,7 +112,7 @@ is available at `https://gateway./`. ## Source code The source-code of this example can be found in -[`examples/inference/nim` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/inference/nim){:target="_blank"}. +[`examples/inference/nim`](https://github.com/dstackai/dstack/blob/master/examples/inference/nim). ## What's next? diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md index 1652b838c..d0c42bf2e 100644 --- a/examples/inference/sglang/README.md +++ b/examples/inference/sglang/README.md @@ -1,6 +1,6 @@ # SGLang -This example shows how to deploy DeepSeek-R1-Distill-Llama 8B and 70B using [SGLang :material-arrow-top-right-thin:{ .external }](https://github.com/sgl-project/sglang){:target="_blank"} and `dstack`. +This example shows how to deploy DeepSeek-R1-Distill-Llama 8B and 70B using [SGLang](https://github.com/sgl-project/sglang) and `dstack`. ## Apply a configuration @@ -105,14 +105,14 @@ curl http://127.0.0.1:3000/proxy/models/main/chat/completions \
!!! info "SGLang Model Gateway" - If you'd like to use a custom routing policy, e.g. by leveraging the [SGLang Model Gateway :material-arrow-top-right-thin:{ .external }](https://docs.sglang.ai/advanced_features/router.html#){:target="_blank"}, create a gateway with `router` set to `sglang`. Check out [gateways](https://dstack.ai/docs/concepts/gateways#router) for more details. + If you'd like to use a custom routing policy, e.g. by leveraging the [SGLang Model Gateway](https://docs.sglang.ai/advanced_features/router.html#), create a gateway with `router` set to `sglang`. Check out [gateways](https://dstack.ai/docs/concepts/gateways#router) for more details. > If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling or HTTPs, rate-limits, etc), the OpenAI-compatible endpoint is available at `https://gateway./`. ## Source code The source-code of this example can be found in -[`examples/llms/deepseek/sglang` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/sglang){:target="_blank"}. +[`examples/llms/deepseek/sglang`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/sglang). ## What's next? diff --git a/examples/inference/tgi/README.md b/examples/inference/tgi/README.md index 8630473dd..6984ec2ff 100644 --- a/examples/inference/tgi/README.md +++ b/examples/inference/tgi/README.md @@ -5,7 +5,7 @@ description: "This example shows how to deploy Llama 4 Scout to any cloud or on- # HuggingFace TGI -This example shows how to deploy Llama 4 Scout with `dstack` using [HuggingFace TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/index){:target="_blank"}. +This example shows how to deploy Llama 4 Scout with `dstack` using [HuggingFace TGI](https://huggingface.co/docs/text-generation-inference/en/index). ??? info "Prerequisites" Once `dstack` is [installed](https://dstack.ai/docs/installation), clone the repo with examples. @@ -21,7 +21,7 @@ This example shows how to deploy Llama 4 Scout with `dstack` using [HuggingFace ## Deployment -Here's an example of a service that deploys [`Llama-4-Scout-17B-16E-Instruct` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct){:target="_blank"} using TGI. +Here's an example of a service that deploys [`Llama-4-Scout-17B-16E-Instruct`](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) using TGI.
@@ -116,7 +116,7 @@ is available at `https://gateway./`. ## Source code The source-code of this example can be found in -[`examples/inference/tgi` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/inference/tgi). +[`examples/inference/tgi`](https://github.com/dstackai/dstack/blob/master/examples/inference/tgi). ## What's next? diff --git a/examples/inference/trtllm/README.md b/examples/inference/trtllm/README.md index 3d29ab0d9..02ae55f46 100644 --- a/examples/inference/trtllm/README.md +++ b/examples/inference/trtllm/README.md @@ -6,7 +6,7 @@ description: "This example shows how to deploy Deepseek models to any cloud or o # TensorRT-LLM This example shows how to deploy both DeepSeek R1 and its distilled version -using [TensorRT-LLM :material-arrow-top-right-thin:{ .external }](https://github.com/NVIDIA/TensorRT-LLM){:target="_blank"} and `dstack`. +using [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) and `dstack`. ??? info "Prerequisites" Once `dstack` is [installed](https://dstack.ai/docs/installation), clone the repo with examples. @@ -365,10 +365,10 @@ is available at `https://gateway./`. ## Source code The source-code of this example can be found in -[`examples/inference/trtllm` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/inference/trtllm){:target="_blank"}. +[`examples/inference/trtllm`](https://github.com/dstackai/dstack/blob/master/examples/inference/trtllm). ## What's next? 1. Check [services](https://dstack.ai/docs/services) -2. Browse [Tensorrt-LLM DeepSeek-R1 with PyTorch Backend :material-arrow-top-right-thin:{ .external }](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/deepseek_v3){:target="_blank"} and [Prepare the Model Repository :material-arrow-top-right-thin:{ .external }](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#prepare-the-model-repository){:target="_blank"} -3. See also [`trtllm-serve` :material-arrow-top-right-thin:{ .external }](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html#trtllm-serve){:target="_blank"} +2. Browse [Tensorrt-LLM DeepSeek-R1 with PyTorch Backend](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/deepseek_v3) and [Prepare the Model Repository](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#prepare-the-model-repository) +3. See also [`trtllm-serve`](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html#trtllm-serve) diff --git a/examples/inference/vllm/README.md b/examples/inference/vllm/README.md index d646ea287..12bfd18ac 100644 --- a/examples/inference/vllm/README.md +++ b/examples/inference/vllm/README.md @@ -4,7 +4,7 @@ description: "This example shows how to deploy Llama 3.1 to any cloud or on-prem # vLLM -This example shows how to deploy Llama 3.1 8B with `dstack` using [vLLM :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/){:target="_blank"}. +This example shows how to deploy Llama 3.1 8B with `dstack` using [vLLM](https://docs.vllm.ai/en/latest/). ??? info "Prerequisites" Once `dstack` is [installed](https://dstack.ai/docs/installation), clone the repo with examples. @@ -112,7 +112,7 @@ is available at `https://gateway./`. ## Source code The source-code of this example can be found in -[`examples/inference/vllm` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/inference/vllm). +[`examples/inference/vllm`](https://github.com/dstackai/dstack/blob/master/examples/inference/vllm). ## What's next? diff --git a/examples/llms/deepseek/README.md b/examples/llms/deepseek/README.md index ac098fa70..bca90ac35 100644 --- a/examples/llms/deepseek/README.md +++ b/examples/llms/deepseek/README.md @@ -1,7 +1,7 @@ # Deepseek This example walks you through how to deploy and -train [Deepseek :material-arrow-top-right-thin:{ .external }](https://huggingface.co/deepseek-ai){:target="_blank"} +train [Deepseek](https://huggingface.co/deepseek-ai) models with `dstack`. > We used Deepseek-R1 distilled models and Deepseek-V2-Lite, a 16B model with the same architecture as Deepseek-R1 (671B). Deepseek-V2-Lite retains MLA and DeepSeekMoE but requires less memory, making it ideal for testing and fine-tuning on smaller GPUs. @@ -21,7 +21,7 @@ models with `dstack`. ### AMD -Here's an example of a service that deploys `Deepseek-R1-Distill-Llama-70B` using [SGLang :material-arrow-top-right-thin:{ .external }](https://github.com/sgl-project/sglang){:target="_blank"} and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm){:target="_blank"} with AMD `MI300X` GPU. The below configurations also support `Deepseek-V2-Lite`. +Here's an example of a service that deploys `Deepseek-R1-Distill-Llama-70B` using [SGLang](https://github.com/sgl-project/sglang) and [vLLM](https://github.com/vllm-project/vllm) with AMD `MI300X` GPU. The below configurations also support `Deepseek-V2-Lite`. === "SGLang" @@ -81,13 +81,13 @@ Note, when using `Deepseek-R1-Distill-Llama-70B` with `vLLM` with a 192GB GPU, w ### Intel Gaudi Here's an example of a service that deploys `Deepseek-R1-Distill-Llama-70B` -using [TGI on Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/tgi-gaudi){:target="_blank"} -and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/HabanaAI/vllm-fork){:target="_blank"} (Gaudi fork) with Intel Gaudi 2. +using [TGI on Gaudi](https://github.com/huggingface/tgi-gaudi) +and [vLLM](https://github.com/HabanaAI/vllm-fork) (Gaudi fork) with Intel Gaudi 2. -> Both [TGI on Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/tgi-gaudi){:target="_blank"} -> and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/HabanaAI/vllm-fork){:target="_blank"} do not support `Deepseek-V2-Lite`. -> See [this :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/tgi-gaudi/issues/271) -> and [this :material-arrow-top-right-thin:{ .external }](https://github.com/HabanaAI/vllm-fork/issues/809#issuecomment-2652454824) issues. +> Both [TGI on Gaudi](https://github.com/huggingface/tgi-gaudi) +> and [vLLM](https://github.com/HabanaAI/vllm-fork) do not support `Deepseek-V2-Lite`. +> See [this](https://github.com/huggingface/tgi-gaudi/issues/271) +> and [this](https://github.com/HabanaAI/vllm-fork/issues/809#issuecomment-2652454824) issues. === "TGI" @@ -170,8 +170,8 @@ and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/Haban ### NVIDIA Here's an example of a service that deploys `Deepseek-R1-Distill-Llama-8B` -using [SGLang :material-arrow-top-right-thin:{ .external }](https://github.com/sgl-project/sglang){:target="_blank"} -and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm){:target="_blank"} with NVIDIA GPUs. +using [SGLang](https://github.com/sgl-project/sglang) +and [vLLM](https://github.com/vllm-project/vllm) with NVIDIA GPUs. Both SGLang and vLLM also support `Deepseek-V2-Lite`. === "SGLang" @@ -297,7 +297,7 @@ is available at `https://gateway./`. ### AMD -Here are the examples of LoRA fine-tuning of `Deepseek-V2-Lite` and GRPO fine-tuning of `DeepSeek-R1-Distill-Qwen-1.5B` on `MI300X` GPU using HuggingFace's [TRL :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/trl){:target="_blank"}. +Here are the examples of LoRA fine-tuning of `Deepseek-V2-Lite` and GRPO fine-tuning of `DeepSeek-R1-Distill-Qwen-1.5B` on `MI300X` GPU using HuggingFace's [TRL](https://github.com/huggingface/trl). === "LoRA" @@ -405,8 +405,8 @@ Note, the `GRPO` fine-tuning of `DeepSeek-R1-Distill-Qwen-1.5B` consumes up to 1 ### Intel Gaudi Here is an example of LoRA fine-tuning of `DeepSeek-R1-Distill-Qwen-7B` on Intel Gaudi 2 GPUs using -HuggingFace's [Optimum for Intel Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-habana){:target="_blank"} -and [DeepSpeed :material-arrow-top-right-thin:{ .external }](https://github.com/deepspeedai/DeepSpeed){:target="_blank"}. Both also support `LoRA` +HuggingFace's [Optimum for Intel Gaudi](https://github.com/huggingface/optimum-habana) +and [DeepSpeed](https://github.com/deepspeedai/DeepSpeed). Both also support `LoRA` fine-tuning of `Deepseek-V2-Lite` with same configuration as below. === "LoRA" @@ -464,7 +464,7 @@ fine-tuning of `Deepseek-V2-Lite` with same configuration as below. ### NVIDIA Here are examples of LoRA fine-tuning of `DeepSeek-R1-Distill-Qwen-1.5B` and QLoRA fine-tuning of `DeepSeek-V2-Lite` -on NVIDIA GPU using HuggingFace's [TRL :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/trl){:target="_blank"} library. +on NVIDIA GPU using HuggingFace's [TRL](https://github.com/huggingface/trl) library. === "LoRA"
@@ -600,7 +600,7 @@ needs 7–10GB due to intermediate hidden states. ## Source code The source-code of this example can be found in -[`examples/llms/deepseek` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek). +[`examples/llms/deepseek`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek). !!! info "What's next?" 1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), diff --git a/examples/llms/llama/README.md b/examples/llms/llama/README.md index 89e716d40..573d4b803 100644 --- a/examples/llms/llama/README.md +++ b/examples/llms/llama/README.md @@ -18,8 +18,8 @@ This example walks you through how to deploy Llama 4 Scout model with `dstack`. ### AMD Here's an example of a service that deploys -[`Llama-4-Scout-17B-16E-Instruct` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct){:target="_blank"} -using [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm){:target="_blank"} +[`Llama-4-Scout-17B-16E-Instruct`](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) +using [vLLM](https://github.com/vllm-project/vllm) with AMD `MI300X` GPUs.
@@ -59,8 +59,8 @@ resources: ### NVIDIA Here's an example of a service that deploys -[`Llama-4-Scout-17B-16E-Instruct` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct){:target="_blank"} -using [SGLang :material-arrow-top-right-thin:{ .external }](https://github.com/sgl-project/sglang){:target="_blank"} and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm){:target="_blank"} +[`Llama-4-Scout-17B-16E-Instruct`](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) +using [SGLang](https://github.com/sgl-project/sglang) and [vLLM](https://github.com/vllm-project/vllm) with NVIDIA `H200` GPUs. === "SGLang" @@ -128,7 +128,7 @@ with NVIDIA `H200` GPUs. !!! info "NOTE:" With vLLM, add `--override-generation-config='{"attn_temperature_tuning": true}'` to - improve accuracy for [contexts longer than 32K tokens :material-arrow-top-right-thin:{ .external }](https://blog.vllm.ai/2025/04/05/llama4.html){:target="_blank"}. + improve accuracy for [contexts longer than 32K tokens](https://blog.vllm.ai/2025/04/05/llama4.html). ### Memory requirements @@ -201,7 +201,7 @@ is available at `https://./`. ## Fine-tuning -Here's and example of FSDP and QLoRA fine-tuning of 4-bit Quantized [Llama-4-Scout-17B-16E :material-arrow-top-right-thin:{ .external }](https://huggingface.co/axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16) on 2xH100 NVIDIA GPUs using [Axolotl :material-arrow-top-right-thin:{ .external }](https://github.com/OpenAccess-AI-Collective/axolotl){:target="_blank"} +Here's and example of FSDP and QLoRA fine-tuning of 4-bit Quantized [Llama-4-Scout-17B-16E](https://huggingface.co/axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16) on 2xH100 NVIDIA GPUs using [Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl)
@@ -279,10 +279,10 @@ $ dstack apply -f examples/single-node-training/axolotl/.dstack.yml ## Source code The source-code for deployment examples can be found in -[`examples/llms/llama` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/llama) and the source-code for the finetuning example can be found in [`examples/single-node-training/axolotl` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/axolotl){:target="_blank"}. +[`examples/llms/llama`](https://github.com/dstackai/dstack/blob/master/examples/llms/llama) and the source-code for the finetuning example can be found in [`examples/single-node-training/axolotl`](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/axolotl). ## What's next? 1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), [services](https://dstack.ai/docs/services), and [protips](https://dstack.ai/docs/protips). -2. Browse [Llama 4 with SGLang :material-arrow-top-right-thin:{ .external }](https://github.com/sgl-project/sglang/blob/main/docs/references/llama4.md), [Llama 4 with vLLM :material-arrow-top-right-thin:{ .external }](https://blog.vllm.ai/2025/04/05/llama4.html), [Llama 4 with AMD :material-arrow-top-right-thin:{ .external }](https://rocm.blogs.amd.com/artificial-intelligence/llama4-day-0-support/README.html) and [Axolotl :material-arrow-top-right-thin:{ .external }](https://github.com/OpenAccess-AI-Collective/axolotl){:target="_blank"}. +2. Browse [Llama 4 with SGLang](https://github.com/sgl-project/sglang/blob/main/docs/references/llama4.md), [Llama 4 with vLLM](https://blog.vllm.ai/2025/04/05/llama4.html), [Llama 4 with AMD](https://rocm.blogs.amd.com/artificial-intelligence/llama4-day-0-support/README.html) and [Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl). diff --git a/examples/llms/llama31/README.md b/examples/llms/llama31/README.md index bc07d74da..ff288e3c8 100644 --- a/examples/llms/llama31/README.md +++ b/examples/llms/llama31/README.md @@ -219,7 +219,7 @@ is available at `https://gateway./`. ### Running on multiple GPUs Below is the task configuration file of fine-tuning Llama 3.1 8B using TRL on the -[`OpenAssistant/oasst_top1_2023-08-25` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/OpenAssistant/oasst_top1_2023-08-25) dataset. +[`OpenAssistant/oasst_top1_2023-08-25`](https://huggingface.co/datasets/OpenAssistant/oasst_top1_2023-08-25) dataset.
@@ -374,13 +374,13 @@ resources: ## Source code The source-code of this example can be found in -[`examples/llms/llama31` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/llama31) and [`examples/single-node-training/trl` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/trl). +[`examples/llms/llama31`](https://github.com/dstackai/dstack/blob/master/examples/llms/llama31) and [`examples/single-node-training/trl`](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/trl). ## What's next? 1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), [services](https://dstack.ai/docs/services), and [protips](https://dstack.ai/docs/protips). -2. Browse [Llama 3.1 on HuggingFace :material-arrow-top-right-thin:{ .external }](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f), - [HuggingFace's Llama recipes :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/huggingface-llama-recipes), - [Meta's Llama recipes :material-arrow-top-right-thin:{ .external }](https://github.com/meta-llama/llama-recipes) - and [Llama Agentic System :material-arrow-top-right-thin:{ .external }](https://github.com/meta-llama/llama-agentic-system/). +2. Browse [Llama 3.1 on HuggingFace](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f), + [HuggingFace's Llama recipes](https://github.com/huggingface/huggingface-llama-recipes), + [Meta's Llama recipes](https://github.com/meta-llama/llama-recipes) + and [Llama Agentic System](https://github.com/meta-llama/llama-agentic-system/). diff --git a/examples/llms/llama32/README.md b/examples/llms/llama32/README.md index 484720ee2..c60713932 100644 --- a/examples/llms/llama32/README.md +++ b/examples/llms/llama32/README.md @@ -122,11 +122,11 @@ is available at `https://./`. ## Source code The source-code of this example can be found in -[`examples/llms/llama32` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/llama32). +[`examples/llms/llama32`](https://github.com/dstackai/dstack/blob/master/examples/llms/llama32). ## What's next? 1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), [services](https://dstack.ai/docs/services), and [protips](https://dstack.ai/docs/protips). -2. Browse [Llama 3.2 on HuggingFace :material-arrow-top-right-thin:{ .external }](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf) - and [LLama 3.2 on vLLM :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/models/supported_models.html#multimodal-language-models). +2. Browse [Llama 3.2 on HuggingFace](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf) + and [LLama 3.2 on vLLM](https://docs.vllm.ai/en/latest/models/supported_models.html#multimodal-language-models). diff --git a/examples/misc/airflow/README.md b/examples/misc/airflow/README.md index 21687de74..fa1a15f76 100644 --- a/examples/misc/airflow/README.md +++ b/examples/misc/airflow/README.md @@ -78,4 +78,4 @@ def pipeline(...): ## Source code The source code for this example can be found in -[`examples/misc/airflow` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/misc/airflow). +[`examples/misc/airflow`](https://github.com/dstackai/dstack/blob/master/examples/misc/airflow). diff --git a/examples/misc/docker-compose/README.md b/examples/misc/docker-compose/README.md index 262f2abfd..d74dba030 100644 --- a/examples/misc/docker-compose/README.md +++ b/examples/misc/docker-compose/README.md @@ -2,10 +2,10 @@ All backends except `runpod`, `vastai`, and `kubernetes` allow using [Docker and Docker Compose](https://dstack.ai/docs/guides/protips#docker-and-docker-compose) inside `dstack` runs. -This example shows how to deploy Hugging Face [Chat UI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/chat-ui/index){:target="_blank"} -with [TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/index){:target="_blank"} -serving [Llama-3.2-3B-Instruct :material-arrow-top-right-thin:{ .external }](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct){:target="_blank"} -using [Docker Compose :material-arrow-top-right-thin:{ .external }](https://docs.docker.com/compose/){:target="_blank"}. +This example shows how to deploy Hugging Face [Chat UI](https://huggingface.co/docs/chat-ui/index) +with [TGI](https://huggingface.co/docs/text-generation-inference/en/index) +serving [Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) +using [Docker Compose](https://docs.docker.com/compose/). ??? info "Prerequisites" Once `dstack` is [installed](https://dstack.ai/docs/installation), clone the repo with examples. @@ -103,7 +103,7 @@ using [Docker Compose :material-arrow-top-right-thin:{ .external }](https://docs ### Deploying as a service If you'd like to deploy Chat UI as an auto-scalable and secure endpoint, -use the service configuration. You can find it at [`examples/misc/docker-compose/service.dstack.yml` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/misc/docker-compose/service.dstack.yml) +use the service configuration. You can find it at [`examples/misc/docker-compose/service.dstack.yml`](https://github.com/dstackai/dstack/blob/master/examples/misc/docker-compose/service.dstack.yml) ### Running a configuration @@ -172,7 +172,7 @@ be persisted. ## Source code The source-code of this example can be found in -[`examples/misc/docker-compose` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/misc/docker-compose). +[`examples/misc/docker-compose`](https://github.com/dstackai/dstack/blob/master/examples/misc/docker-compose). ## What's next? diff --git a/examples/models/wan22/README.md b/examples/models/wan22/README.md index 352a3cd66..47e9f4230 100644 --- a/examples/models/wan22/README.md +++ b/examples/models/wan22/README.md @@ -1,6 +1,6 @@ # Wan2.2 -[Wan2.2 :material-arrow-top-right-thin:{ .external }](https://github.com/Wan-Video/Wan2.2){:target="_blank"} is an open-source SOTA foundational video model. This example shows how to run the T2V-A14B model variant via `dstack` for text-to-video generation. +[Wan2.2](https://github.com/Wan-Video/Wan2.2) is an open-source SOTA foundational video model. This example shows how to run the T2V-A14B model variant via `dstack` for text-to-video generation. ??? info "Prerequisites" Once `dstack` is [installed](https://dstack.ai/docs/installation), clone the repo with examples. @@ -137,4 +137,4 @@ wget https://bashupload.com/fIo7l/wan22.mp4 ## Source code The source-code of this example can be found in -[`examples/models/wan22` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/models/wan22){:target="_blank"}. +[`examples/models/wan22`](https://github.com/dstackai/dstack/blob/master/examples/models/wan22). diff --git a/examples/server-deployment/cloudformation/README.md b/examples/server-deployment/cloudformation/README.md index a8ea773a8..2e2057df4 100644 --- a/examples/server-deployment/cloudformation/README.md +++ b/examples/server-deployment/cloudformation/README.md @@ -1,7 +1,7 @@ # Deploying server to a private VPC via AWS CloudFormation If you'd like to deploy the server to a private AWS VPC, you can use -our CloudFormation [template :material-arrow-top-right-thin:{ .external }](https://console.aws.amazon.com/cloudformation/home#/stacks/quickcreate?templateURL=https://get-dstack.s3.eu-west-1.amazonaws.com/cloudformation/template.yaml){:target="_blank"}. +our CloudFormation [template](https://console.aws.amazon.com/cloudformation/home#/stacks/quickcreate?templateURL=https://get-dstack.s3.eu-west-1.amazonaws.com/cloudformation/template.yaml). First, ensure, you've set up a private VPC with public and private subnets. @@ -14,4 +14,4 @@ Once, the stack is created, go to `Outputs` for the server URL and admin token. !!! info "Source code" If you'd like to adjust anything, the source code of the template can be found at - [`examples/server-deployment/cloudformation/template.yaml` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/server-deployment/cloudformation/template.yaml){:target="_blank"}. + [`examples/server-deployment/cloudformation/template.yaml`](https://github.com/dstackai/dstack/blob/master/examples/server-deployment/cloudformation/template.yaml). diff --git a/examples/single-node-training/axolotl/README.md b/examples/single-node-training/axolotl/README.md index e99be93de..c9c7917f0 100644 --- a/examples/single-node-training/axolotl/README.md +++ b/examples/single-node-training/axolotl/README.md @@ -1,6 +1,6 @@ # Axolotl -This example shows how to use [Axolotl :material-arrow-top-right-thin:{ .external }](https://github.com/OpenAccess-AI-Collective/axolotl){:target="_blank"} with `dstack` to fine-tune 4-bit Quantized `Llama-4-Scout-17B-16E` using SFT with FSDP and QLoRA. +This example shows how to use [Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) with `dstack` to fine-tune 4-bit Quantized `Llama-4-Scout-17B-16E` using SFT with FSDP and QLoRA. ??? info "Prerequisites" Once `dstack` is [installed](https://dstack.ai/docs/installation), clone the repo with examples. @@ -16,7 +16,7 @@ This example shows how to use [Axolotl :material-arrow-top-right-thin:{ .externa ## Define a configuration -Axolotl reads the model, QLoRA, and dataset arguments, as well as trainer configuration from a [`scout-qlora-flexattn-fsdp2.yaml` :material-arrow-top-right-thin:{ .external }](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml){:target="_blank"} file. The configuration uses 4-bit axolotl quantized version of `meta-llama/Llama-4-Scout-17B-16E`, requiring only ~43GB VRAM/GPU with 4K context length. +Axolotl reads the model, QLoRA, and dataset arguments, as well as trainer configuration from a [`scout-qlora-flexattn-fsdp2.yaml`](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml) file. The configuration uses 4-bit axolotl quantized version of `meta-llama/Llama-4-Scout-17B-16E`, requiring only ~43GB VRAM/GPU with 4K context length. Below is a task configuration that does fine-tuning. @@ -90,7 +90,7 @@ Provisioning... ## Source code The source-code of this example can be found in -[`examples/single-node-training/axolotl` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/axolotl){:target="_blank"} and [`examples/distributed-training/axolotl` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/distributed-training/axolotl){:target="_blank"}. +[`examples/single-node-training/axolotl`](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/axolotl) and [`examples/distributed-training/axolotl`](https://github.com/dstackai/dstack/blob/master/examples/distributed-training/axolotl). ## What's next? diff --git a/examples/single-node-training/trl/README.md b/examples/single-node-training/trl/README.md index f7431ad90..208408281 100644 --- a/examples/single-node-training/trl/README.md +++ b/examples/single-node-training/trl/README.md @@ -1,6 +1,6 @@ # TRL -This example walks you through how to use [TRL :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/trl){:target="_blank"} to fine-tune `Llama-3.1-8B` with `dstack` using SFT with QLoRA. +This example walks you through how to use [TRL](https://github.com/huggingface/trl) to fine-tune `Llama-3.1-8B` with `dstack` using SFT with QLoRA. ## Define a configuration @@ -106,7 +106,7 @@ Provisioning... ## Source code The source-code of this example can be found in -[`examples/llms/llama31` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/llama31){:target="_blank"} and [`examples/single-node-training/trl` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/trl){:target="_blank"}. +[`examples/llms/llama31`](https://github.com/dstackai/dstack/blob/master/examples/llms/llama31) and [`examples/single-node-training/trl`](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/trl). ## What's next?