Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 212 additions & 0 deletions .ci/scripts/test_model_e2e_windows.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
#!/usr/bin/env pwsh
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

param(
[Parameter(Mandatory = $true)]
[string]$Device,
[Parameter(Mandatory = $true)]
[string]$HfModel,
[Parameter(Mandatory = $true)]
[string]$QuantName,
[string]$ModelDir = ".",
[string]$ExpectedCudaVersion = ""
)

Set-StrictMode -Version Latest
$ErrorActionPreference = "Stop"
$PSNativeCommandUseErrorActionPreference = $true
$ProgressPreference = "SilentlyContinue"

if ($Device -ne "cuda-windows") {
throw "Unsupported device '$Device'. Expected 'cuda-windows'."
}

Write-Host "Testing model: $HfModel (quantization: $QuantName)"

$resolvedModelDir = (Resolve-Path -Path $ModelDir).Path
$modelPte = Join-Path -Path $resolvedModelDir -ChildPath "model.pte"
$cudaBlob = Join-Path -Path $resolvedModelDir -ChildPath "aoti_cuda_blob.ptd"

if (-not (Test-Path -Path $modelPte -PathType Leaf)) {
throw "model.pte not found in '$resolvedModelDir'"
}
if (-not (Test-Path -Path $cudaBlob -PathType Leaf)) {
throw "aoti_cuda_blob.ptd not found in '$resolvedModelDir'"
}

$scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path
$executorchRoot = (Resolve-Path -Path (Join-Path -Path $scriptDir -ChildPath "..\..")).Path

switch ($HfModel) {
"mistralai/Voxtral-Mini-3B-2507" {
$runnerTarget = "voxtral_runner"
$runnerPath = "voxtral"
$runnerPreset = "voxtral-cuda"
$expectedOutput = "identity"
$preprocessor = "voxtral_preprocessor.pte"
$tokenizerUrl = "https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main" # @lint-ignore
$tokenizerFile = "tekken.json"
$audioUrl = "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
$audioFile = "poem.wav"
}
"nvidia/parakeet-tdt" {
$runnerTarget = "parakeet_runner"
$runnerPath = "parakeet"
$runnerPreset = "parakeet-cuda"
$expectedOutput = "Phoebe"
$preprocessor = ""
$tokenizerUrl = ""
$tokenizerFile = "tokenizer.model"
$audioUrl = "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav"
$audioFile = "test_audio.wav"
}
default {
throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, nvidia/parakeet-tdt"
}
}

function Download-IfNeeded {
param(
[Parameter(Mandatory = $true)]
[string]$Url,
[Parameter(Mandatory = $true)]
[string]$OutFile
)

if (Test-Path -Path $OutFile -PathType Leaf) {
Write-Host "Using existing file: $OutFile"
return
}
Write-Host "Downloading $Url -> $OutFile"
Invoke-WebRequest -Uri $Url -OutFile $OutFile
}

Push-Location $executorchRoot
try {
Write-Host "::group::Check CUDA toolchain"
$nvccOutput = nvcc --version | Out-String
Write-Host $nvccOutput
nvidia-smi
if (-not [string]::IsNullOrWhiteSpace($ExpectedCudaVersion)) {
$versionMatch = [Regex]::Match($nvccOutput, "release\s+(\d+\.\d+)")
if (-not $versionMatch.Success) {
throw "Failed to parse CUDA version from nvcc output."
}
$actualCudaVersion = $versionMatch.Groups[1].Value
if ($actualCudaVersion -ne $ExpectedCudaVersion) {
throw "CUDA version mismatch. Expected: $ExpectedCudaVersion, Actual: $actualCudaVersion"
}
Write-Host "CUDA version check passed: $actualCudaVersion"
}
Write-Host "::endgroup::"

Write-Host "::group::Build ExecuTorch (CUDA)"
$numCores = [Math]::Max([Environment]::ProcessorCount - 1, 1)
cmake --preset llm-release-cuda
cmake --build cmake-out --target install --config Release -j $numCores
Write-Host "::endgroup::"

Write-Host "::group::Build $runnerTarget"
Push-Location (Join-Path -Path $executorchRoot -ChildPath "examples\models\$runnerPath")
try {
cmake --preset $runnerPreset
cmake --build (Join-Path -Path $executorchRoot -ChildPath "cmake-out\examples\models\$runnerPath") --target $runnerTarget --config Release -j $numCores
}
finally {
Pop-Location
}
Write-Host "::endgroup::"

Write-Host "::group::Prepare Artifacts"
if ($preprocessor -ne "") {
$preprocessorPath = Join-Path -Path $resolvedModelDir -ChildPath $preprocessor
if (-not (Test-Path -Path $preprocessorPath -PathType Leaf)) {
throw "Required preprocessor artifact not found: $preprocessorPath"
}
}
if ($tokenizerFile -ne "") {
$tokenizerPath = Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile
if (-not (Test-Path -Path $tokenizerPath -PathType Leaf) -and $tokenizerUrl -eq "") {
throw "Required tokenizer artifact not found: $tokenizerPath"
}
}
if ($tokenizerUrl -ne "") {
Download-IfNeeded -Url "$tokenizerUrl/$tokenizerFile" -OutFile (Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile)
}
if ($audioUrl -ne "") {
Download-IfNeeded -Url $audioUrl -OutFile (Join-Path -Path $resolvedModelDir -ChildPath $audioFile)
}
Get-ChildItem -Path $resolvedModelDir
Write-Host "::endgroup::"

Write-Host "::group::Run $runnerTarget"
$runnerExeCandidates = @(
(Join-Path -Path $executorchRoot -ChildPath "cmake-out\examples\models\$runnerPath\Release\$runnerTarget.exe"),
(Join-Path -Path $executorchRoot -ChildPath "cmake-out\examples\models\$runnerPath\$runnerTarget.exe")
)
$runnerExe = $runnerExeCandidates | Where-Object { Test-Path -Path $_ -PathType Leaf } | Select-Object -First 1
if (-not $runnerExe) {
throw "Runner executable not found. Checked: $($runnerExeCandidates -join ', ')"
}

$runnerArgs = @("--model_path", $modelPte, "--data_path", $cudaBlob)
switch ($HfModel) {
"mistralai/Voxtral-Mini-3B-2507" {
$runnerArgs += @(
"--temperature", "0",
"--tokenizer_path", (Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile),
"--audio_path", (Join-Path -Path $resolvedModelDir -ChildPath $audioFile),
"--processor_path", (Join-Path -Path $resolvedModelDir -ChildPath $preprocessor)
)
}
"nvidia/parakeet-tdt" {
$runnerArgs = @(
"--model_path", $modelPte,
"--audio_path", (Join-Path -Path $resolvedModelDir -ChildPath $audioFile),
"--tokenizer_path", (Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile),
"--data_path", $cudaBlob
)
}
}

$stdoutFile = Join-Path -Path $env:TEMP -ChildPath ("et_runner_stdout_{0}.log" -f ([Guid]::NewGuid().ToString("N")))
$stderrFile = Join-Path -Path $env:TEMP -ChildPath ("et_runner_stderr_{0}.log" -f ([Guid]::NewGuid().ToString("N")))
try {
$proc = Start-Process `
-FilePath $runnerExe `
-ArgumentList $runnerArgs `
-NoNewWindow `
-Wait `
-PassThru `
-RedirectStandardOutput $stdoutFile `
-RedirectStandardError $stderrFile

$stdout = if (Test-Path -Path $stdoutFile -PathType Leaf) { Get-Content -Path $stdoutFile -Raw } else { "" }
$stderr = if (Test-Path -Path $stderrFile -PathType Leaf) { Get-Content -Path $stderrFile -Raw } else { "" }
$output = @($stdout, $stderr) -join [Environment]::NewLine
$exitCode = $proc.ExitCode
}
finally {
Remove-Item -Path $stdoutFile -ErrorAction SilentlyContinue
Remove-Item -Path $stderrFile -ErrorAction SilentlyContinue
}
Write-Host "Runner output:"
Write-Host $output

if ($exitCode -ne 0) {
Write-Warning "Runner exited with code $exitCode (may be benign)`n$output"
}

if ($expectedOutput -ne "" -and $output -notmatch [Regex]::Escape($expectedOutput)) {
throw "Expected output '$expectedOutput' not found in runner output"
}
Write-Host "Success: '$expectedOutput' found in output"
Write-Host "::endgroup::"
}
finally {
Pop-Location
}
57 changes: 50 additions & 7 deletions .github/workflows/cuda-windows.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
# Test ExecuTorch CUDA Windows Cross-Compilation Export
# This workflow tests model export targeting CUDA Windows using optimum-executorch.
# It runs on a Linux machine with CUDA and uses the executorch-ubuntu-22.04-cuda-windows
# Docker image which has mingw and Windows CUDA SDK pre-installed for cross-compilation.
# Test ExecuTorch CUDA Windows Artifacts
# This workflow exports models targeting CUDA Windows using optimum-executorch on Linux.
# Then it runs those exported artifacts on a Windows CI machine.

name: Test CUDA Windows Export
name: Test CUDA Windows Export and E2E

on:
pull_request:
Expand Down Expand Up @@ -35,8 +34,8 @@ jobs:
- repo: "nvidia"
name: "parakeet-tdt"
quant:
- "non-quantized"
- "quantized-int4-weight-only"
- "non-quantized"
- "quantized-int4-weight-only"
with:
timeout: 90
secrets-env: EXECUTORCH_HF_TOKEN
Expand Down Expand Up @@ -81,3 +80,47 @@ jobs:
echo "::endgroup::"

source .ci/scripts/export_model_artifact.sh cuda-windows "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"

test-model-cuda-windows-e2e:
name: test-model-cuda-windows-e2e
needs: export-model-cuda-windows-artifact
uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
strategy:
fail-fast: false
matrix:
model:
- repo: "mistralai"
name: "Voxtral-Mini-3B-2507"
- repo: "nvidia"
name: "parakeet-tdt"
quant:
- "non-quantized"
- "quantized-int4-weight-only"
with:
timeout: 240
runner: windows.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.8
submodules: recursive
download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-windows-${{ matrix.quant }}
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
conda init powershell
powershell -Command "& {
Set-PSDebug -Trace 1
\$ErrorActionPreference = 'Stop'
\$PSNativeCommandUseErrorActionPreference = \$true

\$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8'
\$env:CUDA_PATH = \$env:CUDA_HOME
\$env:PATH = \"\$env:CUDA_HOME\bin;\$env:PATH\"
nvcc --version

.ci/scripts/setup-windows.ps1
\$artifactDir = \$env:RUNNER_ARTIFACT_DIR
if ([string]::IsNullOrWhiteSpace(\$artifactDir)) {
throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.'
}

.ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model.repo }}/${{ matrix.model.name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.8'
}"
64 changes: 63 additions & 1 deletion examples/models/parakeet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,47 @@ This generates:
- `aoti_cuda_blob.ptd` - CUDA kernel blob required at runtime
- `tokenizer.model` - SentencePiece tokenizer

### CUDA-Windows Export

Before running `cuda-windows` export, make sure these requirements are set up:
- `x86_64-w64-mingw32-g++` is installed and on `PATH` (mingw-w64 cross-compiler).
- `WINDOWS_CUDA_HOME` points to the extracted Windows CUDA package directory.

Example setup on Ubuntu:

```bash
# 1) Install cross-compiler + extraction tools
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
g++-mingw-w64-x86-64-posix mingw-w64-tools p7zip-full wget

# 2) Verify cross-compiler
x86_64-w64-mingw32-g++ --version

# 3) Download and extract Windows CUDA installer package
CUDA_VERSION=12.8.1
CUDA_DRIVER_VERSION=572.61
CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
CUDA_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${CUDA_INSTALLER}"

mkdir -p /opt/cuda-windows
cd /opt/cuda-windows
wget -q "${CUDA_URL}" -O "${CUDA_INSTALLER}"
7z x "${CUDA_INSTALLER}" -oextracted -y

# 4) Point WINDOWS_CUDA_HOME to extracted Windows CUDA payload
export WINDOWS_CUDA_HOME=/opt/cuda-windows/extracted/cuda_cudart/cudart
```

```bash
python export_parakeet_tdt.py --backend cuda-windows --output-dir ./parakeet_cuda_windows
```

This generates:
- `model.pte` - The compiled Parakeet TDT model
- `aoti_cuda_blob.ptd` - CUDA kernel blob required at runtime
- `tokenizer.model` - SentencePiece tokenizer

## C++ Runner

### Building
Expand All @@ -149,6 +190,15 @@ make parakeet-metal
make parakeet-cuda
```

On Windows (PowerShell), use CMake workflow presets directly:

```powershell
cmake --workflow --preset llm-release-cuda
Push-Location examples/models/parakeet
cmake --workflow --preset parakeet-cuda
Pop-Location
```

### Running

From the executorch root directory:
Expand All @@ -174,12 +224,24 @@ DYLD_LIBRARY_PATH=/usr/lib ./cmake-out/examples/models/parakeet/parakeet_runner
--tokenizer_path examples/models/parakeet/parakeet_cuda/tokenizer.model
```

Windows (PowerShell):

```powershell
.\cmake-out\examples\models\parakeet\Release\parakeet_runner.exe `
--model_path C:\path\to\parakeet_cuda_windows\model.pte `
--data_path C:\path\to\parakeet_cuda_windows\aoti_cuda_blob.ptd `
--audio_path C:\path\to\audio.wav `
--tokenizer_path C:\path\to\parakeet_cuda_windows\tokenizer.model
```

If your generator is single-config, the runner may be at `.\cmake-out\examples\models\parakeet\parakeet_runner.exe` instead.

### Runner Arguments

| Argument | Description |
|----------|-------------|
| `--model_path` | Path to Parakeet model (.pte) |
| `--audio_path` | Path to input audio file (.wav) |
| `--tokenizer_path` | Path to tokenizer file (default: `tokenizer.json`) |
| `--data_path` | Path to data file (.ptd) for delegate data (required for CUDA) |
| `--data_path` | Path to data file (.ptd) for delegate data (required for CUDA/CUDA-Windows) |
| `--timestamps` | Timestamp output mode: `none\|token\|word\|segment\|all` (default: `segment`) |
Loading
Loading