pytorch · larryliu0820 · Feb 20, 2026 · Feb 20, 2026
diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1
@@ -0,0 +1,212 @@
+#!/usr/bin/env pwsh
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+param(
+    [Parameter(Mandatory = $true)]
+    [string]$Device,
+    [Parameter(Mandatory = $true)]
+    [string]$HfModel,
+    [Parameter(Mandatory = $true)]
+    [string]$QuantName,
+    [string]$ModelDir = ".",
+    [string]$ExpectedCudaVersion = ""
+)
+
+Set-StrictMode -Version Latest
+$ErrorActionPreference = "Stop"
+$PSNativeCommandUseErrorActionPreference = $true
+$ProgressPreference = "SilentlyContinue"
+
+if ($Device -ne "cuda-windows") {
+    throw "Unsupported device '$Device'. Expected 'cuda-windows'."
+}
+
+Write-Host "Testing model: $HfModel (quantization: $QuantName)"
+
+$resolvedModelDir = (Resolve-Path -Path $ModelDir).Path
+$modelPte = Join-Path -Path $resolvedModelDir -ChildPath "model.pte"
+$cudaBlob = Join-Path -Path $resolvedModelDir -ChildPath "aoti_cuda_blob.ptd"
+
+if (-not (Test-Path -Path $modelPte -PathType Leaf)) {
+    throw "model.pte not found in '$resolvedModelDir'"
+}
+if (-not (Test-Path -Path $cudaBlob -PathType Leaf)) {
+    throw "aoti_cuda_blob.ptd not found in '$resolvedModelDir'"
+}
+
+$scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path
+$executorchRoot = (Resolve-Path -Path (Join-Path -Path $scriptDir -ChildPath "..\..")).Path
+
+switch ($HfModel) {
+    "mistralai/Voxtral-Mini-3B-2507" {
+        $runnerTarget = "voxtral_runner"
+        $runnerPath = "voxtral"
+        $runnerPreset = "voxtral-cuda"
+        $expectedOutput = "identity"
+        $preprocessor = "voxtral_preprocessor.pte"
+        $tokenizerUrl = "https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main" # @lint-ignore
+        $tokenizerFile = "tekken.json"
+        $audioUrl = "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
+        $audioFile = "poem.wav"
+    }
+    "nvidia/parakeet-tdt" {
+        $runnerTarget = "parakeet_runner"
+        $runnerPath = "parakeet"
+        $runnerPreset = "parakeet-cuda"
+        $expectedOutput = "Phoebe"
+        $preprocessor = ""
+        $tokenizerUrl = ""
+        $tokenizerFile = "tokenizer.model"
+        $audioUrl = "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav"
+        $audioFile = "test_audio.wav"
+    }
+    default {
+        throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, nvidia/parakeet-tdt"
+    }
+}
+
+function Download-IfNeeded {
+    param(
+        [Parameter(Mandatory = $true)]
+        [string]$Url,
+        [Parameter(Mandatory = $true)]
+        [string]$OutFile
+    )
+
+    if (Test-Path -Path $OutFile -PathType Leaf) {
+        Write-Host "Using existing file: $OutFile"
+        return
+    }
+    Write-Host "Downloading $Url -> $OutFile"
+    Invoke-WebRequest -Uri $Url -OutFile $OutFile
+}
+
+Push-Location $executorchRoot
+try {
+    Write-Host "::group::Check CUDA toolchain"
+    $nvccOutput = nvcc --version | Out-String
+    Write-Host $nvccOutput
+    nvidia-smi
+    if (-not [string]::IsNullOrWhiteSpace($ExpectedCudaVersion)) {
+        $versionMatch = [Regex]::Match($nvccOutput, "release\s+(\d+\.\d+)")
+        if (-not $versionMatch.Success) {
+            throw "Failed to parse CUDA version from nvcc output."
+        }
+        $actualCudaVersion = $versionMatch.Groups[1].Value
+        if ($actualCudaVersion -ne $ExpectedCudaVersion) {
+            throw "CUDA version mismatch. Expected: $ExpectedCudaVersion, Actual: $actualCudaVersion"
+        }
+        Write-Host "CUDA version check passed: $actualCudaVersion"
+    }
+    Write-Host "::endgroup::"
+
+    Write-Host "::group::Build ExecuTorch (CUDA)"
+    $numCores = [Math]::Max([Environment]::ProcessorCount - 1, 1)
+    cmake --preset llm-release-cuda
+    cmake --build cmake-out --target install --config Release -j $numCores
+    Write-Host "::endgroup::"
+
+    Write-Host "::group::Build $runnerTarget"
+    Push-Location (Join-Path -Path $executorchRoot -ChildPath "examples\models\$runnerPath")
+    try {
+        cmake --preset $runnerPreset
+        cmake --build (Join-Path -Path $executorchRoot -ChildPath "cmake-out\examples\models\$runnerPath") --target $runnerTarget --config Release -j $numCores
+    }
+    finally {
+        Pop-Location
+    }
+    Write-Host "::endgroup::"
+
+    Write-Host "::group::Prepare Artifacts"
+    if ($preprocessor -ne "") {
+        $preprocessorPath = Join-Path -Path $resolvedModelDir -ChildPath $preprocessor
+        if (-not (Test-Path -Path $preprocessorPath -PathType Leaf)) {
+            throw "Required preprocessor artifact not found: $preprocessorPath"
+        }
+    }
+    if ($tokenizerFile -ne "") {
+        $tokenizerPath = Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile
+        if (-not (Test-Path -Path $tokenizerPath -PathType Leaf) -and $tokenizerUrl -eq "") {
+            throw "Required tokenizer artifact not found: $tokenizerPath"
+        }
+    }
+    if ($tokenizerUrl -ne "") {
+        Download-IfNeeded -Url "$tokenizerUrl/$tokenizerFile" -OutFile (Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile)
+    }
+    if ($audioUrl -ne "") {
+        Download-IfNeeded -Url $audioUrl -OutFile (Join-Path -Path $resolvedModelDir -ChildPath $audioFile)
+    }
+    Get-ChildItem -Path $resolvedModelDir
+    Write-Host "::endgroup::"
+
+    Write-Host "::group::Run $runnerTarget"
+    $runnerExeCandidates = @(
+        (Join-Path -Path $executorchRoot -ChildPath "cmake-out\examples\models\$runnerPath\Release\$runnerTarget.exe"),
+        (Join-Path -Path $executorchRoot -ChildPath "cmake-out\examples\models\$runnerPath\$runnerTarget.exe")
+    )
+    $runnerExe = $runnerExeCandidates | Where-Object { Test-Path -Path $_ -PathType Leaf } | Select-Object -First 1
+    if (-not $runnerExe) {
+        throw "Runner executable not found. Checked: $($runnerExeCandidates -join ', ')"
+    }
+
+    $runnerArgs = @("--model_path", $modelPte, "--data_path", $cudaBlob)
+    switch ($HfModel) {
+        "mistralai/Voxtral-Mini-3B-2507" {
+            $runnerArgs += @(
+                "--temperature", "0",
+                "--tokenizer_path", (Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile),
+                "--audio_path", (Join-Path -Path $resolvedModelDir -ChildPath $audioFile),
+                "--processor_path", (Join-Path -Path $resolvedModelDir -ChildPath $preprocessor)
+            )
+        }
+        "nvidia/parakeet-tdt" {
+            $runnerArgs = @(
+                "--model_path", $modelPte,
+                "--audio_path", (Join-Path -Path $resolvedModelDir -ChildPath $audioFile),
+                "--tokenizer_path", (Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile),
+                "--data_path", $cudaBlob
+            )
+        }
+    }
+
+    $stdoutFile = Join-Path -Path $env:TEMP -ChildPath ("et_runner_stdout_{0}.log" -f ([Guid]::NewGuid().ToString("N")))
+    $stderrFile = Join-Path -Path $env:TEMP -ChildPath ("et_runner_stderr_{0}.log" -f ([Guid]::NewGuid().ToString("N")))
+    try {
+        $proc = Start-Process `
+            -FilePath $runnerExe `
+            -ArgumentList $runnerArgs `
+            -NoNewWindow `
+            -Wait `
+            -PassThru `
+            -RedirectStandardOutput $stdoutFile `
+            -RedirectStandardError $stderrFile
+
+        $stdout = if (Test-Path -Path $stdoutFile -PathType Leaf) { Get-Content -Path $stdoutFile -Raw } else { "" }
+        $stderr = if (Test-Path -Path $stderrFile -PathType Leaf) { Get-Content -Path $stderrFile -Raw } else { "" }
+        $output = @($stdout, $stderr) -join [Environment]::NewLine
+        $exitCode = $proc.ExitCode
+    }
+    finally {
+        Remove-Item -Path $stdoutFile -ErrorAction SilentlyContinue
+        Remove-Item -Path $stderrFile -ErrorAction SilentlyContinue
+    }
+    Write-Host "Runner output:"
+    Write-Host $output
+
+    if ($exitCode -ne 0) {
+        Write-Warning "Runner exited with code $exitCode (may be benign)`n$output"
+    }
+
+    if ($expectedOutput -ne "" -and $output -notmatch [Regex]::Escape($expectedOutput)) {
+        throw "Expected output '$expectedOutput' not found in runner output"
+    }
+    Write-Host "Success: '$expectedOutput' found in output"
+    Write-Host "::endgroup::"
+}
+finally {
+    Pop-Location
+}
diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
@@ -1,9 +1,8 @@
-# Test ExecuTorch CUDA Windows Cross-Compilation Export
-# This workflow tests model export targeting CUDA Windows using optimum-executorch.
-# It runs on a Linux machine with CUDA and uses the executorch-ubuntu-22.04-cuda-windows
-# Docker image which has mingw and Windows CUDA SDK pre-installed for cross-compilation.
+# Test ExecuTorch CUDA Windows Artifacts
+# This workflow exports models targeting CUDA Windows using optimum-executorch on Linux.
+# Then it runs those exported artifacts on a Windows CI machine.
 
-name: Test CUDA Windows Export
+name: Test CUDA Windows Export and E2E
 
 on:
   pull_request:
@@ -35,8 +34,8 @@ jobs:
           - repo: "nvidia"
             name: "parakeet-tdt"
         quant:
-            - "non-quantized"
-            - "quantized-int4-weight-only"
+          - "non-quantized"
+          - "quantized-int4-weight-only"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -81,3 +80,47 @@ jobs:
         echo "::endgroup::"
 
         source .ci/scripts/export_model_artifact.sh cuda-windows "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
+
+  test-model-cuda-windows-e2e:
+    name: test-model-cuda-windows-e2e
+    needs: export-model-cuda-windows-artifact
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    strategy:
+      fail-fast: false
+      matrix:
+        model:
+          - repo: "mistralai"
+            name: "Voxtral-Mini-3B-2507"
+          - repo: "nvidia"
+            name: "parakeet-tdt"
+        quant:
+          - "non-quantized"
+          - "quantized-int4-weight-only"
+    with:
+      timeout: 240
+      runner: windows.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.8
+      submodules: recursive
+      download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-windows-${{ matrix.quant }}
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        conda init powershell
+        powershell -Command "& {
+          Set-PSDebug -Trace 1
+          \$ErrorActionPreference = 'Stop'
+          \$PSNativeCommandUseErrorActionPreference = \$true
+
+          \$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8'
+          \$env:CUDA_PATH = \$env:CUDA_HOME
+          \$env:PATH = \"\$env:CUDA_HOME\bin;\$env:PATH\"
+          nvcc --version
+
+          .ci/scripts/setup-windows.ps1
+          \$artifactDir = \$env:RUNNER_ARTIFACT_DIR
+          if ([string]::IsNullOrWhiteSpace(\$artifactDir)) {
+            throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.'
+          }
+
+          .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model.repo }}/${{ matrix.model.name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.8'
+        }"
@@ -132,6 +132,47 @@ This generates:
 - `aoti_cuda_blob.ptd` - CUDA kernel blob required at runtime
 - `tokenizer.model` - SentencePiece tokenizer
 
+### CUDA-Windows Export
+
+Before running `cuda-windows` export, make sure these requirements are set up:
+- `x86_64-w64-mingw32-g++` is installed and on `PATH` (mingw-w64 cross-compiler).
+- `WINDOWS_CUDA_HOME` points to the extracted Windows CUDA package directory.
+
+Example setup on Ubuntu:
+
+```bash
+# 1) Install cross-compiler + extraction tools
+sudo apt-get update
+sudo apt-get install -y --no-install-recommends \
+  g++-mingw-w64-x86-64-posix mingw-w64-tools p7zip-full wget
+
+# 2) Verify cross-compiler
+x86_64-w64-mingw32-g++ --version
+
+# 3) Download and extract Windows CUDA installer package
+CUDA_VERSION=12.8.1
+CUDA_DRIVER_VERSION=572.61
+CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
+CUDA_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${CUDA_INSTALLER}"
+
+mkdir -p /opt/cuda-windows
+cd /opt/cuda-windows
+wget -q "${CUDA_URL}" -O "${CUDA_INSTALLER}"
+7z x "${CUDA_INSTALLER}" -oextracted -y
+
+# 4) Point WINDOWS_CUDA_HOME to extracted Windows CUDA payload
+export WINDOWS_CUDA_HOME=/opt/cuda-windows/extracted/cuda_cudart/cudart
+```
+
+```bash
+python export_parakeet_tdt.py --backend cuda-windows --output-dir ./parakeet_cuda_windows
+```
+
+This generates:
+- `model.pte` - The compiled Parakeet TDT model
+- `aoti_cuda_blob.ptd` - CUDA kernel blob required at runtime
+- `tokenizer.model` - SentencePiece tokenizer
+
 ## C++ Runner
 
 ### Building
@@ -149,6 +190,15 @@ make parakeet-metal
 make parakeet-cuda
 ```
 
+On Windows (PowerShell), use CMake workflow presets directly:
+
+```powershell
+cmake --workflow --preset llm-release-cuda
+Push-Location examples/models/parakeet
+cmake --workflow --preset parakeet-cuda
+Pop-Location
+```
+
 ### Running
 
 From the executorch root directory:
@@ -174,12 +224,24 @@ DYLD_LIBRARY_PATH=/usr/lib ./cmake-out/examples/models/parakeet/parakeet_runner
   --tokenizer_path examples/models/parakeet/parakeet_cuda/tokenizer.model
 ```
 
+Windows (PowerShell):
+
+```powershell
+.\cmake-out\examples\models\parakeet\Release\parakeet_runner.exe `
+  --model_path C:\path\to\parakeet_cuda_windows\model.pte `
+  --data_path C:\path\to\parakeet_cuda_windows\aoti_cuda_blob.ptd `
+  --audio_path C:\path\to\audio.wav `
+  --tokenizer_path C:\path\to\parakeet_cuda_windows\tokenizer.model
+```
+
+If your generator is single-config, the runner may be at `.\cmake-out\examples\models\parakeet\parakeet_runner.exe` instead.
+
 ### Runner Arguments
 
 | Argument | Description |
 |----------|-------------|
 | `--model_path` | Path to Parakeet model (.pte) |
 | `--audio_path` | Path to input audio file (.wav) |
 | `--tokenizer_path` | Path to tokenizer file (default: `tokenizer.json`) |
-| `--data_path` | Path to data file (.ptd) for delegate data (required for CUDA) |
+| `--data_path` | Path to data file (.ptd) for delegate data (required for CUDA/CUDA-Windows) |
 | `--timestamps`     | Timestamp output mode: `none\|token\|word\|segment\|all` (default: `segment`) |