Skip to content

Commit 56ff2f5

Browse files
committed
cleanup implementation
1 parent ff7d23a commit 56ff2f5

File tree

3 files changed

+404
-0
lines changed

3 files changed

+404
-0
lines changed

cluster-autoscaler/cloudprovider/azure/azure_config.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,16 @@ type Config struct {
115115

116116
// EnableLabelPredictionsOnTemplate defines whether to enable label predictions on the template when scaling from zero
117117
EnableLabelPredictionsOnTemplate bool `json:"enableLabelPredictionsOnTemplate,omitempty" yaml:"enableLabelPredictionsOnTemplate,omitempty"`
118+
119+
// EnableZombieCleanup defines whether to enable automatic cleanup of zombie VMSS instances
120+
EnableZombieCleanup bool `json:"enableZombieCleanup,omitempty" yaml:"enableZombieCleanup,omitempty"`
121+
122+
// ZombieCleanupDryRun defines whether zombie cleanup should run in dry-run mode (detect but don't delete)
123+
// In dry-run mode, zombies are detected and logged but NOT deleted
124+
ZombieCleanupDryRun bool `json:"zombieCleanupDryRun,omitempty" yaml:"zombieCleanupDryRun,omitempty"`
125+
126+
// ZombieMinAgeMinutes defines the minimum age (in minutes) before an instance is considered a zombie
127+
ZombieMinAgeMinutes int `json:"zombieMinAgeMinutes,omitempty" yaml:"zombieMinAgeMinutes,omitempty"`
118128
}
119129

120130
// These are only here for backward compabitility. Their equivalent exists in providerazure.Config with a different name.
@@ -146,6 +156,10 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
146156
cfg.MaxDeploymentsCount = int64(defaultMaxDeploymentsCount)
147157
cfg.StrictCacheUpdates = false
148158
cfg.EnableLabelPredictionsOnTemplate = true
159+
// Zombie cleanup disabled by default for safety
160+
cfg.EnableZombieCleanup = false
161+
cfg.ZombieCleanupDryRun = true // Default to dry-run mode for extra safety
162+
cfg.ZombieMinAgeMinutes = 5 // Default 5 minutes minimum age
149163

150164
// Config file overrides defaults
151165
if configReader != nil {

cluster-autoscaler/cloudprovider/azure/azure_manager.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,12 @@ func (m *AzureManager) forceRefresh() error {
232232
}
233233
m.lastRefresh = time.Now()
234234
klog.V(2).Infof("Refreshed Azure VM and VMSS list, next refresh after %v", m.lastRefresh.Add(m.azureCache.refreshInterval))
235+
236+
if err := m.cleanupZombieNodes(); err != nil {
237+
klog.Errorf("Failed to cleanup zombie nodes: %v", err)
238+
// Don't fail the refresh if zombie cleanup fails
239+
}
240+
235241
return nil
236242
}
237243

0 commit comments

Comments
 (0)