@@ -2,7 +2,7 @@ use std::sync::Arc;
22use std:: time:: { Duration , Instant } ;
33
44use anyhow:: Result ;
5- use futures :: FutureExt ;
5+
66use parking_lot:: RwLock ;
77use prometheus:: {
88 register_gauge, register_histogram, register_int_counter, Gauge , Histogram , IntCounter ,
@@ -351,21 +351,7 @@ impl ModelOptimizer {
351351
352352 // Quantization APIs – backend-specific implementations behind feature flags.
353353 pub fn quantize_fp16 ( & self ) -> Result < ( ) > {
354- #[ cfg( feature = "tch" ) ]
355- {
356- // With tch, a full-graph conversion requires model-specific access.
357- // For now, expose as not implemented until integrated with a model holder.
358- return Err ( AiOptimizeError :: NotImplemented ( "tch fp16 quantization" ) . into ( ) ) ;
359- }
360- #[ cfg( feature = "onnx" ) ]
361- {
362- // ONNX Runtime quantization should be handled offline or through tooling.
363- return Err ( AiOptimizeError :: NotImplemented ( "onnx fp16 quantization" ) . into ( ) ) ;
364- }
365- #[ cfg( feature = "candle" ) ]
366- {
367- return Err ( AiOptimizeError :: NotImplemented ( "candle fp16 quantization" ) . into ( ) ) ;
368- }
354+
369355 #[ allow( unreachable_code) ]
370356 {
371357 warn ! ( "fp16 quantization requested but no backend enabled" ) ;
@@ -374,18 +360,7 @@ impl ModelOptimizer {
374360 }
375361
376362 pub fn quantize_int8 ( & self ) -> Result < ( ) > {
377- #[ cfg( feature = "onnx" ) ]
378- {
379- return Err ( AiOptimizeError :: NotImplemented ( "onnx int8 quantization" ) . into ( ) ) ;
380- }
381- #[ cfg( feature = "tch" ) ]
382- {
383- return Err ( AiOptimizeError :: NotImplemented ( "tch int8 quantization" ) . into ( ) ) ;
384- }
385- #[ cfg( feature = "candle" ) ]
386- {
387- return Err ( AiOptimizeError :: NotImplemented ( "candle int8 quantization" ) . into ( ) ) ;
388- }
363+
389364 #[ allow( unreachable_code) ]
390365 {
391366 warn ! ( "int8 quantization requested but no backend enabled" ) ;
@@ -434,26 +409,7 @@ impl ModelOptimizer {
434409
435410 pub fn start_monitoring ( & self ) {
436411 // GPU utilization polling via NVML if available
437- #[ cfg( feature = "nvml" ) ]
438- {
439- use nvml_wrapper:: Nvml ;
440- let nvml = Nvml :: init ( ) . ok ( ) ;
441- let metrics = self . metrics . clone ( ) ;
442- tokio:: spawn ( async move {
443- if let Some ( nvml) = nvml {
444- loop {
445- if let Ok ( device) = nvml. device_by_index ( 0 ) {
446- if let Ok ( util) = device. utilization_rates ( ) {
447- metrics. gpu_utilization . set ( util. gpu as f64 ) ;
448- }
449- }
450- sleep ( Duration :: from_millis ( 1000 ) ) . await ;
451- }
452- } else {
453- warn ! ( "NVML init failed; GPU utilization metrics disabled" ) ;
454- }
455- } ) ;
456- }
412+
457413
458414 // Alerting loop to check thresholds; emits logs (integrate with Alertmanager externally)
459415 let thresholds = self . thresholds . clone ( ) ;
0 commit comments