Run TensorFlow models
Quantizing and converting a model
mkdir -p ~/post-training-quantization-tf cd ~/post-training-quantization-tf python3 -m venv .venv source .venv/bin/activate pip3 install tensorflow==2.20.0 tf_keras==2.20.1 ai-edge-litert==1.3.0mkdir -p models wget -O models/cats.keras https://cdn.edgeimpulse.com/qc-ai-docs/models/cats.keras wget -O models/cats_X_val.npy https://cdn.edgeimpulse.com/qc-ai-docs/models/cats_X_val.npy wget -O models/cats_y_val.npy https://cdn.edgeimpulse.com/qc-ai-docs/models/cats_y_val.npyimport tensorflow as tf, numpy as np, os, time, tf_keras as keras from ai_edge_litert.interpreter import Interpreter, load_delegate # Shape: (444, 160, 160, 3) X_val = np.load('models/cats_X_val.npy') # Shape: (444, 1) -> with class 1..6 -> scale to 0..5 y_val = np.load('models/cats_y_val.npy') - 1 # Load Keras model model = keras.models.load_model("models/cats.keras") # Calculate accuracy of the TF model tf_start = time.perf_counter() y_pred = model.predict(X_val) tf_end = time.perf_counter() preds = np.argmax(y_pred, axis=1) acc_tf = (preds == y_val).mean() print(f"TF/Keras accuracy: {acc_tf*100:.2f}% (time per inference: {(tf_end - tf_start) * 1000 / X_val.shape[0]:.4g}ms)") print('') # Convert to quantized TFLite file... Uses the dataset earlier as a representative dataset to improve accuracy. TFLITE_FILE = 'cats_i8.tflite' if not os.path.exists(TFLITE_FILE): print(f'Converting to TFLite file ({TFLITE_FILE})...') def rep_dataset(): for i in range(X_val.shape[0]): yield [X_val[i:i+1]] # Build a fixed batch=1 input signature (QNN cannot handle dynamic dims) specs = [] for t in model.inputs: if None in t.shape[1:]: raise ValueError(f"Non-batch dims must be known; got {t.shape}") specs.append(tf.TensorSpec([1, *t.shape[1:]], dtype=t.dtype, name=t.name.split(':')[0])) @tf.function(input_signature=specs) def serve(*xs): y = model(*xs) return y if isinstance(y, (tuple, list)) else (y,) # keep output order stable concrete = serve.get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete], model) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.representative_dataset = rep_dataset converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] converter.inference_input_type = tf.int8 converter.inference_output_type = tf.int8 tflite_model = converter.convert() with open(TFLITE_FILE, "wb") as f: f.write(tflite_model) print(f"TFLite written: {TFLITE_FILE} ({os.path.getsize(TFLITE_FILE)/1e6:.2f} MB)") else: print(f'TFLite file already exists ({TFLITE_FILE})') print('') def run_tflite_model(model_path, use_npu): # Use QNN to run this model on NPU experimental_delegates = [] if use_npu: experimental_delegates = [load_delegate("libQnnTFLiteDelegate.so", options={"backend_type": "htp"})] # Get accuracy for the quantized TFLite file, construct the interpreter interpreter = Interpreter(model_path=model_path, experimental_delegates=experimental_delegates) interpreter.allocate_tensors() in_details = interpreter.get_input_details()[0] out_details = interpreter.get_output_details()[0] # You need to scale the input / output yourself using quantization params in_scale, in_zp = in_details["quantization"] out_scale, out_zp = out_details["quantization"] # Loop through one-by-one (most TFLite files have a fixed batch size of 1) preds_tflite = [] tflite_start = time.perf_counter() for i in range(X_val.shape[0]): # Scale input and invoke x = X_val[i:i+1] x_q = np.round(x / in_scale + in_zp).astype(in_details['dtype']) interpreter.set_tensor(in_details["index"], x_q) interpreter.invoke() # Scale output back to f32 out = interpreter.get_tensor(out_details["index"]) out = (out.astype(np.float32) - out_zp) * out_scale # And add the outcome to the predictions preds_tflite.append(np.argmax(out, axis=1)[0]) tflite_end = time.perf_counter() # Compare accuracy in the same way as above acc_tflite = (np.array(preds_tflite) == y_val).mean() if use_npu: print(f"Quantized TFLite accuracy (NPU): {acc_tflite*100:.2f}% (time per inference: {(tflite_end - tflite_start) * 1000 / X_val.shape[0]:.4g}ms)") else: print(f"Quantized TFLite accuracy (CPU): {acc_tflite*100:.2f}% (time per inference: {(tflite_end - tflite_start) * 1000 / X_val.shape[0]:.4g}ms)") run_tflite_model(TFLITE_FILE, False) run_tflite_model(TFLITE_FILE, True)python3 quantize.py # TF/Keras accuracy: 94.37% (time per inference: 29.86ms) # # Converting to TFLite file (cats_i8.tflite)... # ... # Quantized TFLite accuracy (CPU): 87.16% (time per inference: 10.37ms) # Quantized TFLite accuracy (NPU): 88.51% (time per inference: 3.809ms)
Last updated