File size: 1,283 Bytes

4e8c1b1

# import onnx

# # Load the ONNX model
# model_path = "model_uint8.onnx"  # Replace with the path to your ONNX model
# onnx_model = onnx.load(model_path)

# # Print model's input and output shapes
# for input_tensor in onnx_model.graph.input:
#     print(f"Input Name: {input_tensor.name}")
#     print(
#         f"Input Shape: {[dim.dim_value for dim in input_tensor.type.tensor_type.shape.dim]}"
#     )

# for output_tensor in onnx_model.graph.output:
#     print(f"Output Name: {output_tensor.name}")
#     print(
#         f"Output Shape: {[dim.dim_value for dim in output_tensor.type.tensor_type.shape.dim]}"
#     )


from onnxruntime.quantization import quantize_dynamic, QuantType

# Define the path to the original ONNX model and the quantized output model
onnx_model_path = "./granite_embedding_model.onnx"  # Path to the original ONNX model
quantized_model_path = "./model_uint8.onnx"  # Path to save the quantized ONNX model

# Perform dynamic quantization to UInt8
quantize_dynamic(
    model_input=onnx_model_path,  # Input ONNX model path
    model_output=quantized_model_path,  # Output quantized model path
    weight_type=QuantType.QUInt8,  # Use UInt8 for weights
)

# Print confirmation of quantization
print(f"Quantized model saved to {quantized_model_path}")