Update the python wrapper script to support weight sharing case (#22341)

Update the python wrapper script to support weight sharing case
### Description
update the script to support json file that from QNN converter or the one extracted from QNN context binary file for the weight sharing scenario
This commit is contained in:
Hector Li 2024-10-18 11:16:20 -07:00 committed by GitHub
parent a732f7a4b3
commit d2a5ee2e5e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -20,135 +20,158 @@ class QnnTensorStruct:
self.dim = []
def is_quantized_data_type(qnn_data_type):
# QNN_DATATYPE_UFIXED_POINT_8 QNN_DATATYPE_UFIXED_POINT_16 QNN_DATATYPE_FIXED_POINT_8 QNN_DATATYPE_FIXED_POINT_16
return qnn_data_type == 0x0408 or qnn_data_type == 0x0416 or qnn_data_type == 0x0308 or qnn_data_type == 0x0316
def qnn_data_type_to_onnx_data_type(qnn_data_type):
# QNN_DATATYPE_UFIXED_POINT_8 QNN_DATATYPE_UINT_8
if qnn_data_type == 0x0408 or qnn_data_type == 0x0108:
return TensorProto.UINT8
# QNN_DATATYPE_UFIXED_POINT_16 QNN_DATATYPE_UINT_16
elif qnn_data_type == 0x0416 or qnn_data_type == 0x0116:
return TensorProto.UINT16
# QNN_DATATYPE_UFIXED_POINT_32 QNN_DATATYPE_UINT_32
elif qnn_data_type == 0x0432 or qnn_data_type == 0x0132:
return TensorProto.UINT32
# QNN_DATATYPE_UINT_64
elif qnn_data_type == 0x0164:
return TensorProto.UINT64
# QNN_DATATYPE_FIXED_POINT_8 QNN_DATATYPE_INT_8
elif qnn_data_type == 0x0308 or qnn_data_type == 0x0008:
return TensorProto.INT8
# QNN_DATATYPE_FIXED_POINT_16 QNN_DATATYPE_INT_16
elif qnn_data_type == 0x0316 or qnn_data_type == 0x0016:
return TensorProto.INT16
# QNN_DATATYPE_FIXED_POINT_32 QNN_DATATYPE_INT_32
elif qnn_data_type == 0x0332 or qnn_data_type == 0x0032:
return TensorProto.INT32
# QNN_DATATYPE_INT_64
elif qnn_data_type == 0x0064:
return TensorProto.INT64
# QNN_DATATYPE_FLOAT_16
elif qnn_data_type == 0x0216:
return TensorProto.FLOAT16
# QNN_DATATYPE_FLOAT_32
elif qnn_data_type == 0x0232:
return TensorProto.FLOAT
# QNN_DATATYPE_BOOL_8
elif qnn_data_type == 0x0508:
return TensorProto.BOOL
def is_quantized_data_type(qnn_data_type, is_converter_json):
if is_converter_json:
# QNN_DATATYPE_UFIXED_POINT_8 QNN_DATATYPE_UFIXED_POINT_16 QNN_DATATYPE_FIXED_POINT_8 QNN_DATATYPE_FIXED_POINT_16
return qnn_data_type == 0x0408 or qnn_data_type == 0x0416 or qnn_data_type == 0x0308 or qnn_data_type == 0x0316
else:
return TensorProto.UNDEFINED
return (
qnn_data_type == "QNN_DATATYPE_UFIXED_POINT_8"
or qnn_data_type == "QNN_DATATYPE_UFIXED_POINT_16"
or qnn_data_type == "QNN_DATATYPE_FIXED_POINT_8"
or qnn_data_type == "QNN_DATATYPE_FIXED_POINT_16"
)
def parse_qnn_json_file(qnn_json_file_path, qnn_input_tensor_dic, qnn_output_tensor_dic):
with open(qnn_json_file_path) as qnn_json_file:
qnn_json = json.load(qnn_json_file)
assert "graph" in qnn_json, "QNN converted json file not valid. Can't find graph."
assert "tensors" in qnn_json["graph"], "QNN converted json file not valid. Can't find tensors."
for qnn_tensor_name, qnn_tensor_attribute in qnn_json["graph"]["tensors"].items():
# type:0 - QNN input tensor, type:1 - QNN output tensor
assert (
"type" in qnn_tensor_attribute
and "data_type" in qnn_tensor_attribute
and "dims" in qnn_tensor_attribute
), "QNN converted json file not valid. Can't find some keys from tensors"
def qnn_data_type_to_onnx_data_type(qnn_data_type, is_converter_json):
if is_converter_json:
# QNN_DATATYPE_UFIXED_POINT_8 QNN_DATATYPE_UINT_8
if qnn_data_type == 0x0408 or qnn_data_type == 0x0108:
return TensorProto.UINT8
# QNN_DATATYPE_UFIXED_POINT_16 QNN_DATATYPE_UINT_16
elif qnn_data_type == 0x0416 or qnn_data_type == 0x0116:
return TensorProto.UINT16
# QNN_DATATYPE_UFIXED_POINT_32 QNN_DATATYPE_UINT_32
elif qnn_data_type == 0x0432 or qnn_data_type == 0x0132:
return TensorProto.UINT32
# QNN_DATATYPE_UINT_64
elif qnn_data_type == 0x0164:
return TensorProto.UINT64
# QNN_DATATYPE_FIXED_POINT_8 QNN_DATATYPE_INT_8
elif qnn_data_type == 0x0308 or qnn_data_type == 0x0008:
return TensorProto.INT8
# QNN_DATATYPE_FIXED_POINT_16 QNN_DATATYPE_INT_16
elif qnn_data_type == 0x0316 or qnn_data_type == 0x0016:
return TensorProto.INT16
# QNN_DATATYPE_FIXED_POINT_32 QNN_DATATYPE_INT_32
elif qnn_data_type == 0x0332 or qnn_data_type == 0x0032:
return TensorProto.INT32
# QNN_DATATYPE_INT_64
elif qnn_data_type == 0x0064:
return TensorProto.INT64
# QNN_DATATYPE_FLOAT_16
elif qnn_data_type == 0x0216:
return TensorProto.FLOAT16
# QNN_DATATYPE_FLOAT_32
elif qnn_data_type == 0x0232:
return TensorProto.FLOAT
# QNN_DATATYPE_BOOL_8
elif qnn_data_type == 0x0508:
return TensorProto.BOOL
else:
return TensorProto.UNDEFINED
else:
# QNN_DATATYPE_UFIXED_POINT_8 QNN_DATATYPE_UINT_8
if qnn_data_type == "QNN_DATATYPE_UFIXED_POINT_8" or qnn_data_type == "QNN_DATATYPE_UINT_8":
return TensorProto.UINT8
# QNN_DATATYPE_UFIXED_POINT_16 QNN_DATATYPE_UINT_16
elif qnn_data_type == "QNN_DATATYPE_UFIXED_POINT_16" or qnn_data_type == "QNN_DATATYPE_UINT_16":
return TensorProto.UINT16
# QNN_DATATYPE_UFIXED_POINT_32 QNN_DATATYPE_UINT_32
elif qnn_data_type == "QNN_DATATYPE_UFIXED_POINT_32" or qnn_data_type == "QNN_DATATYPE_UINT_32":
return TensorProto.UINT32
# QNN_DATATYPE_UINT_64
elif qnn_data_type == "QNN_DATATYPE_UINT_64":
return TensorProto.UINT64
# QNN_DATATYPE_FIXED_POINT_8 QNN_DATATYPE_INT_8
elif qnn_data_type == "QNN_DATATYPE_FIXED_POINT_8" or qnn_data_type == "QNN_DATATYPE_INT_8":
return TensorProto.INT8
# QNN_DATATYPE_FIXED_POINT_16 QNN_DATATYPE_INT_16
elif qnn_data_type == "QNN_DATATYPE_FIXED_POINT_16" or qnn_data_type == "QNN_DATATYPE_INT_16":
return TensorProto.INT16
# QNN_DATATYPE_FIXED_POINT_32 QNN_DATATYPE_INT_32
elif qnn_data_type == "QNN_DATATYPE_FIXED_POINT_32" or qnn_data_type == "QNN_DATATYPE_INT_32":
return TensorProto.INT32
# QNN_DATATYPE_INT_64
elif qnn_data_type == "QNN_DATATYPE_INT_64":
return TensorProto.INT64
# QNN_DATATYPE_FLOAT_16
elif qnn_data_type == "QNN_DATATYPE_FLOAT_16":
return TensorProto.FLOAT16
# QNN_DATATYPE_FLOAT_32
elif qnn_data_type == "QNN_DATATYPE_FLOAT_32":
return TensorProto.FLOAT
# QNN_DATATYPE_BOOL_8
elif qnn_data_type == "QNN_DATATYPE_BOOL_8":
return TensorProto.BOOL
else:
return TensorProto.UNDEFINED
# Get all graph inputs
if qnn_tensor_attribute["type"] == 0:
qnn_tensor = QnnTensorStruct()
qnn_tensor.name = qnn_tensor_name
qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type(qnn_tensor_attribute["data_type"])
qnn_tensor.is_quantized = is_quantized_data_type(qnn_tensor_attribute["data_type"])
qnn_tensor.dim = qnn_tensor_attribute["dims"]
if (
qnn_tensor_attribute["quant_params"]["definition"] == 1
and qnn_tensor_attribute["quant_params"]["encoding"] == 0
):
qnn_tensor.scale = qnn_tensor_attribute["quant_params"]["scale_offset"]["scale"]
qnn_tensor.offset = 0 - qnn_tensor_attribute["quant_params"]["scale_offset"]["offset"]
qnn_input_tensor_dic[qnn_tensor_name] = qnn_tensor
# Get all graph outputs
if qnn_tensor_attribute["type"] == 1:
qnn_tensor = QnnTensorStruct()
qnn_tensor.name = qnn_tensor_name
qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type(qnn_tensor_attribute["data_type"])
qnn_tensor.is_quantized = is_quantized_data_type(qnn_tensor_attribute["data_type"])
qnn_tensor.dim = qnn_tensor_attribute["dims"]
if (
qnn_tensor_attribute["quant_params"]["definition"] == 1
and qnn_tensor_attribute["quant_params"]["encoding"] == 0
):
qnn_tensor.scale = qnn_tensor_attribute["quant_params"]["scale_offset"]["scale"]
qnn_tensor.offset = 0 - qnn_tensor_attribute["quant_params"]["scale_offset"]["offset"]
qnn_output_tensor_dic[qnn_tensor_name] = qnn_tensor
def parse_qnn_converter_json_file(qnn_convert_json, qnn_input_tensor_dic, qnn_output_tensor_dic):
is_qnn_converter_json = True
for qnn_tensor_name, qnn_tensor_attribute in qnn_convert_json["graph"]["tensors"].items():
# type:0 - QNN input tensor, type:1 - QNN output tensor
assert (
"type" in qnn_tensor_attribute and "data_type" in qnn_tensor_attribute and "dims" in qnn_tensor_attribute
), "QNN converted json file not valid. Can't find some keys from tensors"
# Get all graph inputs
if qnn_tensor_attribute["type"] == 0:
qnn_tensor = QnnTensorStruct()
qnn_tensor.name = qnn_tensor_name
qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type(
qnn_tensor_attribute["data_type"], is_qnn_converter_json
)
qnn_tensor.is_quantized = is_quantized_data_type(qnn_tensor_attribute["data_type"], is_qnn_converter_json)
qnn_tensor.dim = qnn_tensor_attribute["dims"]
if (
qnn_tensor_attribute["quant_params"]["definition"] == 1
and qnn_tensor_attribute["quant_params"]["encoding"] == 0
):
qnn_tensor.scale = qnn_tensor_attribute["quant_params"]["scale_offset"]["scale"]
qnn_tensor.offset = 0 - qnn_tensor_attribute["quant_params"]["scale_offset"]["offset"]
qnn_input_tensor_dic[qnn_tensor_name] = qnn_tensor
# Get all graph outputs
if qnn_tensor_attribute["type"] == 1:
qnn_tensor = QnnTensorStruct()
qnn_tensor.name = qnn_tensor_name
qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type(
qnn_tensor_attribute["data_type"], is_qnn_converter_json
)
qnn_tensor.is_quantized = is_quantized_data_type(qnn_tensor_attribute["data_type"], is_qnn_converter_json)
qnn_tensor.dim = qnn_tensor_attribute["dims"]
if (
qnn_tensor_attribute["quant_params"]["definition"] == 1
and qnn_tensor_attribute["quant_params"]["encoding"] == 0
):
qnn_tensor.scale = qnn_tensor_attribute["quant_params"]["scale_offset"]["scale"]
qnn_tensor.offset = 0 - qnn_tensor_attribute["quant_params"]["scale_offset"]["offset"]
qnn_output_tensor_dic[qnn_tensor_name] = qnn_tensor
assert (
len(qnn_input_tensor_dic) >= 1 and len(qnn_output_tensor_dic) >= 1
), "Converted QNN model not valid. It should have at least 1 input & 1 output."
# Onnxruntime QNN EP can support context binary file generated by QNN tool chain. However QNN generated context binary file
# uses channel last data layout and 8 bits or 16 bits for input and output.
# This script gets the QNN model input & output information from QNN converted model_net.json file, compare them with Onnx model
# and inserts Cast, Transpose nodes to Onnx model if required
def main():
parser = ArgumentParser("Generate Onnx model which includes the QNN context binary.")
parser.add_argument("-b", "--qnn_bin", help="Required. Path to Qnn context binary file.", required=True, type=str)
parser.add_argument(
"-q", "--qnn_json", help="Required. Path to Qnn converted model_net.json file.", required=True, type=str
)
parser.add_argument(
"--disable_embed_mode",
action="store_true",
default=False,
help="Set embed_mode=1 which mean embed Qnn context binary into the onnx model. Otherwise, set context binary file path in the onnx model",
)
args = parser.parse_args()
# Parse Qnn model_net.json file to get the graph input output information
qnn_input_tensor_dic = {}
qnn_output_tensor_dic = {}
parse_qnn_json_file(args.qnn_json, qnn_input_tensor_dic, qnn_output_tensor_dic)
if args.disable_embed_mode:
ep_cache_context_content = args.qnn_bin
ctx_embed_mode = 0
else:
with open(args.qnn_bin, "rb") as file:
ep_cache_context_content = file.read()
ctx_embed_mode = 1
def generate_wrapper_onnx_file(
grap_name,
model_file_name,
qnn_input_tensor_dic,
qnn_output_tensor_dic,
disable_embed_mode,
qnn_ctx_file,
quantized_IO,
qnn_sdk_version="unknown",
):
graph_nodes = []
ini_list = []
value_infos = []
model_inputs = []
for qnn_input in qnn_input_tensor_dic.values():
if qnn_input.is_quantized:
if qnn_input.is_quantized and not quantized_IO:
q_scale_input_name = qnn_input.name + "_scale"
q_offset_input_name = qnn_input.name + "_zp"
q_scale = helper.make_tensor(q_scale_input_name, TensorProto.FLOAT, [], [qnn_input.scale])
@ -170,13 +193,22 @@ def main():
else:
model_inputs.append(helper.make_tensor_value_info(qnn_input.name, qnn_input.onnx_data_type, qnn_input.dim))
if disable_embed_mode:
ep_cache_context_content = qnn_ctx_file
ctx_embed_mode = 0
else:
with open(qnn_ctx_file, "rb") as file:
ep_cache_context_content = file.read()
ctx_embed_mode = 1
qnn_ep_context_node = helper.make_node(
"EPContext",
name="QnnContext",
name=grap_name,
inputs=qnn_input_tensor_dic.keys(),
outputs=qnn_output_tensor_dic.keys(),
ep_cache_context=ep_cache_context_content,
embed_mode=ctx_embed_mode,
ep_sdk_version=qnn_sdk_version,
source="Qnn",
domain="com.microsoft",
)
@ -184,7 +216,7 @@ def main():
model_outputs = []
for qnn_output in qnn_output_tensor_dic.values():
if qnn_output.is_quantized:
if qnn_output.is_quantized and not quantized_IO:
dq_scale_input_name = qnn_output.name + "_scale"
dq_offset_input_name = qnn_output.name + "_zp"
dq_scale = helper.make_tensor(dq_scale_input_name, TensorProto.FLOAT, [], [qnn_output.scale])
@ -214,7 +246,120 @@ def main():
model_def = helper.make_model(graph_def, producer_name="MS")
onnx.save(model_def, args.qnn_json.replace(".json", "_qnn_ctx.onnx"))
onnx.save(model_def, model_file_name)
# parse Qnn graph from the json file that extracted from context binary file
def parse_qnn_graph(qnn_graph, qnn_input_tensor_dic, qnn_output_tensor_dic):
is_qnn_converter_json = False
graph_name = qnn_graph["info"]["graphName"]
raw_inputs = qnn_graph["info"]["graphInputs"]
raw_outputs = qnn_graph["info"]["graphOutputs"]
for raw_input in raw_inputs:
tensor_info = raw_input["info"]
qnn_tensor = QnnTensorStruct()
qnn_tensor.name = tensor_info["name"]
qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type(tensor_info["dataType"], is_qnn_converter_json)
qnn_tensor.is_quantized = is_quantized_data_type(tensor_info["dataType"], is_qnn_converter_json)
qnn_tensor.dim = tensor_info["dimensions"]
if (
tensor_info["quantizeParams"]["definition"] == "QNN_DEFINITION_DEFINED"
and tensor_info["quantizeParams"]["quantizationEncoding"] == "QNN_QUANTIZATION_ENCODING_SCALE_OFFSET"
):
qnn_tensor.scale = tensor_info["quantizeParams"]["scaleOffset"]["scale"]
qnn_tensor.offset = 0 - tensor_info["quantizeParams"]["scaleOffset"]["offset"]
qnn_input_tensor_dic[qnn_tensor.name] = qnn_tensor
for raw_output in raw_outputs:
tensor_info = raw_output["info"]
qnn_tensor = QnnTensorStruct()
qnn_tensor.name = tensor_info["name"]
qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type(tensor_info["dataType"], is_qnn_converter_json)
qnn_tensor.is_quantized = is_quantized_data_type(tensor_info["dataType"], is_qnn_converter_json)
qnn_tensor.dim = tensor_info["dimensions"]
if (
tensor_info["quantizeParams"]["definition"] == "QNN_DEFINITION_DEFINED"
and tensor_info["quantizeParams"]["quantizationEncoding"] == "QNN_QUANTIZATION_ENCODING_SCALE_OFFSET"
):
qnn_tensor.scale = tensor_info["quantizeParams"]["scaleOffset"]["scale"]
qnn_tensor.offset = 0 - tensor_info["quantizeParams"]["scaleOffset"]["offset"]
qnn_output_tensor_dic[qnn_tensor.name] = qnn_tensor
assert (
len(qnn_input_tensor_dic) >= 1 and len(qnn_output_tensor_dic) >= 1
), "Converted QNN model not valid. It should have at least 1 input & 1 output."
return graph_name
# Onnxruntime QNN EP can support context binary file generated by QNN tool chain. However QNN generated context binary file
# uses channel last data layout and 8 bits or 16 bits for input and output.
# This script gets the QNN model input & output information from QNN converted model_net.json file, compare them with Onnx model
# and inserts Cast, Transpose nodes to Onnx model if required
def main():
parser = ArgumentParser("Generate Onnx model which includes the QNN context binary.")
parser.add_argument("-b", "--qnn_bin", help="Required. Path to Qnn context binary file.", required=True, type=str)
parser.add_argument(
"-q", "--qnn_json", help="Required. Path to Qnn converted model_net.json file.", required=True, type=str
)
parser.add_argument(
"--disable_embed_mode",
action="store_true",
default=False,
help="Set embed_mode=1 which mean embed Qnn context binary into the onnx model. Otherwise, set context binary file path in the onnx model",
)
parser.add_argument(
"--quantized_IO",
action="store_true",
default=False,
help="QNN converted context binary use quantized data as graph inputs and outputs. Will keep it if quantized_IO=True, otherwise, will insert Q and DQ nodes accordingly to make the graph inputs & outputs as float32 data type.",
)
args = parser.parse_args()
# Parse Qnn model_net.json file to get the graph input output information
with open(args.qnn_json) as qnn_json_file:
qnn_json_obj = json.load(qnn_json_file)
if "graph" in qnn_json_obj and "tensors" in qnn_json_obj["graph"]:
print("This json file is from Qnn converter")
qnn_input_tensor_dic = {}
qnn_output_tensor_dic = {}
parse_qnn_converter_json_file(qnn_json_obj, qnn_input_tensor_dic, qnn_output_tensor_dic)
generate_wrapper_onnx_file(
"QnnContext",
args.qnn_json.replace(".json", "_qnn_ctx.onnx"),
qnn_input_tensor_dic,
qnn_output_tensor_dic,
args.disable_embed_mode,
args.qnn_bin,
args.quantized_IO,
)
elif "info" in qnn_json_obj and "graphs" in qnn_json_obj["info"]:
print("This json file is extracted from QNN context binary file")
qnn_version = qnn_json_obj["info"]["buildId"]
for qnn_graph in qnn_json_obj["info"]["graphs"]:
qnn_input_tensor_dic = {}
qnn_output_tensor_dic = {}
graph_name = parse_qnn_graph(qnn_graph, qnn_input_tensor_dic, qnn_output_tensor_dic)
ctx_file_name = graph_name + "_qnn_ctx.onnx"
if not args.quantized_IO:
ctx_file_name = ctx_file_name.replace(".onnx", "_fp32_io.onnx")
generate_wrapper_onnx_file(
graph_name,
ctx_file_name,
qnn_input_tensor_dic,
qnn_output_tensor_dic,
args.disable_embed_mode,
args.qnn_bin,
args.quantized_IO,
qnn_version,
)
else:
print("json file unrecoginized.")
if __name__ == "__main__":