From fe0c65cc286ce5f2f228632341174d914f56c1ce Mon Sep 17 00:00:00 2001 From: commc Date: Wed, 4 Sep 2024 14:29:23 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E5=AE=A2=E6=88=B7=E9=9C=80=E6=B1=82CLIP?= =?UTF-8?q?=E6=A8=A1=E5=9E=8Bcompile=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/multimodal/compile_aie.py | 169 ++++++++++++++++++ .../built-in/multimodal/compile_ts.py | 147 +++++++++++++++ 2 files changed, 316 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/multimodal/compile_aie.py create mode 100644 MindIE/MindIE-Torch/built-in/multimodal/compile_ts.py diff --git a/MindIE/MindIE-Torch/built-in/multimodal/compile_aie.py b/MindIE/MindIE-Torch/built-in/multimodal/compile_aie.py new file mode 100644 index 0000000000..35f13c9747 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/multimodal/compile_aie.py @@ -0,0 +1,169 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import time +import json +import logging +import argparse +import torch +import mindietorch +import torch.nn as nn +from torch._export import export, dynamic_dim +from mindietorch import _enums +from transformers.models.auto.modeling_auto import AutoModel + +logging.basicConfig(level=logging.INFO) + + +class CLIPWrapper(nn.Module): + def __init__(self, clip): + super(CLIPWrapper, self).__init__() + self.model = clip + self.logit_scale = clip.logit_scale.exp().to(self.model.device) + + def forward(self, input_ids, pixel_values, attention_mask): + image_embeds = self.model.get_image_features(pixel_values) + text_embeds = self.model.get_text_features(input_ids, attention_mask) + + image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) + + logits_per_image = image_embeds @ text_embeds.transpose(1, 0).contiguous() * self.logit_scale + logits_per_text = logits_per_image.transpose(1, 0).contiguous() + + return image_embeds, text_embeds, logits_per_text, logits_per_image + + +def compile_clip(args): + # 加载Pytorch模型 + with torch.no_grad(): + torch_model = AutoModel.from_pretrained(args.hf_model_path).float().eval() + torch_model = CLIPWrapper(torch_model) + + hf_config_path = os.path.join(args.hf_model_path, "config.json") + if not os.path.exists(hf_config_path): + raise FileNotFoundError(f"config.json not found at {args.hf_model_path}: {hf_config_path}") + with open(hf_config_path, "r") as f: + config_dict = json.load(f) + + # 构造模型输入 + image_width = config_dict["vision_config"]["image_size"] + pixel_values_shape = (args.img_max_batch, 3, image_width, image_width) + input_ids_shape = (args.text_max_batch, args.max_token_len) + pixel_values = torch.ones(pixel_values_shape, dtype=torch.float32) + input_ids = torch.randint(high=1, size=input_ids_shape, dtype=torch.int32) + attention_mask = torch.ones_like(input_ids, dtype=torch.int32) + + # 导出fx格式模型并执行MindIE编译 + constraints = [ + # input ids + dynamic_dim(input_ids, 0) >= 1, + dynamic_dim(input_ids, 0) <= args.text_max_batch, + dynamic_dim(input_ids, 1) >= 1, + dynamic_dim(input_ids, 1) <= 52, + # pixel input + dynamic_dim(pixel_values, 0) >= 1, + dynamic_dim(pixel_values, 0) <= args.img_max_batch, + # input ids attention mask + dynamic_dim(attention_mask, 0) == dynamic_dim(input_ids, 0), + dynamic_dim(attention_mask, 1) == dynamic_dim(input_ids, 1), + ] + + logging.info("Starting to export dynamic clip ...") + intermediate_model = export( + torch_model, + args=(input_ids, pixel_values, attention_mask,), + constraints=constraints + ) + logging.info("Successfully export dynamic clip!") + + mindietorch.set_device(args.device_id) + pixel_values_min_shape = (1, 3, image_width, image_width) + pixel_values_max_shape = (args.img_max_batch, 3, image_width, image_width) + input_ids_min_shape = (1, 1) + input_ids_max_shape = (args.text_max_batch, args.max_token_len) + + # 执行MindIETorch编译 + compile_inputs = [ + mindietorch.Input(min_shape=input_ids_min_shape, max_shape=input_ids_max_shape), + mindietorch.Input(min_shape=pixel_values_min_shape, max_shape=pixel_values_max_shape), + mindietorch.Input(min_shape=input_ids_min_shape, max_shape=input_ids_max_shape), # attention mask + ] + + if args.precision == "fp16": + model_precision = _enums.PrecisionPolicy.FP16 + elif args.precision == "fp32": + model_precision = _enums.PrecisionPolicy.FP32 + else: + raise ValueError("Unsupported precision type!") + + logging.info("Starting to compile mindietorch clip ...") + ts = time.time() + compiled_model = mindietorch.compile( + intermediate_model, + inputs=compile_inputs, + precision_policy=model_precision, + soc_version=args.soc_version, + ) + compile_cost = time.time() - ts + logging.info("compile time cost: %f", compile_cost) + logging.info("Successfully exported mindietorch clip!") + + logging.info("Starting to save ...") + model_save_dir = f"{args.save_dir}" + if not os.path.exists(model_save_dir): + os.makedirs(model_save_dir) + compiled_file_name = f"CLIP-{args.model_version}-MindIE.pt" + torch.save(compiled_model, model_save_dir + compiled_file_name, pickle_protocol=4) + logging.info("Saving done!") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compile Clip model") + parser.add_argument("--soc-version", type=str, default="Ascend910B4", help="NPU version") + parser.add_argument("--device-id", type=int, default=0) + parser.add_argument("--text-max-batch", type=int, default=80) + parser.add_argument("--img-max-batch", type=int, default=8) + parser.add_argument( + "--max-token-len", + type=int, + default=52, + help="The padded length of input text (include [CLS] & [SEP] tokens)." + ) + parser.add_argument( + "--model-version", + default="ViT-B-16", + choices=["ViT-B-16", "ViT-L-14", "ViT-L-14-336", "ViT-H-14", "RN50"], + help="Specify the architecture of CLIP model to be converted." + ) + parser.add_argument( + "--hf-model-path", + default="/Path/to/Huggingface_model_path", + type=str, + help="Huggingface CLIP Model Path." + ) + parser.add_argument( + "--precision", + default="fp16", + choices=["fp16", "fp32"], + help="Specify the precision of CLIP model to be converted." + ) + parser.add_argument("--save-dir", type=str, default="./", help="Path to save the exported model") + + return parser.parse_args() + + +if __name__ == "__main__": + compile_args = parse_args() + compile_clip(compile_args) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/multimodal/compile_ts.py b/MindIE/MindIE-Torch/built-in/multimodal/compile_ts.py new file mode 100644 index 0000000000..74f6c1892e --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/multimodal/compile_ts.py @@ -0,0 +1,147 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import time +import json +import logging +import argparse +import torch +import mindietorch +import torch.nn as nn +from mindietorch import _enums +from transformers.models.auto.modeling_auto import AutoModel + +logging.basicConfig(level=logging.INFO) + + +class CLIPWrapper(nn.Module): + def __init__(self, clip): + super(CLIPWrapper, self).__init__() + self.model = clip + self.logit_scale = clip.logit_scale.exp() + self.logit_scale.to(self.model.device) + + def forward(self, input_ids, pixel_values, attention_mask): + image_embeds = self.model.get_image_features(pixel_values) + text_embeds = self.model.get_text_features(input_ids, attention_mask) + + image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) + + logits_per_image = image_embeds @ text_embeds.transpose(1, 0).contiguous() * self.logit_scale + logits_per_text = logits_per_image.transpose(1, 0).contiguous() + + return image_embeds, text_embeds, logits_per_text, logits_per_image + + +def compile_clip(args): + # 加载Pytorch模型 + with torch.no_grad(): + torch_model = AutoModel.from_pretrained(args.hf_model_path).float().eval() + torch_model = CLIPWrapper(torch_model) + + hf_config_path = os.path.join(args.hf_model_path, "config.json") + if not os.path.exists(hf_config_path): + raise FileNotFoundError(f"config.json not found at {args.hf_model_path}: {hf_config_path}") + with open(hf_config_path, "r") as f: + config_dict = json.load(f) + + # 构造模型输入 + image_width = config_dict["vision_config"]["image_size"] + pixel_values_shape = (args.img_batch, 3, image_width, image_width) + input_ids_shape = (args.text_batch, args.token_len) + pixel_values = torch.ones(pixel_values_shape, dtype=torch.float32) + input_ids = torch.randint(high=1, size=input_ids_shape, dtype=torch.int32) + attention_mask = torch.ones_like(input_ids, dtype=torch.int32) + + # 导出ts格式模型并执行MindIE编译 + input_data = [input_ids, pixel_values, attention_mask] + logging.info("Starting to trace clip ...") + intermediate_model = torch.jit.trace(torch_model, input_data) + logging.info("Successfully trace clip!") + + mindietorch.set_device(args.device_id) + + # 执行MindIETorch编译 + compile_inputs = [ + mindietorch.Input(shape=input_ids_shape, dtype=torch.int32), + mindietorch.Input(shape=pixel_values_shape, dtype=torch.float32), + mindietorch.Input(shape=input_ids_shape, dtype=torch.int32) + ] + if args.precision == "fp16": + model_precision = _enums.PrecisionPolicy.FP16 + elif args.precision == "fp32": + model_precision = _enums.PrecisionPolicy.FP32 + else: + raise ValueError("Unsupported precision type!") + + logging.info("Starting to compile mindietorch clip ...") + ts = time.time() + compiled_model = mindietorch.compile( + intermediate_model, + inputs=compile_inputs, + precision_policy=model_precision, + soc_version=args.soc_version, + ) + compile_cost = time.time() - ts + logging.info("compile time cost: %f", compile_cost) + logging.info("Successfully exported mindietorch clip!") + + logging.info("Starting to save ...") + model_save_dir = f"{args.save_dir}" + if not os.path.exists(model_save_dir): + os.makedirs(model_save_dir) + compiled_file_name = f"CLIP-{args.model_version}-MindIE.ts" + compiled_model.save(model_save_dir + compiled_file_name) + logging.info("Saving done!") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compile Clip model") + parser.add_argument("--soc-version", default="Ascend910B4", help="NPU version") + parser.add_argument("--device-id", type=int, default=0) + parser.add_argument("--text-batch", type=int, default=80) + parser.add_argument("--img-batch", type=int, default=1) + parser.add_argument( + "--token-len", + type=int, + default=52, + help="The padded length of input text (include [CLS] & [SEP] tokens)." + ) + parser.add_argument( + "--model-version", + default="ViT-L-14", + choices=["ViT-B-16", "ViT-L-14", "ViT-L-14-336", "ViT-H-14", "RN50"], + help="Specify the architecture of CLIP model to be converted." + ) + parser.add_argument( + "--hf-model-path", + default="/Path/to/Huggingface_model_path", + type=str, + help="Huggingface CLIP Model Path." + ) + parser.add_argument( + "--precision", + default="fp16", + choices=["fp16", "fp32"], + help="Specify the precision of CLIP model to be converted." + ) + parser.add_argument("--save-dir", type=str, default="./", help="Path to save the exported model") + + return parser.parse_args() + + +if __name__ == "__main__": + compile_args = parse_args() + compile_clip(compile_args) \ No newline at end of file -- Gitee From 2fdd6876e6218578955eb3e4a7a96ffcf2f80fa8 Mon Sep 17 00:00:00 2001 From: commc Date: Wed, 4 Sep 2024 18:28:59 +0800 Subject: [PATCH 2/3] =?UTF-8?q?config=E6=89=93=E5=BC=80=E6=96=B9=E5=BC=8F?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/multimodal/compile_aie.py | 10 +++------- MindIE/MindIE-Torch/built-in/multimodal/compile_ts.py | 10 +++------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/multimodal/compile_aie.py b/MindIE/MindIE-Torch/built-in/multimodal/compile_aie.py index 35f13c9747..15f27de077 100644 --- a/MindIE/MindIE-Torch/built-in/multimodal/compile_aie.py +++ b/MindIE/MindIE-Torch/built-in/multimodal/compile_aie.py @@ -21,6 +21,7 @@ import mindietorch import torch.nn as nn from torch._export import export, dynamic_dim from mindietorch import _enums +from transformers import AutoConfig from transformers.models.auto.modeling_auto import AutoModel logging.basicConfig(level=logging.INFO) @@ -50,15 +51,10 @@ def compile_clip(args): with torch.no_grad(): torch_model = AutoModel.from_pretrained(args.hf_model_path).float().eval() torch_model = CLIPWrapper(torch_model) + config = AutoConfig.from_pretrained(args.hf_model_path) - hf_config_path = os.path.join(args.hf_model_path, "config.json") - if not os.path.exists(hf_config_path): - raise FileNotFoundError(f"config.json not found at {args.hf_model_path}: {hf_config_path}") - with open(hf_config_path, "r") as f: - config_dict = json.load(f) - # 构造模型输入 - image_width = config_dict["vision_config"]["image_size"] + image_width = config.vision_config.image_size pixel_values_shape = (args.img_max_batch, 3, image_width, image_width) input_ids_shape = (args.text_max_batch, args.max_token_len) pixel_values = torch.ones(pixel_values_shape, dtype=torch.float32) diff --git a/MindIE/MindIE-Torch/built-in/multimodal/compile_ts.py b/MindIE/MindIE-Torch/built-in/multimodal/compile_ts.py index 74f6c1892e..8f959ff3bd 100644 --- a/MindIE/MindIE-Torch/built-in/multimodal/compile_ts.py +++ b/MindIE/MindIE-Torch/built-in/multimodal/compile_ts.py @@ -20,6 +20,7 @@ import torch import mindietorch import torch.nn as nn from mindietorch import _enums +from transformers import AutoConfig from transformers.models.auto.modeling_auto import AutoModel logging.basicConfig(level=logging.INFO) @@ -50,15 +51,10 @@ def compile_clip(args): with torch.no_grad(): torch_model = AutoModel.from_pretrained(args.hf_model_path).float().eval() torch_model = CLIPWrapper(torch_model) + config = AutoConfig.from_pretrained(args.hf_model_path) - hf_config_path = os.path.join(args.hf_model_path, "config.json") - if not os.path.exists(hf_config_path): - raise FileNotFoundError(f"config.json not found at {args.hf_model_path}: {hf_config_path}") - with open(hf_config_path, "r") as f: - config_dict = json.load(f) - # 构造模型输入 - image_width = config_dict["vision_config"]["image_size"] + image_width = config.vision_config.image_size pixel_values_shape = (args.img_batch, 3, image_width, image_width) input_ids_shape = (args.text_batch, args.token_len) pixel_values = torch.ones(pixel_values_shape, dtype=torch.float32) -- Gitee From 87c82ed0c0414a2813921cc29722f07ee6c4bc0b Mon Sep 17 00:00:00 2001 From: commc Date: Thu, 5 Sep 2024 17:14:42 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E6=AD=A3=E7=A1=AE=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E5=A4=B9=E5=B1=82=E7=BA=A7=E7=BB=93=E6=9E=84=E6=95=B4=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/multimodal/{ => CLIP}/compile_aie.py | 0 MindIE/MindIE-Torch/built-in/multimodal/{ => CLIP}/compile_ts.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename MindIE/MindIE-Torch/built-in/multimodal/{ => CLIP}/compile_aie.py (100%) rename MindIE/MindIE-Torch/built-in/multimodal/{ => CLIP}/compile_ts.py (100%) diff --git a/MindIE/MindIE-Torch/built-in/multimodal/compile_aie.py b/MindIE/MindIE-Torch/built-in/multimodal/CLIP/compile_aie.py similarity index 100% rename from MindIE/MindIE-Torch/built-in/multimodal/compile_aie.py rename to MindIE/MindIE-Torch/built-in/multimodal/CLIP/compile_aie.py diff --git a/MindIE/MindIE-Torch/built-in/multimodal/compile_ts.py b/MindIE/MindIE-Torch/built-in/multimodal/CLIP/compile_ts.py similarity index 100% rename from MindIE/MindIE-Torch/built-in/multimodal/compile_ts.py rename to MindIE/MindIE-Torch/built-in/multimodal/CLIP/compile_ts.py -- Gitee