3,769
社区成员




你好,请问一下,我在使用Tutorial_for_Llama3_Compute ,针对Qwen模型,安装官方教程进行model_prepare是,困惑度是正常的,但是当加载等效替换导出的onnx模型时,效果很差。请问是为什么呢。onnx推理代码如下,输入预处理函数,都是调用高通自定义的函数。:
class LLMForwardPassManager:
def __init__(self, cfg, model_path, tokenizer, separate_tuple_input_output, num_tokens):
self.tokenizer = tokenizer
self.session = ort.InferenceSession(model_path)
self.device = 'cpu' # ONNX typically uses CPU unless GPU is configured
self.num_heads = getattr(cfg, 'num_attention_heads', 14)
self.num_kv_heads = getattr(cfg, 'num_key_value_heads', 2)
self.num_layers = getattr(cfg, 'num_hidden_layers', 24)
self.embed_dim = getattr(cfg, 'hidden_size', 896)
self.rope_theta = getattr(cfg, "rope_theta", 10000.0)
self.max_tokens = getattr(cfg, 'max_tokens', 4096)
self.num_tokens = num_tokens # Fixed at 2073
self.max_kv_size = self.max_tokens - self.num_tokens # 2023
self.use_position_embedding_input = True
self.use_combined_mask_input = True
self.transposed_key_cache = True
self.mask_neg = getattr(cfg, 'mask_neg', -50)
self.use_input_embeddings = False
self.separate_tuple_input_output = separate_tuple_input_output
self.dtype = np.float32
def replace_model(self, new_model_path):
self.session = ort.InferenceSession(new_model_path)
@contextlib.contextmanager
def place_on_device(self, device):
original_device = self.device
try:
self.to(device)
yield
finally:
self.to(original_device)
def to(self, device):
self.device = device # Note: ONNX Runtime device handling may require additional setup for GPU
def _tokenize_text(self, text, max_length=None):
if self.tokenizer is None:
raise ValueError("No tokenizer registered with forward pass manager.")
encoded = self.tokenizer(text, return_tensors="np", padding="max_length" if max_length else False,
truncation=True, max_length=max_length or self.max_tokens)
return encoded
def _update_kv_cache(self, prev_key_value, new_key_value, max_cache_size, is_concatenated=False):
def _concat(a, b, dim):
if isinstance(a, tuple):
assert len(a) == len(b), 'Unexpected key/value pair'
return tuple(_concat(ai, bi, dim) for ai, bi in zip(a, b))
return np.concatenate((a, b), axis=dim)
def _do_concat(a, b, key_dim, value_dim):
return tuple((_concat(ak, bk, key_dim), _concat(av, bv, value_dim)) for (ak, av), (bk, bv) in zip(a, b))
def _shift(a, dim, shift_size):
if isinstance(a, tuple):
return tuple(_shift(ai, dim, shift_size) for ai in a)
assert dim in (2, 3), 'Unexpected shift axis'
return a[:, :, shift_size:, :] if dim == 2 else a[:, :, :, shift_size:]
def _do_shift(a, key_dim, value_dim, shift_size):
return tuple((_shift(k, key_dim, shift_size), _shift(v, value_dim, shift_size)) for k, v in a)
value_dim = 2
key_dim = 3 if self.transposed_key_cache else 2
if prev_key_value is None or is_concatenated:
next_key_value = new_key_value
elif new_key_value is None:
next_key_value = prev_key_value
else:
next_key_value = _do_concat(prev_key_value, new_key_value, key_dim, value_dim)
shift_size = next_key_value[0][1].shape[-2] - max_cache_size
if shift_size > 0:
next_key_value = _do_shift(next_key_value, key_dim, value_dim, shift_size)
return next_key_value
def validate_inputs(self, input_text=None, input_ids=None, input_embeddings=None, past_key_values=None):
input_count = sum(1 for x in (input_text, input_ids, input_embeddings) if x is not None)
if input_count != 1:
print("Incorrect number of arguments: one of (input_text, input_ids, input_embeddings) expected.")
return False
if past_key_values is not None and past_key_values[0][1].shape[-2] > self.max_kv_size:
print(f"Provided past_key_values are too long: {past_key_values[0][1].shape[-2]} > {self.max_kv_size}")
return False
return True
def validate_input_lengths(self, input_length, mask_length, attn_length):
if not (1 <= input_length <= self.num_tokens):
print(f"input_length({input_length}) must be between 1 and num_tokens({self.num_tokens}).")
return False
if not (input_length <= mask_length <= attn_length):
print(f"mask_length({mask_length}) must satisfy input_length <= mask_length <= attn_length({attn_length}).")
return False
return True
def prepare_inputs(self, input_text=None, input_ids=None, input_embeddings=None, attention_mask=None,
past_key_values=None, **kwargs):
assert self.validate_inputs(input_text, input_ids, input_embeddings, past_key_values)
kvcache_info_bundle = {}
if input_text is not None:
encoded = self._tokenize_text(input_text, max_length=self.num_tokens)
input_ids = encoded['input_ids']
attention_mask = encoded['attention_mask']
kvcache_info_bundle["input_length"] = input_ids.shape[1]
else:
kvcache_info_bundle["input_length"] = input_ids.shape[1]
input = input_ids.astype(np.int64)
batch_size = input.shape[0]
input_length = input.shape[1]
kv_length = past_key_values[0][1].shape[-2] if past_key_values is not None else 0
attn_length = min(input_length + kv_length, self.max_tokens)
if attention_mask is None:
attention_mask = np.ones((batch_size, input_length), dtype=np.int64)
attention_mask = attention_mask.astype(np.int64)
mask_length = attention_mask.shape[1]
assert self.validate_input_lengths(input_length, mask_length, attn_length)
if input_length < self.num_tokens:
shape = (batch_size, self.num_tokens - input_length)
input_extensions = np.full(shape, self.tokenizer.pad_token_id, dtype=np.int64)
input = np.concatenate((input_extensions, input), axis=1)
attention_mask_extension = np.zeros((batch_size, self.num_tokens - input_length), dtype=np.int64)
attention_mask = np.concatenate((attention_mask_extension, attention_mask), axis=1)
desired_length = self.max_tokens
if mask_length < desired_length:
attention_mask_extension = np.zeros((batch_size, desired_length - mask_length), dtype=np.int64)
attention_mask = np.concatenate((attention_mask_extension, attention_mask), axis=1)
elif mask_length > desired_length:
attention_mask = attention_mask[:, -desired_length:]
if past_key_values is None:
past_key_values = get_padded_kv_values(past_size=self.max_kv_size, num_layers=self.num_layers,
hidden_size=self.embed_dim, num_attention_heads=self.num_heads,
num_kv_heads=self.num_kv_heads, device=self.device, dtype=self.dtype)
else:
past_key_values = self._update_kv_cache(None, past_key_values, self.max_kv_size)
position_ids = np.cumsum(attention_mask, axis=1) - 1
position_ids = np.clip(position_ids, 0, self.max_tokens - 1)
position_ids = position_ids[:, -self.num_tokens:]
position_ids_cos, position_ids_sin = get_position_embeddings_from_position_ids(position_ids,
head_dim=self.embed_dim // self.num_heads,
max_length=self.max_tokens,
rope_theta=self.rope_theta,
device=self.device,
dtype=self.dtype)
attention_mask = prepare_combined_attention_mask(attention_mask, (batch_size, self.num_tokens), self.max_kv_size,
device=self.device, mask_neg=self.mask_neg, dtype=self.dtype)
inputs = {
'input_ids': input.reshape(1, self.num_tokens),
'attention_mask': attention_mask.reshape(1, 1, self.num_tokens, self.max_tokens),
'position_ids_cos': position_ids_cos.reshape(1, 1, self.num_tokens, 32),
'position_ids_sin': position_ids_sin.reshape(1, 1, self.num_tokens, 32)
}
if self.separate_tuple_input_output:
input_names = [input.name for input in self.session.get_inputs()]
flattened_key_values = list(flatten_tensors(past_key_values))
for key, value in zip(input_names[4:], flattened_key_values):
inputs[key] = value
return inputs, kvcache_info_bundle
def prepare_outputs(self, outputs, prepared_inputs, kvcache_info_bundle):
lm_logits = outputs[0]
def _get_past_kv_from_outputs(outputs):
if self.separate_tuple_input_output:
return tuple((outputs[(2 * i) + 1], outputs[(2 * i) + 2]) for i in range(self.num_layers))
else:
return outputs[1:]
new_past_key_values = _get_past_kv_from_outputs(outputs)
new_past_key_values = self._update_kv_cache(None, new_key_value=new_past_key_values, max_cache_size=self.num_tokens)
old_past_key_values = tuple((prepared_inputs[f"past_key_{i}_in"], prepared_inputs[f"past_value_{i}_in"]) for i in range(self.num_layers))
past_key_values = self._update_kv_cache(old_past_key_values, new_past_key_values, self.max_kv_size)
return {'lm_logits': lm_logits, 'past_key_values': past_key_values}
def __call__(self, *args, **kwargs):
prepared_inputs, kvcache_info_bundle = self.prepare_inputs(*args, **kwargs)
outputs = self.session.run(None, prepared_inputs)
prepared_outputs = self.prepare_outputs(outputs, prepared_inputs, kvcache_info_bundle)
return prepared_outputs
def generate(self, input_text, max_new_tokens=3):
encoded = self._tokenize_text(input_text)
input_ids = encoded['input_ids']
attention_mask = encoded['attention_mask']
past_key_values = get_padded_kv_values(past_size=self.max_kv_size, num_layers=self.num_layers,
hidden_size=self.embed_dim, num_attention_heads=self.num_heads,
num_kv_heads=self.num_kv_heads, device=self.device, dtype=self.dtype)
generated_ids = input_ids[0].tolist()
outputs = self(input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values)
past_key_values = outputs['past_key_values']
for _ in range(max_new_tokens):
next_token_logits = outputs['lm_logits'][:, -1, :]
next_token = np.argmax(next_token_logits, axis=-1).item()
generated_ids.append(next_token)
if next_token == self.tokenizer.eos_token_id:
break
input_ids = np.array([[next_token]], dtype=np.int64)
attention_mask = np.ones((1, 1), dtype=np.int64)
outputs = self(input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values)
past_key_values = outputs['past_key_values']
return self.tokenizer.decode(generated_ids, skip_special_tokens=True)
if __name__ == "__main__":
class Config:
num_attention_heads = 14
num_key_value_heads = 2
num_hidden_layers = 24
hidden_size = 896
rope_theta = 10000.0
max_tokens = 4096
cfg = Config()
tokenizer = AutoTokenizer.from_pretrained('/home/Qwen2.5-0.5B-Instruct-old/huggface')
model_path = '/home/Qwen2.5-0.5B-Instruct-old/onnx/qwen2.onnx'
fpm = LLMForwardPassManager(cfg, model_path, tokenizer, separate_tuple_input_output=True, num_tokens=2073)
prompt = "你能够做什么"
generated_text = fpm.generate(prompt)
print("Generated text:", generated_text)
在使用 Tutorial_for_Llama3_Compute
针对 Qwen 模型,按官方教程进行 model_prepare
时困惑度正常,但加载等效替换导出的 ONNX 模型后效果变差,可能由以下几方面原因导致:
torch.onnx.export
)将模型导出为 ONNX 时,参数设置不正确可能会导致模型结构或权重信息丢失。例如,没有正确指定输入输出的名称、形状、动态轴等,可能会使 ONNX 模型在推理时无法正确处理输入数据,进而影响性能。