运行脚本
import argparse
import mindspore as ms
from mindspore import Model, Tensor
from mindspore.common import initializer
from mindformers import MindFormerConfig
from mindformers import build_context
from mindformers.tools.logger import logger
from mindformers.trainer.utils import transform_and_load_checkpoint
from mindformers.core.parallel_config import build_parallel_config
from mindformers.models.llama.llama_tokenizer_fast import LlamaTokenizerFast
from research.deepseek3.deepseek3_model_infer import InferenceDeepseekV3ForCausalLM
from research.deepseek3.deepseek3_config import DeepseekV3Config
def run_predict(args):
"""Deepseek-V3/R1 predict"""
# inputs
input_questions = [args.input]
# set model config
yaml_file = args.config
config = MindFormerConfig(yaml_file)
build_context(config)
build_parallel_config(config)
model_config = config.model.model_config
model_config.parallel_config = config.parallel_config
model_config.moe_config = config.moe_config
model_config = DeepseekV3Config(**model_config)
# build tokenizer
tokenizer = LlamaTokenizerFast(config.processor.tokenizer.vocab_file,
config.processor.tokenizer.tokenizer_file,
unk_token=config.processor.tokenizer.unk_token,
bos_token=config.processor.tokenizer.bos_token,
eos_token=config.processor.tokenizer.eos_token,
fast_tokenizer=True)
tokenizer.pad_token = tokenizer.eos_token
# build model from config
network = InferenceDeepseekV3ForCausalLM(model_config)
ms_model = Model(network)
if config.load_checkpoint:
logger.info("----------------Transform and load checkpoint----------------")
seq_length = model_config.seq_length
input_ids = Tensor(shape=(model_config.batch_size, seq_length), dtype=ms.int32, init=initializer.One())
infer_data = network.prepare_inputs_for_predict_layout(input_ids)
transform_and_load_checkpoint(config, ms_model, network, infer_data, do_predict=True)
inputs = tokenizer(input_questions, max_length=64, padding="max_length")["input_ids"]
outputs = network.generate(inputs,
max_length=1024,
do_sample=False,
top_k=5,
top_p=1,
max_new_tokens=128)
answer = tokenizer.decode(outputs)
print("answer: ", answer)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'--config',
type=str,
required=False,
help='YAML config files, such as'
'/home/ma-user/work/vllm-mindspore/install_depend_pkgs/mindformers-br_infer_boom/research/deepseek3/deepseek3_671b/predict_deepseek3_671b.yaml',
default="/home/ma-user/work/vllm-mindspore/install_depend_pkgs/mindformers-br_infer_boom/research/deepseek3/deepseek3_671b/predict_deepseek3_671b.yaml")
parser.add_argument(
'--input',
type=str,
default="生抽和老抽的区别是什么?")
args_ = parser.parse_args()
run_predict(args_)
配置文件
seed: 0
output_dir: './output' # path to save checkpoint/strategy
run_mode: 'predict'
use_parallel: False
load_checkpoint: "/home/ma-user/work/models/deepseekv3_bf16/tokenizer.json"
load_ckpt_format: "safetensors"
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
# trainer config
trainer:
type: CausalLanguageModelingTrainer
model_name: 'DeepSeekV3'
# default parallel of device num = 32 for Atlas 800T A2
parallel_config:
model_parallel: 1
pipeline_stage: 1
expert_parallel: 1
vocab_emb_dp: False
# mindspore context init config
context:
mode: 0 # 0--Graph Mode; 1--Pynative Mode
max_device_memory: "50GB"
device_id: 0
affinity_cpu_list: None
# parallel context config
parallel:
parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend
full_batch: False
strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
# model config
model:
model_config:
type: DeepseekV3Config
auto_register: deepseek3_config.DeepseekV3Config
batch_size: 1 # add for incre predict
seq_length: 4096
hidden_size: 7168
num_layers: 10
num_heads: 128
max_position_embeddings: 163840
intermediate_size: 18432
kv_lora_rank: 512
q_lora_rank: 1536
qk_rope_head_dim: 64
v_head_dim: 128
qk_nope_head_dim: 128
vocab_size: 129280
multiple_of: 256
rms_norm_eps: 1.0e-6
bos_token_id: 0
eos_token_id: 1
pad_token_id: 1
ignore_token_id: -100
compute_dtype: "bfloat16"
layernorm_compute_type: "bfloat16"
softmax_compute_type: "bfloat16"
rotary_dtype: "bfloat16"
router_dense_type: "bfloat16"
param_init_type: "bfloat16"
scaling_factor:
beta_fast: 32.0
beta_slow: 1.0
factor: 40.0
mscale: 1.0
mscale_all_dim: 1.0
original_max_position_embeddings: 4096
use_past: True
extend_method: "YARN"
use_flash_attention: True
block_size: 16
num_blocks: 512
offset: 0
checkpoint_name_or_path: ""
repetition_penalty: 1
max_decode_length: 1024
top_k: 1
top_p: 1
theta: 10000.0
do_sample: False
is_dynamic: True
qkv_concat: False
ffn_concat: False
auto_map:
AutoConfig: deepseek3_config.DeepseekV3Config
AutoModel: deepseek3.DeepseekV3ForCausalLM
arch:
type: DeepseekV3ForCausalLM
auto_register: deepseek3.DeepseekV3ForCausalLM
moe_config:
expert_num: 256
num_experts_chosen: 8
routing_policy: "TopkRouterV2"
shared_expert_num: 1
routed_scaling_factor: 2.5
first_k_dense_replace: 0
moe_intermediate_size: 2048
topk_group: 4
n_group: 8
processor:
return_tensors: ms
tokenizer:
unk_token: '<unk>'
bos_token: '<|begin▁of▁sentence|>'
eos_token: '<|end▁of▁sentence|>'
pad_token: '<|end▁of▁sentence|>'
type: LlamaTokenizerFast
vocab_file: '/home/ma-user/work/models/deepseekv3_bf16/tokenizer.json'
tokenizer_file: '/home/ma-user/work/models/deepseekv3_bf16/tokenizer.json'
报错信息
2025-09-09 22:59:06,762 - mindformers./output/log[mindformers/utils/load_checkpoint_utils.py:191] - INFO - ......Start load checkpoint from safetensors......
2025-09-09 22:59:06,762 - mindformers./output/log[mindformers/utils/load_checkpoint_utils.py:196] - INFO - Load checkpoint from /home/ma-user/work/models/deepseekv3_bf16/tokenizer.json.
2025-09-09 22:59:06,763 - mindformers./output/log[mindformers/utils/load_checkpoint_utils.py:205] - INFO - Set network.set_train=False, reduce compile time in prediction.
2025-09-09 22:59:06,775 - mindformers./output/log[mindformers/utils/load_checkpoint_utils.py:211] - INFO - ......Use single checkpoint file mode......
2025-09-09 22:59:06,775 - mindformers./output/log[mindformers/models/modeling_utils.py:1531] - INFO - InferenceDeepseekV3ForCausalLM does not support qkv concat check, skipping...
2025-09-09 22:59:06,776 - mindformers./output/log[mindformers/utils/load_checkpoint_utils.py:378] - INFO - ......Start load checkpoint to model......
Traceback (most recent call last):
File "/home/ma-user/work/mindarmour/examples/model_protection/deepseekv3/infer/run_deepseekv3_predict.py", line 79, in <module>
run_predict(args_)
File "/home/ma-user/work/mindarmour/examples/model_protection/deepseekv3/infer/run_deepseekv3_predict.py", line 50, in run_predict
transform_and_load_checkpoint(config, ms_model, network, infer_data, do_predict=True)
File "/home/ma-user/work/vllm-mindspore/install_depend_pkgs/mindformers-br_infer_boom/mindformers/trainer/utils.py", line 400, in transform_and_load_checkpoint
load_checkpoint_with_safetensors(config, model, network, dataset, do_eval=do_eval,
File "/home/ma-user/work/vllm-mindspore/install_depend_pkgs/mindformers-br_infer_boom/mindformers/utils/load_checkpoint_utils.py", line 275, in load_checkpoint_with_safetensors
load_safetensors_checkpoint(config, load_checkpoint_files, network, strategy_path, load_checkpoint, optimizer)
File "/home/ma-user/work/vllm-mindspore/install_depend_pkgs/mindformers-br_infer_boom/mindformers/utils/load_checkpoint_utils.py", line 382, in load_safetensors_checkpoint
with safe_open(checkpoint_file, framework='np') as f:
safetensors_rust.SafetensorError: Error while deserializing header: header too large
尝试过魔乐社区和huggingface的bf16版本 应该不是权重文件问题 求助如何解决