Deepseek V3 Inference 报错 header too large

运行脚本

import argparse

import mindspore as ms
from mindspore import Model, Tensor
from mindspore.common import initializer

from mindformers import MindFormerConfig
from mindformers import build_context
from mindformers.tools.logger import logger
from mindformers.trainer.utils import transform_and_load_checkpoint
from mindformers.core.parallel_config import build_parallel_config
from mindformers.models.llama.llama_tokenizer_fast import LlamaTokenizerFast

from research.deepseek3.deepseek3_model_infer import InferenceDeepseekV3ForCausalLM
from research.deepseek3.deepseek3_config import DeepseekV3Config


def run_predict(args):
    """Deepseek-V3/R1 predict"""
    # inputs
    input_questions = [args.input]

    # set model config
    yaml_file = args.config
    config = MindFormerConfig(yaml_file)
    build_context(config)
    build_parallel_config(config)
    model_config = config.model.model_config
    model_config.parallel_config = config.parallel_config
    model_config.moe_config = config.moe_config
    model_config = DeepseekV3Config(**model_config)

    # build tokenizer
    tokenizer = LlamaTokenizerFast(config.processor.tokenizer.vocab_file,
                                   config.processor.tokenizer.tokenizer_file,
                                   unk_token=config.processor.tokenizer.unk_token,
                                   bos_token=config.processor.tokenizer.bos_token,
                                   eos_token=config.processor.tokenizer.eos_token,
                                   fast_tokenizer=True)
    tokenizer.pad_token = tokenizer.eos_token

    # build model from config
    network = InferenceDeepseekV3ForCausalLM(model_config)
    ms_model = Model(network)
    if config.load_checkpoint:
        logger.info("----------------Transform and load checkpoint----------------")
        seq_length = model_config.seq_length
        input_ids = Tensor(shape=(model_config.batch_size, seq_length), dtype=ms.int32, init=initializer.One())
        infer_data = network.prepare_inputs_for_predict_layout(input_ids)
        transform_and_load_checkpoint(config, ms_model, network, infer_data, do_predict=True)

    inputs = tokenizer(input_questions, max_length=64, padding="max_length")["input_ids"]
    outputs = network.generate(inputs,
                               max_length=1024,
                               do_sample=False,
                               top_k=5,
                               top_p=1,
                               max_new_tokens=128)
    answer = tokenizer.decode(outputs)
    print("answer: ", answer)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--config',
        type=str,
        required=False,
        help='YAML config files, such as'
        '/home/ma-user/work/vllm-mindspore/install_depend_pkgs/mindformers-br_infer_boom/research/deepseek3/deepseek3_671b/predict_deepseek3_671b.yaml',
        
        default="/home/ma-user/work/vllm-mindspore/install_depend_pkgs/mindformers-br_infer_boom/research/deepseek3/deepseek3_671b/predict_deepseek3_671b.yaml")
    parser.add_argument(
        '--input',
        type=str,
        default="生抽和老抽的区别是什么?")
    args_ = parser.parse_args()

    run_predict(args_)

配置文件

seed: 0
output_dir: './output' # path to save checkpoint/strategy
run_mode: 'predict'
use_parallel: False

load_checkpoint: "/home/ma-user/work/models/deepseekv3_bf16/tokenizer.json"
load_ckpt_format: "safetensors"
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model

# trainer config
trainer:
  type: CausalLanguageModelingTrainer
  model_name: 'DeepSeekV3'

# default parallel of device num = 32 for Atlas 800T A2
parallel_config:
  model_parallel: 1
  pipeline_stage: 1
  expert_parallel: 1
  vocab_emb_dp: False

# mindspore context init config
context:
  mode: 0 # 0--Graph Mode; 1--Pynative Mode
  max_device_memory: "50GB"
  device_id: 0
  affinity_cpu_list: None

# parallel context config
parallel:
  parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend
  full_batch: False
  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"

# model config
model:
  model_config:
    type: DeepseekV3Config
    auto_register: deepseek3_config.DeepseekV3Config
    batch_size: 1 # add for incre predict
    seq_length: 4096
    hidden_size: 7168
    num_layers: 10
    num_heads: 128
    max_position_embeddings: 163840
    intermediate_size: 18432
    kv_lora_rank:  512
    q_lora_rank: 1536
    qk_rope_head_dim: 64
    v_head_dim: 128
    qk_nope_head_dim: 128
    vocab_size: 129280
    multiple_of: 256
    rms_norm_eps: 1.0e-6
    bos_token_id: 0
    eos_token_id: 1
    pad_token_id: 1
    ignore_token_id: -100
    compute_dtype: "bfloat16"
    layernorm_compute_type: "bfloat16"
    softmax_compute_type: "bfloat16"
    rotary_dtype: "bfloat16"
    router_dense_type: "bfloat16"
    param_init_type: "bfloat16"
    scaling_factor:
      beta_fast: 32.0
      beta_slow: 1.0
      factor: 40.0
      mscale: 1.0
      mscale_all_dim: 1.0
      original_max_position_embeddings: 4096
    use_past: True
    extend_method: "YARN"
    use_flash_attention: True
    block_size: 16
    num_blocks: 512
    offset: 0
    checkpoint_name_or_path: ""
    repetition_penalty: 1
    max_decode_length: 1024
    top_k: 1
    top_p: 1
    theta: 10000.0
    do_sample: False
    is_dynamic: True
    qkv_concat: False
    ffn_concat: False
    auto_map:
      AutoConfig: deepseek3_config.DeepseekV3Config
      AutoModel: deepseek3.DeepseekV3ForCausalLM
  arch:
    type: DeepseekV3ForCausalLM
    auto_register: deepseek3.DeepseekV3ForCausalLM

moe_config:
  expert_num: 256
  num_experts_chosen: 8
  routing_policy: "TopkRouterV2"
  shared_expert_num: 1
  routed_scaling_factor: 2.5
  first_k_dense_replace: 0
  moe_intermediate_size: 2048
  topk_group: 4
  n_group: 8

processor:
  return_tensors: ms
  tokenizer:
    unk_token: '<unk>'
    bos_token: '<|begin▁of▁sentence|>'
    eos_token: '<|end▁of▁sentence|>'
    pad_token: '<|end▁of▁sentence|>'
    type: LlamaTokenizerFast
    vocab_file: '/home/ma-user/work/models/deepseekv3_bf16/tokenizer.json'
    tokenizer_file: '/home/ma-user/work/models/deepseekv3_bf16/tokenizer.json'

报错信息

2025-09-09 22:59:06,762 - mindformers./output/log[mindformers/utils/load_checkpoint_utils.py:191] - INFO - ......Start load checkpoint from safetensors......
2025-09-09 22:59:06,762 - mindformers./output/log[mindformers/utils/load_checkpoint_utils.py:196] - INFO - Load checkpoint from /home/ma-user/work/models/deepseekv3_bf16/tokenizer.json.
2025-09-09 22:59:06,763 - mindformers./output/log[mindformers/utils/load_checkpoint_utils.py:205] - INFO - Set network.set_train=False, reduce compile time in prediction.
2025-09-09 22:59:06,775 - mindformers./output/log[mindformers/utils/load_checkpoint_utils.py:211] - INFO - ......Use single checkpoint file mode......
2025-09-09 22:59:06,775 - mindformers./output/log[mindformers/models/modeling_utils.py:1531] - INFO - InferenceDeepseekV3ForCausalLM does not support qkv concat check, skipping...
2025-09-09 22:59:06,776 - mindformers./output/log[mindformers/utils/load_checkpoint_utils.py:378] - INFO - ......Start load checkpoint to model......
Traceback (most recent call last):
  File "/home/ma-user/work/mindarmour/examples/model_protection/deepseekv3/infer/run_deepseekv3_predict.py", line 79, in <module>
    run_predict(args_)
  File "/home/ma-user/work/mindarmour/examples/model_protection/deepseekv3/infer/run_deepseekv3_predict.py", line 50, in run_predict
    transform_and_load_checkpoint(config, ms_model, network, infer_data, do_predict=True)
  File "/home/ma-user/work/vllm-mindspore/install_depend_pkgs/mindformers-br_infer_boom/mindformers/trainer/utils.py", line 400, in transform_and_load_checkpoint
    load_checkpoint_with_safetensors(config, model, network, dataset, do_eval=do_eval,
  File "/home/ma-user/work/vllm-mindspore/install_depend_pkgs/mindformers-br_infer_boom/mindformers/utils/load_checkpoint_utils.py", line 275, in load_checkpoint_with_safetensors
    load_safetensors_checkpoint(config, load_checkpoint_files, network, strategy_path, load_checkpoint, optimizer)
  File "/home/ma-user/work/vllm-mindspore/install_depend_pkgs/mindformers-br_infer_boom/mindformers/utils/load_checkpoint_utils.py", line 382, in load_safetensors_checkpoint
    with safe_open(checkpoint_file, framework='np') as f:
safetensors_rust.SafetensorError: Error while deserializing header: header too large

尝试过魔乐社区和huggingface的bf16版本 应该不是权重文件问题 求助如何解决

用户您好,欢迎使用MindSpore,已经收到您的问题,会尽快分析解答~

单卡64g跑不了deepseek 10层,至少2张吧.

SafetensorError:HeaderTooLarge

按照如下原因排查下.

1. 模型文件未正确下载

最常见的情况是,模型参数文件未能完整下载。由于模型文件通常较大,网络连接不稳定或下载过程中断可能导致文件不完整。特别是对于那些通过 Git 或其他版本控制系统管理的项目,如果未安装 Git Large File Storage(git-lfs)工具,那么大文件的下载可能会失败或被截断,从而导致文件损坏或不完整。

2. 文件格式不兼容

另一个可能的原因是文件格式不兼容。虽然 .safetensor 文件格式旨在优化大文件的存储和传输,但如果文件在生成或转换过程中出现了问题,例如使用了不同版本的工具或库,可能会导致文件格式与当前环境不匹配。这种情况下,即使文件下载成功,也可能因为格式问题而无法正确解析。

3. 系统资源限制

此外,系统资源的限制也是一个不容忽视的因素。某些操作系统或运行环境对文件大小或内存使用有严格的限制,特别是在处理非常大的模型文件时,可能会触发这些限制。例如,某些云服务器或虚拟机可能配置了较低的内存或磁盘空间,导致文件加载时出现异常。

为了进一步验证上述假设,建议开发者首先检查模型文件的完整性。可以通过对比文件的哈希值(如 MD5 或 SHA256)来确认文件是否完整无损。同时,检查网络连接的稳定性,确保下载过程没有中断。如果确实存在文件下载不完整的问题,那么安装并配置 git-lfs 工具将是一个有效的解决方案。

好的我尝试一下 还想请教一下build model一次大概需要多久 我这边时间能到一小时

你的磁盘是什么类型的?读写速度怎么样?

我这边用的ssd, 8卡32g显存910b,

10层 a8w8 deepseek, 单卡占用25g*8 总计约 200g

6分钟就能载入进行推理.

裁剪的肯定载入会快点.

感谢 conda-forge重新安装safetensors已解决

此话题已在最后回复的 60 分钟后被自动关闭。不再允许新回复。