模型推理报错

https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5

命令

python run_mindformer.py   --config /home/mindspore/work/mindformers/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml   --predict_data "帮助我制定一份去上海的旅游攻略"   --register_path "/home/mindspore/work/mindformers/research/qwen2_5/"

predict_qwen2_5_7b_instruct.yaml

seed: 0
output_dir: "./output" # path to save checkpoint/strategy
load_checkpoint: ""
load_ckpt_format: "safetensors"
src_strategy_path_or_dir: ""
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
use_parallel: False
run_mode: "predict"

# trainer config
trainer:
  type: CausalLanguageModelingTrainer
  model_name: "qwen2_5_7b"

# runner config
runner_config:
  epochs: 5
  batch_size: 1
  sink_mode: True
  sink_size: 2
runner_wrapper:
  type: MFTrainOneStepCell
  scale_sense:
    type: DynamicLossScaleUpdateCell
    loss_scale_value: 65536
    scale_factor: 2
    scale_window: 1000
  use_clip_grad: True

# default parallel of device num = 8 for Atlas 800T A2
parallel_config:
  data_parallel: 1
  model_parallel: 1
  pipeline_stage: 1
  micro_batch_num: 1
  vocab_emb_dp: False
  gradient_aggregation_group: 4
# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
micro_batch_interleave_num: 1

model:
  model_config:
    type: LlamaConfig # 后续可改为Qwen2Config(若框架支持)
    batch_size: 1
    seq_length: 32768
    hidden_size: 3584
    num_layers: 28
    num_heads: 28
    n_kv_heads: 4
    vocab_size: 152064
    intermediate_size: 18944
    max_position_embeddings: 32768
    qkv_has_bias: True
    rms_norm_eps: 1.0e-6
    theta: 1000000.0
    emb_dropout_prob: 0.0
    eos_token_id: [151645, 151643]
    pad_token_id: 151643
    bos_token_id: 151643
    compute_dtype: "bfloat16"
    layernorm_compute_type: "float32"
    softmax_compute_type: "float32"
    rotary_dtype: "bfloat16"
    param_init_type: "bfloat16"
    use_past: True
    use_flash_attention: False # 禁用flash attention避免兼容问题
    block_size: 32
    num_blocks: 1024
    use_past_shard: False
    offset: 0
    checkpoint_name_or_path: ""
    repetition_penalty: 1.05
    max_decode_length: 512
    top_k: 20
    top_p: 0.8
    temperature: 0.7
    do_sample: True
    is_dynamic: True
    qkv_concat: True
    auto_map:
      AutoTokenizer: [qwen2_5_tokenizer.Qwen2Tokenizer, null]

  arch:
    type: LlamaForCausalLM # 后续可改为Qwen2ForCausalLM(若框架支持)

processor:
  return_tensors: ms
  tokenizer:
    model_max_length: 131072
    bos_token: null
    eos_token: "<|im_end|>"
    unk_token: null
    pad_token: "<|endoftext|>"
    vocab_file: "/home/mindspore/work/qwen2_5/vocab.json" # 确保路径正确
    merges_file: "/home/mindspore/work/qwen2_5/merges.txt" # 确保路径正确
    chat_template: "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"
    type: Qwen2Tokenizer # 与分词器注册名称一致
  type: Qwen2Processor

# mindspore context init config
context:
  mode: 0 #0--Graph Mode; 1--Pynative Mode
  device_target: "Ascend"
  ascend_config:
    precision_mode: "must_keep_origin_dtype"
  max_call_depth: 10000
  max_device_memory: "25GB" # 确保不超过设备内存
  save_graphs: False
  save_graphs_path: "./graph"
  device_id: 0

# parallel context config
parallel:
  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
  gradients_mean: False
  enable_alltoall: False
  full_batch: True
  search_mode: "sharding_propagation"
  enable_parallel_optimizer: False
  strategy_ckpt_config:
    save_file: "/home/mindspore/work/qwen_model.ckpt"
    only_trainable_params: False
  parallel_optimizer_config:
    gradient_accumulation_shard: False
    parallel_optimizer_threshold: 64

qwen2_5_tokenizer.py

# Copyright 2024 Huawei Technologies Co., Ltd
# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Tokenization classes for Qwen2."""

import json
import os
import unicodedata
from functools import lru_cache
from typing import Dict, Optional, Tuple
import regex as re

from mindspore import log as logger
from mindformers.tools.register import MindFormerRegister, MindFormerModuleType
from mindformers.models.tokenization_utils import PreTrainedTokenizer
from mindformers.models.tokenization_utils_base import AddedToken
from mindformers.tools.utils import check_file


VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",
    "merges_file": "merges.txt",
}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {"qwen/qwen-tokenizer": "vocab.json"},
    "merges_file": {"qwen/qwen-tokenizer": "merges.txt"},
}

MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}

PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""

ENDOFTEXT = "<|endoftext|>"
IMSTART = "<|im_start|>"
IMEND = "<|im_end|>"
REFSTART = "<|object_ref_start|>"
REFEND = "<|object_ref_end|>"
BOXSTART = "<|box_start|>"
BOXEND = "<|box_end|>"
QUADSTART = "<|quad_start|>"
QUADEND = "<|quad_end|>"
VISIONSTART = "<|vision_start|>"
VISIONEND = "<|vision_end|>"
VISIONPAD = "<|vision_pad|>"
IMAGEPAD = "<|image_pad|>"
VIDEOPAD = "<|video_pad|>"
TOOLCALLSTART = "<tool_call>"
TOOLCALLEND = "</tool_call>"
FIMPREFIX = "<|fim_prefix|>"
FIMMIDDLE = "<|fim_middle|>"
FIMSUFFIX = "<|fim_suffix|>"
FIMPAD = "<|fim_pad|>"
REPONAME = "<|repo_name|>"
FILESEP = "<|file_sep|>"
TOOLRESPONSESTART = "<tool_response>"
TOOLRESPONSEEND = "</tool_response>"
THINKSTART = "<think>"
THINKEND = "<think>"
ENDOFTEXTID = 151643
IMSTARTID = 151644
IMENDID = 151645
REFSTARTID = 151646
REFENDID = 151647
BOXSTARTID = 151648
BOXENDID = 151649
QUADSTARTID = 151650
QUADENDID = 151651
VISIONSTARTID = 151652
VISIONENDID = 151653
VISIONPADID = 151654
IMAGEPADID = 151655
VIDEOPADID = 151656
TOOLCALLSTARTID = 151657
TOOLCALLENDID = 151658
FIMPREFIXID = 151659
FIMMIDDLEID = 151660
FIMSUFFIXID = 151661
FIMPADID = 151662
REPONAMEID = 151663
FILESEPID = 151664
TOOLRESPONSESTARTID = 151665
TOOLRESPONSEENDID = 151666
THINKSTARTID = 151667
THINKENDID = 151668


@lru_cache()
def bytes_to_unicode():
    """Returns list of utf-8 byte and a mapping to unicode strings."""
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"),
                                                         ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


def get_pairs(word):
    """Return set of symbol pairs in a word."""
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


# 修正注册名称为"Qwen2Tokenizer",与配置文件中的type一致
@MindFormerRegister.register(MindFormerModuleType.TOKENIZER, name="Qwen2Tokenizer")
class Qwen2Tokenizer(PreTrainedTokenizer):
    """Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding."""

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = MAX_MODEL_INPUT_SIZES
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(self,
                 vocab_file,
                 merges_file,
                 errors="replace",
                 unk_token="<|endoftext|>",
                 bos_token=None,
                 eos_token="<|endoftext|>",
                 pad_token="<|endoftext|>",
                 clean_up_tokenization_spaces=False,
                 split_special_tokens=False,
                 **kwargs,
                 ):
        # Qwen vocab does not contain control tokens; added tokens need to be special
        bos_token = (
            AddedToken(bos_token, lstrip=False, rstrip=False,
                       special=True, normalized=False)
            if isinstance(bos_token, str)
            else bos_token
        )
        eos_token = (
            AddedToken(eos_token, lstrip=False, rstrip=False,
                       special=True, normalized=False)
            if isinstance(eos_token, str)
            else eos_token
        )
        unk_token = (
            AddedToken(unk_token, lstrip=False, rstrip=False,
                       special=True, normalized=False)
            if isinstance(unk_token, str)
            else unk_token
        )
        pad_token = (
            AddedToken(pad_token, lstrip=False, rstrip=False,
                       special=True, normalized=False)
            if isinstance(pad_token, str)
            else pad_token
        )
        end_of_text_token = AddedToken(
            ENDOFTEXT, lstrip=False, rstrip=False, special=True, normalized=False)
        im_start_token = AddedToken(
            IMSTART, lstrip=False, rstrip=False, special=True, normalized=False)
        im_end_token = AddedToken(
            IMEND, lstrip=False, rstrip=False, special=True, normalized=False)
        ref_start_token = AddedToken(
            REFSTART, lstrip=False, rstrip=False, special=True, normalized=False)
        ref_end_token = AddedToken(
            REFEND, lstrip=False, rstrip=False, special=True, normalized=False)
        box_start_token = AddedToken(
            BOXSTART, lstrip=False, rstrip=False, special=True, normalized=False)
        box_end_token = AddedToken(
            BOXEND, lstrip=False, rstrip=False, special=True, normalized=False)
        quad_start_token = AddedToken(
            QUADSTART, lstrip=False, rstrip=False, special=True, normalized=False)
        quad_end_token = AddedToken(
            QUADEND, lstrip=False, rstrip=False, special=True, normalized=False)
        vision_start_token = AddedToken(
            VISIONSTART, lstrip=False, rstrip=False, special=True, normalized=False)
        vision_end_token = AddedToken(
            VISIONEND, lstrip=False, rstrip=False, special=True, normalized=False)
        vision_pad_token = AddedToken(
            VISIONPAD, lstrip=False, rstrip=False, special=True, normalized=False)
        image_pad_token = AddedToken(
            IMAGEPAD, lstrip=False, rstrip=False, special=True, normalized=False)
        video_pad_token = AddedToken(
            VIDEOPAD, lstrip=False, rstrip=False, special=True, normalized=False)
        toolcall_start_token = AddedToken(
            TOOLCALLSTART, lstrip=False, rstrip=False, special=True, normalized=False)
        toolcall_end_token = AddedToken(
            TOOLCALLEND, lstrip=False, rstrip=False, special=True, normalized=False)
        fim_prefix_token = AddedToken(
            FIMPREFIX, lstrip=False, rstrip=False, special=True, normalized=False)
        fim_middle_token = AddedToken(
            FIMMIDDLE, lstrip=False, rstrip=False, special=True, normalized=False)
        fim_suffix_token = AddedToken(
            FIMSUFFIX, lstrip=False, rstrip=False, special=True, normalized=False)
        fim_pad_token = AddedToken(
            FIMPAD, lstrip=False, rstrip=False, special=True, normalized=False)
        repo_name_token = AddedToken(
            REPONAME, lstrip=False, rstrip=False, special=True, normalized=False)
        file_sep_token = AddedToken(
            FILESEP, lstrip=False, rstrip=False, special=True, normalized=False)
        tool_response_start_token = AddedToken(
            TOOLRESPONSESTART, lstrip=False, rstrip=False, special=True, normalized=False)  # 修正之前的错误(误用FILESEP)
        tool_response_end_token = AddedToken(
            TOOLRESPONSEEND, lstrip=False, rstrip=False, special=True, normalized=False)  # 修正之前的错误
        think_start_token = AddedToken(
            THINKSTART, lstrip=False, rstrip=False, special=True, normalized=False)  # 修正之前的错误
        think_end_token = AddedToken(
            THINKEND, lstrip=False, rstrip=False, special=True, normalized=False)  # 修正之前的错误

        self.special_tokens = {
            ENDOFTEXT: ENDOFTEXTID,
            IMSTART: IMSTARTID,
            IMEND: IMENDID,
            REFSTART: REFSTARTID,
            REFEND: REFENDID,
            BOXSTART: BOXSTARTID,
            BOXEND: BOXENDID,
            QUADSTART: QUADSTARTID,
            QUADEND: QUADENDID,
            VISIONSTART: VISIONSTARTID,
            VISIONEND: VISIONENDID,
            VISIONPAD: VISIONPADID,
            IMAGEPAD: IMAGEPADID,
            VIDEOPAD: VIDEOPADID,
            TOOLCALLSTART: TOOLCALLSTARTID,
            TOOLCALLEND: TOOLCALLENDID,
            FIMPREFIX: FIMPREFIXID,
            FIMMIDDLE: FIMMIDDLEID,
            FIMSUFFIX: FIMSUFFIXID,
            FIMPAD: FIMPADID,
            REPONAME: REPONAMEID,
            FILESEP: FILESEPID,
            TOOLRESPONSESTART: TOOLRESPONSESTARTID,
            TOOLRESPONSEEND: TOOLRESPONSEENDID,
            THINKSTART: THINKSTARTID,
            THINKEND: THINKENDID
        }
        self.end_of_text_id = self.special_tokens[ENDOFTEXT]
        self.im_start_id = self.special_tokens[IMSTART]
        self.im_end_id = self.special_tokens[IMEND]
        self.ref_start_id = self.special_tokens[REFSTART]
        self.ref_end_id = self.special_tokens[REFEND]
        self.box_start_id = self.special_tokens[BOXSTART]
        self.box_end_id = self.special_tokens[BOXEND]
        self.quad_start_id = self.special_tokens[QUADSTART]
        self.quad_end_id = self.special_tokens[QUADEND]
        self.vision_start_id = self.special_tokens[VISIONSTART]
        self.vision_end_id = self.special_tokens[VISIONEND]
        self.vision_pad_id = self.special_tokens[VISIONPAD]
        self.image_pad_id = self.special_tokens[IMAGEPAD]
        self.video_pad_id = self.special_tokens[VIDEOPAD]
        self.toolcall_start_id = self.special_tokens[TOOLCALLSTART]
        self.toolcall_end_id = self.special_tokens[TOOLCALLEND]
        self.fim_prefix_id = self.special_tokens[FIMPREFIX]
        self.fim_middle_id = self.special_tokens[FIMMIDDLE]
        self.fim_suffix_id = self.special_tokens[FIMSUFFIX]
        self.fim_pad_id = self.special_tokens[FIMPAD]
        self.repo_name_id = self.special_tokens[REPONAME]
        self.file_sep_id = self.special_tokens[FILESEP]
        self.tool_response_start_id = self.special_tokens[TOOLRESPONSESTART]
        self.tool_response_end_id = self.special_tokens[TOOLRESPONSEEND]
        self.think_start_id = self.special_tokens[THINKSTART]
        self.think_end_id = self.special_tokens[THINKEND]
        check_file(vocab_file, "tokenizer")
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        bpe_merges = []
        with open(merges_file, encoding="utf-8") as merges_handle:
            for line in merges_handle:
                line = line.strip()
                if not line or line.startswith("#"):
                    continue
                bpe_merges.append(tuple(line.split()))
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))

        self.cache = {}

        self.pat = re.compile(PRETOKENIZE_REGEX)

        if kwargs.get("add_prefix_space", False):
            logger.warning(
                f"{self.__class__.__name__} does not support `add_prefix_space`, setting it to True has no effect."
            )

        self._added_tokens_decoder: Dict[int, AddedToken] = {
            self.end_of_text_id: end_of_text_token,
            self.im_start_id: im_start_token,
            self.im_end_id: im_end_token,
            self.ref_start_id: ref_start_token,
            self.ref_end_id: ref_end_token,
            self.box_start_id: box_start_token,
            self.box_end_id: box_end_token,
            self.quad_start_id: quad_start_token,
            self.quad_end_id: quad_end_token,
            self.vision_start_id: vision_start_token,
            self.vision_end_id: vision_end_token,
            self.vision_pad_id: vision_pad_token,
            self.image_pad_id: image_pad_token,
            self.video_pad_id: video_pad_token,
            self.toolcall_start_id: toolcall_start_token,
            self.toolcall_end_id: toolcall_end_token,
            self.fim_prefix_id: fim_prefix_token,
            self.fim_middle_id: fim_middle_token,
            self.fim_suffix_id: fim_suffix_token,
            self.fim_pad_id: fim_pad_token,
            self.repo_name_id: repo_name_token,
            self.file_sep_id: file_sep_token,
            self.tool_response_start_id: tool_response_start_token,
            self.tool_response_end_id: tool_response_end_token,
            self.think_start_id: think_start_token,
            self.think_end_id: think_end_token,
        }

        super().__init__(
            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
            pad_token=pad_token,
            unk_token=unk_token,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            split_special_tokens=split_special_tokens,** kwargs,
        )

    @property
    def vocab_size(self) -> int:
        return len(self.encoder)

    def get_vocab(self):
        return dict(self.encoder, **self.added_tokens_encoder)

    def bpe(self, token):
        """byte pair encoding"""
        if token in self.cache:
            return self.cache[token]
        word = tuple(token)
        pairs = get_pairs(word)

        if not pairs:
            return token

        while True:
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(
                pair, float("inf")))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word_tuple = tuple(new_word)
            word = new_word_tuple
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word_str = " ".join(word)
        self.cache[token] = word_str
        return word_str

    def _tokenize(self, text):
        """Tokenize a string."""
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
            bpe_tokens.extend(
                bpe_token for bpe_token in self.bpe(token).split(" "))
        return bpe_tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.decoder.get(index)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        text = "".join(tokens)
        text = bytearray([self.byte_decoder[c]
                          for c in text]).decode("utf-8", errors=self.errors)
        return text

    def decode(self,
               token_ids,
               skip_special_tokens: bool = False,
               clean_up_tokenization_spaces: Optional[bool] = False,
               spaces_between_special_tokens: bool = False,** kwargs,
               ) -> str:
        """decode token ids"""
        # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
        # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer
        return super().decode(
            token_ids,
            skip_special_tokens=skip_special_tokens,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            spaces_between_special_tokens=spaces_between_special_tokens,
            **kwargs,
        )

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """save vocabulary"""
        if not os.path.isdir(save_directory):
            logger.error(
                f"Vocabulary path ({save_directory}) should be a directory")
            return None
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") +
            VOCAB_FILES_NAMES["vocab_file"]
        )
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") +
            VOCAB_FILES_NAMES["merges_file"]
        )

        flags_ = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
        with os.fdopen(os.open(vocab_file, flags_, 0o750), "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        index = 0
        with os.fdopen(os.open(merge_file, flags_, 0o750), "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        return vocab_file, merge_file

    def prepare_for_tokenization(self, text,** kwargs):
        text = unicodedata.normalize("NFC", text)
        return text, kwargs

用户您好,欢迎安装使用MindSpore。已经收到您的问题,会尽快分析和答复问题。

有具体的报错信息吗?

mindformers套件的问题的话,也可以在mindformers仓库下提issue,有专门该套件的工程师处理的

1 Like