Source code for cotk.dataloader.dataloader

'''A module for dataloader'''
import random
from typing import Optional, Any, Union, Sequence, Dict, Tuple, Iterable, List
from collections import Counter, OrderedDict
from itertools import chain
import logging
from hashlib import sha256

import numpy as np

from .._utils import trim_before_target
from .._utils.unordered_hash import UnorderedSha256, dumps
from .._utils.metaclass import DocStringInheritor, LoadClassInterface, copy_func, copy_property
from .._utils.typehint import OrderedDictType
from ..file_utils import get_resource_file_path
from .tokenizer import Tokenizer
from .field import Field, SentenceDefault, _FieldContent, Sentence
from .vocab import Vocab, GeneralVocab
from .context import FieldContext, VocabContext

[docs]class Dataloader(LoadClassInterface, metaclass=DocStringInheritor): '''Base class of Dataloader. '''
[docs]class LanguageProcessing(Dataloader): """Bases: :class:`.dataloader.Dataloader` Base class for all language processing tasks. This is an abstract class. During the initialization of a dataloader, :class:`Vocab`, :class:`Tokenizer` or :class:`Field` may be created. See :ref:`how to create a dataloader<customized_tasks_ref>`. Arguments:{FILE_ID_DOCS}{FIELD_DETAILS} """ FILE_ID_DOCS = r""" file_id (str): A string indicating the source (path) of the dataset. It can be local path (``"./data"``), a resource name (``"resources://dataset"``), or an url (``"http://test.com/dataset.zip"``). See :ref:`the details of file id<file_id>`.""" FIELD_DETAILS = r""" fields (List, OrderedDict, Dict): This arguments supports multiple input types: * If ``OrderDict`` or ``List``, it specify ``data format`` of the ``"train"``, ``"dev"``, ``"test"`` set. * A ``data format`` should be an ``OrderedDict`` or a ``List[Tuple]`` can be converted to ``OrderedDict``. * The ``key`` of ``data format`` is the name of a Field (used by :meth:`.get_batch`), and the ``value`` is either a class name of a Field or a :class:`Field` object. * Examples: >>> postField = SentenceDefault(...) >>> respField = SentenceDefault(...) >>> data_format = [("post", postField), ("resp", respField)] or >>> data_format = [("post", "SentenceDefault"), ("resp", "SentenceDefault")] * Examples: >>> fields = data_format equals to >>> fields = {"train": data_format, "dev": data_format, "test": data_format} * If ``Dict``, ``fields[key]`` describes ``data format`` of the set named ``key``. Examples: >>> fields = {"train": data_format, "extra": data_format} * See :ref:`how to create a dataloader<customized_tasks_ref>`.""" FIELD_REF = r""" fields (List, OrderedDict, Dict): See initialization of :class:`LanguageProcessing` for explanation. """ SHARED_ARGUMENTS = r'''{LanguageProcessing.FILE_ID_DOCS} {_FILE_ID_DEFAULT} {LanguageProcessing.TOKENIZER_DOCS} {_TOKENIZER_DEFAULT} {LanguageProcessing.MAX_SENT_LENGTH_DOCS} {_MAX_SENT_LENGTH_DEFAULT} {LanguageProcessing.CONVERT_TO_LOWER_LETTER_DOCS} {_CONVERT_TO_LOWER_LETTER_DEFAULT} {LanguageProcessing.MIN_FREQUENT_VOCAB_TIMES_DOCS} {_MIN_FREQUENT_VOCAB_TIMES_DEFAULT} {LanguageProcessing.MIN_RARE_VOCAB_TIMES_DOCS} {_MIN_RARE_VOCAB_TIMES_DEFAULT} {LanguageProcessing.PRETRAINED_DOCS} {_PRETAINED_DEFAULT}''' _FILE_ID_DEFAULT = "" TOKENIZER_DOCS = Sentence.TOKENIZER_DOCS _TOKENIZER_DEFAULT = Sentence.TOKENIZER_DEFAULT MAX_SENT_LENGTH_DOCS = Sentence.MAX_SENT_LENGTH_DOCS _MAX_SENT_LENGTH_DEFAULT = Sentence.MAX_SENT_LENGTH_DEFAULT CONVERT_TO_LOWER_LETTER_DOCS = Sentence.CONVERT_TO_LOWER_LETTER_DOCS _CONVERT_TO_LOWER_LETTER_DEFAULT = Sentence.CONVERT_TO_LOWER_LETTER_DEFAULT MIN_FREQUENT_VOCAB_TIMES_DOCS = GeneralVocab.MIN_FREQUENT_VOCAB_TIMES_DOCS _MIN_FREQUENT_VOCAB_TIMES_DEFAULT = GeneralVocab.MIN_FREQUENT_VOCAB_TIMES_DEFAULT MIN_RARE_VOCAB_TIMES_DOCS = GeneralVocab.MIN_RARE_VOCAB_TIMES_DOCS _MIN_RARE_VOCAB_TIMES_DEFAULT = GeneralVocab.MIN_RARE_VOCAB_TIMES_DEFAULT PRETRAINED_DOCS = r''' pretrained (str, optional): Use :ref:`pretrained field<pretrained_field_ref>` instead of :class:`SentenceDefault`.''' _PRETAINED_DEFAULT = "Default: If ``None``, no pretrained field used." # for docstring fields: Dict[str, "OrderedDict[str, Union[str, Field]]"] = {} '''This instance attribute shows fields of the dataloader (See the initialization of :class:`LanguageProcessing`). For example, the fields can be printed as follows: .. code-block:: python { 'train': OrderedDict([('sent', <cotk.dataloader.field.SentenceDefault object at 0x000001E170F8B588>)]), 'dev': OrderedDict([('sent', <cotk.dataloader.field.SentenceDefault object at 0x000001E170F8BB48>)]), 'test': OrderedDict([('sent', <cotk.dataloader.field.SentenceDefault object at 0x000001E170F8BEC8>)])} } ''' def __init__(self, file_id: str, \ fields: Union["OrderedDict[str, Union[str, Field]]", List[Tuple[str, Union[str, Field]]],\ Dict[str, Union["OrderedDict[str, Union[str, Field]]", List[Tuple[str, Union[str, Field]]]]]], \ ): self.file_id = file_id self.file_path = get_resource_file_path(file_id) with FieldContext.set_parameters(vocab=GeneralVocab(), weak=True) as field_context: fieldcontents: Dict[str, OrderedDictType[str, _FieldContent]] = {} self.fields: Dict[str, OrderedDictType[str, Field]] = {} if isinstance(fields, OrderedDict) or isinstance(fields, list): fields = {set_name: fields for set_name in ["train", "dev", "test"]} if isinstance(fields, dict): for set_name, fields_in_one_set in fields.items(): one_fields, one_fieldcontents = self._fill_field_and_create_content(set_name, fields_in_one_set) self.fields[set_name] = one_fields fieldcontents[set_name] = one_fieldcontents else: raise TypeError("Unknown type for fields") self._load_data(fieldcontents) self.vocabs = self._collect_vocabs_from_fields(self.fields) # self.default_vocab_id = 0 if len(self.vocabs) == 1 else None self.tokenizers = self._collect_tokenizers_from_fields(self.fields) # self.default_tokenizer_id = 0 if len(self.tokenizers) == 1 else None self.default_field_set_name: Optional[str] = None self.default_field_name: Optional[str] = None self._build_vocabs() self._setting_hash = self._create_setting_hash() self._vocab_hash = self._create_vocab_hash() self.data = self._get_data(fieldcontents) self._raw_data_hash, self._data_hash = self._create_data_hash(fieldcontents) self.index, self.batch_id, self.batch_size = self._init_batch(fieldcontents)
[docs] @staticmethod def simple_create(file_id: str, \ fields: Union[OrderedDictType[str, Union[str, Field]],\ Dict[str, OrderedDictType[str, Union[str, Field]]]], \ **kwargs) -> "LanguageProcessing": '''A simple way to create a dataloader. Instead of using :class:`VocabContext` and :class:`FieldContext`, specifying all the possible parameters here. Arguments:{FILE_ID_DOCS}{FIELD_REF} **kwargs: Arguments passed to created :class:`Vocab` and :class:`Field`. ''' with VocabContext.set_parameters(**kwargs): with FieldContext.set_parameters(**kwargs): with FieldContext.set_parameters(tokenizer="space", weak=True): return LanguageProcessing(file_id, fields)
def _load_data(self, fieldcontents: Dict[str, OrderedDictType[str, _FieldContent]]): '''Load data from file. Arguments: fieldcontents (Dict[str, OrderedDictType[str, _FieldContent]]): fieldcontents for each set ''' for set_name, fieldcontents_in_one_set in fieldcontents.items(): if not fieldcontents_in_one_set: raise RuntimeError("no field specified") with open("%s/%s.txt" % (self.file_path, set_name), encoding='utf-8') as f_file: line_cnt = 0 file_iterator = iter(f_file) while True: try: for _, fieldcontent in fieldcontents_in_one_set.items(): line_add = fieldcontent.read_next(file_iterator) if line_add == 0: while True: if next(file_iterator): raise RuntimeError("the file %s corrupted at line %d" % (set_name, line_cnt)) line_cnt += line_add except StopIteration: break sample_nums = [fieldcontent.get_data_number() for _, fieldcontent in fieldcontents_in_one_set.items()] if not all([sample_num == sample_nums[0] for sample_num in sample_nums]): raise RuntimeError("the file %s corrupted at end of the file") for _, fieldcontents_in_one_set in fieldcontents.items(): for _, fieldcontent in fieldcontents_in_one_set.items(): fieldcontent.process_before_vocab() def _init_batch(self, fieldcontents: Dict[str, OrderedDictType[str, _FieldContent]]) -> \ Tuple[Dict[str, List[int]], Dict[str, int], Dict[str, Optional[int]]]: '''Initialize the batches. Return a tuple contains ``index``, ``batch_id``, ``batch_size`` for each set. Arguments: fieldcontents (Dict[str, OrderedDictType[str, _FieldContent]]): fieldcontents for each set. ''' index: Dict[str, List[int]] = {} batch_id: Dict[str, int] = {} batch_size: Dict[str, Optional[int]] = {} for set_name, fieldcontents_in_one_set in fieldcontents.items(): sample_nums = [fieldcontent.get_data_number() \ for _, fieldcontent in fieldcontents_in_one_set.items()] batch_id[set_name] = 0 batch_size[set_name] = None index[set_name] = list(range(sample_nums[0])) return index, batch_id, batch_size def _get_data(self, fieldcontents: Dict[str, OrderedDictType[str, _FieldContent]]) -> \ Dict[str, Dict[str, Any]]: '''Get the data from fieldcontents. Arguments: fieldcontents (Dict[str, OrderedDict[str, _FieldContent]]): fieldcontents for each set. ''' data: Dict[str, Dict[str, Any]] = {} for set_name, fieldcontents_in_one_set in sorted(fieldcontents.items()): data[set_name] = {} for field_name, fieldcontent in fieldcontents_in_one_set.items(): data[set_name][field_name] = fieldcontent.get_data() return data def _build_vocabs(self): '''Invoke build vocab for each vocabulary''' for vocab in self.vocabs: vocab.build_vocab() def _collect_vocabs_from_fields(self, fields: Dict[str, OrderedDictType[str, Field]])\ -> List[Vocab]: '''Collect all vocabulary instances (deduplicated). Arguments: fieldcontents (Dict[str, OrderedDict[str, Field]]): field for each set. ''' vocabs: List[Vocab] = [] for _, fields_in_one_set in sorted(fields.items()): # sort to keep order for _, field in fields_in_one_set.items(): vocab = field.get_vocab() if vocab is not None and vocab not in vocabs: vocabs.append(vocab) return vocabs def _collect_tokenizers_from_fields(self, fields: Dict[str, OrderedDictType[str, Field]])\ -> List[Tokenizer]: '''Collect all tokenizer instances (deduplicated). Arguments: fieldcontents (Dict[str, OrderedDict[str, Field]]): field for each set. ''' tokenizers: List[Tokenizer] = [] tokenizers_setting_hash: List[str] = [] for _, fields_in_one_set in sorted(fields.items()): # sort to keep order for _, field in fields_in_one_set.items(): tokenizer = field.get_tokenizer() if tokenizer is not None and tokenizer.get_setting_hash() not in tokenizers_setting_hash: tokenizers.append(tokenizer) tokenizers_setting_hash.append(tokenizer.get_setting_hash()) return tokenizers def _fill_field_and_create_content(self, set_name: str, fields: \ Union[OrderedDictType[str, Union[str, Field]], List[Tuple[str, Union[str, Field]]]], \ ) -> \ Tuple[OrderedDictType[str, Field], OrderedDictType[str, _FieldContent]]: '''Create and return fields and field contexts. Arguments: set_name(str): name of the set field (OrderedDictType[str, Union[str, Field]]): fields for the set. ''' fieldcontents: OrderedDictType[str, _FieldContent] = OrderedDict() new_fields: OrderedDictType[str, Field] = OrderedDict() fields_iter: Iterable[Tuple[str, Union[str, Field]]] if isinstance(fields, OrderedDict): fields_iter = fields.items() elif isinstance(fields, list): fields_iter = fields else: raise TypeError("Unexpected Type for fields") for name, field_name in fields_iter: if isinstance(field_name, str): field = Field.load_class(field_name)() elif isinstance(field_name, Field): field = field_name else: raise TypeError("Each value of `fields` must be a Field object or a string indicating the name of a Field class.") fieldcontent = field._create(set_name) #pylint: disable=protected-access fieldcontents[name] = fieldcontent new_fields[name] = field return new_fields, fieldcontents def _create_data_hash(self, fieldcontents): raw_data_hash = sha256() data_hash = sha256() for _, fieldcontents_in_one_set in sorted(fieldcontents.items()): for _, fieldcontent in fieldcontents_in_one_set.items(): raw_data_hash.update(dumps(fieldcontent.get_raw_data_hash())) data_hash.update(dumps(fieldcontent.get_data_hash())) return raw_data_hash.hexdigest(), data_hash.hexdigest() def _create_setting_hash(self): setting_hash = sha256() for _, fields_in_one_set in sorted(self.fields.items()): for _, field in fields_in_one_set.items(): setting_hash.update(dumps(field._get_setting_hash(self.vocabs))) #pylint: disable=protected-access for vocab in self.vocabs: setting_hash.update(dumps(vocab.get_setting_hash())) for tokenizer in self.tokenizers: setting_hash.update(dumps(tokenizer.get_setting_hash())) return setting_hash.hexdigest() def _create_vocab_hash(self): vocab_hash = sha256() for vocab in self.vocabs: vocab_hash.update(dumps(vocab.get_vocab_hash())) return vocab_hash.hexdigest()
[docs] def get_default_vocab(self) -> Vocab: '''Get the default :class:`Vocab` in this dataloader. It can be set by :meth:`.set_default_field`. ''' vocab = self.get_default_field().get_vocab() if vocab is None: raise ValueError("This field do not have vocab") return vocab
[docs] def get_default_tokenizer(self) -> Tokenizer: '''Get the default :class:`Tokenizer` in this dataloader. It can be set by :meth:`.set_default_field`. ''' tokenizer = self.get_default_field().get_tokenizer() if tokenizer is None: raise ValueError("This field do not have tokenizer") return tokenizer
[docs] def get_default_field(self) -> Field: '''Get the default :class:`Field` in this dataloader. It can be set by :meth:`.set_default_field`. ''' if self.default_field_name is None or self.default_field_set_name is None: raise RuntimeError("No default field. \ Specify the default field by set_default_field.") return self.fields[self.default_field_set_name][self.default_field_name]
SET_NAME_DESCRIPTION = '''set_name (str): The name of set. For example: ``"train"``, ``"dev"``, ``"test"``.''' FIELD_NAME_DESCRIPTION = '''field_name (str): The name of field.'''
[docs] def set_default_field(self, set_name: str, field_name: str): '''Set the default :class:`Field` in this dataloader. In the meanwhile, the default :class:`Vocab` and :class:`Tokenizer` is also set according to the field (if the field have vocab and tokenizer). The default field will affect the action in the following methods: * :meth:`get_default_field` * :meth:`tokenize` * :meth:`tokenize_sentences` * :meth:`convert_tokens_to_ids` * :meth:`convert_ids_to_tokens` * :meth:`convert_ids_to_sentence` * :meth:`convert_sentence_to_ids` * :meth:`add_special_to_ids` * :meth:`remove_special_in_ids` * :meth:`process_sentences` * :meth:`trim_in_ids` * :meth:`get_default_vocab` * :meth:`get_special_tokens_mapping` * :meth:`get_special_tokens_id` * :meth:`get_default_tokenizer` Arguments: {SET_NAME_DESCRIPTION} {FIELD_NAME_DESCRIPTION} ''' if set_name not in self.fields: raise KeyError("No such set named %s" % set_name) elif field_name not in self.fields[set_name]: raise KeyError("No such field named %s" % field_name) self.default_field_set_name = set_name self.default_field_name = field_name
# tokenizer = self.fields[set_name][field_name].get_tokenizer() # if tokenizer: # self.set_default_tokenizer(tokenizer) # vocab = self.fields[set_name][field_name].get_vocab() # if vocab: # self.set_default_vocab(vocab)
[docs] def get_field(self, set_name: str, field_name: str) -> Field: '''Get :class:`Field` according to name of set and field. Arguments: {SET_NAME_DESCRIPTION} {FIELD_NAME_DESCRIPTION} ''' return self.fields[set_name][field_name]
[docs] def get_general_hash(self) -> str: '''General hash. Identifying all details in dataloader, including raw data before processed, tokenized data, vocabulary, and settings. See :ref:`dataloader hash<dataloader_hash_ref>` for explaination. ''' general_hash = sha256() general_hash.update(dumps(self._raw_data_hash)) general_hash.update(dumps(self._data_hash)) general_hash.update(dumps(self._vocab_hash)) general_hash.update(dumps(self._setting_hash)) return general_hash.hexdigest()
[docs] def get_raw_data_hash(self) -> str: '''Raw data hash. Identifying raw data before processed. See :ref:`dataloader hash<dataloader_hash_ref>` for explaination. ''' return self._raw_data_hash
[docs] def get_data_hash(self) -> str: '''Data hash. Identifying data after processed (tokenized). See :ref:`dataloader hash<dataloader_hash_ref>` for explaination. ''' return self._data_hash
[docs] def get_vocab_hash(self) -> str: '''Vocab hash. Identifying vocabulary. See :ref:`dataloader hash<dataloader_hash_ref>` for explaination. ''' return self._vocab_hash
[docs] def get_setting_hash(self) -> str: '''Setting hash, identifying settings to create the data loader. See :ref:`dataloader hash<dataloader_hash_ref>` for explaination. ''' return self._setting_hash
[docs] def restart(self, set_name, batch_size=None, shuffle=True): '''Initialize batches. This function be called before :func:`get_next_batch` or an epoch is end. See :meth:`get_next_batch` for examples. Arguments: {SET_NAME_DESCRIPTION} batch_size (int): the number of sample in a batch. default: if ``None``, last ``batch_size`` is used. shuffle (bool): whether to shuffle the data. Default: ``True``. ''' if set_name not in self.fields: raise ValueError("No set named %s." % set_name) if batch_size is None and self.batch_size[set_name] is None: raise ValueError("You need batch_size to initialize.") if shuffle: # rng_state = random.getstate() random.shuffle(self.index[set_name]) # random.setstate(rng_state) self.batch_id[set_name] = 0 if batch_size is not None: self.batch_size[set_name] = batch_size batch_size_div = self.batch_size[set_name] assert batch_size_div is not None print("%s set restart, %d batches and %d left" % (set_name, \ len(self.index[set_name]) // batch_size_div, \ len(self.index[set_name]) % batch_size_div))
_GET_BATCH_MORE_DOC = "Return a merged dict containing all the data from each field by calling :meth:`.field.get_batch`. " \ "See examples in subclasses for the return value of predefined tasks." _GET_BATCH_EXAMPLE = ""
[docs] def get_batch(self, set_name: str, indexes: List[int]) -> Dict[str, Any]: '''Get a batch of data with specified ``indexes``. {_GET_BATCH_MORE_DOC} :meth:`get_next_batch`, :meth:`get_batches`, :meth:`get_all_batch` provide other methods to get batched data, Their return values are consistent with this methods. Arguments: {SET_NAME_DESCRIPTION} indexes (list): a list of specified indexes of batched data. {_GET_BATCH_EXAMPLE} ''' if set_name not in self.fields: raise ValueError("No set named %s." % set_name) res: Dict[str, Any] = {} for field_name, field_obj in self.fields[set_name].items(): res.update(field_obj.get_batch(field_name, self.data[set_name][field_name], indexes)) #pylint: disable=protected-access return res
IGNORE_LEFT_SAMPLES = "ignore_left_samples (bool): If the number of the samples is not divisible by ``batch_size``, " \ "ignore the left samples less than ``batch_size`` " \ "Setting it to ``True`` make that every batch will have the same number of samples. " \ "Default: ``False``."
[docs] def get_next_batch(self, set_name, ignore_left_samples=False) -> Optional[Dict[str, Any]]: '''Get next batch. It can be called only after Initializing batches (:func:`restart`). Return a dict like :func:`get_batch`, or None if the epoch is end. Arguments: {SET_NAME_DESCRIPTION} {IGNORE_LEFT_SAMPLES} Examples: >>> dataloader.restart("train") >>> while True: >>> data = dataloader.get_next_batch("train") >>> if data: >>> break >>> print(data) ''' if set_name not in self.fields: raise ValueError("No set named %s." % set_name) batch_size = self.batch_size[set_name] if batch_size is None: raise RuntimeError( \ "Please run restart before calling this function.") batch_id = self.batch_id[set_name] start, end = batch_id * \ batch_size, (batch_id + 1) * batch_size if start >= len(self.index[set_name]): return None if ignore_left_samples and end > len(self.index[set_name]): return None index = self.index[set_name][start:end] res = self.get_batch(set_name, index) self.batch_id[set_name] += 1 return res
[docs] def get_batches(self, set_name, batch_size=None, shuffle=True, ignore_left_samples=False) -> Iterable[Dict[str, Any]]: '''An iterable generator over batches. It first call :func:`restart`, and then :func:`get_next_batch` until no more data is available. Returns an iterable generator where each element is like :func:`get_batch`. Arguments: {SET_NAME_DESCRIPTION} batch_size (int, optional): default: ``None``. Use ``batch_size`` by default. shuffle (bool): whether to shuffle the data. Default: ``True``. {IGNORE_LEFT_SAMPLES} ''' self.restart(set_name, batch_size, shuffle) while True: res = self.get_next_batch(set_name, ignore_left_samples) if res is None: break yield res
[docs] def get_all_batch(self, set_name) -> Dict[str, List[Any]]: r'''Concatenate all batches to a single dict, where padding will not be applied. Returns a dict like :func:`get_batch` with all valid ``indexes``, but all the sentences are not padded and their type will be converted to list. Exactly, this function called :func:`get_batch` where ``len(indexes)==1`` multiple times and concatenate all the values in the returned dicts. Arguments: {SET_NAME_DESCRIPTION} ''' res: Dict[str, List[Any]] = {} for idx in self.index[set_name]: batch = self.get_batch(set_name, [idx]) for attr, val in batch.items(): if attr not in res: res[attr] = [] if not isinstance(val, (list, np.ndarray)): val = [val] res[attr].extend(val) return res
# copy some functions from vocab _VOCAB_MORE_DOCSTRING = '''It calls the identical method of the :class:`Vocab` instance ``vocab``,\ from :meth:`.get_default_vocab()`.''' frequent_vocab_size = copy_property(get_default_vocab, Vocab, "frequent_vocab_size") all_vocab_size = copy_property(get_default_vocab, Vocab, "all_vocab_size") frequent_vocab_list = copy_property(get_default_vocab, Vocab, "frequent_vocab_list") all_vocab_list = copy_property(get_default_vocab, Vocab, "all_vocab_list") get_special_tokens_mapping = copy_func(get_default_vocab, Vocab, "get_special_tokens_mapping") get_special_tokens_id = copy_func(get_default_vocab, Vocab, "get_special_tokens_id") pad_id = copy_property(get_default_vocab, Vocab, "pad_id") unk_id = copy_property(get_default_vocab, Vocab, "unk_id") go_id = copy_property(get_default_vocab, Vocab, "go_id") eos_id = copy_property(get_default_vocab, Vocab, "eos_id") _SENTENCE_MORE_DOCSTRING = '''It calls the identical method of the :class:`Sentence` instance ``sentence``,\ from :meth:`.get_default_field()`.''' _SESSION_MORE_DOCSTRING = '''It calls the identical method of the :class:`Session` instance ``session``,\ from :meth:`.get_default_field()`.''' tokenize = copy_func(get_default_field, Sentence, "tokenize") tokenize_sentences = copy_func(get_default_field, Sentence, "tokenize_sentences") convert_tokens_to_ids = copy_func(get_default_field, Sentence, "convert_tokens_to_ids") convert_ids_to_tokens = copy_func(get_default_field, Sentence, "convert_ids_to_tokens") convert_ids_to_sentence = copy_func(get_default_field, Sentence, "convert_ids_to_sentence") convert_sentence_to_ids = copy_func(get_default_field, Sentence, "convert_sentence_to_ids") add_special_to_ids = copy_func(get_default_field, Sentence, "add_special_to_ids") remove_special_in_ids = copy_func(get_default_field, Sentence, "remove_special_in_ids") process_sentences = copy_func(get_default_field, Sentence, "process_sentences") trim_in_ids = copy_func(get_default_field, Sentence, "trim_in_ids")