Source code for cotk.wordvector.wordvector

'''
A module for word vector
'''
import numpy as np
import os
from .._utils.metaclass import DocStringInheritor, LoadClassInterface
from typing import List, Dict, Union, Optional, Any

from ..file_utils import get_resource_file_path


[docs]class WordVector(LoadClassInterface, metaclass=DocStringInheritor): r'''Base of all word vector loader. '''
[docs]class GeneralWordVector(WordVector): r'''Bases: :class:`.dataloader.WordVector` This class is a general pretrained word vector. Arguments: {FILE_ID_DOCS} {_FILE_ID_DEFAULT} {INPUT_FORMAT} ''' FILE_ID_DOCS = r''' file_id (str, ``None``): A str indicates the source of word vectors. It can be local path (``"./data"``), a resource name (``"resources://dataset"``), or an url (``"http://test.com/dataset.zip"``). See :meth:`cotk.file_utils.get_resource_file_path` for further details. If ``None``, do not use pretrained word vector.''' _FILE_ID_DEFAULT = "" INPUT_FORMAT = r''' Input Format A text file named ``wordvec.txt`` should be contained in the path. In the file, each word vec should be described in two lines. The first line is the word (or phrase), then the next line is multiple floats indicating the embedding. Example of ``wordvec.txt``: .. code-block:: none word 0.0 1.0 -2.3 phrases 0.3 -1.2 3.4 ''' def __init__(self, file_id: Union[str, None]): super().__init__() self.file_id: Optional[str] = file_id self.file_path = get_resource_file_path(file_id) if file_id else None def _load_raw_word2vec(self) -> Dict[str, str]: '''Load raw word vectors from file. ''' raw_word2vec = {} if self.file_path: file_path = self.file_path if os.path.isdir(file_path): file_path = "%s/wordvec.txt" % (file_path) with open(file_path, 'r', encoding='utf-8') as glove_file: lines = glove_file.readlines() for i in range(0, len(lines), 2): word = lines[i].strip() vec = lines[i+1].strip() raw_word2vec[word] = np.fromstring(vec, sep=" ") return raw_word2vec
[docs] def load_matrix(self, n_dims: int, vocab_list: List[str], \ mean: Optional[Union[float, List, np.ndarray]] = None, \ std: Optional[Union[float, List, np.ndarray]] = None, \ default_embeddings: Optional[Union[List, np.ndarray]] = None) -> np.ndarray: r'''Load pretrained word vector and return a numpy 2-d array. The ith row is the feature of the ith word in ``vocab_list``. If some feature is not included in pretrained word vector, it will be initialized by: * ``default_embeddings``, if it is not ``None``. * normal distribution with ``mean`` and ``std``, otherwise. Arguments: n_dims (int): specify the dimension size of word vector. If ``n_dims`` is bigger than size of pretrained word vector, the rest embedding will be initialized by ``default_embeddings`` or a normal distribution. vocab_list (list): specify the vocab list used in data loader. If there is any word not appeared in pretrained word vector, the embedding will be initialized by ``default_embeddings`` or a normal distribution. mean (float, Any, None): The mean of normal distribution. It can be a float, or an array whose shape is ``[n_dims]``. if ``None``, it will be set by the mean of loaded word vector embedding. Default: ``None``. std (float, Any, None): The standard deviation of normal distribution. It can be a float, or an array whose shape is ``[n_dims]``. if ``None``, it will be set by the standard deviation of loaded word vector embedding. Default: ``None``. default_embeddings (Any, optional): The default embeddings, its size should be ``[len(vocab_list), n_dims]``. Default: None, which indicates initializing the embeddings from the normal distribution with ``mean`` and ``std``. Returns: (:class:`numpy.ndarray`): A 2-d array. Size:``[len(vocab_list), n_dims]``. ''' if mean is not None: mean = np.array(mean) if mean.shape != () and mean.shape != (n_dims,): raise ValueError("The shape of mean must be () or (n_dims,), but got %s" % (mean.shape, )) if std is not None: std = np.array(std) if std.shape != () and std.shape != (n_dims,): raise ValueError("The shape of std must be () or (n_dims,), but got %s" % (std.shape, )) raw_word2vec = self._load_raw_word2vec() if default_embeddings is not None: if isinstance(default_embeddings, list): default_embeddings = np.array(default_embeddings) elif not isinstance(default_embeddings, np.ndarray): raise TypeError("Unkown type for default_embeddings") if default_embeddings.shape != (len(vocab_list), n_dims): raise ValueError("default_embeddings.shape should be equal to [len(vocab_list), n_dims]") default_embeddings = default_embeddings.copy() else: raw_word2vec_list = list(raw_word2vec.values()) if raw_word2vec_list: all_embedding = np.stack(list(raw_word2vec.values())) now_dims = min(n_dims, all_embedding.shape[1]) if mean is None: mean = np.zeros(n_dims) if raw_word2vec_list: mean[:now_dims] = np.mean(all_embedding, axis=0)[:now_dims] if std is None: std = np.ones(n_dims) / np.sqrt(n_dims) if len(raw_word2vec_list) > 1: std[:now_dims] = np.std(all_embedding, axis=0)[:now_dims] default_embeddings = np.random.randn(len(vocab_list), n_dims) * std + mean oov_cnt = 0 have_warned = False for i, vocab in enumerate(vocab_list): vec = raw_word2vec.get(vocab, None) if vec is None: oov_cnt += 1 else: tmp = vec if len(tmp) != n_dims and not have_warned: have_warned = True if len(tmp) > n_dims: print("Warning: Dimension of loaded wordvec is %d, but ``n_dims`` is set to %d. \ The redundant dimension is trimmed." % (len(tmp), n_dims)) else: print("Warning: Dimension of loaded wordvec is %d, but ``n_dims`` is set to %d. \ The extra dimension is initialized by normal distribution."\ % (len(tmp), n_dims)) now_dims = min(len(tmp), n_dims) default_embeddings[i, :now_dims] = tmp[:now_dims] print("wordvec cannot cover %f vocab" % (float(oov_cnt)/len(vocab_list))) return default_embeddings
[docs] def load_dict(self, vocab_list: List[str]) -> Dict[str, np.ndarray]: r'''Load word vector and return a dict that maps words to vectors. Arguments: vocab_list (list): specify the vocab list used in data loader. If there is any word not appeared in pretrained word vector, the feature will not be returned. Returns: (dict): maps a word (str) to its pretrained embedding (:class:`numpy.ndarray`) where its shape is [ndims]. ''' raw_word2vec = self._load_raw_word2vec() word2vec = {} for vocab in vocab_list: vec = raw_word2vec.get(vocab, None) if vec is not None: word2vec[vocab] = vec return word2vec
[docs]class Glove(GeneralWordVector): r'''Bases: :class:`.dataloader.GeneralWordVector`, :class:`.dataloader.WordVector` GloVe is pre-trained word vector named `Global Vectors for Word Representation`. References: [1] Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation. Arguments: {FILE_ID_DOCS} {_FILE_ID_DEFAULT} ''' _FILE_ID_DEFAULT = "Default: ``resources://Glove300d``. A 300-d pretrained GloVe will be downloaded (or loaded from cache) and used." def __init__(self, file_id="resources://Glove300d"): super().__init__(file_id=file_id)