I am training word2vec on my own text-corpus using mikolov's implementation from here. Not all unique words from the corpus get a vector even though I have set the min-count to 1. Are there any parameters I may have missed, that might be the reason not all unique words get a vector? What else might be the reason?
To test word2vecs behavior I have written the following script providing a text file with 20058 sentences and 278896 words (all words and punctuation are space separated and there is one sentence per line).
import subprocess
def get_w2v_vocab(path_embs):
vocab = set()
with open(path_embs, 'r', encoding='utf8') as f:
for line in f:
word = line.split(' ')[0]
return vocab - {'</s>'}
def train(path_corpus, path_embs):
subprocess.call(["./word2vec", "-threads", "6", "-train", path_corpus,
"-output", path_embs, "-min-count", "1"])
def get_unique_words_in_corpus(path_corpus):
vocab = []
with open(path_corpus, 'r', encoding='utf8') as f:
for line in f:
vocab.extend(line.strip('\n').split(' '))
return set(vocab)
def check_equality(expected, actual):
if not expected == actual:
diff = len(expected - actual)
raise Exception('Not equal! Vocab expected: {}, Vocab actual: {}, Diff: {}'.format(len(expected), len(actual), diff))
print('Expected vocab and actual vocab are equal.')
def main():
path_corpus = 'test_corpus2.txt'
path_embs = 'embeddings.vec'
vocab_expected = get_unique_words_in_corpus(path_corpus)
train(path_corpus, path_embs)
vocab_actual = get_w2v_vocab(path_embs)
check_equality(vocab_expected, vocab_actual)
if __name__ == '__main__':
This script gives me the following output:
Starting training using file test_corpus2.txt
Vocab size: 33651
Words in train file: 298954
Alpha: 0.000048 Progress: 99.97% Words/thread/sec: 388.16k Traceback (most recent call last):
File "test_w2v_behaviour.py", line 44, in <module>
File "test_w2v_behaviour.py", line 40, in main
check_equality(vocab_expected, vocab_actual)
File "test_w2v_behaviour.py", line 29, in check_equality
raise Exception('Not equal! Vocab expected: {}, Vocab actual: {}, Diff: {}'.format(len(expected), len(actual), diff))
Exception: Not equal! Vocab expected: 42116, Vocab actual: 33650, Diff: 17316