Skip to content

Commit

Permalink
Added public/ to serve dictionary files
Browse files Browse the repository at this point in the history
  • Loading branch information
Cameron Chambers committed Jun 10, 2021
1 parent 276da00 commit 1101e7a
Show file tree
Hide file tree
Showing 24 changed files with 1,017 additions and 0 deletions.
37 changes: 37 additions & 0 deletions public/dict/CharacterClass.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright 2014 Takuya Asano
* Copyright 2010-2014 Atilika Inc. and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

"use strict";

/**
* CharacterClass
* @param {number} class_id
* @param {string} class_name
* @param {boolean} is_always_invoke
* @param {boolean} is_grouping
* @param {number} max_length
* @constructor
*/
function CharacterClass(class_id, class_name, is_always_invoke, is_grouping, max_length) {
this.class_id = class_id;
this.class_name = class_name;
this.is_always_invoke = is_always_invoke;
this.is_grouping = is_grouping;
this.max_length = max_length;
}

module.exports = CharacterClass;
205 changes: 205 additions & 0 deletions public/dict/CharacterDefinition.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
/*
* Copyright 2014 Takuya Asano
* Copyright 2010-2014 Atilika Inc. and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

"use strict";

var InvokeDefinitionMap = require("./InvokeDefinitionMap");
var CharacterClass = require("./CharacterClass");
var SurrogateAwareString = require("../util/SurrogateAwareString");

var DEFAULT_CATEGORY = "DEFAULT";

/**
* CharacterDefinition represents char.def file and
* defines behavior of unknown word processing
* @constructor
*/
function CharacterDefinition() {
this.character_category_map = new Uint8Array(65536); // for all UCS2 code points
this.compatible_category_map = new Uint32Array(65536); // for all UCS2 code points
this.invoke_definition_map = null;
}

/**
* Load CharacterDefinition
* @param {Uint8Array} cat_map_buffer
* @param {Uint32Array} compat_cat_map_buffer
* @param {InvokeDefinitionMap} invoke_def_buffer
* @returns {CharacterDefinition}
*/
CharacterDefinition.load = function (cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer) {
var char_def = new CharacterDefinition();
char_def.character_category_map = cat_map_buffer;
char_def.compatible_category_map = compat_cat_map_buffer;
char_def.invoke_definition_map = InvokeDefinitionMap.load(invoke_def_buffer);
return char_def;
};

CharacterDefinition.parseCharCategory = function (class_id, parsed_category_def) {
var category = parsed_category_def[1];
var invoke = parseInt(parsed_category_def[2]);
var grouping = parseInt(parsed_category_def[3]);
var max_length = parseInt(parsed_category_def[4]);
if (!isFinite(invoke) || (invoke !== 0 && invoke !== 1)) {
console.log("char.def parse error. INVOKE is 0 or 1 in:" + invoke);
return null;
}
if (!isFinite(grouping) || (grouping !== 0 && grouping !== 1)) {
console.log("char.def parse error. GROUP is 0 or 1 in:" + grouping);
return null;
}
if (!isFinite(max_length) || max_length < 0) {
console.log("char.def parse error. LENGTH is 1 to n:" + max_length);
return null;
}
var is_invoke = (invoke === 1);
var is_grouping = (grouping === 1);

return new CharacterClass(class_id, category, is_invoke, is_grouping, max_length);
};

CharacterDefinition.parseCategoryMapping = function (parsed_category_mapping) {
var start = parseInt(parsed_category_mapping[1]);
var default_category = parsed_category_mapping[2];
var compatible_category = (3 < parsed_category_mapping.length) ? parsed_category_mapping.slice(3) : [];
if (!isFinite(start) || start < 0 || start > 0xFFFF) {
console.log("char.def parse error. CODE is invalid:" + start);
}
return { start: start, default: default_category, compatible: compatible_category};
};

CharacterDefinition.parseRangeCategoryMapping = function (parsed_category_mapping) {
var start = parseInt(parsed_category_mapping[1]);
var end = parseInt(parsed_category_mapping[2]);
var default_category = parsed_category_mapping[3];
var compatible_category = (4 < parsed_category_mapping.length) ? parsed_category_mapping.slice(4) : [];
if (!isFinite(start) || start < 0 || start > 0xFFFF) {
console.log("char.def parse error. CODE is invalid:" + start);
}
if (!isFinite(end) || end < 0 || end > 0xFFFF) {
console.log("char.def parse error. CODE is invalid:" + end);
}
return { start: start, end: end, default: default_category, compatible: compatible_category};
};

/**
* Initializing method
* @param {Array} category_mapping Array of category mapping
*/
CharacterDefinition.prototype.initCategoryMappings = function (category_mapping) {
// Initialize map by DEFAULT class
var code_point;
if (category_mapping != null) {
for (var i = 0; i < category_mapping.length; i++) {
var mapping = category_mapping[i];
var end = mapping.end || mapping.start;
for (code_point = mapping.start; code_point <= end; code_point++) {

// Default Category class ID
this.character_category_map[code_point] = this.invoke_definition_map.lookup(mapping.default);

for (var j = 0; j < mapping.compatible.length; j++) {
var bitset = this.compatible_category_map[code_point];
var compatible_category = mapping.compatible[j];
if (compatible_category == null) {
continue;
}
var class_id = this.invoke_definition_map.lookup(compatible_category); // Default Category
if (class_id == null) {
continue;
}
var class_id_bit = 1 << class_id;
bitset = bitset | class_id_bit; // Set a bit of class ID 例えば、class_idが3のとき、3ビット目に1を立てる
this.compatible_category_map[code_point] = bitset;
}
}
}
}
var default_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY);
if (default_id == null) {
return;
}
for (code_point = 0; code_point < this.character_category_map.length; code_point++) {
// 他に何のクラスも定義されていなかったときだけ DEFAULT
if (this.character_category_map[code_point] === 0) {
// DEFAULT class ID に対応するビットだけ1を立てる
this.character_category_map[code_point] = 1 << default_id;
}
}
};

/**
* Lookup compatible categories for a character (not included 1st category)
* @param {string} ch UCS2 character (just 1st character is effective)
* @returns {Array.<CharacterClass>} character classes
*/
CharacterDefinition.prototype.lookupCompatibleCategory = function (ch) {
var classes = [];

/*
if (SurrogateAwareString.isSurrogatePair(ch)) {
// Surrogate pair character codes can not be defined by char.def
return classes;
}*/
var code = ch.charCodeAt(0);
var integer;
if (code < this.compatible_category_map.length) {
integer = this.compatible_category_map[code]; // Bitset
}

if (integer == null || integer === 0) {
return classes;
}

for (var bit = 0; bit < 32; bit++) { // Treat "bit" as a class ID
if (((integer << (31 - bit)) >>> 31) === 1) {
var character_class = this.invoke_definition_map.getCharacterClass(bit);
if (character_class == null) {
continue;
}
classes.push(character_class);
}
}
return classes;
};


/**
* Lookup category for a character
* @param {string} ch UCS2 character (just 1st character is effective)
* @returns {CharacterClass} character class
*/
CharacterDefinition.prototype.lookup = function (ch) {

var class_id;

var code = ch.charCodeAt(0);
if (SurrogateAwareString.isSurrogatePair(ch)) {
// Surrogate pair character codes can not be defined by char.def, so set DEFAULT category
class_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY);
} else if (code < this.character_category_map.length) {
class_id = this.character_category_map[code]; // Read as integer value
}

if (class_id == null) {
class_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY);
}

return this.invoke_definition_map.getCharacterClass(class_id);
};

module.exports = CharacterDefinition;
59 changes: 59 additions & 0 deletions public/dict/ConnectionCosts.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright 2014 Takuya Asano
* Copyright 2010-2014 Atilika Inc. and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

"use strict";

/**
* Connection costs matrix from cc.dat file.
* 2 dimension matrix [forward_id][backward_id] -> cost
* @constructor
* @param {number} forward_dimension
* @param {number} backward_dimension
*/
function ConnectionCosts(forward_dimension, backward_dimension) {
this.forward_dimension = forward_dimension;
this.backward_dimension = backward_dimension;

// leading 2 integers for forward_dimension, backward_dimension, respectively
this.buffer = new Int16Array(forward_dimension * backward_dimension + 2);
this.buffer[0] = forward_dimension;
this.buffer[1] = backward_dimension;
}

ConnectionCosts.prototype.put = function (forward_id, backward_id, cost) {
var index = forward_id * this.backward_dimension + backward_id + 2;
if (this.buffer.length < index + 1) {
throw "ConnectionCosts buffer overflow";
}
this.buffer[index] = cost;
};

ConnectionCosts.prototype.get = function (forward_id, backward_id) {
var index = forward_id * this.backward_dimension + backward_id + 2;
if (this.buffer.length < index + 1) {
throw "ConnectionCosts buffer overflow";
}
return this.buffer[index];
};

ConnectionCosts.prototype.loadConnectionCosts = function (connection_costs_buffer) {
this.forward_dimension = connection_costs_buffer[0];
this.backward_dimension = connection_costs_buffer[1];
this.buffer = connection_costs_buffer;
};

module.exports = ConnectionCosts;
82 changes: 82 additions & 0 deletions public/dict/DynamicDictionaries.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Copyright 2014 Takuya Asano
* Copyright 2010-2014 Atilika Inc. and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

"use strict";

var doublearray = require("doublearray");
var TokenInfoDictionary = require("./TokenInfoDictionary");
var ConnectionCosts = require("./ConnectionCosts");
var UnknownDictionary = require("./UnknownDictionary");

/**
* Dictionaries container for Tokenizer
* @param {DoubleArray} trie
* @param {TokenInfoDictionary} token_info_dictionary
* @param {ConnectionCosts} connection_costs
* @param {UnknownDictionary} unknown_dictionary
* @constructor
*/
function DynamicDictionaries(trie, token_info_dictionary, connection_costs, unknown_dictionary) {
if (trie != null) {
this.trie = trie;
} else {
this.trie = doublearray.builder(0).build([
{k: "", v: 1}
]);
}
if (token_info_dictionary != null) {
this.token_info_dictionary = token_info_dictionary;
} else {
this.token_info_dictionary = new TokenInfoDictionary();
}
if (connection_costs != null) {
this.connection_costs = connection_costs;
} else {
// backward_size * backward_size
this.connection_costs = new ConnectionCosts(0, 0);
}
if (unknown_dictionary != null) {
this.unknown_dictionary = unknown_dictionary;
} else {
this.unknown_dictionary = new UnknownDictionary();
}
}

// from base.dat & check.dat
DynamicDictionaries.prototype.loadTrie = function (base_buffer, check_buffer) {
this.trie = doublearray.load(base_buffer, check_buffer);
return this;
};

DynamicDictionaries.prototype.loadTokenInfoDictionaries = function (token_info_buffer, pos_buffer, target_map_buffer) {
this.token_info_dictionary.loadDictionary(token_info_buffer);
this.token_info_dictionary.loadPosVector(pos_buffer);
this.token_info_dictionary.loadTargetMap(target_map_buffer);
return this;
};

DynamicDictionaries.prototype.loadConnectionCosts = function (cc_buffer) {
this.connection_costs.loadConnectionCosts(cc_buffer);
return this;
};

DynamicDictionaries.prototype.loadUnknownDictionaries = function (unk_buffer, unk_pos_buffer, unk_map_buffer, cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer) {
this.unknown_dictionary.loadUnknownDictionaries(unk_buffer, unk_pos_buffer, unk_map_buffer, cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer);
return this;
};

module.exports = DynamicDictionaries;
Loading

0 comments on commit 1101e7a

Please sign in to comment.