-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added public/ to serve dictionary files
- Loading branch information
Cameron Chambers
committed
Jun 10, 2021
1 parent
276da00
commit 1101e7a
Showing
24 changed files
with
1,017 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
/* | ||
* Copyright 2014 Takuya Asano | ||
* Copyright 2010-2014 Atilika Inc. and contributors | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
"use strict"; | ||
|
||
/** | ||
* CharacterClass | ||
* @param {number} class_id | ||
* @param {string} class_name | ||
* @param {boolean} is_always_invoke | ||
* @param {boolean} is_grouping | ||
* @param {number} max_length | ||
* @constructor | ||
*/ | ||
function CharacterClass(class_id, class_name, is_always_invoke, is_grouping, max_length) { | ||
this.class_id = class_id; | ||
this.class_name = class_name; | ||
this.is_always_invoke = is_always_invoke; | ||
this.is_grouping = is_grouping; | ||
this.max_length = max_length; | ||
} | ||
|
||
module.exports = CharacterClass; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
/* | ||
* Copyright 2014 Takuya Asano | ||
* Copyright 2010-2014 Atilika Inc. and contributors | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
"use strict"; | ||
|
||
var InvokeDefinitionMap = require("./InvokeDefinitionMap"); | ||
var CharacterClass = require("./CharacterClass"); | ||
var SurrogateAwareString = require("../util/SurrogateAwareString"); | ||
|
||
var DEFAULT_CATEGORY = "DEFAULT"; | ||
|
||
/** | ||
* CharacterDefinition represents char.def file and | ||
* defines behavior of unknown word processing | ||
* @constructor | ||
*/ | ||
function CharacterDefinition() { | ||
this.character_category_map = new Uint8Array(65536); // for all UCS2 code points | ||
this.compatible_category_map = new Uint32Array(65536); // for all UCS2 code points | ||
this.invoke_definition_map = null; | ||
} | ||
|
||
/** | ||
* Load CharacterDefinition | ||
* @param {Uint8Array} cat_map_buffer | ||
* @param {Uint32Array} compat_cat_map_buffer | ||
* @param {InvokeDefinitionMap} invoke_def_buffer | ||
* @returns {CharacterDefinition} | ||
*/ | ||
CharacterDefinition.load = function (cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer) { | ||
var char_def = new CharacterDefinition(); | ||
char_def.character_category_map = cat_map_buffer; | ||
char_def.compatible_category_map = compat_cat_map_buffer; | ||
char_def.invoke_definition_map = InvokeDefinitionMap.load(invoke_def_buffer); | ||
return char_def; | ||
}; | ||
|
||
CharacterDefinition.parseCharCategory = function (class_id, parsed_category_def) { | ||
var category = parsed_category_def[1]; | ||
var invoke = parseInt(parsed_category_def[2]); | ||
var grouping = parseInt(parsed_category_def[3]); | ||
var max_length = parseInt(parsed_category_def[4]); | ||
if (!isFinite(invoke) || (invoke !== 0 && invoke !== 1)) { | ||
console.log("char.def parse error. INVOKE is 0 or 1 in:" + invoke); | ||
return null; | ||
} | ||
if (!isFinite(grouping) || (grouping !== 0 && grouping !== 1)) { | ||
console.log("char.def parse error. GROUP is 0 or 1 in:" + grouping); | ||
return null; | ||
} | ||
if (!isFinite(max_length) || max_length < 0) { | ||
console.log("char.def parse error. LENGTH is 1 to n:" + max_length); | ||
return null; | ||
} | ||
var is_invoke = (invoke === 1); | ||
var is_grouping = (grouping === 1); | ||
|
||
return new CharacterClass(class_id, category, is_invoke, is_grouping, max_length); | ||
}; | ||
|
||
CharacterDefinition.parseCategoryMapping = function (parsed_category_mapping) { | ||
var start = parseInt(parsed_category_mapping[1]); | ||
var default_category = parsed_category_mapping[2]; | ||
var compatible_category = (3 < parsed_category_mapping.length) ? parsed_category_mapping.slice(3) : []; | ||
if (!isFinite(start) || start < 0 || start > 0xFFFF) { | ||
console.log("char.def parse error. CODE is invalid:" + start); | ||
} | ||
return { start: start, default: default_category, compatible: compatible_category}; | ||
}; | ||
|
||
CharacterDefinition.parseRangeCategoryMapping = function (parsed_category_mapping) { | ||
var start = parseInt(parsed_category_mapping[1]); | ||
var end = parseInt(parsed_category_mapping[2]); | ||
var default_category = parsed_category_mapping[3]; | ||
var compatible_category = (4 < parsed_category_mapping.length) ? parsed_category_mapping.slice(4) : []; | ||
if (!isFinite(start) || start < 0 || start > 0xFFFF) { | ||
console.log("char.def parse error. CODE is invalid:" + start); | ||
} | ||
if (!isFinite(end) || end < 0 || end > 0xFFFF) { | ||
console.log("char.def parse error. CODE is invalid:" + end); | ||
} | ||
return { start: start, end: end, default: default_category, compatible: compatible_category}; | ||
}; | ||
|
||
/** | ||
* Initializing method | ||
* @param {Array} category_mapping Array of category mapping | ||
*/ | ||
CharacterDefinition.prototype.initCategoryMappings = function (category_mapping) { | ||
// Initialize map by DEFAULT class | ||
var code_point; | ||
if (category_mapping != null) { | ||
for (var i = 0; i < category_mapping.length; i++) { | ||
var mapping = category_mapping[i]; | ||
var end = mapping.end || mapping.start; | ||
for (code_point = mapping.start; code_point <= end; code_point++) { | ||
|
||
// Default Category class ID | ||
this.character_category_map[code_point] = this.invoke_definition_map.lookup(mapping.default); | ||
|
||
for (var j = 0; j < mapping.compatible.length; j++) { | ||
var bitset = this.compatible_category_map[code_point]; | ||
var compatible_category = mapping.compatible[j]; | ||
if (compatible_category == null) { | ||
continue; | ||
} | ||
var class_id = this.invoke_definition_map.lookup(compatible_category); // Default Category | ||
if (class_id == null) { | ||
continue; | ||
} | ||
var class_id_bit = 1 << class_id; | ||
bitset = bitset | class_id_bit; // Set a bit of class ID 例えば、class_idが3のとき、3ビット目に1を立てる | ||
this.compatible_category_map[code_point] = bitset; | ||
} | ||
} | ||
} | ||
} | ||
var default_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY); | ||
if (default_id == null) { | ||
return; | ||
} | ||
for (code_point = 0; code_point < this.character_category_map.length; code_point++) { | ||
// 他に何のクラスも定義されていなかったときだけ DEFAULT | ||
if (this.character_category_map[code_point] === 0) { | ||
// DEFAULT class ID に対応するビットだけ1を立てる | ||
this.character_category_map[code_point] = 1 << default_id; | ||
} | ||
} | ||
}; | ||
|
||
/** | ||
* Lookup compatible categories for a character (not included 1st category) | ||
* @param {string} ch UCS2 character (just 1st character is effective) | ||
* @returns {Array.<CharacterClass>} character classes | ||
*/ | ||
CharacterDefinition.prototype.lookupCompatibleCategory = function (ch) { | ||
var classes = []; | ||
|
||
/* | ||
if (SurrogateAwareString.isSurrogatePair(ch)) { | ||
// Surrogate pair character codes can not be defined by char.def | ||
return classes; | ||
}*/ | ||
var code = ch.charCodeAt(0); | ||
var integer; | ||
if (code < this.compatible_category_map.length) { | ||
integer = this.compatible_category_map[code]; // Bitset | ||
} | ||
|
||
if (integer == null || integer === 0) { | ||
return classes; | ||
} | ||
|
||
for (var bit = 0; bit < 32; bit++) { // Treat "bit" as a class ID | ||
if (((integer << (31 - bit)) >>> 31) === 1) { | ||
var character_class = this.invoke_definition_map.getCharacterClass(bit); | ||
if (character_class == null) { | ||
continue; | ||
} | ||
classes.push(character_class); | ||
} | ||
} | ||
return classes; | ||
}; | ||
|
||
|
||
/** | ||
* Lookup category for a character | ||
* @param {string} ch UCS2 character (just 1st character is effective) | ||
* @returns {CharacterClass} character class | ||
*/ | ||
CharacterDefinition.prototype.lookup = function (ch) { | ||
|
||
var class_id; | ||
|
||
var code = ch.charCodeAt(0); | ||
if (SurrogateAwareString.isSurrogatePair(ch)) { | ||
// Surrogate pair character codes can not be defined by char.def, so set DEFAULT category | ||
class_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY); | ||
} else if (code < this.character_category_map.length) { | ||
class_id = this.character_category_map[code]; // Read as integer value | ||
} | ||
|
||
if (class_id == null) { | ||
class_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY); | ||
} | ||
|
||
return this.invoke_definition_map.getCharacterClass(class_id); | ||
}; | ||
|
||
module.exports = CharacterDefinition; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
/* | ||
* Copyright 2014 Takuya Asano | ||
* Copyright 2010-2014 Atilika Inc. and contributors | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
"use strict"; | ||
|
||
/** | ||
* Connection costs matrix from cc.dat file. | ||
* 2 dimension matrix [forward_id][backward_id] -> cost | ||
* @constructor | ||
* @param {number} forward_dimension | ||
* @param {number} backward_dimension | ||
*/ | ||
function ConnectionCosts(forward_dimension, backward_dimension) { | ||
this.forward_dimension = forward_dimension; | ||
this.backward_dimension = backward_dimension; | ||
|
||
// leading 2 integers for forward_dimension, backward_dimension, respectively | ||
this.buffer = new Int16Array(forward_dimension * backward_dimension + 2); | ||
this.buffer[0] = forward_dimension; | ||
this.buffer[1] = backward_dimension; | ||
} | ||
|
||
ConnectionCosts.prototype.put = function (forward_id, backward_id, cost) { | ||
var index = forward_id * this.backward_dimension + backward_id + 2; | ||
if (this.buffer.length < index + 1) { | ||
throw "ConnectionCosts buffer overflow"; | ||
} | ||
this.buffer[index] = cost; | ||
}; | ||
|
||
ConnectionCosts.prototype.get = function (forward_id, backward_id) { | ||
var index = forward_id * this.backward_dimension + backward_id + 2; | ||
if (this.buffer.length < index + 1) { | ||
throw "ConnectionCosts buffer overflow"; | ||
} | ||
return this.buffer[index]; | ||
}; | ||
|
||
ConnectionCosts.prototype.loadConnectionCosts = function (connection_costs_buffer) { | ||
this.forward_dimension = connection_costs_buffer[0]; | ||
this.backward_dimension = connection_costs_buffer[1]; | ||
this.buffer = connection_costs_buffer; | ||
}; | ||
|
||
module.exports = ConnectionCosts; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
/* | ||
* Copyright 2014 Takuya Asano | ||
* Copyright 2010-2014 Atilika Inc. and contributors | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
"use strict"; | ||
|
||
var doublearray = require("doublearray"); | ||
var TokenInfoDictionary = require("./TokenInfoDictionary"); | ||
var ConnectionCosts = require("./ConnectionCosts"); | ||
var UnknownDictionary = require("./UnknownDictionary"); | ||
|
||
/** | ||
* Dictionaries container for Tokenizer | ||
* @param {DoubleArray} trie | ||
* @param {TokenInfoDictionary} token_info_dictionary | ||
* @param {ConnectionCosts} connection_costs | ||
* @param {UnknownDictionary} unknown_dictionary | ||
* @constructor | ||
*/ | ||
function DynamicDictionaries(trie, token_info_dictionary, connection_costs, unknown_dictionary) { | ||
if (trie != null) { | ||
this.trie = trie; | ||
} else { | ||
this.trie = doublearray.builder(0).build([ | ||
{k: "", v: 1} | ||
]); | ||
} | ||
if (token_info_dictionary != null) { | ||
this.token_info_dictionary = token_info_dictionary; | ||
} else { | ||
this.token_info_dictionary = new TokenInfoDictionary(); | ||
} | ||
if (connection_costs != null) { | ||
this.connection_costs = connection_costs; | ||
} else { | ||
// backward_size * backward_size | ||
this.connection_costs = new ConnectionCosts(0, 0); | ||
} | ||
if (unknown_dictionary != null) { | ||
this.unknown_dictionary = unknown_dictionary; | ||
} else { | ||
this.unknown_dictionary = new UnknownDictionary(); | ||
} | ||
} | ||
|
||
// from base.dat & check.dat | ||
DynamicDictionaries.prototype.loadTrie = function (base_buffer, check_buffer) { | ||
this.trie = doublearray.load(base_buffer, check_buffer); | ||
return this; | ||
}; | ||
|
||
DynamicDictionaries.prototype.loadTokenInfoDictionaries = function (token_info_buffer, pos_buffer, target_map_buffer) { | ||
this.token_info_dictionary.loadDictionary(token_info_buffer); | ||
this.token_info_dictionary.loadPosVector(pos_buffer); | ||
this.token_info_dictionary.loadTargetMap(target_map_buffer); | ||
return this; | ||
}; | ||
|
||
DynamicDictionaries.prototype.loadConnectionCosts = function (cc_buffer) { | ||
this.connection_costs.loadConnectionCosts(cc_buffer); | ||
return this; | ||
}; | ||
|
||
DynamicDictionaries.prototype.loadUnknownDictionaries = function (unk_buffer, unk_pos_buffer, unk_map_buffer, cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer) { | ||
this.unknown_dictionary.loadUnknownDictionaries(unk_buffer, unk_pos_buffer, unk_map_buffer, cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer); | ||
return this; | ||
}; | ||
|
||
module.exports = DynamicDictionaries; |
Oops, something went wrong.