Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes excessive line wrapping. #529

Merged
merged 8 commits into from
Apr 9, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Fixes excessive line wrapping.
  • Loading branch information
kylebgorman committed Apr 8, 2024
commit aaa2ce0e85e7bdf271a6f2a2cc6db1cdabf1448b
53 changes: 11 additions & 42 deletions data/scrape/lib/languages_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,11 @@ def _detect_best_script_name(

Example: "ژۇرنال" -> ("Arabic", 1.0).
"""
script_counts: DefaultDict[
str,
float,
] = collections.defaultdict(float)
script_counts: DefaultDict[str, float] = collections.defaultdict(float)
for char in word:
script_counts[unicodedataplus.script(char)] += 1.0
script_probs = [
(
s,
script_counts[s] / len(word),
)
for s in script_counts
]
script_probs.sort(
key=operator.itemgetter(1),
reverse=True,
)
script_probs = [(s, script_counts[s] / len(word)) for s in script_counts]
script_probs.sort(key=operator.itemgetter(1), reverse=True)
if strict and len(script_probs) != 1:
return None
else:
Expand All @@ -79,26 +67,14 @@ def _get_alias(


def _remove_mismatch_ids(
script_dict: Dict[
str,
Dict[
str,
str,
],
]
script_dict: Dict[str, Dict[str, str]]
) -> Dict[str, Dict[str, str]]:
"""Removes [key:value] pairs when the key does not
match the ISO 15924 code alias for script.
"""
remove = []
for (
key,
value,
) in script_dict["script"].items():
value = value.replace(
" ",
"_",
)
for key, value in script_dict["script"].items():
value = value.replace(" ", "_")
if _get_alias(value) != key:
remove.append(key)
for i in remove:
Expand All @@ -107,11 +83,7 @@ def _remove_mismatch_ids(


def main():
with open(
LANGUAGES_PATH,
"r",
encoding="utf-8",
) as source:
with open(LANGUAGES_PATH, "r", encoding="utf-8") as source:
languages = json.load(source)
for filename in os.listdir(TSV_DIRECTORY):
if filename.endswith(".tsv"):
Expand All @@ -126,20 +98,17 @@ def main():
) as source:
for line in source:
if line is not None:
word = line.split(
"\t",
1,
)[0]
word = line.split("\t", 1)[0]
script = _detect_best_script_name(word)
if script is not None:
if "script" not in lang:
lang["script"] = {}
# Uses property_value_aliases to get
# ISO-15924 code.
if script not in lang["script"]:
lang["script"][
_get_alias(script)
] = script.replace("_", " ")
lang["script"][_get_alias(script)] = (
script.replace("_", " ")
)
_remove_mismatch_ids(lang)
with open(LANGUAGES_PATH, "w", encoding="utf-8") as sink:
json.dump(languages, sink, ensure_ascii=False, indent=4)
Expand Down