帮助:从汉化提取翻译
要制作翻译对照表,需要从汉化组提供的汉化包中提取文本。
天际中知名的汉化有:ANK汉化、大学汉化(参考知乎相关问题)。
要求
- Ability to Read
- Ability to Use Python
- Ability to Use Translator
- Ability to Download xTranslator
xTranslator 支持中文,可以在 选项 - 选项 - 选项 中选择 界面语言 为简/繁体中文。
步骤
1. 导出XML翻译文件
太长不看:加载esm,加载汉化包Strings为翻译,导出XML
具体步骤:
- 下载汉化包,解压到单独的文件夹。
- 确认游戏安装路径,后续将需要其 Data文件夹下的 *.esm 文件
- 打开 xTranslator
- 文件 - 加载esm文件 - 在游戏Data文件夹选择一个esm文件,如:Skyrim.esm
- 工具 - 加载Strings为翻译 - 选择汉化包对应的Strings文件,如:Skyrim_chinese.STRINGS
- 文件 - 导出翻译 - XML文件, 选择目录保存翻译文件。
- 重复3-6,直至所有汉化包Strings文件都被导出
2. 使用 Python 将 XML 转换为词典
目标:导出的XML不利于直接使用(搜索或用做机翻术语库)
这需要将XML解析为容易使用的格式。
比如转化成一些便于搜索的txt文件,便于查询。
比如辅助机翻:对于一个英文段落,遍历词典替换部分英文为中文。例如“The Skyrim is awesome!”,替换为“the 天际 is awesome!”,再机翻为“天际很赞!”。最后人工润色。
之前导出词典使用的代码
""" xml to trans dict"""
import json
from pathlib import Path
from collections import defaultdict
import xmltodict # 第三方库,需要 pip install
def main():
# 这些文件夹中包含若干从 xTranslator 中导出的 XML 文件
trans_xml_folders = {
"ANK": r"C:\Skyrim_Trans\汉化\ANK\xml",
"DS": r"C:\Skyrim_Trans\汉化\DS-景赞汉化\xml",
"Unofficial": r"C:\Skyrim_Trans\汉化\Unofficial\xml",
}
out_folder = r"C:\Skyrim_Trans\汉化"
# 1. xml to dict
pkg_data_dict = xml_to_dict(trans_xml_folders, out_folder)
# 2. dict_by_pkg >> dict_by_type
# dict[pkg][type][id] >> dict[type][id][pkg]
trans_by_type = dict_to_trans_by_type(pkg_data_dict, out_folder)
# 3. to simple trans_dict, (keep en>>cn only)
# dict[type][id][pkg] >> dict[type][id]
# item = {en:str, ank:str, uno:str, ...}
simple_trans_dict = to_simple_item(trans_by_type, out_folder)
# 4. convert to machine use trans file
# only keep en >> cn
# en \t ank \t uno \t ...
to_trans_file(simple_trans_dict, out_folder)
# write json file for every type
# to_type_trans_files(simple_trans_dict, out_folder)
def xml_to_dict(trans_xml_folders, out_folder):
"""
将所有文件中的 XML 文件转化为python dict
return pkg_data_dict
pkg_data_dict[pkg][type_key][edid] = item
pkg_data_dict[汉化包][类型][翻译编号] = 翻译信息
"""
pkg_data_dict = {} # name: pkg_data
for pkg_name, xml_folder in trans_xml_folders.items():
print("process trans folder: ", xml_folder)
""" pkg_data 一个汉化包的所有数据
pkg_data[type_key][list of items]
e.g. {
FACT:FULL [item, item, ...]
QUST:NNAM [item, item, ...]
BOOK:FULL [item, item, ...]
}
"""
pkg_data = defaultdict(dict)
for xml_file in Path(xml_folder).iterdir():
is_xml = xml_file.name.lower().endswith("xml")
if not is_xml:
continue
print("\t\tconvert xml to dict: ", xml_file)
# 1. xml >> python dict
xml_text = xml_file.read_text("UTF8")
data_dict = xmltodict.parse(xml_text)
dlc_name = data_dict["SSTXMLRessources"]["Params"]["Addon"]
item_list = data_dict["SSTXMLRessources"]["Content"]["String"]
# 2. xml dict >>>> pkg data dict
# list of item >>>> pkg_data[type][id]=item
for item in item_list:
type_key = item["REC"]
if not isinstance(item["REC"], str):
type_key = item["REC"]["#text"]
edid = item["EDID"]
item["DLC"] = dlc_name
item["PKG"] = pkg_name
pkg_data[type_key][edid] = item
pkg_data_dict[pkg_name] = pkg_data
return pkg_data_dict
def dict_to_trans_by_type(pkg_data_dict, out_folder):
"""
dict[pkg][type][id] >> dict[type][id][pkg]
"""
type_keys = [] # e.g. FACT:MNAM BOOK:DESC QUST:FULL
for _, pkg_data in pkg_data_dict.items():
type_keys += list(pkg_data.keys())
type_keys = list(set(type_keys))
print("convert trans dict format......")
trans_by_type = {} # trans_dict[type_key][edid][pkg_name] = item
for type_key in type_keys:
trans_by_type[type_key] = defaultdict(dict)
for pkg_name, pkg_data in pkg_data_dict.items():
for edid, item in pkg_data[type_key].items():
trans_by_type[type_key][edid][pkg_name] = item
print("save full json file......")
# save json file
# json_str = json.dumps(trans_dict, ensure_ascii=False, indent=4)
# json_file = Path(out_folder) / "full_dict.json"
# json_file.write_text(json_str, encoding="UTF8")
return trans_by_type
def to_simple_item(trans_by_type, out_folder):
""" convert to simple trans_dict
trans_dict >> [type_key][edid] = new_item
new_item {en, ank, uno, ...}
"""
simple_trans_dict = defaultdict(dict)
for type_key, type_value in trans_by_type.items():
for edid, edid_value in type_value.items():
new_edid_value = {}
for pkg, item in edid_value.items():
new_edid_value["En"] = item["Source"]
new_edid_value[pkg] = item["Dest"] if item["Dest"] else ""
if new_edid_value["En"]:
simple_trans_dict[type_key][edid] = new_edid_value
print("save dict json file......")
json_str = json.dumps(simple_trans_dict, ensure_ascii=False, indent=4)
json_file = Path(out_folder) / "trans_dict_simple.json"
json_file.write_text(json_str, encoding="UTF8")
return simple_trans_dict
def to_trans_file(simple_trans_dict, out_folder):
"""convert to machine use trans file
only keep en >> cn
e.g. :
en \t ank \t uno \t ...
"""
print("convert and save simple trans dict txt file......")
splitter = "\t"
order_trans_file = Path(out_folder) / "order_trans_file.txt"
lines = []
for type_key, type_data in simple_trans_dict.items():
# these type text is too long for trans dict
if type_key.startswith("DIAL") or not type_key.endswith("FULL"):
continue
for _, item in type_data.items():
line = ""
for k, v in item.items():
line += v + splitter
# skip no trans line
if not is_cn(line):
continue
lines.append(line)
lines = list(set(lines)) # 去重
lines.sort(key=lambda s: s.lower()) # 字母排序
file_str = "\n".join(lines)
order_trans_file.write_text(file_str, "UTF8")
print("convert and save machine trans txt file......")
machine_trans_file = Path(out_folder) / "machine_trans_file.txt"
lines.sort(key=lambda s: (99999 - len(s.split(" ")), s.lower())) # 长度逆序,用于机翻前从上到下遍历替换
file_str = "\n".join(lines)
machine_trans_file.write_text(file_str, "UTF8")
def to_type_trans_files(simple_trans_dict, out_folder):
print("convert and save type dict txt file......")
splitter = "\t"
for type_key, type_data in simple_trans_dict.items():
(Path(out_folder) / "type_trans").mkdir(exist_ok=True)
type_file = Path(out_folder) / "type_trans" / (type_key.replace(":","_") + ".txt")
file_str = ""
for edid, item in type_data.items():
line = edid + splitter
for k, v in item.items():
try:
line += v + splitter
except TypeError:
print(item)
if not is_cn(line):
continue
line = line.replace("\n"+u'\u3000', "<break_line>")
line = line.replace("\n", "<break_line>")
line = line.replace(u'\u3000', " ")
file_str += line + "\n"
type_file.write_text(file_str, "UTF8")
def is_cn(data: str):
import re
try:
if data and re.search(u'[\u4e00-\u9fff]', data):
return True
except TypeError:
print(data)
exit(1)
return False
if __name__ == '__main__':
main()

沪公网安备 31011002002714 号