帮助:从汉化提取翻译
要制作翻译对照表,需要从汉化组提供的汉化包中提取文本。
天际中知名的汉化有:ANK汉化、大学汉化(参考知乎相关问题)。
要求
- Ability to Read
- Ability to Use Python
- Ability to Use Translator
- Ability to Download xTranslator
xTranslator 支持中文,可以在 选项 - 选项 - 选项 中选择 界面语言 为简/繁体中文。
步骤
1. 导出XML翻译文件
太长不看:加载esm,加载汉化包Strings为翻译,导出XML
具体步骤:
- 下载汉化包,解压到单独的文件夹。
- 确认游戏安装路径,后续将需要其 Data文件夹下的 *.esm 文件
- 打开 xTranslator
- 文件 - 加载esm文件 - 在游戏Data文件夹选择一个esm文件,如:Skyrim.esm
- 工具 - 加载Strings为翻译 - 选择汉化包对应的Strings文件,如:Skyrim_chinese.STRINGS
- 文件 - 导出翻译 - XML文件, 选择目录保存翻译文件。
- 重复3-6,直至所有汉化包Strings文件都被导出
2. 使用 Python 将 XML 转换为词典
目标:导出的XML不利于直接使用(搜索或用做机翻术语库)
这需要将XML解析为容易使用的格式。
比如转化成一些便于搜索的txt文件,便于查询。
比如辅助机翻:对于一个英文段落,遍历词典替换部分英文为中文。例如“The Skyrim is awesome!”,替换为“the 天际 is awesome!”,再机翻为“天际很赞!”。最后人工润色。
之前导出词典使用的代码
""" xml to trans dict""" import json from pathlib import Path from collections import defaultdict import xmltodict # 第三方库,需要 pip install def main(): # 这些文件夹中包含若干从 xTranslator 中导出的 XML 文件 trans_xml_folders = { "ANK": r"C:\Skyrim_Trans\汉化\ANK\xml", "DS": r"C:\Skyrim_Trans\汉化\DS-景赞汉化\xml", "Unofficial": r"C:\Skyrim_Trans\汉化\Unofficial\xml", } out_folder = r"C:\Skyrim_Trans\汉化" # 1. xml to dict pkg_data_dict = xml_to_dict(trans_xml_folders, out_folder) # 2. dict_by_pkg >> dict_by_type # dict[pkg][type][id] >> dict[type][id][pkg] trans_by_type = dict_to_trans_by_type(pkg_data_dict, out_folder) # 3. to simple trans_dict, (keep en>>cn only) # dict[type][id][pkg] >> dict[type][id] # item = {en:str, ank:str, uno:str, ...} simple_trans_dict = to_simple_item(trans_by_type, out_folder) # 4. convert to machine use trans file # only keep en >> cn # en \t ank \t uno \t ... to_trans_file(simple_trans_dict, out_folder) # write json file for every type # to_type_trans_files(simple_trans_dict, out_folder) def xml_to_dict(trans_xml_folders, out_folder): """ 将所有文件中的 XML 文件转化为python dict return pkg_data_dict pkg_data_dict[pkg][type_key][edid] = item pkg_data_dict[汉化包][类型][翻译编号] = 翻译信息 """ pkg_data_dict = {} # name: pkg_data for pkg_name, xml_folder in trans_xml_folders.items(): print("process trans folder: ", xml_folder) """ pkg_data 一个汉化包的所有数据 pkg_data[type_key][list of items] e.g. { FACT:FULL [item, item, ...] QUST:NNAM [item, item, ...] BOOK:FULL [item, item, ...] } """ pkg_data = defaultdict(dict) for xml_file in Path(xml_folder).iterdir(): is_xml = xml_file.name.lower().endswith("xml") if not is_xml: continue print("\t\tconvert xml to dict: ", xml_file) # 1. xml >> python dict xml_text = xml_file.read_text("UTF8") data_dict = xmltodict.parse(xml_text) dlc_name = data_dict["SSTXMLRessources"]["Params"]["Addon"] item_list = data_dict["SSTXMLRessources"]["Content"]["String"] # 2. xml dict >>>> pkg data dict # list of item >>>> pkg_data[type][id]=item for item in item_list: type_key = item["REC"] if not isinstance(item["REC"], str): type_key = item["REC"]["#text"] edid = item["EDID"] item["DLC"] = dlc_name item["PKG"] = pkg_name pkg_data[type_key][edid] = item pkg_data_dict[pkg_name] = pkg_data return pkg_data_dict def dict_to_trans_by_type(pkg_data_dict, out_folder): """ dict[pkg][type][id] >> dict[type][id][pkg] """ type_keys = [] # e.g. FACT:MNAM BOOK:DESC QUST:FULL for _, pkg_data in pkg_data_dict.items(): type_keys += list(pkg_data.keys()) type_keys = list(set(type_keys)) print("convert trans dict format......") trans_by_type = {} # trans_dict[type_key][edid][pkg_name] = item for type_key in type_keys: trans_by_type[type_key] = defaultdict(dict) for pkg_name, pkg_data in pkg_data_dict.items(): for edid, item in pkg_data[type_key].items(): trans_by_type[type_key][edid][pkg_name] = item print("save full json file......") # save json file # json_str = json.dumps(trans_dict, ensure_ascii=False, indent=4) # json_file = Path(out_folder) / "full_dict.json" # json_file.write_text(json_str, encoding="UTF8") return trans_by_type def to_simple_item(trans_by_type, out_folder): """ convert to simple trans_dict trans_dict >> [type_key][edid] = new_item new_item {en, ank, uno, ...} """ simple_trans_dict = defaultdict(dict) for type_key, type_value in trans_by_type.items(): for edid, edid_value in type_value.items(): new_edid_value = {} for pkg, item in edid_value.items(): new_edid_value["En"] = item["Source"] new_edid_value[pkg] = item["Dest"] if item["Dest"] else "" if new_edid_value["En"]: simple_trans_dict[type_key][edid] = new_edid_value print("save dict json file......") json_str = json.dumps(simple_trans_dict, ensure_ascii=False, indent=4) json_file = Path(out_folder) / "trans_dict_simple.json" json_file.write_text(json_str, encoding="UTF8") return simple_trans_dict def to_trans_file(simple_trans_dict, out_folder): """convert to machine use trans file only keep en >> cn e.g. : en \t ank \t uno \t ... """ print("convert and save simple trans dict txt file......") splitter = "\t" order_trans_file = Path(out_folder) / "order_trans_file.txt" lines = [] for type_key, type_data in simple_trans_dict.items(): # these type text is too long for trans dict if type_key.startswith("DIAL") or not type_key.endswith("FULL"): continue for _, item in type_data.items(): line = "" for k, v in item.items(): line += v + splitter # skip no trans line if not is_cn(line): continue lines.append(line) lines = list(set(lines)) # 去重 lines.sort(key=lambda s: s.lower()) # 字母排序 file_str = "\n".join(lines) order_trans_file.write_text(file_str, "UTF8") print("convert and save machine trans txt file......") machine_trans_file = Path(out_folder) / "machine_trans_file.txt" lines.sort(key=lambda s: (99999 - len(s.split(" ")), s.lower())) # 长度逆序,用于机翻前从上到下遍历替换 file_str = "\n".join(lines) machine_trans_file.write_text(file_str, "UTF8") def to_type_trans_files(simple_trans_dict, out_folder): print("convert and save type dict txt file......") splitter = "\t" for type_key, type_data in simple_trans_dict.items(): (Path(out_folder) / "type_trans").mkdir(exist_ok=True) type_file = Path(out_folder) / "type_trans" / (type_key.replace(":","_") + ".txt") file_str = "" for edid, item in type_data.items(): line = edid + splitter for k, v in item.items(): try: line += v + splitter except TypeError: print(item) if not is_cn(line): continue line = line.replace("\n"+u'\u3000', "<break_line>") line = line.replace("\n", "<break_line>") line = line.replace(u'\u3000', " ") file_str += line + "\n" type_file.write_text(file_str, "UTF8") def is_cn(data: str): import re try: if data and re.search(u'[\u4e00-\u9fff]', data): return True except TypeError: print(data) exit(1) return False if __name__ == '__main__': main()