百度输入法的词库 bdict 转 txt
Python
<br />\#encoding:utf-8
import struct
import binascii
class Baidu(object):
def __init__(self, originfile):
self.originfile = originfile
self.lefile = originfile + '.le'
self.txtfile = originfile[0:(originfile.__len__()-5)] + 'txt'
self.buf = [b'0' for x in range(0,2)]
self.listwords = []
#字节流大端转小端
def be2le(self):
of = open(self.originfile,'rb')
lef = open(self.lefile, 'wb')
contents = of.read()
contents_size = contents.__len__()
mo_size = (contents_size % 2)
#保证是偶数
if mo_size > 0:
contents_size += (2-mo_size)
contents += contents + b'0000'
#大小端交换
for i in range(0, contents_size, 2):
self.buf[1] = contents[i]
self.buf[0] = contents[i+1]
le_bytes = struct.pack('2B', self.buf[0], self.buf[1])
lef.write(le_bytes)
print('写入成功转为小端的字节流')
of.close()
lef.close()
def le2txt(self):
lef = open(self.lefile, 'rb')
txtf = open(self.txtfile, 'w')
#以字符串形式读取转成小端后的字节流,百度词典的起始位置为0x350
le_bytes = lef.read().hex()[0x350:]
i = 0
while i<len(le_bytes):
result = le_bytes[i:i+4]
i+=4
#将所有字符解码成汉字,拼音或字符
content = binascii.a2b_hex(result).decode('utf-16-be')
#判断汉字
if '\u4e00' <= content <= '\u9fff':
self.listwords.append(content)
else:
if self.listwords:
word = ''.join(self.listwords)
txtf.write(word + '\n')
self.listwords = []
print('写入txt成功')
lef.close()
txtf.close()
if __name__ == '__main__':
path = './dict_file_565_20111206100521_1.0.0.bdict'
bd = Baidu(path)
bd.be2le()
bd.le2txt()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
|
<br />\#encoding:utf-8
import struct
import binascii
class Baidu(object):
def __init__(self, originfile):
self.originfile = originfile
self.lefile = originfile + '.le'
self.txtfile = originfile[0:(originfile.__len__()-5)] + 'txt'
self.buf = [b'0' for x in range(0,2)]
self.listwords = []
#字节流大端转小端
def be2le(self):
of = open(self.originfile,'rb')
lef = open(self.lefile, 'wb')
contents = of.read()
contents_size = contents.__len__()
mo_size = (contents_size % 2)
#保证是偶数
if mo_size > 0:
contents_size += (2-mo_size)
contents += contents + b'0000'
#大小端交换
for i in range(0, contents_size, 2):
self.buf[1] = contents[i]
self.buf[0] = contents[i+1]
le_bytes = struct.pack('2B', self.buf[0], self.buf[1])
lef.write(le_bytes)
print('写入成功转为小端的字节流')
of.close()
lef.close()
def le2txt(self):
lef = open(self.lefile, 'rb')
txtf = open(self.txtfile, 'w')
#以字符串形式读取转成小端后的字节流,百度词典的起始位置为0x350
le_bytes = lef.read().hex()[0x350:]
i = 0
while i<len(le_bytes):
result = le_bytes[i:i+4]
i+=4
#将所有字符解码成汉字,拼音或字符
content = binascii.a2b_hex(result).decode('utf-16-be')
#判断汉字
if '\u4e00' <= content <= '\u9fff':
self.listwords.append(content)
else:
if self.listwords:
word = ''.join(self.listwords)
txtf.write(word + '\n')
self.listwords = []
print('写入txt成功')
lef.close()
txtf.close()
if __name__ == '__main__':
path = './dict_file_565_20111206100521_1.0.0.bdict'
bd = Baidu(path)
bd.be2le()
bd.le2txt()
|
-
zeropython 微信公众号
5868037 QQ号
[email protected] QQ邮箱