Python 判断文件编码
import chardet
import config
from chardet.universaldetector import UniversalDetector
"""性能比较差"""
def detectFile(file_name):
detector = UniversalDetector()
file_obj = open(file_name)
for line in file_obj.readlines():
# 分块进行测试,直到达到阈值
detector.feed(line)
if detector.done: break
# 关闭检测对象
detector.close()
file_obj.close()
# 输出检测结果
if not detector.result.has_key("encoding"):
detector.result["encoding"] = config.DEFAULT_ENCODING
if str.lower(str(detector.result["encoding"])) == "gb2312":
detector.result["encoding"] = "gbk"
if str.lower(str(detector.result["encoding"])).startswith("asc"):
detector.result["encoding"] = "utf8"
return detector.result
def detectUTF8(file_name):
isutf8 = True
line_num = 0
state = 0
file_obj = open(file_name)
all_lines = file_obj.readlines()
file_obj.close()
for line in all_lines:
line_num += 1
line_len = len(line)
for index in range(line_len):
if state == 0:
if ord(line[index])&0x80 == 0x00:
state = 0
elif ord(line[index])&0xE0 == 0xC0:
state = 1
elif ord(line[index])&0xF0 == 0xE0:
state = 2
elif ord(line[index])&0xF8 == 0xF0:
state = 3
else:
isutf8 = False
break
else:
if not ord(line[index])&0xC0 == 0x80:
isutf8 = False
break
state -= 1
if not isutf8:
break
return isutf8
def detectFile2(full_path)
encoding_type = get_encoding_type_of_file(full_path)
with codecs.open(full_path, 'r', encoding_type) as file_handler:
self.FileBuffer = file_handler.read()
file_handler.seek(0)
self.FileLinesBuffer = file_handler.readlines()
def get_encoding_type_of_file(full_path):
with open(full_path, 'r') as file_handler:
file_content = file_handler.read()
encoding_type = chardet.detect(file_content).get("encoding")
return encoding_type if encoding_type.lower() != 'gb2312' else 'gbk'
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

上一篇: Python Selenium 小栗子
下一篇: Python 排序字典的 key
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
{{ commentTitle }}