帮师妹写的裁判文书内容提取:可做进一步提取模板
一、提取结果 二、提取内容 需求内容为:被告、裁判文书案号、时间、地域、刑事强制措施、罪名、刑罚 整体运用的技术:主要采取的是正则匹配,因为裁判文书的这些表述是存在一定的模式的,法言法语的要求是的语言表述一般比较固定;当然在被告名字处正则匹配可能会遗漏,于是通过pyhanlp实体识别进行了进一步的精确化 遇到的一些问题:多主体不同罪名 这一问题主要通过被告名称进行定位,如果名字在该段文字中且存在这一罪名,则形成指向关系,被告犯xx罪 三、代码 ```python # -*- coding: utf-8 -*- # Author: cw # Datetime : 2020 # software: PyCharm # 收获: import time,json,random,re from fuzzywuzzy import fuzz from tqdm import tqdm from pyhanlp import * from 数据集处理过程.非结巴而是符号分词 import Symbol_cutword as sy #自行封装的读取docx内容的函数 import pandas as pd from common.线程方法化 import Thread_Pool as t #自行封装的多线程函数 from 数据集处理过程.中文数字转阿拉伯 import simple_math as sm #自行封装的中文数字转阿拉伯数字的函数 class Get_message(): def __init__(self): self.file = os.path.dirname(__file__) def get_name(self,file): #实体识别:名称 global name_o names,names_o1,names_o2 = [],[],[] CRFLexicalAnalyzer = JClass("com.hankcs.hanlp.model.crf.CRFLexicalAnalyzer") analyzer = CRFLexicalAnalyzer() for t in analyzer.analyze(file): if "nr" in str(t): name_o = "".join(re.findall(r'[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+',str(t).replace("犯", ""))) if name_o == "某某": pass else: if len(name_o)<=3: names_o1.append(name_o + "(人名)") else: name_o = name_o[:3] names_o1.append(name_o + "(人名)") elif "nt" in str(t): name_o = "".join(re.findall(r'[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+',str(t).replace("犯", ""))) if name_o == "某某": pass else: names_o1.append(name_o + "(机构名)") #print(names_o1) if names_o1 != []: names = list(set(names_o1)) return names def get_text_from_wenshu(self, file): names,user_dict = [],[] dataframe= {} texts = sy().period_cutword(file) # 公开与否 if "不宜在互联网公布" in texts or len(texts) <= 20: dataframe['公开状况'] = "不公开" dataframe['案件名称'] = file.split("\\")[-1].split(".")[0] names = self.get_name(file.split("\\")[-1].split(".")[0]) if names == []: dataframe["被告"] = "不明" dataframe["被告"] = "".join(names) """excel写入""" dataframes = pd.DataFrame(dataframe, index=[0]) writer = pd.ExcelWriter(r"E:\Firefox\shimei2\{name}.xls".format(name=file.split("\\")[-1].split(".")[0])) dataframes.to_excel(writer, sheet_name='数据', index=False) writer.save() else: dataframe['公开状况'] = "公开" for i in texts[:5]: try: name_ = re.search("被告人(.*?),",i).group().replace("被告人","").replace(",","") names.append(name_) except: pass #地域准备 try: area = re.search("湖南省(.*?)书", i).group().replace("刑事判决书","").replace("刑事裁定书","") dataframe["地域"] = area dataframe["指控机关"] = area+"人民检察院" dataframe["审判法院"] = area+"人民法院" except: pass content = ["被告人", "原告人", "被上诉人", "上诉人"] for n in content: if fuzz.partial_ratio(n, i) > 50: for nrt in self.get_name(i): names.append(nrt) #异常 if names == []: for i in texts[:10]: for nrt in self.get_name(i): names.append(nrt) names = list(set(names)) #print(names) for yuan in names:#列表前移出错 if "检察院" in yuan: if "人名" in yuan: yuan = yuan.split("人名") for yuan_1 in yuan: if "检察" in yuan_1: yuan2= yuan_1.replace("(机构名)","").replace("(","").replace(")","") dataframe["指控机关"] = yuan2 dataframe["地域"] = yuan2.replace("人民检察院", "").replace('检察院', '').replace("(机构名)","") else: names.append(yuan_1.replace("(", "").replace(")", "")) else: dataframe["指控机关"] = yuan.replace("(机构名)","") dataframe["地域"] = yuan.replace("人民检察院", "").replace('检察院', '').replace("(机构名)","") try: for i in range(2): names.remove(yuan) except: pass if "法院" in yuan: if "人名" in yuan: yuan = yuan.split("人名") for yuan_1 in yuan: if "法院" in yuan_1: yuan2 = yuan_1.replace("(机构名)","").replace("(","").replace(")","") dataframe["审判法院"]= yuan2 dataframe["地域"] = yuan2.replace("法院", "").replace('人民法院', '').replace("(机构名)","") else: names.append(yuan_1.replace("(", "").replace(")", "")) else: dataframe["审判法院"] = yuan.replace("人民法院", "").replace('法院', '').replace("(机构名)","").replace("(","").replace(")","") dataframe["地域"] = yuan.replace("法院", "").replace('人民法院', '').replace("(机构名)","").replace("(","").replace(")","") try: for i in range(2): names.remove(yuan) except: pass if "市" in yuan and len(yuan) <=3: try: for i in range(2): names.remove(yuan) except: pass if names == []: print(file) names = ["不明"] dataframe["被告"] = "".join(names).replace("[", "").replace("]", "") dataframe["刑事强制措施"] = "无" # 裁判文书名字 dataframe['案件名称'] = file.split("\\")[-1].split(".")[0] # 案号 try: number = re.search("((.*?)号", texts[0].strip()).group() dataframe["案号"] = number #dataframe["时间"] = re.search("((.*?))", number).group().replace("(", "").replace(")", "") except: number = re.search("湘(.*?)号", texts[0]).group() dataframe["案号"] = number for i in texts: if '〇' in i: reg = '([\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e\u5343\u4e07\u3007]+)年' time = "".join(re.findall(reg, i))[:4] time = sm(time) dataframe["时间"] = time for na in names: if na in i: if fuzz.partial_ratio("查封", i) > 80: dataframe["刑事强制措施"] = f"查封:{na}" elif fuzz.partial_ratio("扣押", i) > 80: dataframe["刑事强制措施"] = f"扣押:{na}" elif fuzz.partial_ratio("冻结", i) > 80: dataframe["刑事强制措施"] = f"冻结:{na}" elif fuzz.partial_ratio("逮捕", i) > 80: dataframe["刑事强制措施"] = f"逮捕:{na}" if fuzz.partial_ratio("查封", i) > 80: dataframe["刑事强制措施"] = f"查封:{na}" elif fuzz.partial_ratio("扣押", i) > 80: dataframe["刑事强制措施"] = f"扣押:{na}" elif fuzz.partial_ratio("冻结", i) > 80: dataframe["刑事强制措施"] = f"冻结:{na}" elif fuzz.partial_ratio("逮捕", i) > 80: dataframe["刑事强制措施"] = f"逮捕:{na}" zuimings= str(re.findall("判决如下(.*?)如不服本判决", "。".join(texts))).replace("[","").replace("]","")+"。" zuiming=zuimings.split("。") #准备 dataframe["罪名"] = "不明" dataframe["刑罚"] = "不明" for zuim in zuiming: #有就更新 for name_c in names: name_c = name_c.replace("(人名)", "").replace("(机构名)", "") if name_c in zuim and "罪" in zuim: try: zui1 = re.search(f"{name_c}(.*?)罪", zuim).group() dataframe["罪名"] = zui1.replace("[", "").replace("]", "").replace("犯", "-") except: pass if "判处" in zuim and "。" in zuim: pen1 = re.search("判处(.*?)。", zuim).group() dataframe["刑罚"] = str(pen1) else: try: pen1 = re.search("判处(.*?))", str(zuim)+")#").group() dataframe["刑罚"] = str(pen1) except: pass #异常情况 if dataframe["罪名"] == "不明": try: cri = re.search("犯(.*?)罪", zuimings).group().replace("犯", "") except: #print(file) cri = re.search("被告人(.*?)罪", "。".join(texts)).group().replace("被告人", "").replace("一","") if cri != []: dataframe["罪名"] = cri if dataframe["刑罚"] == "不明": try:#应对裁判文书出现序列错误 pen1 = re.search("判处(.*?)。", zuimings).group().replace("判处", "") except: try: pen1 = re.search("判(.*?)。", "。".join(texts)).group().replace("判", "") except: try: pen1 = re.search("判(.*?))", "。".join(texts)).group().replace("判", "") except: pen1 = "没有提取到相关信息" if pen1 != "没有提取到相关信息" : if len(pen1)>30: pen1 = re.search("判(.*?))", "。".join(texts)+")").group().replace("判处", "") dataframe["刑罚"] = pen1[:30] else: dataframe["刑罚"] = pen1 """excel写入""" #print(dataframe) dataframes = pd.DataFrame(dataframe, index=[0]) writer = pd.ExcelWriter(r"E:\Firefox\shimei2\{name}.xls".format(name=file.split("\\")[-1].split(".")[0])) dataframes.to_excel(writer, sheet_name='数据', index=False) writer.save() def threading(self,files): threads = glob.glob(os.path.join(files,"*.docx")) #print(threads[168:171]) t(2,self.get_text_from_wenshu,threads).concurrent_Thread_package() if __name__ == "__main__": os.makedirs("E:\Firefox\shimei2") #excel文件的存储目录 files = r"E:\Firefox\shimei_wenshu15" #files为目标文件的目录,里面必须是.docx格式的文件,不能是.doc文件格式,因为python没有处理.docg Get_message(index).threading(files)