1. 首页
  2. web前端

python读取word文档识别字段颜色,解析字段

python版本3.7.3,读取的文档格式为.docx

文中带有简单注释

看不懂的百度网盘下载直接查看,更改运行里面的py文件

import os import sys import xlrd import codecs import collections import json import io import docx import string from docx import Document from docx.shared import RGBColor #这个是docx的颜色类   maxLength = 0 id = 1 convert_list = [] type_list = [] curPath = os.path.dirname(os.path.abspath(__file__)) # coding=utf-8 #获取文档对象 def readDocx(fileName,type):     xlsFile = curPath + '\'+fileName+'.docx'   #地理(葡)Respueda G .es.pt     print("xlsFile: "+xlsFile)     file=docx.Document(xlsFile)     # print("段落数:"+str(len(file.paragraphs)))      index = 0     data = {}     i = 0     global id     global maxLength     for p in file.paragraphs:         i = i + 1         if i <= 1:  #跳过第一行             continue         if p.text == "" or (not p.text.strip()):             continue         # print("读取第 "+str(i)+" 行,文件名:"+fileName+" ID:"+str(id)+"  内容:"+p.text)         if index == 0: #提取题目             # print(p.text.find("-"),"题的内容是:", p.text)             length = len(p.text)             idx = p.text.find("Número")             if idx != -1 and idx < 2:                 idx = idx + len("Número") + 1                 # print("Número: "+str(idx)+"   text: "+p.text)                 p.text = p.text[idx:(length)]                 # print("Número: "+str(idx)+"   text: "+p.text)                          indexStr = "-" #分隔符             if p.text.find(indexStr) == -1:                 indexStr = "."                 if p.text.find(indexStr) == -1:                     indexStr = " "             # print("题的内容是:", p.text)             idx = p.text.index(indexStr)+len(indexStr)             length = len(p.text)             if length > maxLength:                 maxLength = length                 # print(id,"最大字符数",maxLength)             # print(str(idx)+str(length)+"第"+str(id)+"题的内容是:"+p.text)             questionAndsubType = p.text[idx:(length)]             questionAndsubTypeList = questionAndsubType.split("|")              data["question"] = questionAndsubTypeList[0] #题目                          # if len(questionAndsubTypeList) > 1 : #类型                 # subType = questionAndsubTypeList[1].replace("n", "")                 # print("---类型---",type_list.count(subType))                 # if type_list.count(subType) <= 0 :                     # type_list.append(subType)              data["subType"] = type#escape(subType)  #类型         else:   #提取选项,以及正确答案             # print("第"+str(id)+"题    选项"+ str(index) +"是:"+p.text)             length = len(p.text)             for n in p.runs:                 rgb = str(n.font.color.rgb) #读取段落颜色                 # print("runs"+rgb)                 if rgb == "00FF00":                     # print("正确答案: ",index)                     data["rightIndex"] = index             #删除段落中不必要文字             idx = p.text.find("(Direito)")             if idx != -1:                 p.text = p.text[0:idx]                              idx = p.text.find("(Correcta)")             if idx != -1:                 p.text = p.text[0:idx]                              idx = p.text.find("(Right)")             if idx != -1:                 p.text = p.text[0:idx]                              idx = p.text.find("(Correct)")             if idx != -1:                 p.text = p.text[0:idx]             #删除段落中不必要文字                          data["option"+str(index)] = p.text          index = index + 1          if index >= 5:             data["_id"] = id             # print("data: "+str(data))             convert_list.append(data)             index = 0             id = id + 1             data = {}  def writeDocx(fileList,name):     global id     global convert_list     global type_list     id = 1     convert_list = []     type_list = []          for p in fileList:         readDocx(p["path"],p["type"])     #题库     jsonPath = os.path.join(curPath,"topic",name+".txt") #写入路径     dirname = os.path.dirname(jsonPath)     if not os.path.exists(dirname):         os.makedirs(dirname)      with io.open(jsonPath, 'w', encoding='utf-8') as f:     #按照对应路径写入         f.write(json.dumps(convert_list, ensure_ascii=False, indent=4, sort_keys=True))  def main():     en_fileList =  [{"path":"en_us_topic\地理(英)Respueda G .es.en",          "type":"World"},                     {"path":"en_us_topic\科学与技术(英)",                    "type":"Technology"},                     {"path":"en_us_topic\历史(英)Resupeda H.es.en",           "type":"History"},                     {"path":"en_us_topic\艺术和文学(英)Respueda A&L.es.en",  "type":"ArtAndLiterature"},                     {"path":"en_us_topic\娱乐(英)Respueda E.es.en",           "type":"Fashion"},                     {"path":"en_us_topic\运动(英)Respueda  D.es.en",          "type":"Sports"}]     en_name = "en_us_topic"      es_fileList =  [{"path":"es_es_topic\地理(西)Respueda G ",                "type":"World"},                     {"path":"es_es_topic\科学与技术(西)Respueda C&T",            "type":"Technology"},                     {"path":"es_es_topic\历史(西)Resupeda H",                 "type":"History"},                     {"path":"es_es_topic\艺术和文学(西)Respueda A&L",        "type":"ArtAndLiterature"},                     {"path":"es_es_topic\娱乐(西)Respueda E",                 "type":"Fashion"},                     {"path":"es_es_topic\运动(西)Respueda  D",                "type":"Sports"}]     es_name = "es_es_topic"      pt_fileList =  [{"path":"pt_br_topic\地理(葡)Respueda G .es.pt",          "type":"World"},                     {"path":"pt_br_topic\科学与技术(葡)",                    "type":"Technology"},                     {"path":"pt_br_topic\历史(葡)Resupeda H.es.pt",           "type":"History"},                     {"path":"pt_br_topic\艺术和文学(葡)Respueda A&L.es.pt",  "type":"ArtAndLiterature"},                     {"path":"pt_br_topic\娱乐(葡)Respueda E.es.pt",           "type":"Fashion"},                     {"path":"pt_br_topic\运动(葡)Respueda  D.es.pt",          "type":"Sports"}]     pt_name = "pt_br_topic"      writeDocx(pt_fileList,pt_name)     writeDocx(es_fileList,es_name)     writeDocx(en_fileList,en_name)      main()  

原文始发于:python读取word文档识别字段颜色,解析字段

主题测试文章,只做测试使用。发布者:开发工程师,转转请注明出处:http://www.cxybcw.com/3689.html

联系我们

13687733322

在线咨询:点击这里给我发消息

邮件:1877088071@qq.com

工作时间:周一至周五,9:30-18:30,节假日休息

QR code