使用pywin32处理excel文件¶
Windows官方接口 https://docs.microsoft.com/zh-cn/office/vba/api/word.document.exportasfixedformat https://docs.microsoft.com/en-us/office/vba/api/excel.range.exportasfixedformat
#!/usr/bin/env python
#-*- coding:utf-8 -*-
#######################################################
# 用于批量删除excel的指定行 #
# 适用于所有office,前提需要安装pywin32和office软件 #
#######################################################
import os
import sys
import time
import glob
import shutil
import string
import os.path
import traceback
import ConfigParser
import win32com.client
SPATH = "" #需处理的excel文件目录
DPATH = "" #处理后的excel存放目录
SKIP_FILE_LIST = [] #需要跳过的文件列表
MAX_SHEET_INDEX = 1 #每个excel文件的前几个表需要处理
DELETE_ROW_LIST = [] #需要删除的行号
def dealPath(pathname=''):
'''deal with windows file path'''
if pathname:
pathname = pathname.strip()
if pathname:
pathname = r'%s'%pathname
pathname = string.replace(pathname, r'/', '\\')
pathname = os.path.abspath(pathname)
if pathname.find(":\\") == -1:
pathname = os.path.join(os.getcwd(), pathname)
return pathname
class EasyExcel(object):
'''class of easy to deal with excel'''
def __init__(self):
'''initial excel application'''
self.m_filename = ''
self.m_exists = False
self.m_excel = win32com.client.DispatchEx('Excel.Application') #也可以用Dispatch,前者开启新进程,后者会复用进程中的excel进程
self.m_excel.DisplayAlerts = False #覆盖同名文件时不弹出确认框
def open(self, filename=''):
'''open excel file'''
if getattr(self, 'm_book', False):
self.m_book.Close()
self.m_filename = dealPath(filename) or ''
self.m_exists = os.path.isfile(self.m_filename)
if not self.m_filename or not self.m_exists:
self.m_book = self.m_excel.Workbooks.Add()
else:
self.m_book = self.m_excel.Workbooks.Open(self.m_filename)
def reset(self):
'''reset'''
self.m_excel = None
self.m_book = None
self.m_filename = ''
def save(self, newfile=''):
'''save the excel content'''
assert type(newfile) is str, 'filename must be type string'
newfile = dealPath(newfile) or self.m_filename
if not newfile or (self.m_exists and newfile == self.m_filename):
self.m_book.Save()
return
pathname = os.path.dirname(newfile)
if not os.path.isdir(pathname):
os.makedirs(pathname)
self.m_filename = newfile
self.m_book.SaveAs(newfile)
def close(self):
'''close the application'''
self.m_book.Close(SaveChanges=1)
self.m_excel.Quit()
time.sleep(2)
self.reset()
def addSheet(self, sheetname=None):
'''add new sheet, the name of sheet can be modify,but the workbook can't '''
sht = self.m_book.Worksheets.Add()
sht.Name = sheetname if sheetname else sht.Name
return sht
def getSheet(self, sheet=1):
'''get the sheet object by the sheet index'''
assert sheet > 0, 'the sheet index must bigger then 0'
return self.m_book.Worksheets(sheet)
def getSheetByName(self, name):
'''get the sheet object by the sheet name'''
for i in xrange(1, self.getSheetCount()+1):
sheet = self.getSheet(i)
if name == sheet.Name:
return sheet
return None
def getCell(self, sheet=1, row=1, col=1):
'''get the cell object'''
assert row>0 and col>0, 'the row and column index must bigger then 0'
return self.getSheet(sheet).Cells(row, col)
def getRow(self, sheet=1, row=1):
'''get the row object'''
assert row>0, 'the row index must bigger then 0'
return self.getSheet(sheet).Rows(row)
def getCol(self, sheet, col):
'''get the column object'''
assert col>0, 'the column index must bigger then 0'
return self.getSheet(sheet).Columns(col)
def getRange(self, sheet, row1, col1, row2, col2):
'''get the range object'''
sht = self.getSheet(sheet)
return sht.Range(self.getCell(sheet, row1, col1), self.getCell(sheet, row2, col2))
def getCellValue(self, sheet, row, col):
'''Get value of one cell'''
return self.getCell(sheet,row, col).Value
def setCellValue(self, sheet, row, col, value):
'''set value of one cell'''
self.getCell(sheet, row, col).Value = value
def getRowValue(self, sheet, row):
'''get the row values'''
return self.getRow(sheet, row).Value
def setRowValue(self, sheet, row, values):
'''set the row values'''
self.getRow(sheet, row).Value = values
def getColValue(self, sheet, col):
'''get the row values'''
return self.getCol(sheet, col).Value
def setColValue(self, sheet, col, values):
'''set the row values'''
self.getCol(sheet, col).Value = values
def getRangeValue(self, sheet, row1, col1, row2, col2):
'''return a tuples of tuple)'''
return self.getRange(sheet, row1, col1, row2, col2).Value
def setRangeValue(self, sheet, row1, col1, data):
'''set the range values'''
row2 = row1 + len(data) - 1
col2 = col1 + len(data[0]) - 1
range = self.getRange(sheet, row1, col1, row2, col2)
range.Clear()
range.Value = data
def getSheetCount(self):
'''get the number of sheet'''
return self.m_book.Worksheets.Count
def getMaxRow(self, sheet):
'''get the max row number, not the count of used row number'''
return self.getSheet(sheet).Rows.Count
def getMaxCol(self, sheet):
'''get the max col number, not the count of used col number'''
return self.getSheet(sheet).Columns.Count
def clearCell(self, sheet, row, col):
'''clear the content of the cell'''
self.getCell(sheet,row,col).Clear()
def deleteCell(self, sheet, row, col):
'''delete the cell'''
self.getCell(sheet, row, col).Delete()
def clearRow(self, sheet, row):
'''clear the content of the row'''
self.getRow(sheet, row).Clear()
def deleteRow(self, sheet, row):
'''delete the row'''
self.getRow(sheet, row).Delete()
def clearCol(self, sheet, col):
'''clear the col'''
self.getCol(sheet, col).Clear()
def deleteCol(self, sheet, col):
'''delete the col'''
self.getCol(sheet, col).Delete()
def clearSheet(self, sheet):
'''clear the hole sheet'''
self.getSheet(sheet).Clear()
def deleteSheet(self, sheet):
'''delete the hole sheet'''
self.getSheet(sheet).Delete()
def deleteRows(self, sheet, fromRow, count=1):
'''delete count rows of the sheet'''
maxRow = self.getMaxRow(sheet)
maxCol = self.getMaxCol(sheet)
endRow = fromRow+count-1
if fromRow > maxRow or endRow < 1:
return
self.getRange(sheet, fromRow, 1, endRow, maxCol).Delete()
def deleteCols(self, sheet, fromCol, count=1):
'''delete count cols of the sheet'''
maxRow = self.getMaxRow(sheet)
maxCol = self.getMaxCol(sheet)
endCol = fromCol + count - 1
if fromCol > maxCol or endCol < 1:
return
self.getRange(sheet, 1, fromCol, maxRow, endCol).Delete()
def echo(msg):
'''echo message'''
print msg
def dealSingle(excel, sfile, dfile):
'''deal with single excel file'''
echo("deal with %s"%sfile)
basefile = os.path.basename(sfile)
excel.open(sfile)
sheetcount = excel.getSheetCount()
if not (basefile in SKIP_FILE_LIST or file in SKIP_FILE_LIST):
for sheet in range(1, sheetcount+1):
if sheet > MAX_SHEET_INDEX:
continue
reduce = 0
for row in DELETE_ROW_LIST:
excel.deleteRow(sheet, row-reduce)
reduce += 1
#excel.deleteRows(sheet, 2, 2)
excel.save(dfile)
def dealExcel(spath, dpath):
'''deal with excel files'''
start = time.time()
#check source path exists or not
spath = dealPath(spath)
if not os.path.isdir(spath):
echo("No this directory :%s"%spath)
return
#check destination path exists or not
dpath = dealPath(dpath)
if not os.path.isdir(dpath):
os.makedirs(dpath)
shutil.rmtree(dpath)
#list the excel file
filelist = glob.glob(os.path.join(spath, '*.xlsx'))
if not filelist:
echo('The path of %s has no excel file'%spath)
return
#deal with excel file
excel = EasyExcel()
for file in filelist:
basefile = os.path.basename(file)
destfile = os.path.join(dpath, basefile)
dealSingle(excel, file, destfile)
echo('Use time:%s'%(time.time()-start))
excel.close()
def loadConfig(configfile='./config.ini'):
'''parse config file'''
global SPATH
global DPATH
global SKIP_FILE_LIST
global MAX_SHEET_INDEX
global DELETE_ROW_LIST
file = dealPath(configfile)
if not os.path.isfile(file):
echo('Can not find the config.ini')
return False
parser = ConfigParser.ConfigParser()
parser.read(file)
SPATH = parser.get('pathconfig', 'spath').strip()
DPATH = parser.get('pathconfig', 'dpath').strip()
filelist = parser.get('otherconfig', 'filelist').strip()
index = parser.get('otherconfig', 'maxindex').strip()
rowlist = parser.get('otherconfig', 'deleterows').strip()
if filelist:
SKIP_FILE_LIST = filelist.split(";")
if rowlist:
DELETE_ROW_LIST = map(int, rowlist.split(";"))
MAX_SHEET_INDEX = int(index) if index else MAX_SHEET_INDEX
def main():
'''main function'''
loadConfig()
if SPATH and DPATH and MAX_SHEET_INDEX:
dealExcel(SPATH, DPATH)
raw_input("Please press any key to exit!")
if __name__=="__main__":
main()
config.ini文件如下:
[pathconfig]
#;spath表示需要处理的excel文件目录
spath=./tests
#;dpath表示处理后的excel文件目录
dpath=./dest
[otherconfig]
#;filelist表示不需要做特殊处理的excel文件列表,以英文分号分隔
filelist=
#;maxindex表示需要处理每个excel文件的前几张表
maxindex=1
#;deleterows表示需要删除的阿拉伯数字行号,用英文分号分隔
deleterows=2;3
Win32将doc, xlsx, ppt转换成pdf¶
#-*- coding:utf-8 -*-
import os
from win32com.client import Dispatch, constants, gencache, DispatchEx
class PDFConverter():
def __init__(self, pathname, export='.'):
self._handle_postfix = ['doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx']
self._filename_list = list()
self._export_folder = os.path.join(os.path.abspath('.'), 'pdfconver')
if not os.path.exists(self._export_folder):
os.mkdir(self._export_folder)
self._enumerate_filename(pathname)
def _enumerate_filename(self, pathname):
'''
读取所有文件名
'''
full_pathname = os.path.abspath(pathname)
if os.path.isfile(full_pathname):
if self._is_legal_postfix(full_pathname):
self._filename_list.append(full_pathname)
else:
raise TypeError('文件{}后缀名不合法! 仅支持如下文件类型: {}.'.format(pathname, ','.join(self._handle_postfix)))
elif os.path.isfile(full_pathname):
for relpath, _, files in os.walk(full_pathname):
for name in files:
filename = os.path.join(full_pathname, relpath, name)
if self._is_legal_postfix(filename):
self._filename_list.append(filename)
else:
raise TypeError('文件/文件夹{}不存在或不合法!'.format(pathname))
def _is_legal_postfix(self, filename):
return filename.split('.')[-1].lower() in self._handle_postfix and not os.path.basename(filename).startswith('~')
def run_conver(self):
'''
进行批量处理, 根据后缀名调用函数执行转换
'''
print('需要转换的文件数: {}'.format(len(self._filename_list)))
for filename in self._filename_list:
postfix = filename.split('.')[1].lower()
funCall = getattr(self, postfix)
print('原文件: {}'.format(filename))
funCall(filename)
print('转换完成')
def doc(self, filename):
'''
doc和docx文件转换
'''
name = os.path.basename(filename).split('.')[0] + '.pdf'
exportfile = os.path.join(self._export_folder, name)
print('保存PDF文件: {}'.format(exportfile))
gencache.EnsureModule('{00020905-0000-0000-C000-000000000046}', 0, 8, 4)
w = Dispatch('Word.Application')
doc = w.Documents.Open(filename)
doc.ExportAsFixedFormat(exportfile, contants.wdExportFormatPDF,
Item=contants.wdExportDocumentWithMarkup,
CreateBookmarks=constants.wdExportCreateHeadingBookmarks)
w.Quit(constants.wdDoNotSaveChanges)
def docx(self, filename):
self.doc(filename)
def xls(self, filename):
'''
xls和xlsx文件转换
'''
name = os.path.basename(filename).split('.')[0] + '.pdf'
exportfile = os.path.join(self._export_folder, name)
print('保存PDF文件: {}'.format(exportfile))
xlApp = DispatchEx('Excel.Application')
xlApp.Visible = False
xlApp.DisplayAlerts = 0
books = xlApp.Workbooks.Open(filename, False)
books.ExportAsFixedFormat(0, exportfile)
books.Close(False)
xlApp.Quit()
def xlsx(self, filename):
self.xls(filename)
def ppt(self, filename):
'''
ppt和pptx文件转换
'''
name = os.path.basename(filename).split('.')[0] + '.pdf'
exportfile = os.path.join(self._export_folder, name)
print('保存PDF文件: {}'.format(exportfile))
gencache.EnsureModule('{00020905-0000-0000-C000-000000000046}', 0, 8, 4)
p = Dispatch('PowerPoint.Application')
ppt = p.Presentations.Open(filename, False, False, False)
ppt.ExportAsFixedFormat(exportfile, 2, PrintRange=None)
p.Quit()
def pptx(self, filename):
self.ppt(filename)
if __name__ == '__main__':
# 支持文件夹批量导入
# folder = 'tmp'
# pathname = os.path.join(os.path.abspath('.'), folder)
# 也支持单个文件的转换
pathname = 'inwarehouse_templet.xlsx'
pdfConver = PDFConverter(pathname)
pdfConver.run_conver()
excel的不同sheet存为pdf¶
#! -*- coding:utf-8 -*-
import os
import xlrd
from win32com.client import Dispatch, constants, gencache, DispatchEx
class PDFConverter():
def __init__(self, pathname, sheetnum, export='.'):
self.sheetnum = sheetnum
self._handle_postfix = ['doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx']
self._filename_list = list()
self._export_folder = os.path.join(os.path.abspath('.'), 'pdfconver')
if not os.path.exists(self._export_folder):
os.mkdir(self._export_folder)
self._enumerate_filename(pathname)
def _enumerate_filename(self, pathname):
'''
读取所有文件名
'''
full_pathname = os.path.abspath(pathname)
if os.path.isfile(full_pathname):
if self._is_legal_postfix(full_pathname):
self._filename_list.append(full_pathname)
else:
raise TypeError('文件{}后缀名不合法! 仅支持如下文件类型: {}.'.format(pathname, ','.join(self._handle_postfix)))
elif os.path.isfile(full_pathname):
for relpath, _, files in os.walk(full_pathname):
for name in files:
filename = os.path.join(full_pathname, relpath, name)
if self._is_legal_postfix(filename):
self._filename_list.append(filename)
else:
raise TypeError('文件/文件夹{}不存在或不合法!'.format(pathname))
def _is_legal_postfix(self, filename):
return filename.split('.')[-1].lower() in self._handle_postfix and not os.path.basename(filename).startswith('~')
def run_conver(self):
'''
进行批量处理, 根据后缀名调用函数执行转换
'''
print('需要转换的文件数: {}'.format(len(self._filename_list)))
for filename in self._filename_list:
postfix = filename.split('.')[1].lower()
funCall = getattr(self, postfix)
print('原文件: {}'.format(filename))
funCall(filename)
print('转换完成')
def xls(self, filename):
'''
xls 和 xlsx 文件转换
'''
xlApp = DispatchEx('Excel.Application')
xlApp.Visible = False
xlApp.DisplayAlerts = 0
books = xlApp.Workbooks.Open(filename, False)
# 循环保存每一个sheet
for i in range(1, self.sheetnum+1):
sheetName = books.Sheet(i).Name
xlSheet = books.Worksheets(sheetName)
name = sheetName + '.pdf'
exportfile = os.path.join(self._export_folder, name)
xlSheet.ExportAsFixedFormat(0, exportfile)
print('保存PDF文件: {}'.format(exportfile))
books.Close(False)
xlApp.Quit()
def xlsx(self, filename):
self.xls(filename)
if __name__ == '__main__':
# 支持单个文件的转换
pathname = u'原始数据.xlsx'
# 获取到文件的sheet数
b = xlrd.open_workbook(pathname)
sheetnum = len(b.sheets())
pdfconverter = PDFConverter(pathname, sheetnum)
pdfconverter.run_conver()