#!/usr/bin/python # # Copyright information # # Copyright (C) 2010-2018 Dilshod Temirkhodjaev <tdilshod@gmail.com> # # License # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. from __future__ import print_function __author__ = "Dilshod Temirkhodjaev <tdilshod@gmail.com>" __license__ = "GPL-2+" __version__ = "0.7.6" import csv, datetime, zipfile, string, sys, os, re, signal import xml.parsers.expat from xml.dom import minidom try: # python2.4 from cStringIO import StringIO except: pass try: from argparse import ArgumentParser except: # python2.4 from optparse import OptionParser # see also ruby-roo lib at: http://github.com/hmcgowan/roo FORMATS = { 'general': 'float', '0': 'float', '0.00': 'float', '#,##0': 'float', '#,##0.00': 'float', '0%': 'percentage', '0.00%': 'percentage', '0.00e+00': 'float', 'mm-dd-yy': 'date', 'd-mmm-yy': 'date', 'd-mmm': 'date', 'mmm-yy': 'date', 'h:mm am/pm': 'date', 'h:mm:ss am/pm': 'date', 'h:mm': 'time', 'h:mm:ss': 'time', 'm/d/yy h:mm': 'date', '#,##0 ;(#,##0)': 'float', '#,##0 ;[red](#,##0)': 'float', '#,##0.00;(#,##0.00)': 'float', '#,##0.00;[red](#,##0.00)': 'float', 'mm:ss': 'time', '[h]:mm:ss': 'time', 'mmss.0': 'time', '##0.0e+0': 'float', '@': 'float', 'yyyy\\-mm\\-dd': 'date', 'dd/mm/yy': 'date', 'hh:mm:ss': 'time', "dd/mm/yy\\ hh:mm": 'date', 'dd/mm/yyyy hh:mm:ss': 'date', 'yy-mm-dd': 'date', 'd-mmm-yyyy': 'date', 'm/d/yy': 'date', 'm/d/yyyy': 'date', 'dd-mmm-yyyy': 'date', 'dd/mm/yyyy': 'date', 'mm/dd/yy h:mm am/pm': 'date', 'mm/dd/yy hh:mm': 'date', 'mm/dd/yyyy h:mm am/pm': 'date', 'mm/dd/yyyy hh:mm:ss': 'date', 'yyyy-mm-dd hh:mm:ss': 'date', '#,##0;(#,##0)': 'float', '_(* #,##0_);_(* (#,##0);_(* "-"??_);_(@_)': 'float', '_(* #,##0.00_);_(* (#,##0.00);_(* "-"??_);_(@_)': 'float' } STANDARD_FORMATS = { 0: 'general', 1: '0', 2: '0.00', 3: '#,##0', 4: '#,##0.00', 9: '0%', 10: '0.00%', 11: '0.00e+00', 12: '# ?/?', 13: '# ??/??', 14: 'mm-dd-yy', 15: 'd-mmm-yy', 16: 'd-mmm', 17: 'mmm-yy', 18: 'h:mm am/pm', 19: 'h:mm:ss am/pm', 20: 'h:mm', 21: 'h:mm:ss', 22: 'm/d/yy h:mm', 37: '#,##0 ;(#,##0)', 38: '#,##0 ;[red](#,##0)', 39: '#,##0.00;(#,##0.00)', 40: '#,##0.00;[red](#,##0.00)', 45: 'mm:ss', 46: '[h]:mm:ss', 47: 'mmss.0', 48: '##0.0e+0', 49: '@', } CONTENT_TYPES = { 'shared_strings', 'styles', 'workbook', 'worksheet', 'relationships', } DEFAULT_APP_PATH = "/xl" DEFAULT_WORKBOOK_PATH = DEFAULT_APP_PATH + "/workbook.xml" class XlsxException(Exception): pass class InvalidXlsxFileException(XlsxException): pass class SheetNotFoundException(XlsxException): pass class OutFileAlreadyExistsException(XlsxException): pass class Xlsx2csv: """ Usage: Xlsx2csv("test.xslx", **params).convert("test.csv", sheetid=1) Input: xlsxfile - path to file or filehandle options: sheetid - sheet no to convert (0 for all sheets) dateformat - override date/time format timeformat - override time format floatformat - override float format quoting - if and how to quote delimiter - csv columns delimiter symbol sheetdelimiter - sheets delimiter used when processing all sheets skip_empty_lines - skip empty lines skip_trailing_columns - skip trailing columns hyperlinks - include hyperlinks include_sheet_pattern - only include sheets named matching given pattern exclude_sheet_pattern - exclude sheets named matching given pattern """ def __init__(self, xlsxfile, **options): options.setdefault("delimiter", ",") options.setdefault("quoting", csv.QUOTE_MINIMAL) options.setdefault("sheetdelimiter", "--------") options.setdefault("dateformat", None) options.setdefault("timeformat", None) options.setdefault("floatformat", None) options.setdefault("scifloat", False) options.setdefault("skip_empty_lines", False) options.setdefault("skip_trailing_columns", False) options.setdefault("escape_strings", False) options.setdefault("hyperlinks", False) options.setdefault("include_sheet_pattern", ["^.*$"]) options.setdefault("exclude_sheet_pattern", []) options.setdefault("merge_cells", False) options.setdefault("ignore_formats", ['']) options.setdefault("lineterminator", "\n") self.options = options try: self.ziphandle = zipfile.ZipFile(xlsxfile) except (zipfile.BadZipfile, IOError): raise InvalidXlsxFileException("Invalid xlsx file: " + str(xlsxfile)) self.py3 = sys.version_info[0] == 3 self.content_types = self._parse(ContentTypes, "/[Content_Types].xml") self.shared_strings = self._parse(SharedStrings, self.content_types.types["shared_strings"]) self.styles = self._parse(Styles, self.content_types.types["styles"]) self.workbook = self._parse(Workbook, self.content_types.types["workbook"]) workbook_relationships = list(filter(lambda r: "book" in r, self.content_types.types["relationships"]))[0] self.workbook.relationships = self._parse(Relationships, workbook_relationships) if self.options['escape_strings']: self.shared_strings.escape_strings() def __del__(self): # make sure to close zip file, ziphandler does have a close() method self.ziphandle.close() def getSheetIdByName(self, name): for s in self.workbook.sheets: if s['name'] == name: return s['index'] return None def convert(self, outfile, sheetid=1): """outfile - path to file or filehandle""" if sheetid > 0: self._convert(sheetid, outfile) else: if isinstance(outfile, str): if not os.path.exists(outfile): os.makedirs(outfile) elif os.path.isfile(outfile): raise OutFileAlreadyExistsException("File " + str(outfile) + " already exists!") for s in self.workbook.sheets: sheetname = s['name'] # filter sheets by include pattern include_sheet_pattern = self.options['include_sheet_pattern'] if type(include_sheet_pattern) == type(""): # optparser lib fix include_sheet_pattern = [include_sheet_pattern] if len(include_sheet_pattern) > 0: include = False for pattern in include_sheet_pattern: include = pattern and len(pattern) > 0 and re.match(pattern, sheetname) if include: break if not include: continue # filter sheets by exclude pattern exclude_sheet_pattern = self.options['exclude_sheet_pattern'] if type(exclude_sheet_pattern) == type(""): # optparser lib fix exclude_sheet_pattern = [exclude_sheet_pattern] exclude = False for pattern in exclude_sheet_pattern: exclude = pattern and len(pattern) > 0 and re.match(pattern, sheetname) if exclude: break if exclude: continue if not self.py3: sheetname = sheetname.encode('utf-8') of = outfile if isinstance(outfile, str): of = os.path.join(outfile, sheetname + '.csv') elif self.options['sheetdelimiter'] and len(self.options['sheetdelimiter']): of.write(self.options['sheetdelimiter'] + " " + str(s['index']) + " - " + sheetname + self.options['lineterminator']) self._convert(s['index'], of) def _convert(self, sheet_index, outfile): closefile = False if isinstance(outfile, str): if sys.version_info[0] == 2: outfile = open(outfile, 'wb+') elif sys.version_info[0] == 3: outfile = open(outfile, 'w+', encoding=self.options['outputencoding'], newline="") else: sys.stderr.write("error: version of your python is not supported: " + str(sys.version_info) + "\n") sys.exit(1) closefile = True try: writer = csv.writer(outfile, quoting=self.options['quoting'], delimiter=self.options['delimiter'], lineterminator=self.options['lineterminator']) sheets_filtered = list(filter(lambda s: s['index'] == sheet_index, self.workbook.sheets)) if len(sheets_filtered) == 0: eprint("Sheet with index %i not found or can't be handled" % sheet_index) return 1 sheet_path = None # using sheet relation information if 'relation_id' in sheets_filtered[0] and sheets_filtered[0]['relation_id'] is not None: relation_id = sheets_filtered[0]['relation_id'] if relation_id in self.workbook.relationships.relationships and \ 'target' in self.workbook.relationships.relationships[relation_id]: relationship = self.workbook.relationships.relationships[relation_id] sheet_path = "/xl/" + relationship['target'] if sheet_path is None: sheet_path = "/xl/worksheets/sheet%i.xml" % sheet_index if sheet_path is None: sheet_path = "/xl/worksheets/worksheet%i.xml" % sheet_index if sheet_path is None and sheet_index == 1: sheet_path = self.content_types.types["worksheet"] if sheet_path is None: raise SheetNotFoundException("Sheet %i not found" % sheet_index) sheet_file = self._filehandle(sheet_path) sheet = Sheet(self.workbook, self.shared_strings, self.styles, sheet_file) try: relationships_path = os.path.join(os.path.dirname(sheet_path), "_rels", os.path.basename(sheet_path) + ".rels") sheet.relationships = self._parse(Relationships, relationships_path) sheet.set_dateformat(self.options['dateformat']) sheet.set_timeformat(self.options['timeformat']) sheet.set_floatformat(self.options['floatformat']) sheet.set_skip_empty_lines(self.options['skip_empty_lines']) sheet.set_skip_trailing_columns(self.options['skip_trailing_columns']) sheet.set_include_hyperlinks(self.options['hyperlinks']) sheet.set_merge_cells(self.options['merge_cells']) sheet.set_scifloat(self.options['scifloat']) sheet.set_ignore_formats(self.options['ignore_formats']) if self.options['escape_strings'] and sheet.filedata: sheet.filedata = re.sub(r"(<v>[^<>]+) ([^<>]+</v>)", r"\1\\n\2", re.sub(r"(<v>[^<>]+)	([^<>]+</v>)", r"\1\\t\2", re.sub(r"(<v>[^<>]+) ([^<>]+</v>)", r"\1\\r\2", sheet.filedata))) sheet.to_csv(writer) finally: sheet_file.close() sheet.close() finally: if closefile: outfile.close() def _filehandle(self, filename): for name in filter(lambda f: filename and f.lower() == filename.lower()[1:], self.ziphandle.namelist()): # python2.4 fix if not hasattr(self.ziphandle, "open"): return StringIO(self.ziphandle.read(name)) return self.ziphandle.open(name, "r") return None def _parse(self, klass, filename): instance = klass() filehandle = self._filehandle(filename) if filehandle: instance.parse(filehandle) filehandle.close() return instance class Workbook: def __init__(self): self.sheets = list() self.date1904 = False def parse(self, filehandle): workbookDoc = minidom.parseString(filehandle.read()) if workbookDoc.firstChild.namespaceURI: fileVersion = workbookDoc.firstChild.getElementsByTagNameNS(workbookDoc.firstChild.namespaceURI, "fileVersion") else: fileVersion = workbookDoc.firstChild.getElementsByTagName("fileVersion") if len(fileVersion) == 0: self.appName = DEFAULT_APP_PATH else: try: if workbookDoc.firstChild.namespaceURI: self.appName = \ workbookDoc.firstChild.getElementsByTagNameNS( workbookDoc.firstChild.namespaceURI, "fileVersion")[0]._attrs['appName'].value else: self.appName = workbookDoc.firstChild.getElementsByTagName("fileVersion")[0]._attrs['appName'].value except KeyError: # no app name self.appName = DEFAULT_APP_PATH try: if workbookDoc.firstChild.namespaceURI: self.date1904 = \ workbookDoc.firstChild.getElementsByTagNameNS( workbookDoc.firstChild.namespaceURI, "workbookPr")[0]._attrs['date1904'].value.lower().strip() \ != "false" else: self.date1904 = \ workbookDoc.firstChild.getElementsByTagName("workbookPr")[0] \ ._attrs['date1904'].value.lower().strip() \ != "false" except: pass if workbookDoc.firstChild.namespaceURI: sheets = workbookDoc.firstChild.getElementsByTagNameNS(workbookDoc.firstChild.namespaceURI, "sheets")[0] else: sheets = workbookDoc.firstChild.getElementsByTagName("sheets")[0] if workbookDoc.firstChild.namespaceURI: sheetNodes = sheets.getElementsByTagNameNS(workbookDoc.firstChild.namespaceURI, "sheet") else: sheetNodes = sheets.getElementsByTagName("sheet") for i, sheetNode in enumerate(sheetNodes): attrs = sheetNode._attrs name = attrs["name"].value relation_id = None if 'r:id' in attrs: relation_id = attrs['r:id'].value self.sheets.append({'name': name, 'relation_id': relation_id, 'index': i + 1, 'id': i + 1}) # remove id starting 0.8.0 version class ContentTypes: def __init__(self): self.types = {} for type in CONTENT_TYPES: self.types[type] = None def parse(self, filehandle): types = minidom.parseString(filehandle.read()).firstChild if not types: return if types.namespaceURI: overrideNodes = types.getElementsByTagNameNS(types.namespaceURI, "Override") else: overrideNodes = types.getElementsByTagName("Override") for override in overrideNodes: attrs = override._attrs type = attrs.get('ContentType').value name = attrs.get('PartName').value if type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml": self.types["workbook"] = name elif type == "application/vnd.openxmlformats-officedocument.spreadsheetml.styles+xml": self.types["styles"] = name elif type == "application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml": # BUG preserved only last sheet self.types["worksheet"] = name elif type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml": self.types["shared_strings"] = name elif type == "application/vnd.openxmlformats-package.relationships+xml": if self.types["relationships"] is None: self.types["relationships"] = list() self.types["relationships"].append(name) if self.types["workbook"] is None: self.types["workbook"] = DEFAULT_WORKBOOK_PATH if self.types["relationships"] is None: self.types["relationships"] = [os.path.dirname(self.types["workbook"]) + "/_rels/" + \ os.path.basename(self.types["workbook"]) + ".rels"] class Relationships: def __init__(self): self.relationships = {} def parse(self, filehandle): doc = minidom.parseString(filehandle.read()) if doc.namespaceURI: relationships = doc.getElementsByTagNameNS(doc.namespaceURI, "Relationships") else: relationships = doc.getElementsByTagName("Relationships") if not relationships: return if doc.namespaceURI: relationshipNodes = relationships[0].getElementsByTagNameNS(doc.namespaceURI, "Relationship") else: relationshipNodes = relationships[0].getElementsByTagName("Relationship") for rel in relationshipNodes: attrs = rel._attrs rId = attrs.get('Id') if rId: vtype = attrs.get('Type') target = attrs.get('Target') self.relationships[str(rId.value)] = { "type": vtype and str(vtype.value) or None, "target": target and str(target.value) or None } class Styles: def __init__(self): self.numFmts = {} self.cellXfs = [] def parse(self, filehandle): styles = minidom.parseString(filehandle.read()).firstChild # numFmts if styles.namespaceURI: numFmtsElement = styles.getElementsByTagNameNS(styles.namespaceURI, "numFmts") else: numFmtsElement = styles.getElementsByTagName("numFmts") if len(numFmtsElement) == 1: for numFmt in numFmtsElement[0].childNodes: if numFmt.nodeType == minidom.Node.ELEMENT_NODE: numFmtId = int(numFmt._attrs['numFmtId'].value) formatCode = numFmt._attrs['formatCode'].value.lower().replace('\\', '') self.numFmts[numFmtId] = formatCode if styles.namespaceURI: cellXfsElement = styles.getElementsByTagNameNS(styles.namespaceURI, "cellXfs") else: cellXfsElement = styles.getElementsByTagName("cellXfs") if len(cellXfsElement) == 1: for cellXfs in cellXfsElement[0].childNodes: if cellXfs.nodeType != minidom.Node.ELEMENT_NODE or not ( cellXfs.nodeName == "xf" or cellXfs.nodeName.endswith(":xf")): continue if cellXfs._attrs and 'numFmtId' in cellXfs._attrs: numFmtId = int(cellXfs._attrs['numFmtId'].value) if self.chk_exists(numFmtId) == None: numFmtId = int(cellXfs._attrs['applyNumberFormat'].value) self.cellXfs.append(numFmtId) else: self.cellXfs.append(None) # When Unknown Numformat ID assign applyNumberFormat def chk_exists(self, numFmtId): xfs_numfmt = numFmtId format_str = None if xfs_numfmt in self.numFmts: format_str = self.numFmts[xfs_numfmt] elif xfs_numfmt in STANDARD_FORMATS: format_str = STANDARD_FORMATS[xfs_numfmt] return format_str class SharedStrings: def __init__(self): self.parser = None self.strings = [] self.si = False self.t = False self.rPh = False self.value = "" def parse(self, filehandle): self.parser = xml.parsers.expat.ParserCreate() self.parser.CharacterDataHandler = self.handleCharData self.parser.StartElementHandler = self.handleStartElement self.parser.EndElementHandler = self.handleEndElement self.parser.ParseFile(filehandle) def escape_strings(self): for i in range(0, len(self.strings)): self.strings[i] = self.strings[i].replace("\r", "\\r").replace("\n", "\\n").replace("\t", "\\t") def handleCharData(self, data): if self.t: self.value += data def handleStartElement(self, name, attrs): # ignore namespace i = name.find(":") if i >= 0: name = name[i + 1:] if name == 'si': self.si = True self.value = "" elif name == 't' and self.rPh: self.t = False elif name == 't' and self.si: self.t = True elif name == 'rPh': self.rPh = True def handleEndElement(self, name): # ignore namespace i = name.find(":") if i >= 0: name = name[i + 1:] if name == 'si': self.si = False self.strings.append(self.value) elif name == 't': self.t = False elif name == 'rPh': self.rPh = False class Sheet: def __init__(self, workbook, sharedString, styles, filehandle): self.py3 = sys.version_info[0] == 3 self.parser = None self.writer = None self.sharedString = None self.styles = None self.relationships = None self.columns_count = -1 self.in_sheet = False self.in_row = False self.in_cell = False self.in_cell_value = False self.columns = {} self.lastRowNum = 0 self.rowNum = None self.colType = None self.cellId = None self.s_attr = None self.data = None self.max_columns = -1 self.dateformat = None self.timeformat = "%H:%M" # default time format self.floatformat = None self.skip_empty_lines = False self.skip_trailing_columns = False self.filedata = None self.filehandle = filehandle self.workbook = workbook self.sharedStrings = sharedString.strings self.styles = styles self.hyperlinks = {} self.mergeCells = {} self.ignore_formats = [] self.colIndex = 0 self.colNum = "" def close(self): # Make sure Worksheet is closed, parsers lib does not have a close() function, so simply delete it self.parser = None def set_dateformat(self, dateformat): self.dateformat = dateformat def set_timeformat(self, timeformat): if timeformat: self.timeformat = timeformat def set_floatformat(self, floatformat): self.floatformat = floatformat def set_skip_empty_lines(self, skip): self.skip_empty_lines = skip def set_skip_trailing_columns(self, skip): self.skip_trailing_columns = skip def set_ignore_formats(self, ignore_formats): self.ignore_formats = ignore_formats def set_merge_cells(self, mergecells): if not mergecells: return if not self.filedata: self.filedata = self.filehandle.read() data = str(self.filedata) # python3: convert byte buffer to string # find worksheet tag, we need namespaces from it start = data.find("<worksheet") if start < 0: return end = data.find(">", start) worksheet = data[start: end + 1] # find hyperlinks part start = data.find("<mergeCells") if start < 0: # hyperlinks not found return end = data.find("</mergeCells>") data = data[start: end + 13] # parse hyperlinks doc = minidom.parseString(worksheet + data + "</worksheet>").firstChild if doc.namespaceURI: mergeCells = doc.getElementsByTagNameNS(doc.namespaceURI, "mergeCell") else: mergeCells = doc.getElementsByTagName("mergeCell") for mergeCell in mergeCells: attrs = mergeCell._attrs if 'ref' in attrs.keys(): rangeStr = attrs['ref'].value rng = rangeStr.split(":") if len(rng) > 1: for cell in self._range(rangeStr): self.mergeCells[cell] = {} self.mergeCells[cell]['copyFrom'] = rng[0] def set_scifloat(self, scifloat): self.scifloat = scifloat def set_include_hyperlinks(self, hyperlinks): if not hyperlinks or not self.relationships or not self.relationships.relationships: return # we must read file first to get hyperlinks, but we don't wont to parse whole file if not self.filedata: self.filedata = self.filehandle.read() data = str(self.filedata) # python3: convert byte buffer to string # find worksheet tag, we need namespaces from it start = data.find("<worksheet") if start < 0: return end = data.find(">", start) worksheet = data[start: end + 1] # find hyperlinks part start = data.find("<hyperlinks>") if start < 0: # hyperlinks not found return end = data.find("</hyperlinks>") data = data[start: end + 13] # parse hyperlinks doc = minidom.parseString(worksheet + data + "</worksheet>").firstChild if doc.namespaceURI: hiperlinkNodes = doc.getElementsByTagNameNS(doc.namespaceURI, "hyperlink") else: hiperlinkNodes = doc.getElementsByTagName("hyperlink") for hlink in hiperlinkNodes: attrs = hlink._attrs ref = rId = None for k in attrs.keys(): if k == "ref": ref = str(attrs[k].value) if k.endswith(":id"): rId = str(attrs[k].value) if not ref or not rId: continue rel = self.relationships.relationships.get(rId) if not rel: continue target = rel.get('target') for cell in self._range(ref): self.hyperlinks[cell] = target def to_csv(self, writer): self.writer = writer self.parser = xml.parsers.expat.ParserCreate() self.parser.buffer_text = True self.parser.CharacterDataHandler = self.handleCharData self.parser.StartElementHandler = self.handleStartElement self.parser.EndElementHandler = self.handleEndElement if self.filedata: self.parser.Parse(self.filedata) else: self.parser.ParseFile(self.filehandle) def handleCharData(self, data): if self.in_cell_value: self.collected_string += data self.data = self.collected_string if self.colType == "s": # shared string self.data = self.sharedStrings[int(self.data)] elif self.colType == "b": # boolean self.data = (int(data) == 1 and "TRUE") or (int(data) == 0 and "FALSE") or data elif self.colType == "str" or self.colType == "inlineStr": self.data = data elif self.s_attr: s = int(self.s_attr) # get cell format format_str = "general" xfs_numfmt = self.styles.cellXfs[s] if xfs_numfmt in self.styles.numFmts: format_str = self.styles.numFmts[xfs_numfmt] elif xfs_numfmt in STANDARD_FORMATS: format_str = STANDARD_FORMATS[xfs_numfmt] # get format type if not format_str: eprint("unknown format %s at %d" % (format_str, xfs_numfmt)) return format_type = None if format_str in FORMATS: format_type = FORMATS[format_str] elif re.match("^\d+(\.\d+)?$", self.data) and re.match(".*[hsmdyY]", format_str) and not re.match( '.*\[.*[dmhys].*\]', format_str): # it must be date format if float(self.data) < 1: format_type = "time" else: format_type = "date" elif re.match("^-?\d+(.\d+)?$", self.data) or ( self.scifloat and re.match("^-?\d+(.\d+)?([eE]-?\d+)?$", self.data)): format_type = "float" if format_type == 'date' and self.dateformat == 'float': format_type = "float" if format_type and not format_type in self.ignore_formats: try: if format_type == 'date': # date/time if self.workbook.date1904: date = datetime.datetime(1904, 1, 1) + datetime.timedelta(float(self.data)) else: date = datetime.datetime(1899, 12, 30) + datetime.timedelta(float(self.data)) if self.dateformat: # str(dateformat) - python2.5 bug, see: http://bugs.python.org/issue2782 self.data = date.strftime(str(self.dateformat)) else: # ignore ";@", don't know what does it mean right now # ignore "[$-409], [$-f409], [$-16001]" and similar format codes dateformat = re.sub(r"\[\$\-[A-z0-9]*\]", "", format_str, 1) \ .replace(";@", "").replace("yyyy", "%Y").replace("yy", "%y") \ .replace("hh:mm", "%H:%M").replace("h", "%I").replace("%H%H", "%H") \ .replace("ss", "%S").replace("dddd", "d").replace("dd", "d").replace("d", "%d") \ .replace("am/pm", "%p").replace("mmmm", "%B").replace("mmm", "%b") \ .replace(":mm", ":%M").replace("m", "%m").replace("%m%m", "%m") self.data = date.strftime(str(dateformat)).strip() elif format_type == 'time': # time t = int(round((float(self.data) % 1) * 24 * 60 * 60, 6)) # it should be in seconds d = datetime.time(int((t // 3600) % 24), int((t // 60) % 60), int(t % 60)) self.data = d.strftime(self.timeformat) elif format_type == 'float' and ('E' in self.data or 'e' in self.data): self.data = str(self.floatformat or '%f') % float(self.data) # if cell is general, be aggressive about stripping any trailing 0s, decimal points, etc. elif format_type == 'float' and format_str == 'general': self.data = ("%f" % (float(self.data))).rstrip('0').rstrip('.') elif format_type == 'float' and format_str[0:3] == '0.0': if self.floatformat: self.data = str(self.floatformat) % float(self.data) else: L = len(format_str.split(".")[1]) if '%' in format_str: L += 1 self.data = ("%." + str(L) + "f") % float(self.data) elif format_type == 'float': # unsupported float formatting self.data = ("%f" % (float(self.data))).rstrip('0').rstrip('.') except (ValueError, OverflowError): # this catch must be removed, it's hiding potential problems eprint("Error: potential invalid date format.") # invalid date format pass def handleStartElement(self, name, attrs): has_namespace = name.find(":") > 0 if self.in_row and (name == 'c' or (has_namespace and name.endswith(':c'))): self.colType = attrs.get("t") self.s_attr = attrs.get("s") self.cellId = attrs.get("r") if self.cellId: self.colNum = self.cellId[:len(self.cellId) - len(self.rowNum)] self.colIndex = 0 else: self.colIndex += 1 self.data = "" self.in_cell = True elif self.in_cell and ( (name == 'v' or name == 'is') or (has_namespace and (name.endswith(':v') or name.endswith(':is')))): self.in_cell_value = True self.collected_string = "" elif self.in_sheet and (name == 'row' or (has_namespace and name.endswith(':row'))) and ('r' in attrs): self.rowNum = attrs['r'] self.in_row = True self.colIndex = 0 self.colNum = "" self.columns = {} self.spans = None if 'spans' in attrs: self.spans = [int(i) for i in attrs['spans'].split(" ")[-1].split(":")] elif name == 't': # reset collected string self.collected_string = "" elif name == 'sheetData' or (has_namespace and name.endswith(':sheetData')): self.in_sheet = True elif name == 'dimension': rng = attrs.get("ref").split(":") if len(rng) > 1: start = re.match("^([A-Z]+)(\d+)$", rng[0]) if (start): end = re.match("^([A-Z]+)(\d+)$", rng[1]) startCol = start.group(1) endCol = end.group(1) self.columns_count = 0 for cell in self._range(startCol + "1:" + endCol + "1"): self.columns_count += 1 def handleEndElement(self, name): has_namespace = name.find(":") > 0 if self.in_cell and ((name == 'v' or name == 'is' or name == 't') or ( has_namespace and (name.endswith(':v') or name.endswith(':is')))): self.in_cell_value = False elif self.in_cell and (name == 'c' or (has_namespace and name.endswith(':c'))): t = 0 for i in self.colNum: t = t * 26 + ord(i) - 64 d = self.data if self.hyperlinks: hyperlink = self.hyperlinks.get(self.cellId) if hyperlink: d = "<a href='" + hyperlink + "'>" + d + "</a>" if self.colNum + self.rowNum in self.mergeCells.keys(): if 'copyFrom' in self.mergeCells[self.colNum + self.rowNum].keys() and \ self.mergeCells[self.colNum + self.rowNum]['copyFrom'] == self.colNum + self.rowNum: self.mergeCells[self.colNum + self.rowNum]['value'] = d else: d = self.mergeCells[self.mergeCells[self.colNum + self.rowNum]['copyFrom']]['value'] self.columns[t - 1 + self.colIndex] = d if self.in_row and (name == 'row' or (has_namespace and name.endswith(':row'))): if len(self.columns.keys()) > 0: d = [""] * (max(self.columns.keys()) + 1) for k in self.columns.keys(): val = self.columns[k] if not self.py3: val = val.encode("utf-8") d[k] = val if self.spans: l = self.spans[1] if len(d) < l: d += (l - len(d)) * [''] # write empty lines if not self.skip_empty_lines: for i in range(self.lastRowNum, int(self.rowNum) - 1): self.writer.writerow([]) self.lastRowNum = int(self.rowNum) # write line to csv if not self.skip_empty_lines or d.count('') != len(d): while len(d) < self.columns_count: d.append("") if self.skip_trailing_columns: if self.max_columns < 0: self.max_columns = len(d) while len(d) > 0 and d[-1] == "": d = d[0:-1] self.max_columns = self.max_columns - 1 elif self.max_columns > 0: d = d[0:self.max_columns] self.writer.writerow(d) self.in_row = False elif self.in_sheet and (name == 'sheetData' or (has_namespace and name.endswith(':sheetData'))): self.in_sheet = False # rangeStr: "A3:C12" or "D5" # example: for cell in _range("A1:Z12"): print cell def _range(self, rangeStr): rng = rangeStr.split(":") if len(rng) == 1: yield rangeStr else: start = re.match("^([A-Z]+)(\d+)$", rng[0]) end = re.match("^([A-Z]+)(\d+)$", rng[1]) if not start or not end: return startCol = start.group(1) startRow = int(start.group(2)) endCol = end.group(1) endRow = int(end.group(2)) col = startCol while True: for row in range(startRow, endRow + 1): yield col + str(row) if col == endCol: break t = 0 for i in col: t = t * 26 + ord(i) - 64 col = "" while t >= 0: col = chr(t % 26 + 65) + col t = t // 26 - 1 def convert_recursive(path, sheetid, outfile, kwargs): for name in os.listdir(path): fullpath = os.path.join(path, name) if os.path.isdir(fullpath): convert_recursive(fullpath, sheetid, outfile, kwargs) else: outfilepath = outfile if len(outfilepath) == 0 and fullpath.lower().endswith(".xlsx"): outfilepath = fullpath[:-4] + 'csv' print("Converting %s to %s" % (fullpath, outfilepath)) try: Xlsx2csv(fullpath, **kwargs).convert(outfilepath, sheetid) except zipfile.BadZipfile: print("File %s is not a zip file" % fullpath) if __name__ == "__main__": try: signal.signal(signal.SIGPIPE, signal.SIG_DFL) signal.signal(signal.SIGINT, signal.SIG_DFL) except AttributeError: pass if "ArgumentParser" in globals(): parser = ArgumentParser(description="xlsx to csv converter") parser.add_argument('infile', metavar='xlsxfile', help="xlsx file path") parser.add_argument('outfile', metavar='outfile', nargs='?', help="output csv file path") parser.add_argument('-v', '--version', action='version', version=__version__) nargs_plus = "+" argparser = True else: parser = OptionParser(usage="%prog [options] infile [outfile]", version=__version__) parser.add_argument = parser.add_option nargs_plus = 1 argparser = False if sys.version_info[0] == 2 and sys.version_info[1] < 5: inttype = "int" else: inttype = int parser.add_argument("-a", "--all", dest="all", default=False, action="store_true", help="export all sheets") parser.add_argument("-c", "--outputencoding", dest="outputencoding", default="utf-8", action="store", help="encoding of output csv ** Python 3 only ** (default: utf-8)") parser.add_argument("-d", "--delimiter", dest="delimiter", default=",", help="delimiter - columns delimiter in csv, 'tab' or 'x09' for a tab (default: comma ',')") parser.add_argument("--hyperlinks", "--hyperlinks", dest="hyperlinks", action="store_true", default=False, help="include hyperlinks") parser.add_argument("-e", "--escape", dest='escape_strings', default=False, action="store_true", help="Escape \\r\\n\\t characters") parser.add_argument("-E", "--exclude_sheet_pattern", nargs=nargs_plus, dest="exclude_sheet_pattern", default="", help="exclude sheets named matching given pattern, only effects when -a option is enabled.") parser.add_argument("-f", "--dateformat", dest="dateformat", help="override date/time format (ex. %%Y/%%m/%%d)") parser.add_argument("-t", "--timeformat", dest="timeformat", help="override time format (ex. %%H/%%M/%%S)") parser.add_argument("--floatformat", dest="floatformat", help="override float format (ex. %%.15f)") parser.add_argument("--sci-float", dest="scifloat", default=False, action="store_true", help="force scientific notation to float") parser.add_argument("-I", "--include_sheet_pattern", nargs=nargs_plus, dest="include_sheet_pattern", default="^.*$", help="only include sheets named matching given pattern, only effects when -a option is enabled.") parser.add_argument("--ignore-formats", nargs=nargs_plus, type=str, dest="ignore_formats", default=[''], help="Ignores format for specific data types.") parser.add_argument("-l", "--lineterminator", dest="lineterminator", default="\n", help="line terminator - lines terminator in csv, '\\n' '\\r\\n' or '\\r' (default: \\n)") parser.add_argument("-m", "--merge-cells", dest="merge_cells", default=False, action="store_true", help="merge cells") parser.add_argument("-n", "--sheetname", dest="sheetname", default=None, help="sheet name to convert") parser.add_argument("-i", "--ignoreempty", dest="skip_empty_lines", default=False, action="store_true", help="skip empty lines") parser.add_argument("--skipemptycolumns", dest="skip_trailing_columns", default=False, action="store_true", help="skip trailing empty columns") parser.add_argument("-p", "--sheetdelimiter", dest="sheetdelimiter", default="--------", help="sheet delimiter used to separate sheets, pass '' if you do not need delimiter, or 'x07' " "or '\\f' for form feed (default: '--------')") parser.add_argument("-q", "--quoting", dest="quoting", default="minimal", help="quoting - fields quoting in csv, 'none' 'minimal' 'nonnumeric' or 'all' (default: minimal)") parser.add_argument("-s", "--sheet", dest="sheetid", default=1, type=inttype, help="sheet number to convert") if argparser: options = parser.parse_args() else: (options, args) = parser.parse_args() if len(args) < 1: parser.print_usage() sys.stderr.write("error: too few arguments" + os.linesep) sys.exit(1) options.infile = args[0] options.outfile = len(args) > 1 and args[1] or None if len(options.delimiter) == 1: pass elif options.delimiter == 'tab' or options.delimiter == '\\t': options.delimiter = '\t' elif options.delimiter == 'comma': options.delimiter = ',' elif options.delimiter[0] == 'x': options.delimiter = chr(int(options.delimiter[1:])) else: sys.stderr.write("error: invalid delimiter\n") sys.exit(1) if options.quoting == 'none': options.quoting = csv.QUOTE_NONE elif options.quoting == 'minimal': options.quoting = csv.QUOTE_MINIMAL elif options.quoting == 'nonnumeric': options.quoting = csv.QUOTE_NONNUMERIC elif options.quoting == 'all': options.quoting = csv.QUOTE_ALL else: sys.stderr.write("error: invalid quoting\n") sys.exit(1) if options.lineterminator == '\n': pass elif options.lineterminator == '\\n': options.lineterminator = '\n' elif options.lineterminator == '\\r': options.lineterminator = '\r' elif options.lineterminator == '\\r\\n': options.lineterminator = '\r\n' else: sys.stderr.write("error: invalid line terminator\n") sys.exit(1) if options.sheetdelimiter == '--------': pass elif options.sheetdelimiter == '': pass elif options.sheetdelimiter == '\\f': options.sheetdelimiter = '\f' elif options.sheetdelimiter[0] == 'x': options.sheetdelimiter = chr(int(options.sheetdelimiter[1:])) else: sys.stderr.write("error: invalid sheet delimiter\n") sys.exit(1) kwargs = { 'delimiter': options.delimiter, 'quoting': options.quoting, 'sheetdelimiter': options.sheetdelimiter, 'dateformat': options.dateformat, 'timeformat': options.timeformat, 'floatformat': options.floatformat, 'scifloat': options.scifloat, 'skip_empty_lines': options.skip_empty_lines, 'skip_trailing_columns': options.skip_trailing_columns, 'escape_strings': options.escape_strings, 'hyperlinks': options.hyperlinks, 'include_sheet_pattern': options.include_sheet_pattern, 'exclude_sheet_pattern': options.exclude_sheet_pattern, 'merge_cells': options.merge_cells, 'outputencoding': options.outputencoding, 'lineterminator': options.lineterminator, 'ignore_formats': options.ignore_formats } sheetid = options.sheetid if options.all: sheetid = 0 outfile = options.outfile or sys.stdout try: if os.path.isdir(options.infile): convert_recursive(options.infile, sheetid, outfile, kwargs) else: xlsx2csv = Xlsx2csv(options.infile, **kwargs) if options.sheetname: sheetid = xlsx2csv.getSheetIdByName(options.sheetname) if not sheetid: raise XlsxException("Sheet '%s' not found" % options.sheetname) xlsx2csv.convert(outfile, sheetid) except XlsxException: _, e, _ = sys.exc_info() sys.stderr.write(str(e) + "\n") sys.exit(1) def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs)