It’s no secret. Excel is abused and misused in so many different ways and the most rough abuse is disregarding Excel’s inherent 2D table structure and using it to collect data as an excel based form.
Recently had a task where I faced two hundred or so such forms, from where needed to pull the data. Turned to my favourite hammer, Python.
And with xlrd, this was as easy as cutting a cake. Here’s three functions that did the job.
GetFilenames collects all the filenames including path given a top level folder. Getdata collects the required data from the given row, col and writeOut dumps all the collected data into a csv file as a 2D table.
from os import walk from os.path import join def GetFilenames(mypath): return (join(dirpath,f) for (dirpath, dirnames, filenames) in walk(mypath) for f in filenames if f.endswith(('.xls','.xlsx')))
import xlrd def getdata(fname,listx,listy): """ Reads data from excel sheet fname excel file name listx=[2,4,6,8,10,12,14,17,19,21,27,29,2,6,8,10,4] # Rows index listy=[1,1,1,1,1,1,1,1,1,1,1,1,4,4,4,4,4] # Column index from which data has to be extracted returns dataList containing the extracted info from excel. """ dataList=[] # initial an empty list book=xlrd.open_workbook(fname) # open workbook sheet=book.sheet_by_index(0) # Get first sheet dataList.append(fname) for x,y in zip(listx,listy): # walk to all the row,col combination and collect data if sheet.cell_type(x,y) <> 3: # if its not a data data dataList.append(sheet.cell_value(x,y)) else: # if its a date data datetup =xlrd.xldate_as_tuple(sheet.cell_value(x,y),book.datemode) dataList.append(datetup[0]) dataList.append(datetup[1]) dataList.append(datetup[2]) return dataList
import csv def writeOut(fname,files): f = open(fname,"wb") writer=csv.writer(f) for f in files: writer.writerow(getdata(f)) print fname,"written !!" return None