#Author: Keir Vaughan-taylor #MonFeb142011 #Program klaatu6.py Converts XML to mysql database #Repeated data items within a tag are written with key to data table import MySQLdb import xml.dom.minidom import sys, os import re from unidecode import unidecode def filterUnicode(vi): # Input string vi may contain unicode characters, replace each unicode string with an interpretation cleaned='' for i in range(0,len(vi)): #Replace unicode chars with something close if ord(vi[i]) > 127: cleaned+=unidecode(vi[i]) else: cleaned+=vi[i] return cleaned def dataClean(chl): # Each value in childList is to be added to SQL table however must clean up field data chl=chl.strip() chl=filterUnicode(chl) #Translate unicode characters to something recognisable if chl=='': # Test is data field is empty, if so replace with keyword None chl='None' if chl[-1]==chr(92): #If last chr of data is backslash, stop escape of following apostrophe chl+=chr(92) # replace \' with single ' then all single ' with a \' val= "'" + chl.replace(chr(92)+"'","'").replace("'",chr(92)+"\'") + "'" return val def addAttributes(subnode,fieldList,childList): # Any attribute data to be turned into table fields, there is only one attribute per tag # This bit should perhaps handle all attribute in the heirarchy but so far only the immediate parent tname=subnode.nodeName #try: # if subnode.parentNode.hasAttributes(): # modFldName=subnode.parentNode.nodeName+subnode.parentNode.attributes.keys()[0] # childList[modFldName]=subnode.parentNode.attributes.values()[0].value # dataType[modFldName]='attribute' #except: # pass if subnode.hasAttributes(): # Turn attribute into field. Ensure not already a name of a field subats=subnode.attributes attribName=subats.keys()[0] # Get the name of the attribute turn it into a field name fld=attribName if fld in childList: fld='attr:'+attribName fld=tname+"_"+fld childList[fld]=subnode.attributes.values()[0].value dataType[fld]='attribute' def emitSQL(tname,fieldList,inscolNames,colItems,valItems): global tableDict,dataType,tablesCreated,createCmds,scocur,errorLog,dataTableName,tableId # tname is current table name if tname not in createCmds: createCmds[tname]=colItems # Data names associated with table name flddef='' for field in colItems: flddef+="`"+field+'` text, ' # Construct part of create command #if field in fieldList: del fieldList[field] flddef=flddef[0:-2] flddef="create table `"+tname+"`"+ "(" + "dirPath varchar(255), fulltext(dirPath), xmlFile varchar(64), fulltext(xmlFile),xmlPath varchar(255), fulltext(xmlPath), tabNo int, " + flddef + ');' try: scocur.execute(flddef) except: errorLog.write("Error creating table:"+tname+"\n") errorLog.write(flddef+"\n") else: # Table is already created and may need to be altered for field in colItems: if field not in createCmds[tname]: # New field may have been found in this xml tag if dataType[field] != 'repeat': createCmds[tname].append(field) altcmd="alter table `%s` add column `%s` text;" % (tname,field) try: scocur.execute(altcmd) except: errorLog.write("Error altering table:"+tname+"\n") errorLog.write(altcmd+"\n") #if field in fieldList: del fieldList[field] valItems=filterUnicode(valItems[0:-2] + ')') #Replace pesky unicode chars with ascii equivalent inscolNames=inscolNames[0:-2] + ')' sql="insert into `" + tname + "`" + inscolNames + " values " + valItems + ';' #print sql try: scocur.execute(sql) # Each table record get an integer Id tableId+=1 fieldList={} except MySQLdb.Error, e: errorLog.write("Error with inserting into table:"+tname+"\n") errorLog.write("Err:%d %s\n" % (e.args[0],e.args[1])) errorLog.write(sql + "\n") def sameNameField(subnode,tableEchalon,dirPath,xmlFile,field,dupFieldCount,val): # Data items table for repeated or list data # Each time a field with the same name repeats write field to dataItems table indexed by table Id # dupFieldCount is the unique table id. global tableDict,dataType,tablesCreated,createCmds,scocur,errorLog,dataTableName,tableId tname=subnode.nodeName if field in dupFieldCount and dupFieldCount[field] > 1 or subnode.hasAttributes(): #Field items may have an attribute attached. In this case the data item is put in the # data table with the the extra attribute field name and value attName="None" attVal="None" if subnode.hasAttributes(): attName=subnode.attributes.keys()[0] attVal=subnode.attributes.values()[0].value # repeat data table cmd="insert into %s (dirPath, xmlFile, xmlPath, tabNo, name , val, attName,attValue) values \ ('%s','%s','%s',%s, '%s', %s,'%s','%s');" % \ (dataTableName,dirPath,xmlFile,tableEchalon,str(tableId),field,val,attName,attVal) #print cmd try: # print cmd scocur.execute(cmd) # For repeat items, lists, add the item to the dataItem table except: errorLog.write("Error inserting repeat item into table:"+dataTableName+"\n") errorLog.write(cmd+"\n\n") return True else: return False def make_SQL(nodelist,parentName,dirPath,xmlFile): # Open database connection # Traverse XML tree build a list of tables to create and each tables field name entries # tableDict contains the name of a table and any field names contained in that table # The table name maps to a tag found in the XML global tableDict,dataType,tablesCreated,createCmds,scocur,errorLog,dataTableName,tableId # Pre scan of nodes to identify repeat fields. These fields will be put in a data table dupFieldCount={} for subnode in nodelist: if (subnode.nodeType == subnode.ELEMENT_NODE): tname=subnode.nodeName dupFieldCount[tname]=dupFieldCount.get(tname,-1)+1 fieldList={} for subnode in nodelist: tname=subnode.nodeName if (subnode.nodeType == subnode.ELEMENT_NODE): #Test if on a leaf if parentName == '': tableEchalon=filterUnicode(tname) else: tableEchalon=parentName + '_' + filterUnicode(tname) # Assume each tag name is representative of only one table # If tag name has not previously been found then add the name to the list of tables if tableDict.get(tname,None) == None: tableDict[tname]=[] # Collect the names of the members of this node for return to upper call if len(subnode.childNodes) > 0 and tname not in dataType: dataType[tname]='table' # ------------------------------------------------------------------------------------------------------------------ # recursive call childList=make_SQL(subnode.childNodes,tableEchalon,dirPath,xmlFile) #Gets child names and values as a dictionary # Childlist will be empty unless on a leaf node or a table that owns a set of data items. if len(childList) > 0: if dataType[tname]!='data': addAttributes(subnode,fieldList,childList) # XML attribute data converts to fields # format of the first fields in each table containing data is; # XML heirachial path the keys representing the directory heirarchy # path name of the XML source file inscolNames="(dirPath, xmlFile, xmlPath, tabNo, " # Sets up path name fields # initialise path name field's values # TableId is an integer so no quotes valItems="('%s','%s','%s',%s," % (dirPath,xmlFile,tableEchalon,tableId) colItems=[] # Here, if subnode is a table containing data items or is a data item for field in childList: # Accumulate a list data names and values if field in dataType and dataType[field] != 'repeat': colItems.append(field) # Accumulate field names for insert inscolNames+="`"+field + '`, ' # Construct field names part of insert command # Data cleaning # Each value in childList is to be added to SQL table however must clean up field data val=dataClean(childList[field]) if dataType[tname]!='table': if not sameNameField(subnode,tableEchalon,dirPath,xmlFile,field,dupFieldCount,val): fieldList[field]=childList[field] # associating field names and returned values else: fieldList[field]=field + "s in " + dataTableName # Indicate repeat field in data table valItems+=val+", " # If not a repeat dataItem add as a field in this table # Accumulate all field names that are used in any table if dataType[tname]=='table': # At a table with data items in it emitSQL(tname,fieldList,inscolNames,colItems,valItems) else: # At this point the procedure call is on one of the leaves of the XML tree # Test if nodeValue is blank, this is not a good item to store ws=1 for i in subnode.nodeValue: if i>' ': ws=0 if ws == 0: # Item non blank fieldList[subnode.parentNode.nodeName]=subnode.nodeValue dataType[subnode.parentNode.nodeName]='data' return fieldList #Main main MAIN global tableDict,dataType,tablesCreated,createCmds,scocur,errorLog,dataTableName,tableId paramfile="klaatuParams.txt" if os.path.exists(paramfile): if os.path.isfile(paramfile): pass else: print paramfile+ " is not a text file." sys.exit('Incorrect file input') else: sys.exit("Parameter file does not exist") paramfd=open(paramfile,"r") paradict={} for dictentry in paramfd.readlines(): keywords=dictentry.split() paradict[keywords[0]]=keywords[1] paramfd.close() #Each table with a set of data items will be keyed with the directory structure and file name basedir=paradict['basedir'] if not os.path.isdir(basedir): sys.exit("XML source directory "+basedir+" not found") if not paradict.has_key('admin'): sys.exit("Paramater file requires admin user") if not paradict.has_key('adminpw'): sys.exit("Paramater file requires admin password") passw=paradict['password'] dbUser=paradict['databaseUser'] dbName=paradict['databaseName'] adminUser=paradict['admin'] adminpw=paradict['adminpw'] dataTableName='datTable' dataKeyNameType='' createCmds={} dataType={} tableId=0 tableDict={} errorLog=open("errorlog.txt","w") proglog=open("progresLog","w") #The XML file location is split ito its directory name path parts that it may be used # as a key to access any data item set db = MySQLdb.connect("localhost",adminUser,passwd = adminpw) cur=db.cursor() cur.execute('create database if not exists ' + dbName) cur = db.cursor() cmd='GRANT ALL ON %s.* TO %s@localhost IDENTIFIED BY "%s"' % (dbName,dbUser,passw) #print cmd cur.execute(cmd) db.close() scodb = MySQLdb.connect (host = "localhost", user = dbUser, passwd = passw, db = dbName) scocur=scodb.cursor() # Generic data item storage for multiple dataitems with the same XML tag name cmd='create table if not exists '+ dataTableName+ ' (dirPath varchar(255), fulltext(dirPath), xmlFile varchar(64), fulltext(xmlFile), xmlPath varchar(255), fulltext(xmlPath), tabNo int, index(tabNo), name varchar(128), val varchar(256),attName varchar(128),attValue varchar(128));' scocur.execute(cmd) for root, dirs, files in os.walk(basedir): if root.count('/')==1: proglog.write(root+"\n") proglog.flush() if len(files) > 0: keyparts=root.split('/') if root[0]=='/': keyparts.pop(0) knmbr=len(keyparts) dataPathKey='' for keynum in range(0, knmbr): #Construct path name as part of sql key field dataPathKey+=keyparts[keynum]+':' dataPathKey=dataPathKey[:-1] # remove trailing colon for xmlFile in files: if os.path.splitext(xmlFile)[1]=='.xml': # only process files with xml extention # Each table contains its directory path with each part of the path as a field dirPath=root+'/' targetFile=dirPath+xmlFile # preliminary parse through file removing anything undesirable srcXMLfd = open(targetFile,"r") srcText=srcXMLfd.read() # xmlText=re.sub(".*?","",srcText) # Mid data item modifier to be removed xmlText=re.sub("(?:.*?|.)*?","",srcText) # Two possible layers of recursion srcXMLfd.close() print targetFile dc = xml.dom.minidom.parseString(xmlText) make_SQL(dc.childNodes,"",dirPath,xmlFile) scodb.close() errorLog.close() proglog.close() print "klaatu6 SQL build complete"