#Author: Keir Vaughan-taylor
#MonFeb142011
#Program klaatu6.py Converts XML to mysql database
#Repeated data items within a tag are written with key to data table
import MySQLdb
import xml.dom.minidom
import sys, os
import re
from unidecode import unidecode
def filterUnicode(vi):
# Input string vi may contain unicode characters, replace each unicode string with an interpretation
cleaned=''
for i in range(0,len(vi)): #Replace unicode chars with something close
if ord(vi[i]) > 127: cleaned+=unidecode(vi[i])
else: cleaned+=vi[i]
return cleaned
def dataClean(chl):
# Each value in childList is to be added to SQL table however must clean up field data
chl=chl.strip()
chl=filterUnicode(chl) #Translate unicode characters to something recognisable
if chl=='': # Test is data field is empty, if so replace with keyword None
chl='None'
if chl[-1]==chr(92): #If last chr of data is backslash, stop escape of following apostrophe
chl+=chr(92)
# replace \' with single ' then all single ' with a \'
val= "'" + chl.replace(chr(92)+"'","'").replace("'",chr(92)+"\'") + "'"
return val
def addAttributes(subnode,fieldList,childList):
# Any attribute data to be turned into table fields, there is only one attribute per tag
# This bit should perhaps handle all attribute in the heirarchy but so far only the immediate parent
tname=subnode.nodeName
#try:
# if subnode.parentNode.hasAttributes():
# modFldName=subnode.parentNode.nodeName+subnode.parentNode.attributes.keys()[0]
# childList[modFldName]=subnode.parentNode.attributes.values()[0].value
# dataType[modFldName]='attribute'
#except:
# pass
if subnode.hasAttributes(): # Turn attribute into field. Ensure not already a name of a field
subats=subnode.attributes
attribName=subats.keys()[0] # Get the name of the attribute turn it into a field name
fld=attribName
if fld in childList:
fld='attr:'+attribName
fld=tname+"_"+fld
childList[fld]=subnode.attributes.values()[0].value
dataType[fld]='attribute'
def emitSQL(tname,fieldList,inscolNames,colItems,valItems):
global tableDict,dataType,tablesCreated,createCmds,scocur,errorLog,dataTableName,tableId
# tname is current table name
if tname not in createCmds:
createCmds[tname]=colItems # Data names associated with table name
flddef=''
for field in colItems:
flddef+="`"+field+'` text, ' # Construct part of create command
#if field in fieldList: del fieldList[field]
flddef=flddef[0:-2]
flddef="create table `"+tname+"`"+ "(" + "dirPath varchar(255), fulltext(dirPath), xmlFile varchar(64), fulltext(xmlFile),xmlPath varchar(255), fulltext(xmlPath), tabNo int, " + flddef + ');'
try:
scocur.execute(flddef)
except:
errorLog.write("Error creating table:"+tname+"\n")
errorLog.write(flddef+"\n")
else: # Table is already created and may need to be altered
for field in colItems:
if field not in createCmds[tname]: # New field may have been found in this xml tag
if dataType[field] != 'repeat':
createCmds[tname].append(field)
altcmd="alter table `%s` add column `%s` text;" % (tname,field)
try:
scocur.execute(altcmd)
except:
errorLog.write("Error altering table:"+tname+"\n")
errorLog.write(altcmd+"\n")
#if field in fieldList: del fieldList[field]
valItems=filterUnicode(valItems[0:-2] + ')') #Replace pesky unicode chars with ascii equivalent
inscolNames=inscolNames[0:-2] + ')'
sql="insert into `" + tname + "`" + inscolNames + " values " + valItems + ';'
#print sql
try:
scocur.execute(sql)
# Each table record get an integer Id
tableId+=1
fieldList={}
except MySQLdb.Error, e:
errorLog.write("Error with inserting into table:"+tname+"\n")
errorLog.write("Err:%d %s\n" % (e.args[0],e.args[1]))
errorLog.write(sql + "\n")
def sameNameField(subnode,tableEchalon,dirPath,xmlFile,field,dupFieldCount,val):
# Data items table for repeated or list data
# Each time a field with the same name repeats write field to dataItems table indexed by table Id
# dupFieldCount is the unique table id.
global tableDict,dataType,tablesCreated,createCmds,scocur,errorLog,dataTableName,tableId
tname=subnode.nodeName
if field in dupFieldCount and dupFieldCount[field] > 1 or subnode.hasAttributes():
#Field items may have an attribute attached. In this case the data item is put in the
# data table with the the extra attribute field name and value
attName="None"
attVal="None"
if subnode.hasAttributes():
attName=subnode.attributes.keys()[0]
attVal=subnode.attributes.values()[0].value
# repeat data table
cmd="insert into %s (dirPath, xmlFile, xmlPath, tabNo, name , val, attName,attValue) values \
('%s','%s','%s',%s, '%s', %s,'%s','%s');" % \
(dataTableName,dirPath,xmlFile,tableEchalon,str(tableId),field,val,attName,attVal)
#print cmd
try:
# print cmd
scocur.execute(cmd) # For repeat items, lists, add the item to the dataItem table
except:
errorLog.write("Error inserting repeat item into table:"+dataTableName+"\n")
errorLog.write(cmd+"\n\n")
return True
else:
return False
def make_SQL(nodelist,parentName,dirPath,xmlFile):
# Open database connection
# Traverse XML tree build a list of tables to create and each tables field name entries
# tableDict contains the name of a table and any field names contained in that table
# The table name maps to a tag found in the XML
global tableDict,dataType,tablesCreated,createCmds,scocur,errorLog,dataTableName,tableId
# Pre scan of nodes to identify repeat fields. These fields will be put in a data table
dupFieldCount={}
for subnode in nodelist:
if (subnode.nodeType == subnode.ELEMENT_NODE):
tname=subnode.nodeName
dupFieldCount[tname]=dupFieldCount.get(tname,-1)+1
fieldList={}
for subnode in nodelist:
tname=subnode.nodeName
if (subnode.nodeType == subnode.ELEMENT_NODE): #Test if on a leaf
if parentName == '': tableEchalon=filterUnicode(tname)
else:
tableEchalon=parentName + '_' + filterUnicode(tname)
# Assume each tag name is representative of only one table
# If tag name has not previously been found then add the name to the list of tables
if tableDict.get(tname,None) == None: tableDict[tname]=[]
# Collect the names of the members of this node for return to upper call
if len(subnode.childNodes) > 0 and tname not in dataType: dataType[tname]='table'
# ------------------------------------------------------------------------------------------------------------------
# recursive call
childList=make_SQL(subnode.childNodes,tableEchalon,dirPath,xmlFile) #Gets child names and values as a dictionary
# Childlist will be empty unless on a leaf node or a table that owns a set of data items.
if len(childList) > 0:
if dataType[tname]!='data':
addAttributes(subnode,fieldList,childList) # XML attribute data converts to fields
# format of the first fields in each table containing data is;
# XML heirachial path the keys representing the directory heirarchy
# path name of the XML source file
inscolNames="(dirPath, xmlFile, xmlPath, tabNo, " # Sets up path name fields
# initialise path name field's values
# TableId is an integer so no quotes
valItems="('%s','%s','%s',%s," % (dirPath,xmlFile,tableEchalon,tableId)
colItems=[]
# Here, if subnode is a table containing data items or is a data item
for field in childList: # Accumulate a list data names and values
if field in dataType and dataType[field] != 'repeat':
colItems.append(field) # Accumulate field names for insert
inscolNames+="`"+field + '`, ' # Construct field names part of insert command
# Data cleaning
# Each value in childList is to be added to SQL table however must clean up field data
val=dataClean(childList[field])
if dataType[tname]!='table':
if not sameNameField(subnode,tableEchalon,dirPath,xmlFile,field,dupFieldCount,val):
fieldList[field]=childList[field] # associating field names and returned values
else:
fieldList[field]=field + "s in " + dataTableName # Indicate repeat field in data table
valItems+=val+", " # If not a repeat dataItem add as a field in this table
# Accumulate all field names that are used in any table
if dataType[tname]=='table': # At a table with data items in it
emitSQL(tname,fieldList,inscolNames,colItems,valItems)
else:
# At this point the procedure call is on one of the leaves of the XML tree
# Test if nodeValue is blank, this is not a good item to store
ws=1
for i in subnode.nodeValue:
if i>' ': ws=0
if ws == 0: # Item non blank
fieldList[subnode.parentNode.nodeName]=subnode.nodeValue
dataType[subnode.parentNode.nodeName]='data'
return fieldList
#Main main MAIN
global tableDict,dataType,tablesCreated,createCmds,scocur,errorLog,dataTableName,tableId
paramfile="klaatuParams.txt"
if os.path.exists(paramfile):
if os.path.isfile(paramfile):
pass
else:
print paramfile+ " is not a text file."
sys.exit('Incorrect file input')
else:
sys.exit("Parameter file does not exist")
paramfd=open(paramfile,"r")
paradict={}
for dictentry in paramfd.readlines():
keywords=dictentry.split()
paradict[keywords[0]]=keywords[1]
paramfd.close()
#Each table with a set of data items will be keyed with the directory structure and file name
basedir=paradict['basedir']
if not os.path.isdir(basedir):
sys.exit("XML source directory "+basedir+" not found")
if not paradict.has_key('admin'):
sys.exit("Paramater file requires admin user")
if not paradict.has_key('adminpw'):
sys.exit("Paramater file requires admin password")
passw=paradict['password']
dbUser=paradict['databaseUser']
dbName=paradict['databaseName']
adminUser=paradict['admin']
adminpw=paradict['adminpw']
dataTableName='datTable'
dataKeyNameType=''
createCmds={}
dataType={}
tableId=0
tableDict={}
errorLog=open("errorlog.txt","w")
proglog=open("progresLog","w")
#The XML file location is split ito its directory name path parts that it may be used
# as a key to access any data item set
db = MySQLdb.connect("localhost",adminUser,passwd = adminpw)
cur=db.cursor()
cur.execute('create database if not exists ' + dbName)
cur = db.cursor()
cmd='GRANT ALL ON %s.* TO %s@localhost IDENTIFIED BY "%s"' % (dbName,dbUser,passw)
#print cmd
cur.execute(cmd)
db.close()
scodb = MySQLdb.connect (host = "localhost",
user = dbUser,
passwd = passw,
db = dbName)
scocur=scodb.cursor()
# Generic data item storage for multiple dataitems with the same XML tag name
cmd='create table if not exists '+ dataTableName+ ' (dirPath varchar(255), fulltext(dirPath), xmlFile varchar(64), fulltext(xmlFile), xmlPath varchar(255), fulltext(xmlPath), tabNo int, index(tabNo), name varchar(128), val varchar(256),attName varchar(128),attValue varchar(128));'
scocur.execute(cmd)
for root, dirs, files in os.walk(basedir):
if root.count('/')==1:
proglog.write(root+"\n")
proglog.flush()
if len(files) > 0:
keyparts=root.split('/')
if root[0]=='/':
keyparts.pop(0)
knmbr=len(keyparts)
dataPathKey=''
for keynum in range(0, knmbr): #Construct path name as part of sql key field
dataPathKey+=keyparts[keynum]+':'
dataPathKey=dataPathKey[:-1] # remove trailing colon
for xmlFile in files:
if os.path.splitext(xmlFile)[1]=='.xml': # only process files with xml extention
# Each table contains its directory path with each part of the path as a field
dirPath=root+'/'
targetFile=dirPath+xmlFile
# preliminary parse through file removing anything undesirable
srcXMLfd = open(targetFile,"r")
srcText=srcXMLfd.read()
# xmlText=re.sub(".*?","",srcText) # Mid data item modifier to be removed
xmlText=re.sub("(?:.*?|.)*?","",srcText) # Two possible layers of recursion
srcXMLfd.close()
print targetFile
dc = xml.dom.minidom.parseString(xmlText)
make_SQL(dc.childNodes,"",dirPath,xmlFile)
scodb.close()
errorLog.close()
proglog.close()
print "klaatu6 SQL build complete"