Part one, take the first full backup. At the present time, this code is still under development and should not be used on a production machine. However, I am posting it here for reference.
Eventually, this code is going to be included in a backup client I am developing that will interface with glusterfs and Amazon S3 storage.
Currently, this code is tested to run on Python v2.7.4 on a Fedora 18 machine. With all three python files, and any number of properly defined job xml files in the jobs.d/ directory, these scripts are currently functional.
seed_files.py
This is the controller file for taking the first backup.
#!/usr/bin/python # Create first full backup import os, stat, time, seed_functions def printHello(): print "hello"; #--- This part of the script does the heavy lifting. def seedMain(myFindPath,myJobId,myFullTempPath,myTargetMetaPath,myTargetTarFilePath,myExcludeFiles): #---execute the first backup for the job. seed_functions.findFiles(myFullTempPath,myFindPath,myExcludeFiles); seed_functions.storeMetaData(myFullTempPath,myTargetMetaPath); seed_functions.mkTarFile(myFullTempPath,myTargetTarFilePath); print "Job completed successfully"; #--- If this is being run as a script, set temporary variables if __name__ == '__main__': myFindPath = '/home/myuser/findfiles'; myJobId = str('106'); myTempPath = '/tmp/'; myTempFileList = 'files.tmp'; myFullTempPath = os.path.join(myTempPath,myTempFileList); myTargetPath = '/home/target1/'; myTargetMeta = 'job'+myJobId+'.meta'; myTargetTarFile = 'job'+myJobId+'.tar'; myTargetMetaPath = myTargetPath+myTargetMeta; myTargetTarFilePath = myTargetPath+myTargetTarFile; #---Path names below should be absolute path names (start with / ) and should not end with '/' myExlcudeFiles = list(); myExlcudeFiles.append('/home/myuser/findfiles/dontbackup'); myExlcudeFiles.append('/home/myuser/findfiles/somebigfiles'); myExlcudeFiles.append('*.adf'); myExlcudeFiles.append('badfilez*'); seedMain(myFindPath,myJobId,myFullTempPath,myTargetMetaPath,myTargetTarFilePath,myExcludeFiles); #myJdate = seed_functions.getJulianDate();
seed_functions.py
This is the file that does the actual real work
#!/usr/bin/python #Shared Functions and Classes import os, sys, stat, time, glob from datetime import datetime def getJulianDate(): #--- This function returns an interger value of today's Julian Date #--- preceded by two digit year. IE: 01JAN2013 -> 13001 nowtime = str(datetime.now()); (year, month, day) = nowtime.split('-'); day = int(day[:2]); month = int(month); year = int(year); t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)); jdate = (year % 2000) * 1000; jdate = jdate + time.gmtime(t)[7]; return jdate; import re def findFiles(filepath,myFindPath,excludeFiles): #--- Variables: fileListPath = string, myFindPath = string, excludeFiles = list() of strings) #--- This function writes/overwrites the file @ 'fileListPath' which should be absolute path. #--- The file @ fileListPath is a list of files found within myFindPath. #--- myFindPath is desgined to be an absolute directory path. #--- excludeFiles is a list() of absolute paths which should be excluded during the finding operation. with open(filepath, 'w') as ftemp: #--- Generate list of files that shouldn't be added #removeExcludedFiles(); for dirname, dirnames, filenames in os.walk(myFindPath): #--- Remove exlcuded directories for badfile in dirnames: if os.path.join(myFindPath,dirname,badfile) in excludeFiles: dirnames.remove(badfile); #--- Gather the other directories for subdirname in dirnames: #--- we just want to add empty directories if os.path.islink(os.path.join(dirname, subdirname)) or not os.listdir(os.path.join(dirname, subdirname)): ftemp.write(os.path.join(dirname, subdirname)+'\n'); #--- Add files to the list. for filename in filenames: #--- check to see if the file is a regular file or a link: if os.path.islink(os.path.join(dirname, filename)) or os.path.isfile(os.path.join(dirname, filename)): #if not filename in myWildList: ftemp.write(os.path.join(dirname, filename)+'\n'); ftemp.closed; def storeMetaData(fileListPath, fileMetaPath): #---This function creates an output file with #---Filename and absolute path of fileMetaPath #---File format is: #--- /path/to/file ::: modified datetime ::: seconds since 1970 ::: md5 hash with open(fileMetaPath, 'w') as fmeta: myFileList = open(fileListPath, 'r'); for filez in myFileList: #--- create an array to append information. myFileMeta = list(); #--- Append absolute path and file name as first item myFileMeta.append(os.path.abspath(filez.strip())); #--- Get file meta information from os.stat() myStat = os.stat(filez.strip()); #---Append human readable date/time stamp. myFileMeta.append(time.ctime(myStat.st_mtime)); #---Append unix timestamp for easy comparison in the future. myFileMeta.append(myStat.st_mtime); if not os.path.isdir(filez.strip()): myHash = md5(filez.strip()); else: myHash = "---None: directory"; myFileMeta.append(myHash); metaDataString = str(myFileMeta[0]+":::"+myFileMeta[1]+":::"+str(myFileMeta[2])+":::"+str(myFileMeta[3])); fmeta.write(metaDataString+'\n'); myFileList.closed; fmeta.closed; import hashlib,os def md5(filename): ''' function to get md5 of file ''' d = hashlib.md5(); try: d.update(open(filename).read()); except Exception,e: print e; else: return d.hexdigest(); import tarfile def mkTarFile(fileList, tarOutPath): thisTarOut = tarOutPath+".lzo" thisFileList = "-T "+fileList os.system("tar {options} {tarfile} {filex} &> /dev/null".format(options="cpvfa", tarfile=thisTarOut, filex=thisFileList)); #-------- #--- Not yet impemented functions below: #-------- def findFiles2(fileListPath,myFindPath,excludeFiles): #--- This function is for testing purposes only with open(fileListPath, 'w') as ftemp: for dirname, dirnames, filenames in os.walk(myFindPath): for subdirname in dirnames: #--- we just want to add empty directories if not os.listdir(os.path.join(dirname, subdirname)): ftemp.write(os.path.join(dirname, subdirname)+'\n'); for filename in filenames: #--- check to see if the file is a regular file or a link: if os.path.islink(os.path.join(dirname, filename)) or os.path.isfile(os.path.join(dirname, filename)): ftemp.write(os.path.join(dirname, filename)+'\n'); ftemp.closed; def removeExcludedFiles(): #--- Not implemented yet. myWildList = list(); wildMatch = re.compile("^\*"); wildMatch2 = re.compile(".*\*"); print "Excluding the following" for badfile in excludeFiles: print badfile; result = wildMatch.match(badfile); if not result: result2 = wildMatch2.match(badfile); print result2; if result or result2: myWildList.append(badfile); print myWildList;
start_seeds.py
This is near completion; it parses the jobs in jobs.d/, verifies them, and runs them.
#!/usr/bin/python import os, re import xml.etree.ElementTree as ET import seed_files def readConfFile(): #--- Future feature to read a specified jobs.d from config file. jobdir = 'jobs.d'; return jobdir; def findJobs(jobdir): myJobList = list(); confMatch = re.compile(".*\.xml$"); for dirname, dirnames, filenames in os.walk(jobdir): for jobid in filenames: result = confMatch.match(jobid); if result: myJobList.append(os.path.join(jobdir,jobid)); return myJobList; def checkPath(pathText): confMatch = re.compile("^\/"); result = confMatch.match(pathText); confMatch2 = re.compile("^\/$"); result2 = confMatch2.match(pathText); if result2: exit('Path cannot be / '); if not result: exit('Directory path must be absolute path: '+pathText); if os.path.isdir(pathText) or os.path.ismount(pathText): print 'Path seems valid: ',pathText; else: exit('Invalid path: '+pathText); def parseJobs(myFoundJobs): print "Found the following config files: ",myFoundJobs; print "------------------------------------------------"; myJobList = list(); for myJob in myFoundJobs: mySubList = list(); print "Parsing and testing: ",myJob; tree = ET.parse(myJob); root = tree.getroot(); for child in root: #--- Validate backup path is valide. #----future feature: master excludes in config file if child.tag == 'backupdir': print "Checking Backup Directory"; checkPath(child.text); if child.tag == 'backuptarget': print "Checking Backup Target" checkPath(child.text); #--- Create another sublist for excluded directories. if child.tag == 'exclude': myExcludeList = list(); for subchild in child: myExcludeList.append(subchild.text); mySubList.append(myExcludeList); #--- Since it's not a sublist, we append directly elif not child.tag == 'exclude': mySubList.append(child.text); print "------------------------------------------------"; myJobList.append(mySubList); #--- Ensure some jobs were actually found. if myJobList.__len__() == 0: exit('Exit on Error: No Jobs Found!'); #--- If we didn't exit above, we returned the parsed job list return myJobList; def performBackup(myJobList): for job in myJobList: #--- job[0]: JobID #--- job[1]; Backup Path #--- job[2]: Backup Target #--- job[3]: temp directory #--- job[4]: Excluded directories myJobId = job[0]; myFindPath = job[1]; myTarget = job[2]; myTempPath = job[3]; myExcludes = job[4]; myTargetPath = os.path.join(myTarget,str('job'+myJobId),'master'); myTempFileList = 'backup_job'+myJobId+'.tmp'; myFullTempPath = os.path.join(myTargetPath,myTempFileList); myTargetMeta = 'job'+myJobId+'_master.meta'; myTargetTarFile = 'job'+myJobId+'_master_seed.tar'; myTargetMetaPath = os.path.join(myTargetPath,myTargetMeta); myTargetTarFilePath = os.path.join(myTargetPath,myTargetTarFile); if os.path.exists(os.path.join(myTarget,str('job'+myJobId))): print os.path.join(myFindPath,str('job'+myJobId)); exit('Critical Error on JobID: '+myJobId+'\n This job directory already exists! Exiting to preserve data!'); else: os.makedirs(os.path.join(myTarget,str('job'+myJobId))); if os.path.exists(myTargetPath): exit('Critical Error on JobID: '+myJobId+'\n This job directory already exists! Exiting to preserve data!'); else: os.makedirs(myTargetPath); myExcludeFiles = list(); for excludes in myExcludes: myExcludeFiles.append(excludes); myExcludeFiles.append(myTarget); print "------------------------------------------------"; print "Starting Job: ",myJobId; seed_files.seedMain(myFindPath,myJobId,myFullTempPath,myTargetMetaPath,myTargetTarFilePath,myExcludeFiles); #--- Execute the script. myJobList = list(); jobdir = readConfFile(); myFoundJobs = findJobs(jobdir); myJobList = parseJobs(myFoundJobs); print "Number of jobs: ",myJobList.__len__(); #promptContinue() #--- Let user review backup jobs, prompt for continue. performBackup(myJobList);
job103.xml
Job file in jobs.d/ directory
<?xml version="1.0"?> <data> <jobid>103</jobid> <backupdir>/home/myuser/files</backupdir> <backuptarget>/offsitenfs/client1/target2</backuptarget> <temppath>/tmp</temppath> <exclude> <directory>/home/myuser/files/badfolder1</directory> <directory>/home/myuser/files/music/badfolder2</directory> </exclude> </data>