Part one, take the first full backup. At the present time, this code is still under development and should not be used on a production machine. However, I am posting it here for reference.
Eventually, this code is going to be included in a backup client I am developing that will interface with glusterfs and Amazon S3 storage.
Currently, this code is tested to run on Python v2.7.4 on a Fedora 18 machine. With all three python files, and any number of properly defined job xml files in the jobs.d/ directory, these scripts are currently functional.
seed_files.py
This is the controller file for taking the first backup.
#!/usr/bin/python
# Create first full backup
import os, stat, time, seed_functions
def printHello():
print "hello";
#--- This part of the script does the heavy lifting.
def seedMain(myFindPath,myJobId,myFullTempPath,myTargetMetaPath,myTargetTarFilePath,myExcludeFiles):
#---execute the first backup for the job.
seed_functions.findFiles(myFullTempPath,myFindPath,myExcludeFiles);
seed_functions.storeMetaData(myFullTempPath,myTargetMetaPath);
seed_functions.mkTarFile(myFullTempPath,myTargetTarFilePath);
print "Job completed successfully";
#--- If this is being run as a script, set temporary variables
if __name__ == '__main__':
myFindPath = '/home/myuser/findfiles';
myJobId = str('106');
myTempPath = '/tmp/';
myTempFileList = 'files.tmp';
myFullTempPath = os.path.join(myTempPath,myTempFileList);
myTargetPath = '/home/target1/';
myTargetMeta = 'job'+myJobId+'.meta';
myTargetTarFile = 'job'+myJobId+'.tar';
myTargetMetaPath = myTargetPath+myTargetMeta;
myTargetTarFilePath = myTargetPath+myTargetTarFile;
#---Path names below should be absolute path names (start with / ) and should not end with '/'
myExlcudeFiles = list();
myExlcudeFiles.append('/home/myuser/findfiles/dontbackup');
myExlcudeFiles.append('/home/myuser/findfiles/somebigfiles');
myExlcudeFiles.append('*.adf');
myExlcudeFiles.append('badfilez*');
seedMain(myFindPath,myJobId,myFullTempPath,myTargetMetaPath,myTargetTarFilePath,myExcludeFiles);
#myJdate = seed_functions.getJulianDate();
seed_functions.py
This is the file that does the actual real work
#!/usr/bin/python
#Shared Functions and Classes
import os, sys, stat, time, glob
from datetime import datetime
def getJulianDate():
#--- This function returns an interger value of today's Julian Date
#--- preceded by two digit year. IE: 01JAN2013 -> 13001
nowtime = str(datetime.now());
(year, month, day) = nowtime.split('-');
day = int(day[:2]);
month = int(month);
year = int(year);
t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0));
jdate = (year % 2000) * 1000;
jdate = jdate + time.gmtime(t)[7];
return jdate;
import re
def findFiles(filepath,myFindPath,excludeFiles):
#--- Variables: fileListPath = string, myFindPath = string, excludeFiles = list() of strings)
#--- This function writes/overwrites the file @ 'fileListPath' which should be absolute path.
#--- The file @ fileListPath is a list of files found within myFindPath.
#--- myFindPath is desgined to be an absolute directory path.
#--- excludeFiles is a list() of absolute paths which should be excluded during the finding operation.
with open(filepath, 'w') as ftemp:
#--- Generate list of files that shouldn't be added
#removeExcludedFiles();
for dirname, dirnames, filenames in os.walk(myFindPath):
#--- Remove exlcuded directories
for badfile in dirnames:
if os.path.join(myFindPath,dirname,badfile) in excludeFiles:
dirnames.remove(badfile);
#--- Gather the other directories
for subdirname in dirnames:
#--- we just want to add empty directories
if os.path.islink(os.path.join(dirname, subdirname)) or not os.listdir(os.path.join(dirname, subdirname)):
ftemp.write(os.path.join(dirname, subdirname)+'\n');
#--- Add files to the list.
for filename in filenames:
#--- check to see if the file is a regular file or a link:
if os.path.islink(os.path.join(dirname, filename)) or os.path.isfile(os.path.join(dirname, filename)):
#if not filename in myWildList:
ftemp.write(os.path.join(dirname, filename)+'\n');
ftemp.closed;
def storeMetaData(fileListPath, fileMetaPath):
#---This function creates an output file with
#---Filename and absolute path of fileMetaPath
#---File format is:
#--- /path/to/file ::: modified datetime ::: seconds since 1970 ::: md5 hash
with open(fileMetaPath, 'w') as fmeta:
myFileList = open(fileListPath, 'r');
for filez in myFileList:
#--- create an array to append information.
myFileMeta = list();
#--- Append absolute path and file name as first item
myFileMeta.append(os.path.abspath(filez.strip()));
#--- Get file meta information from os.stat()
myStat = os.stat(filez.strip());
#---Append human readable date/time stamp.
myFileMeta.append(time.ctime(myStat.st_mtime));
#---Append unix timestamp for easy comparison in the future.
myFileMeta.append(myStat.st_mtime);
if not os.path.isdir(filez.strip()):
myHash = md5(filez.strip());
else:
myHash = "---None: directory";
myFileMeta.append(myHash);
metaDataString = str(myFileMeta[0]+":::"+myFileMeta[1]+":::"+str(myFileMeta[2])+":::"+str(myFileMeta[3]));
fmeta.write(metaDataString+'\n');
myFileList.closed;
fmeta.closed;
import hashlib,os
def md5(filename):
''' function to get md5 of file '''
d = hashlib.md5();
try:
d.update(open(filename).read());
except Exception,e:
print e;
else:
return d.hexdigest();
import tarfile
def mkTarFile(fileList, tarOutPath):
thisTarOut = tarOutPath+".lzo"
thisFileList = "-T "+fileList
os.system("tar {options} {tarfile} {filex} &> /dev/null".format(options="cpvfa", tarfile=thisTarOut, filex=thisFileList));
#--------
#--- Not yet impemented functions below:
#--------
def findFiles2(fileListPath,myFindPath,excludeFiles):
#--- This function is for testing purposes only
with open(fileListPath, 'w') as ftemp:
for dirname, dirnames, filenames in os.walk(myFindPath):
for subdirname in dirnames:
#--- we just want to add empty directories
if not os.listdir(os.path.join(dirname, subdirname)):
ftemp.write(os.path.join(dirname, subdirname)+'\n');
for filename in filenames:
#--- check to see if the file is a regular file or a link:
if os.path.islink(os.path.join(dirname, filename)) or os.path.isfile(os.path.join(dirname, filename)):
ftemp.write(os.path.join(dirname, filename)+'\n');
ftemp.closed;
def removeExcludedFiles():
#--- Not implemented yet.
myWildList = list();
wildMatch = re.compile("^\*");
wildMatch2 = re.compile(".*\*");
print "Excluding the following"
for badfile in excludeFiles:
print badfile;
result = wildMatch.match(badfile);
if not result:
result2 = wildMatch2.match(badfile);
print result2;
if result or result2:
myWildList.append(badfile);
print myWildList;
start_seeds.py
This is near completion; it parses the jobs in jobs.d/, verifies them, and runs them.
#!/usr/bin/python
import os, re
import xml.etree.ElementTree as ET
import seed_files
def readConfFile():
#--- Future feature to read a specified jobs.d from config file.
jobdir = 'jobs.d';
return jobdir;
def findJobs(jobdir):
myJobList = list();
confMatch = re.compile(".*\.xml$");
for dirname, dirnames, filenames in os.walk(jobdir):
for jobid in filenames:
result = confMatch.match(jobid);
if result:
myJobList.append(os.path.join(jobdir,jobid));
return myJobList;
def checkPath(pathText):
confMatch = re.compile("^\/");
result = confMatch.match(pathText);
confMatch2 = re.compile("^\/$");
result2 = confMatch2.match(pathText);
if result2:
exit('Path cannot be / ');
if not result:
exit('Directory path must be absolute path: '+pathText);
if os.path.isdir(pathText) or os.path.ismount(pathText):
print 'Path seems valid: ',pathText;
else:
exit('Invalid path: '+pathText);
def parseJobs(myFoundJobs):
print "Found the following config files: ",myFoundJobs;
print "------------------------------------------------";
myJobList = list();
for myJob in myFoundJobs:
mySubList = list();
print "Parsing and testing: ",myJob;
tree = ET.parse(myJob);
root = tree.getroot();
for child in root:
#--- Validate backup path is valide.
#----future feature: master excludes in config file
if child.tag == 'backupdir':
print "Checking Backup Directory";
checkPath(child.text);
if child.tag == 'backuptarget':
print "Checking Backup Target"
checkPath(child.text);
#--- Create another sublist for excluded directories.
if child.tag == 'exclude':
myExcludeList = list();
for subchild in child:
myExcludeList.append(subchild.text);
mySubList.append(myExcludeList);
#--- Since it's not a sublist, we append directly
elif not child.tag == 'exclude':
mySubList.append(child.text);
print "------------------------------------------------";
myJobList.append(mySubList);
#--- Ensure some jobs were actually found.
if myJobList.__len__() == 0:
exit('Exit on Error: No Jobs Found!');
#--- If we didn't exit above, we returned the parsed job list
return myJobList;
def performBackup(myJobList):
for job in myJobList:
#--- job[0]: JobID
#--- job[1]; Backup Path
#--- job[2]: Backup Target
#--- job[3]: temp directory
#--- job[4]: Excluded directories
myJobId = job[0];
myFindPath = job[1];
myTarget = job[2];
myTempPath = job[3];
myExcludes = job[4];
myTargetPath = os.path.join(myTarget,str('job'+myJobId),'master');
myTempFileList = 'backup_job'+myJobId+'.tmp';
myFullTempPath = os.path.join(myTargetPath,myTempFileList);
myTargetMeta = 'job'+myJobId+'_master.meta';
myTargetTarFile = 'job'+myJobId+'_master_seed.tar';
myTargetMetaPath = os.path.join(myTargetPath,myTargetMeta);
myTargetTarFilePath = os.path.join(myTargetPath,myTargetTarFile);
if os.path.exists(os.path.join(myTarget,str('job'+myJobId))):
print os.path.join(myFindPath,str('job'+myJobId));
exit('Critical Error on JobID: '+myJobId+'\n This job directory already exists! Exiting to preserve data!');
else:
os.makedirs(os.path.join(myTarget,str('job'+myJobId)));
if os.path.exists(myTargetPath):
exit('Critical Error on JobID: '+myJobId+'\n This job directory already exists! Exiting to preserve data!');
else:
os.makedirs(myTargetPath);
myExcludeFiles = list();
for excludes in myExcludes:
myExcludeFiles.append(excludes);
myExcludeFiles.append(myTarget);
print "------------------------------------------------";
print "Starting Job: ",myJobId;
seed_files.seedMain(myFindPath,myJobId,myFullTempPath,myTargetMetaPath,myTargetTarFilePath,myExcludeFiles);
#--- Execute the script.
myJobList = list();
jobdir = readConfFile();
myFoundJobs = findJobs(jobdir);
myJobList = parseJobs(myFoundJobs);
print "Number of jobs: ",myJobList.__len__();
#promptContinue() #--- Let user review backup jobs, prompt for continue.
performBackup(myJobList);
job103.xml
Job file in jobs.d/ directory
<?xml version="1.0"?>
<data>
<jobid>103</jobid>
<backupdir>/home/myuser/files</backupdir>
<backuptarget>/offsitenfs/client1/target2</backuptarget>
<temppath>/tmp</temppath>
<exclude>
<directory>/home/myuser/files/badfolder1</directory>
<directory>/home/myuser/files/music/badfolder2</directory>
</exclude>
</data>