diff --git a/README b/README new file mode 100644 index 0000000000000000000000000000000000000000..e99b032e89f309956e335d68983443d707e2eb52 --- /dev/null +++ b/README @@ -0,0 +1,121 @@ + _ __ _ + _____ (_)___ ___________/ /_ (_) _____ _____ + / ___/ / / __ `/ ___/ ___/ __ \/ / | / / _ \/ ___/ + / /__ / / /_/ / / / /__/ / / / /| |/ / __/ / + \___/_/ /\__,_/_/ \___/_/ /_/_/ |___/\___/_/ + /___/ + + +DESCRIPTION + +cjarchiver is a Python script that can be used to compress +a directory including all its files and subdirectories. + +----------------------------------------------------------- + +PREREQUISITES + +In order to use cjarchiver in sciCORE clusters we need to +load the cjarchiver module: + +ml cjarchiver + +This will load the default version. If you need a specific +version you can search for it with: + +ml spider cjarchiver + +To archive a directory our current working directory should +be at the same level than the target directory. Additionally +it is mandatory that the target directory contains a metadata +file with JSON format. The user should create this file +following this format: + +{ + "name": "NAME OF INVESTIGATOR", + "email": "EMAIL OF INVESTIGATOR", + "pi_name": "NAME OF PI", + "pi_email": "EMAIL OF PI", + "project": "INSERT PROJECT NAME HERE", + "project_start_date": "YYYY-MM-DD", + "project_end_date": "YYYY-MM-DD", + "description": "INSERT PROJECT DESCRIPTION HERE MULTILINE IS NOT OK", + "collaborators":[ + { "name": "COLLABORATOR NAME", + "email": "COLLABORATOR EMAIL" + }, + { "name": "COLLABORATOR NAME", + "email": "COLLABORATOR EMAIL" + } + ], + "comments": "ADDITIONAL COMMENTS (E.G. LEGAL REQUIREMENTS REGARDING DURATION OF DATA PRESERVATION, ETC...)" +} + +----------------------------------------------------------- + +USAGE + +To execute cjarchiver: + +cjarchiver <target_directory> [options] + +If successful, this will generate four files with the name +format <username>_YYYYMMDDThhmmss_<targetfoldername> and the +following extensions: + +.log - with the outputs of the script. +.manifest - with the full list of archived files including + permissions, ownership, size, date, and path. +.md5sum - with the full list of archived files and their + corresponding path and MD5 checksum. +.tar.bz2 - compressed archive of the target directory. + +The manifest and md5sum files are also automatically copied +inside of the target directory and, therefore, included in +the .tar.bz2 file. + +After the creation of these files, cjarchiver renames the +target directory as <targetdirectory>.toberemoved/ and +automatically moves the .tar.bz2 file to the default archiving +directory: /scicore/archive/<group>/<username>/ + +As its name indicates, <targetdirectory>.toberemoved/ can be +deleted, but prior to that, we strongly recommend to check that +the .tar.bz2 file has been correctly moved to the archiving +directory. + +----------------------------------------------------------- + +OPTIONS + +-h, --help: Shows a help message and exits. +-x <subdirectory>, +--exclude <subdirectory>: The user can specify subdirectories + to be excluded from archiving (only + first level subdirectories names, not + full path). It can be repeated for + additional subdirectories. + +----------------------------------------------------------- + +EXAMPLES + +Archive directory "old_data": + cjarchive old_data + +Archive directory "old_data" but exclude "old_data/bad_exp" + cjarchive old_data -x bad_exp + +Archive directory "old_data" but exclude "old_data/bad_exp" +and "old_data/bad_data" + cjarchive old_data -x bad_exp -x bad_data + +----------------------------------------------------------- + +KNOWN ISSUES + +cjarchiver uses the find command to create the manifest and +the md5sum files. It is known that find might fail when used +through NFS to access remote directories. We recommend to use +cjarchiver locally (i.e. directly where the target data is +located). \ No newline at end of file diff --git a/bin/cjarchiver b/bin/cjarchiver new file mode 100755 index 0000000000000000000000000000000000000000..90b08ca70e8a0a13f86781ac0c2f05e065e640b1 --- /dev/null +++ b/bin/cjarchiver @@ -0,0 +1,268 @@ +#!/usr/bin/env python +import os +import sys +import getpass +import grp +import json +import textwrap +from os.path import * +from argparse import * +from datetime import datetime + +#************************************************************* +#Functions +#************************************************************* +#Shell command output checker +def check_shellcommand(command_run,msg): + if command_run != 0: + logfile.write(msg+" unsuccessful (Stopping):\n") + logfile.close() + sys.exit("Stopping: "+msg+" unsuccessful") + +#Remove slash character if given with directory +#Checks that we are in the same level as the target directory +def checkcorrectname(directory): + if directory[len(directory)-1] == '/': + directory=directory[:-1] + if '/' in directory: + sys.exit("Stopping: Wrong directory name. Present PATH should be at the same level of the target directory") + else: + return directory + +#Checks that subdirectories do not contain / character +def checkcorrectnameexcluded(excluded): + for i, f in enumerate(excluded): + print f + if f[len(f)-1] == '/': + f=f[:-1] + if '/' in f: + sys.exit("Stopping: Wrong subdirectory name %s. It should be maximum one level depth from the target directory" %(f)) + excluded[i]=f + return excluded + +#Checks if directory exists +def checkdirectory(directory,out): + pathfile=os.environ['PWD']+'/'+directory + exists=os.path.isdir(pathfile) + print "Searching %s" %(pathfile) + logfile.write("Searching %s\n" %(pathfile)) + if exists: + if out == 0: + print "Directory %s found!" %(directory) + logfile.write("Directory %s found!\n" %(directory)) + else: + print "Directory %s found and excluded!" %(directory) + logfile.write("Directory %s found and excluded!\n" %(directory)) + else: + logfile.write("Stopping: directory not found\n") + logfile.close() + sys.exit("Stopping: directory not found") + logfile.flush() + +#Checks if metadata exists and has the correct JSON format +def checkmetadata(directory): + pathfile=os.environ['PWD']+'/'+directory+'/ARCHIVE_METADATA.json' + exists=os.path.isfile(pathfile) + print "Searching metadata %s" %(pathfile) + logfile.write("Searching metadata %s\n" %(pathfile)) + if exists: + print "Metadata found!" + logfile.write("Metadata found!\n") + correct=is_json(pathfile) + if correct: + print "Metadata format is correct" + logfile.write("Metadata format is correct\n") + else: + print "Metadata format is NOT correct. Use -h or --help for an example of JSON format" + logfile.write("Metadata format is NOT correct\n") + else: + logfile.write("Stopping: metadata not found\n") + logfile.close() + sys.exit("Stopping: metadata not found") + logfile.flush() + + +#JSON validator: Stolen from StackExchange +def is_json(jsonfile): + f=open(jsonfile, 'r') + myjson=f.read() + try: + json_object = json.loads(myjson) + except ValueError, e: + return False + return True + + +#Create list of files (manifest) +def createlist(directory,nameman,excluded): + print "Listing files included in the backup" + if excluded: + excluding=' ' + for f in excluded: + excluding=excluding+'\( -path ./'+directory+'/'+f+' -prune \) -o ' + command='find ./'+directory+excluding+' -ls > '+nameman + else: + command='find ./'+directory+' -ls > '+nameman + logfile.write("Listing files included in the backup\n") + logfile.write("Executing command: %s\n" %(command)) + logfile.flush() + command_run=os.system(command) + msg="Listing of files " + check_shellcommand(command_run,msg) + os.system('cp '+nameman+' '+directory) + logfile.flush() + +#Create list of files md5sum +def createlist_md5sum(directory,namemd5,excluded): + print "creating md5sum for files included in the backup" + if excluded: + excluding=' ' + for f in excluded: + excluding=excluding+'\( -path ./'+directory+'/'+f+' -prune \) -o ' + command='find ./'+directory+excluding+' -type f -exec md5sum {} \; > '+namemd5 + else: + command='find ./'+directory+' -type f -exec md5sum {} \; > '+namemd5 + logfile.write("Listing md5sum of files included in the backup\n") + logfile.write("Executing command: %s\n" %(command)) + logfile.flush() + command_run=os.system(command) + msg="Creating md5sum of files " + check_shellcommand(command_run,msg) + os.system('cp '+namemd5+' '+directory) + logfile.flush() + +#Create archive file tar +def createarchive(directory,archivefile,excluded): + msg="Creating archive " + print msg+"%s" %(archivefile) + if excluded: + excluding=' ' + for f in excluded: + excluding=excluding+"--exclude='"+directory+'/'+f+"' " + command='tar'+excluding+'-cf - '+directory+' | lbzip2 -n 4 > '+archivefile + else: + command='tar cf - '+directory+' | lbzip2 -n 4 > '+archivefile + logfile.write("Creating archive %s\n" %(archivefile)) + logfile.write("Executing command: %s\n" %(command)) + command_run=os.system(command) + check_shellcommand(command_run,msg) + logfile.flush() + +#Check compressed file integrity +def checkintegrity(archivefile): + msg="Checking integrity" + print msg + command='lbzip2 -tv '+archivefile + logfile.write("Archive %s created! Checking integrity\n" %(archivefile)) + logfile.write("Executing command: %s\n" %(command)) + command_run=os.system(command) + check_shellcommand(command_run,msg) + logfile.flush() + +#Rename original folder +def renameoriginal(directory): + msg="Renaming original directory " + print msg + command='mv '+directory+" "+directory+".toberemoved" + logfile.write("Renaming original directory\n") + logfile.write("Executing command: %s\n" %(command)) + command_run=os.system(command) + check_shellcommand(command_run,msg) + logfile.flush() + +#Store compressed file in the corresponding (groupPI) directory +def store(archivefile,groupname,username): + msg="Moving archive %s to /scicore/archive/%s/%s\n" %(archivefile, groupname, username) + print msg + command='mv '+archivefile+' '+'/scicore/archive/'+groupname+'/'+username+'/' + logfile.write("Moving archive %s to /scicore/archive/%s/%s\n" %(archivefile, groupname, username)) + logfile.write("Executing command: %s\n" %(command)) + command_run=os.system(command) + check_shellcommand(command_run,msg) + logfile.flush() + +#************************************************************* +#Parsing arguments and options from command line +#************************************************************* +usage = "python %(prog)s [options] directory" +description = ''' +description: +cjarchive.py archives a folder and all its contents in a compressed file +with the name USER_YYYYMMDDHHMMSS_DIRECTORY.tar.bz2. Requires that the +target folder contains a metadata file named ARCHIVE_METADATA.json in +JSON format (see below an example). +After the archive has been created it is moved to +/scicore/archive/<group>/<user>/ + + +Developed by Ruben M. Cabezon - sciCORE (University of Basel) +ruben.cabezon@unibas.ch +''' +epilog=''' + +JSON format example for the ARCHIVE_METADATA.json file: +{ + "name": "NAME OF INVESTIGATOR", + "email": "EMAIL OF INVESTIGATOR", + "pi_name": "NAME OF PI", + "pi_email": "EMAIL OF PI", + "project": "INSERT PROJECT NAME HERE", + "project_start_date": "YYYY-MM-DD", + "project_end_date": "YYYY-MM-DD", + "description": "INSERT PROJECT DESCRIPTION HERE MULTILINE IS NOT OK", + "collaborators":[ + { "name": "COLLABORATOR NAME", + "email": "COLLABORATOR EMAIL" + }, + { "name": "COLLABORATOR NAME", + "email": "COLLABORATOR EMAIL" + } + ], + "comments": "ADDITIONAL COMMENTS (E.G. LEGAL REQUIREMENTS REGARDING DURATION OF DATA PRESERVATION, ETC...)" +} +''' +parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter,usage=usage,description=textwrap.dedent(description),epilog=textwrap.dedent(epilog)) +parser.add_argument("directory", + help="specify directory, from current level, to be archived") +parser.add_argument("-x","--exclude",action='append', metavar='subdirectory', + help="specify subdirectories to be excluded from archiving (only first level subdirectories names, not full path) Can be repeated for additional subdirectories.") +args = parser.parse_args() + + +#************************************************************* +#Initialization +#************************************************************* +directory=checkcorrectname(args.directory) +if args.exclude: + excluded=checkcorrectnameexcluded(args.exclude) +else: + excluded=[] +now=datetime.now() +username=getpass.getuser() +groupname=grp.getgrgid(os.getgid()).gr_name +name=username+'_'+now.strftime('%Y%m%dT%H%M%S')+'_'+directory +namelog=name+'.log' +namemd5=name+'.md5sum' +nameman=name+'.manifest' +archivefile=name+'.tar.bz2' + +#************************************************************* +#Open logfile +#************************************************************* +logfile=open(namelog,'a') + + +#************************************************************* +#Main code +#************************************************************* +checkdirectory(directory,0) +checkmetadata(directory) +if args.exclude: + for f in excluded: + checkdirectory(directory+'/'+f,1) +createlist(directory,nameman,excluded) +createlist_md5sum(directory,namemd5,excluded) +createarchive(directory,archivefile,excluded) +checkintegrity(archivefile) +renameoriginal(directory) +store(archivefile,groupname,username)