Skip to content
Snippets Groups Projects
Commit 3f4f8b0d authored by Ruben's avatar Ruben
Browse files

Added readme + path to env in script

parent e410408c
No related branches found
No related tags found
No related merge requests found
README 0 → 100644
_ __ _
_____ (_)___ ___________/ /_ (_) _____ _____
/ ___/ / / __ `/ ___/ ___/ __ \/ / | / / _ \/ ___/
/ /__ / / /_/ / / / /__/ / / / /| |/ / __/ /
\___/_/ /\__,_/_/ \___/_/ /_/_/ |___/\___/_/
/___/
DESCRIPTION
cjarchiver is a Python script that can be used to compress
a directory including all its files and subdirectories.
-----------------------------------------------------------
PREREQUISITES
In order to use cjarchiver in sciCORE clusters we need to
load the cjarchiver module:
ml cjarchiver
This will load the default version. If you need a specific
version you can search for it with:
ml spider cjarchiver
To archive a directory our current working directory should
be at the same level than the target directory. Additionally
it is mandatory that the target directory contains a metadata
file with JSON format. The user should create this file
following this format:
{
"name": "NAME OF INVESTIGATOR",
"email": "EMAIL OF INVESTIGATOR",
"pi_name": "NAME OF PI",
"pi_email": "EMAIL OF PI",
"project": "INSERT PROJECT NAME HERE",
"project_start_date": "YYYY-MM-DD",
"project_end_date": "YYYY-MM-DD",
"description": "INSERT PROJECT DESCRIPTION HERE MULTILINE IS NOT OK",
"collaborators":[
{ "name": "COLLABORATOR NAME",
"email": "COLLABORATOR EMAIL"
},
{ "name": "COLLABORATOR NAME",
"email": "COLLABORATOR EMAIL"
}
],
"comments": "ADDITIONAL COMMENTS (E.G. LEGAL REQUIREMENTS REGARDING DURATION OF DATA PRESERVATION, ETC...)"
}
-----------------------------------------------------------
USAGE
To execute cjarchiver:
cjarchiver <target_directory> [options]
If successful, this will generate four files with the name
format <username>_YYYYMMDDThhmmss_<targetfoldername> and the
following extensions:
.log - with the outputs of the script.
.manifest - with the full list of archived files including
permissions, ownership, size, date, and path.
.md5sum - with the full list of archived files and their
corresponding path and MD5 checksum.
.tar.bz2 - compressed archive of the target directory.
The manifest and md5sum files are also automatically copied
inside of the target directory and, therefore, included in
the .tar.bz2 file.
After the creation of these files, cjarchiver renames the
target directory as <targetdirectory>.toberemoved/ and
automatically moves the .tar.bz2 file to the default archiving
directory: /scicore/archive/<group>/<username>/
As its name indicates, <targetdirectory>.toberemoved/ can be
deleted, but prior to that, we strongly recommend to check that
the .tar.bz2 file has been correctly moved to the archiving
directory.
-----------------------------------------------------------
OPTIONS
-h, --help: Shows a help message and exits.
-x <subdirectory>,
--exclude <subdirectory>: The user can specify subdirectories
to be excluded from archiving (only
first level subdirectories names, not
full path). It can be repeated for
additional subdirectories.
-----------------------------------------------------------
EXAMPLES
Archive directory "old_data":
cjarchive old_data
Archive directory "old_data" but exclude "old_data/bad_exp"
cjarchive old_data -x bad_exp
Archive directory "old_data" but exclude "old_data/bad_exp"
and "old_data/bad_data"
cjarchive old_data -x bad_exp -x bad_data
-----------------------------------------------------------
KNOWN ISSUES
cjarchiver uses the find command to create the manifest and
the md5sum files. It is known that find might fail when used
through NFS to access remote directories. We recommend to use
cjarchiver locally (i.e. directly where the target data is
located).
\ No newline at end of file
#!/usr/bin/env python
import os
import sys
import getpass
import grp
import json
import textwrap
from os.path import *
from argparse import *
from datetime import datetime
#*************************************************************
#Functions
#*************************************************************
#Shell command output checker
def check_shellcommand(command_run,msg):
if command_run != 0:
logfile.write(msg+" unsuccessful (Stopping):\n")
logfile.close()
sys.exit("Stopping: "+msg+" unsuccessful")
#Remove slash character if given with directory
#Checks that we are in the same level as the target directory
def checkcorrectname(directory):
if directory[len(directory)-1] == '/':
directory=directory[:-1]
if '/' in directory:
sys.exit("Stopping: Wrong directory name. Present PATH should be at the same level of the target directory")
else:
return directory
#Checks that subdirectories do not contain / character
def checkcorrectnameexcluded(excluded):
for i, f in enumerate(excluded):
print f
if f[len(f)-1] == '/':
f=f[:-1]
if '/' in f:
sys.exit("Stopping: Wrong subdirectory name %s. It should be maximum one level depth from the target directory" %(f))
excluded[i]=f
return excluded
#Checks if directory exists
def checkdirectory(directory,out):
pathfile=os.environ['PWD']+'/'+directory
exists=os.path.isdir(pathfile)
print "Searching %s" %(pathfile)
logfile.write("Searching %s\n" %(pathfile))
if exists:
if out == 0:
print "Directory %s found!" %(directory)
logfile.write("Directory %s found!\n" %(directory))
else:
print "Directory %s found and excluded!" %(directory)
logfile.write("Directory %s found and excluded!\n" %(directory))
else:
logfile.write("Stopping: directory not found\n")
logfile.close()
sys.exit("Stopping: directory not found")
logfile.flush()
#Checks if metadata exists and has the correct JSON format
def checkmetadata(directory):
pathfile=os.environ['PWD']+'/'+directory+'/ARCHIVE_METADATA.json'
exists=os.path.isfile(pathfile)
print "Searching metadata %s" %(pathfile)
logfile.write("Searching metadata %s\n" %(pathfile))
if exists:
print "Metadata found!"
logfile.write("Metadata found!\n")
correct=is_json(pathfile)
if correct:
print "Metadata format is correct"
logfile.write("Metadata format is correct\n")
else:
print "Metadata format is NOT correct. Use -h or --help for an example of JSON format"
logfile.write("Metadata format is NOT correct\n")
else:
logfile.write("Stopping: metadata not found\n")
logfile.close()
sys.exit("Stopping: metadata not found")
logfile.flush()
#JSON validator: Stolen from StackExchange
def is_json(jsonfile):
f=open(jsonfile, 'r')
myjson=f.read()
try:
json_object = json.loads(myjson)
except ValueError, e:
return False
return True
#Create list of files (manifest)
def createlist(directory,nameman,excluded):
print "Listing files included in the backup"
if excluded:
excluding=' '
for f in excluded:
excluding=excluding+'\( -path ./'+directory+'/'+f+' -prune \) -o '
command='find ./'+directory+excluding+' -ls > '+nameman
else:
command='find ./'+directory+' -ls > '+nameman
logfile.write("Listing files included in the backup\n")
logfile.write("Executing command: %s\n" %(command))
logfile.flush()
command_run=os.system(command)
msg="Listing of files "
check_shellcommand(command_run,msg)
os.system('cp '+nameman+' '+directory)
logfile.flush()
#Create list of files md5sum
def createlist_md5sum(directory,namemd5,excluded):
print "creating md5sum for files included in the backup"
if excluded:
excluding=' '
for f in excluded:
excluding=excluding+'\( -path ./'+directory+'/'+f+' -prune \) -o '
command='find ./'+directory+excluding+' -type f -exec md5sum {} \; > '+namemd5
else:
command='find ./'+directory+' -type f -exec md5sum {} \; > '+namemd5
logfile.write("Listing md5sum of files included in the backup\n")
logfile.write("Executing command: %s\n" %(command))
logfile.flush()
command_run=os.system(command)
msg="Creating md5sum of files "
check_shellcommand(command_run,msg)
os.system('cp '+namemd5+' '+directory)
logfile.flush()
#Create archive file tar
def createarchive(directory,archivefile,excluded):
msg="Creating archive "
print msg+"%s" %(archivefile)
if excluded:
excluding=' '
for f in excluded:
excluding=excluding+"--exclude='"+directory+'/'+f+"' "
command='tar'+excluding+'-cf - '+directory+' | lbzip2 -n 4 > '+archivefile
else:
command='tar cf - '+directory+' | lbzip2 -n 4 > '+archivefile
logfile.write("Creating archive %s\n" %(archivefile))
logfile.write("Executing command: %s\n" %(command))
command_run=os.system(command)
check_shellcommand(command_run,msg)
logfile.flush()
#Check compressed file integrity
def checkintegrity(archivefile):
msg="Checking integrity"
print msg
command='lbzip2 -tv '+archivefile
logfile.write("Archive %s created! Checking integrity\n" %(archivefile))
logfile.write("Executing command: %s\n" %(command))
command_run=os.system(command)
check_shellcommand(command_run,msg)
logfile.flush()
#Rename original folder
def renameoriginal(directory):
msg="Renaming original directory "
print msg
command='mv '+directory+" "+directory+".toberemoved"
logfile.write("Renaming original directory\n")
logfile.write("Executing command: %s\n" %(command))
command_run=os.system(command)
check_shellcommand(command_run,msg)
logfile.flush()
#Store compressed file in the corresponding (groupPI) directory
def store(archivefile,groupname,username):
msg="Moving archive %s to /scicore/archive/%s/%s\n" %(archivefile, groupname, username)
print msg
command='mv '+archivefile+' '+'/scicore/archive/'+groupname+'/'+username+'/'
logfile.write("Moving archive %s to /scicore/archive/%s/%s\n" %(archivefile, groupname, username))
logfile.write("Executing command: %s\n" %(command))
command_run=os.system(command)
check_shellcommand(command_run,msg)
logfile.flush()
#*************************************************************
#Parsing arguments and options from command line
#*************************************************************
usage = "python %(prog)s [options] directory"
description = '''
description:
cjarchive.py archives a folder and all its contents in a compressed file
with the name USER_YYYYMMDDHHMMSS_DIRECTORY.tar.bz2. Requires that the
target folder contains a metadata file named ARCHIVE_METADATA.json in
JSON format (see below an example).
After the archive has been created it is moved to
/scicore/archive/<group>/<user>/
Developed by Ruben M. Cabezon - sciCORE (University of Basel)
ruben.cabezon@unibas.ch
'''
epilog='''
JSON format example for the ARCHIVE_METADATA.json file:
{
"name": "NAME OF INVESTIGATOR",
"email": "EMAIL OF INVESTIGATOR",
"pi_name": "NAME OF PI",
"pi_email": "EMAIL OF PI",
"project": "INSERT PROJECT NAME HERE",
"project_start_date": "YYYY-MM-DD",
"project_end_date": "YYYY-MM-DD",
"description": "INSERT PROJECT DESCRIPTION HERE MULTILINE IS NOT OK",
"collaborators":[
{ "name": "COLLABORATOR NAME",
"email": "COLLABORATOR EMAIL"
},
{ "name": "COLLABORATOR NAME",
"email": "COLLABORATOR EMAIL"
}
],
"comments": "ADDITIONAL COMMENTS (E.G. LEGAL REQUIREMENTS REGARDING DURATION OF DATA PRESERVATION, ETC...)"
}
'''
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter,usage=usage,description=textwrap.dedent(description),epilog=textwrap.dedent(epilog))
parser.add_argument("directory",
help="specify directory, from current level, to be archived")
parser.add_argument("-x","--exclude",action='append', metavar='subdirectory',
help="specify subdirectories to be excluded from archiving (only first level subdirectories names, not full path) Can be repeated for additional subdirectories.")
args = parser.parse_args()
#*************************************************************
#Initialization
#*************************************************************
directory=checkcorrectname(args.directory)
if args.exclude:
excluded=checkcorrectnameexcluded(args.exclude)
else:
excluded=[]
now=datetime.now()
username=getpass.getuser()
groupname=grp.getgrgid(os.getgid()).gr_name
name=username+'_'+now.strftime('%Y%m%dT%H%M%S')+'_'+directory
namelog=name+'.log'
namemd5=name+'.md5sum'
nameman=name+'.manifest'
archivefile=name+'.tar.bz2'
#*************************************************************
#Open logfile
#*************************************************************
logfile=open(namelog,'a')
#*************************************************************
#Main code
#*************************************************************
checkdirectory(directory,0)
checkmetadata(directory)
if args.exclude:
for f in excluded:
checkdirectory(directory+'/'+f,1)
createlist(directory,nameman,excluded)
createlist_md5sum(directory,namemd5,excluded)
createarchive(directory,archivefile,excluded)
checkintegrity(archivefile)
renameoriginal(directory)
store(archivefile,groupname,username)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment