# Contributors:
# Christopher P. Barnes <senrabc@gmail.com>
# Andrei Sura: github.com/indera
# Mohan Das Katragadda <mohan.das142@gmail.com>
# Philip Chase <philipbchase@gmail.com>
# Ruchi Vivek Desai <ruchivdesai@gmail.com>
# Taeber Rapczak <taeber@ufl.edu>
# Nicholas Rejack <nrejack@ufl.edu>
# Josh Hanna <josh@hanna.io>
# Copyright (c) 2015, University of Florida
# All rights reserved.
#
# Distributed under the BSD 3-Clause License
# For full text of the BSD 3-Clause License see http://opensource.org/licenses/BSD-3-Clause
"""
This module is used to connect to an sftp server
and retrieve the raw EMR file to be used as input for RED-I.
"""
import os
import csv
from xml.sax import saxutils
import logging
import pysftp
from csv2xml import openio, Writer
from paramiko.ssh_exception import SSHException, BadAuthenticationType
import sys
import ast
import copy
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
[docs]class EmrFileAccessDetails(object) :
"""
Encapsulate the settings used to retrieve the EMR
source file using an SFTP connection
@see redi#_run()
"""
def __init__(self,
emr_sftp_project_name,
emr_download_list,
emr_host,
emr_username,
emr_password,
emr_port,
emr_private_key,
emr_private_key_pass
):
self.sftp_project_name = emr_sftp_project_name
try:
self.download_list = ast.literal_eval(emr_download_list)
# maintain backwards compatibility with existing config repos
# if it doesn't automatically evaluate to a dictionary, make a dictionary
except ValueError:
self.download_list = {str(emr_download_list): "raw.txt"}
self.host = emr_host
self.username = emr_username
self.password = emr_password
self.port = int(emr_port)
self.private_key = emr_private_key
self.private_key_pass = emr_private_key_pass
#============================
# Module level functions
#============================
[docs]def download_files(destination, access_details):
"""
Download a file from the sftp server
:destination the name of the file which will be downloaded
:access_details holds info for accessing the source file over sftp
@see get_emr_data()
"""
connection_info = dict(access_details.__dict__)
# delete unnecessary elements form the dictionary
del connection_info['download_list']
del connection_info['sftp_project_name']
# check for errors during authentication with EMR server
try:
with pysftp.Connection(**connection_info) as sftp:
logger.info("User %s connected to sftp server %s" % \
(connection_info['username'], connection_info['host']))
sftp.get(access_details.download_list, destination)
except IOError as e:
logger.error("Please verify that the private key file mentioned in "\
"settings.ini exists.")
logger.exception(e)
sys.exit()
except BadAuthenticationType as e:
logger.error("Please verify that the EMR server connection details "\
"under section emr_data in settings.ini are correct")
logger.exception(e)
sys.exit()
except SSHException as e:
logger.error("Please verify that the EMR server connection details "\
"under section emr_data in settings.ini are correct")
logger.exception(e)
sys.exit()
[docs]def data_preprocessing(input_filename, output_filename):
# replace &, >, < with &, &>;, &<;
with open(input_filename, 'r') as raw, open(output_filename, 'w') as processed:
for line in raw:
processed.write(saxutils.escape(line))
[docs]def generate_xml(input_filename, output_filename):
# generate_xml now replicates the functionality from the
# "main" code block of csv2xml.py. This allows us to use
# it like another module in our project without having to call os.system().
class Arguments:
pass
args = Arguments()
# Set the properties which we used to pass as command line arguments
args.iencoding = 'cp1252'
args.oencoding = 'utf8'
args.header = True,
args.delimiter = ','
args.declaration = True
args.root_elem = 'study'
args.record_elem = 'subject'
args.ofile = output_filename
args.ifile = input_filename
# Now configure the defaults that would've been set if we were to execute
# csv2xml.py from the command line.
args.linebreak = u'\n'
args.escapechar = None
args.indent = u' '
args.quoting = csv.QUOTE_MINIMAL
args.skipinitialspace = False
args.field_elem = u'field'
args.flat_fields = False
args.doublequote = True
args.quotechar = '"'
args.newline_elem = None
# WARNING! The rest of this function is copied verbatim from csv2xml.py.
# There should be no differences between these blocks of code whatsoever.
# TODO: Replace csv2xml.py entirely?
csv.register_dialect('custom',
delimiter=args.delimiter,
doublequote=args.doublequote,
escapechar=args.escapechar,
quotechar=args.quotechar,
quoting=args.quoting,
skipinitialspace=args.skipinitialspace)
with openio(args.ifile, mode='r', encoding=args.iencoding,
newline='') as ifile:
csvreader = csv.reader(ifile, dialect='custom')
if args.header:
args.header = next(csvreader)
with openio(args.ofile, 'w', args.oencoding) as ofile:
writer = Writer(ofile, args)
writer.write_file(csvreader)
[docs]def cleanup(file_to_delete):
os.remove(file_to_delete)
[docs]def get_emr_data(conf_dir, connection_details):
"""
:conf_dir configuration directory name
:connection_details EmrFileAccessDetails object
"""
number_of_files = len(connection_details.download_list)
counter = 1
for key in connection_details.download_list:
logger.info("Now downloading %i of %i file(s)", counter, number_of_files)
# make a copy of the dict
temp_connection_details = copy.deepcopy(connection_details)
# download the next file in the dict
raw_txt_file = os.path.join(conf_dir, connection_details.download_list[key])
temp_connection_details.download_list = os.path.join(connection_details.sftp_project_name, key)
logger.info("Downloading remote file file: " + temp_connection_details.download_list)
logger.info("Saving to local file name: " + raw_txt_file)
download_files(raw_txt_file, temp_connection_details)
counter += 1