Source code for redi.utils.csv2xml

#!/usr/bin/env python

from __future__ import print_function, unicode_literals
from io import open
import sys
reload(sys)
sys.setdefaultencoding('utf8')

# ==============================================================================
# Conversion from CSV to XML
# ==================
#
# Legal
# ------------------
#
# This software was written by Jerzy Jalocha N <jjalocha@gmail.com>. It is
# distributed "as is" without warranty of any kind. Use at you own risk!
# The author puts no restrictions on the user of this software, except
# attribution. You are free to share, remix, and re-license it, as long as
# the original author is credited.
#
#
# Synopsis
# ------------------
#
# csv2xml.py [options] ifile
#
#
# General Options
# ------------------
#
# ifile             Input CSV file path. Mandatory argument. If a hyphen '-' is
#                   given, the script reads from STDIN.
#
# ofile             Output XML file path. If the option os absent, the script
#                   writes to STDOUT.
#
# iencoding         Input file encoding. Defaults to UTF-8.
#
# oencoding         Output file encoding. Defaults to UTF-8.
#
#
# Input Options
# ------------------
#
# delimiter         The one-character string that is used to separate the
#                   fields. It defaults to a comma ','.
#                   eg: Specifying '\t', instructs the script to read tabulator-
#                       separated files:
#                         A1[TAB]B1[TAB]C1
#
# doublequote       Controls how instances of quotechar appearing inside a field
#                   should be themselves be quoted. When True, the character is
#                   doubled. When False, the escapechar is used as a prefix to
#                   the quotechar. It defaults to True.
#
# escapechar        The escapechar removes any special meaning from the
#                   following character. It defaults to None, which disables
#                   escaping.
#                   eg: Using '!' for escaping the delimiter character:
#                         A1,B!,2,C3
#
#(lineterminator)   The reader is hard-coded to recognise either '\r' or '\n' as
#                   end-of-line, and ignores lineterminator. This behavior may
#                   change in the future.
#
# quotechar         A one-character string used to quote fields containing
#                   special characters, such as the delimiter or quotechar, or
#                   which contain new-line characters. It defaults to '"'.
#                   eg: When set to a single-quote, you can easily use the
#                       delimiter inside fields:
#                         A1,"bee,2",C3
#
# quoting           Controls when quotes should be recognised by the reader. It
#                   can take on any of the QUOTE_* constants below:
#     QUOTE_MINIMAL Default. Instructs writer objects to only quote those fields
#                   which contain special characters such as delimiter,
#                   quotechar or any of the characters in lineterminator.
#         QUOTE_ALL Instructs writer objects to quote all fields.
#  QUOTE_NONNUMERIC Instructs the reader to convert all non-quoted fields to
#                   type float.
#        QUOTE_NONE Instructs reader to perform no special processing of quote
#                   characters.
#
# skipinitialspace  When True, whitespace immediately following the delimiter is
#                   ignored. The default is False.
#                   eg: When False, 'A, B, C' is read as
#                         <field>A</field> <field> B</field> <field> C</field>.
#                       When True, it is read as
#                         <field>A</field> <field>B</field> <field> C</field>.
#
# header            This option instructs the script to read read the field
#                   names from the first file line. It defaults to False.
#                   eg: When True, if reads the following CSV input:
#                         'colA,colB,colC
#                          A1,B1,C1'
#                       And uses the first line as field element names:
#                         <colA>A1</colA> <colB>B1</colB> <colC>C1</colC>
#
#
# Output Options
# ------------------
#
# xml-declaration   This option instructs the script to write an XML
#                   declaration. It defaults to False.
#                   eg: If this option is set, the first line in the output
#                       is <?xml version="1.0" encoding="UTF-8"?>.
#
# root_elem         These three options define the element names in the output
# record_elem       XML document. They default to <document>, <record>, and
# field_elem        <field>.
#                   eg: Specifying 'table', 'row', and 'cell', the output
#                       elements become <table>, <row>, and <cell>.
#
# newline_elem      Name for the newline element. It is disabled (None) by
#                   default.
#                   eg: Specifying 'br' will output a <br/> element for each
#                       newline in a field.
#
# flat_fields       This option disables the numbering of the field elements in
#                   the XML output. It is False by default.
#                   eg: When True, a field element is output as a <field>
#                       element, instead of <field0>, <field1>, etc.
#
# indent            XML file indentation. Defaults to four spaces '    '.
#                   eg: Specifying '\t' uses a tabulator, and using ''
#                       disables indentation alltogether.
#
# linebreak         This option defines what character is used at the end of
#                   each line in the XML file. It defaults to '\n', printing
#                   a new line after each element.
#                   eg: Specifying '' instructs the script to print the whole
#                       XML document as one single line.
#
# ==============================================================================

import csv
import sys
from optparse import OptionParser, OptionGroup

# Replace s by r in text.


[docs]def replace(text, s, r):
    return r.join(text.split(s))

# NOTE: If you modify this script, and the need arises to re-use sys.stdin
#       or sys.stdout, uncomment the following.
#
# Never close STDIN and STDOUT.
# def do_not_close(exc_type, exc_value, traceback):
#    pass
#sys.stdin.__exit__ = do_not_close
#sys.stdout.__exit__ = do_not_close

# Open a file or standard input/output from a unified interface.


[docs]def openio(filename, mode, encoding, newline=None):
    if filename == '-':       # Hyphen is commonly used to designate stdin/out.
        filename = None       # Use filename = None for stdin/out.
    if filename:
        return open(filename, mode=mode, encoding=encoding, newline=newline)
    elif mode == 'r':
        return sys.stdin
    elif mode == 'w':
        return sys.stdout
    else:
        raise ValueError("mode not recognized")

# Sometimes we need to print linebreak elements to the output document, in place
# of the real linebreaks in the input document. Sometimes we just keep print
# out the unmodified field content.


[docs]def field_subst_factory(newline):
    newline_tag = '<{0}/>'.format(newline)

    def text_replace(field):
        return replace(field, '\n', newline_tag)

    def text_keep(field):
        return field
    if newline:
        return text_replace
    else:
        return text_keep

# This class handles all the creating of the XML file.


[docs]class Writer:

    def __init__(self, ofile, args):
        self.file = ofile
        self.args = args
        self.newline_subst = field_subst_factory(args.newline_elem)
        if args.header:
            self.fieldname = self.__fieldname_header
        elif args.flat_fields:
            self.fieldname = self.__fieldname_flat
        else:
            self.fieldname = self.__fieldname_indexed

[docs]    def write_file(self, data):
        if self.args.declaration:
            declaration = ('<?xml version="1.0" encoding="{0}"?>'.
                           format(self.args.oencoding))
            self.write(declaration)
        self.write("<{0}>".format(self.args.root_elem))
        for record in data:
            self.write_record(record)
        self.write("</{0}>".format(self.args.root_elem))

[docs]    def write_record(self, record):
        self.write("{0}<{1}>".
                   format(self.args.indent, self.args.record_elem))
        for index, field in enumerate(record):
            self.write_field(field, index)
        self.write("{0}</{1}>".
                   format(self.args.indent, self.args.record_elem))

[docs]    def write_field(self, field, index):
        self.write("{0}{0}<{1}>{2}</{1}>".
                   format(self.args.indent, self.fieldname(index),
                          self.newline_subst(field)))

[docs]    def write(self, text):
        print(text, file=self.file, end=self.args.linebreak)

    def __fieldname_header(self, index):
        return self.args.header[index]

    def __fieldname_flat(self, index):
        return self.args.field_elem

    def __fieldname_indexed(self, index):
        return self.args.field_elem + str(index)

# Custom callback function for the command-line parser.
# Store tabs and newlines as "real" tabs and newlines.


[docs]def cleanup_callback(option, opt, value, parser):
    result = replace(value, '\\n', '\n')
    result = replace(result, '\\t', '\t')
    setattr(parser.values, option.dest, result)

# Parse the huge amount of command-line options.


[docs]def parse_cmdline():
    usage = "usage: %prog [options] IFILE"
    parser = OptionParser(usage)
    parser.set_defaults(iencoding='UTF-8',
                        oencoding='UTF-8',
                        delimiter=b',',
                        doublequote=True,
                        quotechar=b'"',
                        quoting=csv.QUOTE_MINIMAL,
                        skipinitialspace=False,
                        header=False,
                        declaration=False,
                        root_elem='root',
                        record_elem='record',
                        field_elem='field',
                        flat_fields=False,
                        indent='    ',
                        linebreak='\n')
    parser.add_option('-o', '--output-file', dest='ofile',
                      help="save to file OFILE")
    parser.add_option('-c', '--input-encoding', dest='iencoding',
                      help="input file encoding")
    parser.add_option('-C', '--output-encoding', dest='oencoding',
                      help="output file encoding")

    igroup = OptionGroup(parser, "CSV Dialect Options")
    igroup.add_option('-d', '--delimiter', dest='delimiter', type='str',
                      action='callback', callback=cleanup_callback,
                      help="a one-character string used to separate fields")
    igroup.add_option('-b', '--no-doublequote', action='store_false',
                      dest='doublequote',
                      help="controls how instances of quotechar appearing "
                      "inside a field should be themselves be quoted")
    igroup.add_option('-e', '--escapechar',
                      help="the escapechar removes any special meaning from "
                      "the following character")
    igroup.add_option('-q', '--quotechar',
                      help="A one-character string used to quote fields "
                      "containing special characters")
    igroup.add_option('--quote-all', dest='quoting',
                      action='store_const', const=csv.QUOTE_ALL,
                      help="quote all field (READER?)")
    igroup.add_option('--quote-minimal', dest='quoting',
                      action='store_const', const=csv.QUOTE_MINIMAL,
                      help="quote only special characters (WRITER?)")
    igroup.add_option('--quote-nonnumeric', dest='quoting',
                      action='store_const', const=csv.QUOTE_NONNUMERIC,
                      help="convert all non-quoted fields to type float")
    igroup.add_option('--quote-none', dest='quoting',
                      action='store_const', const=csv.QUOTE_NONE,
                      help="perform no special processing of quote characters")
    igroup.add_option('-s', '--skipinitialspace', action='store_true',
                      help="if whitespace immediately following the delimiter "
                      "should be ignored")
    igroup.add_option('-a', '--header', action='store_true',
                      help="read field names from file")

    ogroup = OptionGroup(parser, "XML Dialect Options")
    ogroup.add_option('-x', '--xml-declaration', dest='declaration',
                      action='store_true',
                      help="whether to output an XML declaration")
    ogroup.add_option('-t', '--root-element', dest='root_elem',
                      help="name of the root element")
    ogroup.add_option('-r', '--record-element', dest='record_elem',
                      help="name of the record-level element")
    ogroup.add_option('-f', '--field-element', dest='field_elem',
                      help="name of the field-level element")
    ogroup.add_option('-n', '--newline-element', dest='newline_elem',
                      help="name of the line break element")
    ogroup.add_option('-l', '--flat-fields', action='store_true',
                      help="disable field element numbering")
    ogroup.add_option('-i', '--indent', dest='indent', type='str',
                      action='callback', callback=cleanup_callback,
                      help="indentation")
    ogroup.add_option('-k', '--linebreak', dest='linebreak', type='str',
                      action='callback', callback=cleanup_callback,
                      help="line break character in output file")

    parser.add_option_group(igroup)
    parser.add_option_group(ogroup)
    options, args = parser.parse_args()
    if len(args) != 1:
        parser.error("incorrect number of arguments")
    options.ifile = args[0]
    return options

# The main processing code.
if __name__ == '__main__':
    args = parse_cmdline()
    csv.register_dialect('custom',
                         delimiter=args.delimiter,
                         doublequote=args.doublequote,
                         escapechar=args.escapechar,
                         quotechar=args.quotechar,
                         quoting=args.quoting,
                         skipinitialspace=args.skipinitialspace)
    with openio(args.ifile, mode='r', encoding=args.iencoding,
                newline='') as ifile:
        csvreader = csv.reader(ifile, dialect='custom')
        if args.header:
            args.header = next(csvreader)
        with openio(args.ofile, 'w', args.oencoding) as ofile:
            writer = Writer(ofile, args)
            writer.write_file(csvreader)