#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2022-07-31 09:52:44 +0300 (Sun, 31 Jul 2022) $
#$Revision: 9353 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.10.0/scripts/cif_values $
#------------------------------------------------------------------------------
#*
#* Read CIFs and print out requested tag values in a tabular form.
#*
#* USAGE:
#*    $0 --options input1.cif input*.cif
#**

use strict;
use warnings;
use File::Basename qw( basename );
use COD::CIF::Parser qw( parse_cif );
use COD::SOptions qw( getOptions get_value );
use COD::SUsage qw( usage options );
use COD::ErrorHandler qw( process_parser_messages
                          report_message );
use COD::ToolsVersion qw( get_version_string );

my $die_on_error_level = {
    ERROR   => 1,
    WARNING => 0,
    NOTE    => 0
};

my $use_parser = "c";

my $header = 1;
my $separator = "\t";
my $value_separator = ",";

my @selected_tags;
my @uniquified_tags;
my $write_data_name = 1;

my $write_filename = 1;
my $write_basename = 2;
my $write_file;

my $replace_spaces = 1; # indicates whether spaces should be replaced
                        # to underscores
my $fix_errors = 0;

my $treat_dots_as_underscores = 1;

#* OPTIONS:
#*   -t, --tags _tag1,_tag2,_tag3
#*   --tags "_tag1 _tag2 _tag3"
#*                     Print out values of tags _tag1, _tag2, and _tag3;
#*                     the specified tag list becomes a new list of printed tags.
#*
#*   -a, --add-tag _tag4
#*                     Add tag _tag4 to the list of printed tags.
#*
#*   -c, --clear-tags
#*                     Clear the list of printed tags accumulated so far.
#*
#*   -h, --header
#*                     Print out header with the names of selected tags at the
#*                     beginning (default).
#*
#*   -h-, --no-header
#*                     Do not print out any header.
#*
#*   -r, --replace-spaces
#*                     Replace spaces into underscores (default).
#*
#*   -r-, --dont-replace-spaces, --no-replace-spaces
#*                     Do not replace spaces into underscores (works
#*                     satisfactory only when tabs are used as separators).
#*
#*   -s, --separator=", "
#*                     Use the specified string as a separator between tag
#*                     values on a line.
#*
#*   -u, --uniquify-tags _tag1,_tag2
#*                     Print only unique values of specified tags, not the whole
#*                     list of values.
#*
#*   --add-uniquified-tag _tag1
#*                     Add tag '_tag' to the uniquified tag list.
#*
#*   -u-, --dont-uniquify-tags, --no-uniquify-tags
#*                     Print all values of all tags, do not uniquify any tags
#*                     (default).
#*
#*   --tab-separator
#*                     Use TAB character as a separator between tag values on
#*                     a line (default).
#*
#*   --vseparator, --value-separator ','
#*                     Use the specified string to separate multiple values
#*                     of a given tag (default: ',').
#*
#*   --treat-dots-as-underscores
#*                     Convert all dots in tag names into underscores (default).
#*
#*   --dont-treat-dots-as-underscores, --no-treat-dots-as-underscores
#*                     Leave original tags as they are.
#*
#*   --filename
#*                     Print out full filename at the beginning of each line.
#*
#*   --fix-syntax-errors
#*                     Try to fix those syntax errors in input CIFs that can be
#*                     corrected unambiguously.
#*
#*   --no-fix-syntax-errors,
#*   --dont-fix-syntax-errors
#*                     Do not try to fix syntax errors in input CIFs (default).
#*
#*   --basename
#*                     Print out basename of each file at the beginning of each
#*                     line.
#*
#*   --no-filename
#*                     Do not print file names (default).
#*
#*   --dataname
#*                     Print out data block name at the beginning of each line
#*                     (default).
#*
#*   --no-dataname
#*                     Do not print data block names.
#*
#*   --use-perl-parser
#*   --use-c-parser
#*                     Specify parser to parse CIF files (default: C parser).
#*
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
@ARGV = getOptions(
    "-t,--tags" => sub { @selected_tags = split( /\s|,/, get_value()) },
    "-a,--add-tag"    => \@selected_tags,
    "-c,--clear-tags" => sub { @selected_tags = () },

    "-h,--header"     => sub { $header = 1 },
    "-h-,--no-header" => sub { $header = 0 },

    "-r,--replace-spaces" => sub { $replace_spaces = 1 },
    "-r-,--no-replace-spaces" => sub { $replace_spaces = 0 },
    "--dont-replace-spaces" => sub { $replace_spaces = 0 },

    "-s,--separator"  => \$separator,
    "--tab-separator" => sub { $separator = "\t" },

    "-u,--uniquify-tags" => sub { @uniquified_tags =
                                      split( /\s|,/, get_value())},

    "--add-uniquified-tag" => \@uniquified_tags,

    "-u-,--no-uniquify-tags" => sub { @uniquified_tags = () },
    "--dont-uniquify-tags"   => sub { @uniquified_tags = () },

    "--treat-dots-as-underscores"
        => sub { $treat_dots_as_underscores = 1 },
    "--dont-treat-dots-as-underscores"
        => sub { $treat_dots_as_underscores = 0 },
    "--no-treat-dots-as-underscores"
        => sub { $treat_dots_as_underscores = 0 },

    "--vseparator"      => \$value_separator,
    "--value-separator" => \$value_separator,

    "--filename"      => sub { $write_file = $write_filename },
    "--basename"      => sub { $write_file = $write_basename },
    "--no-filename"   => sub { $write_file = 0 },

    "--fix-syntax-errors"       => sub { $fix_errors = 1; },
    "--dont-fix-syntax-errors"  => sub { $fix_errors = 0; },
    "--no-fix-syntax-errors"    => sub { $fix_errors = 0; },

    "--dataname"      => sub { $write_data_name = 1 },
    "--no-dataname"   => sub { $write_data_name = 0 },

    "--use-perl-parser" => sub { $use_parser = "perl" },
    "--use-c-parser"    => sub { $use_parser = "c" },

    '--options'      => sub { options; exit },
    '--help,--usage' => sub { usage; exit },
    '--version'      => sub { print get_version_string(), "\n"; exit }
);

sub sunique(@);

if( $treat_dots_as_underscores ) {
    @selected_tags = map { s/\./_/g; $_ } @selected_tags;
    @uniquified_tags = map { s/\./_/g; $_ } @uniquified_tags;
}

@selected_tags = map { lc($_) } @selected_tags;

@uniquified_tags = map { lc($_) } @uniquified_tags;

my %uniquified_tags = map { ($_,1) } @uniquified_tags;

if( $header ) {
    print "# ", join( $separator, @selected_tags ), "\n";
}

@ARGV = ( "-" ) unless @ARGV;

binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

for my $filename (@ARGV) {

    my $options = { fix_errors => $fix_errors,
                    parser     => $use_parser,
                    no_print   => 1 };

    my ( $data, $err_count, $messages ) = parse_cif( $filename, $options );
    process_parser_messages( $messages, $die_on_error_level );
    next if ( $err_count > 0 );

    if( !defined $data->[0] || !defined $data->[0]{name} ) {
        report_message( {
           'program'   => $0,
           'filename'  => $filename,
           'err_level' => 'WARNING',
           'message'   => 'file seems to be empty'
        }, $die_on_error_level->{WARNING} );
        next;
    }

    for my $dataset (@$data) {

        my @values = ();
        my $values = $dataset->{values};

        if( $treat_dots_as_underscores ) {
            for my $tag (keys %{$values}) {
                my $new_tag = $tag;
                $new_tag =~ s/\./_/g;
                if( $new_tag ne $tag ) {
                    $values->{$new_tag} = $values->{$tag};
                }
            }
        }

        if( $write_file ) {
            if( $write_file == $write_filename ) {
                push( @values, $filename );
            } else {
                push( @values, basename( $filename ));
            }
        }

        if( $write_data_name ) {
            push( @values, $dataset->{name} ? $dataset->{name} : "?" );
        }

        for my $tag (@selected_tags) {
            if( exists $values->{$tag} ) {
                my @tag_vals;
                if( $replace_spaces ) {
                    @tag_vals = map { s/\n$//; s/[\n\s]/_/g; $_ }
                        @{$values->{$tag}};
                } else {
                    @tag_vals = map { s/\n$//; s/[\n\s]/ /g; $_ }
                        @{$values->{$tag}};
                }
                if( $uniquified_tags{$tag} ) {
                    push( @values, join( $value_separator,
                                         sunique( @tag_vals )));
                } else {
                    push( @values, join( $value_separator, @tag_vals ));
                }
            } else {
                push( @values, "?" );
            }
        }

        print join( $separator, @values ), "\n";
    }
}

sub sunique(@)
{
    my @sorted = sort { $a cmp $b } @_;
    my @unique = ();

    for my $value (@sorted) {
        if( int(@unique) == 0 || $unique[-1] ne $value ) {
            push( @unique, $value );
        }
    }
    return @unique;
}
