#!/usr/bin/perl -w

use FindBin;
use lib "$FindBin::Bin/../../perl_lib";

#-d:DProf

######################################################################
#
#  __COPYRIGHT__
#
# Copyright 2000-2008 University of Southampton. All Rights Reserved.
# 
#  __LICENSE__
#
######################################################################

=head1 NAME

import_rand_data - import random eprint data records

=head1 SYNOPSIS

import_rand_data <repoid> [dataset] [count]

=cut

use EPrints;
use Getopt::Long;
use Pod::Usage;
use Text::Lorem;

use strict;
use utf8;
use encoding "utf8";

my $version = 0;
my $verbose = 0;
my $quiet = 0;
my $help = 0;
my $man = 0;
my $userid = 1;

Getopt::Long::Configure("permute");

GetOptions( 
	'user=s' => \$userid,
	'help|?' => \$help,
	'man' => \$man,
	'version' => \$version,
	'verbose+' => \$verbose,
	'silent' => \$quiet,
	'quiet' => \$quiet,
	'dump' => \(my $dump),
	'scigen' => \(my $scigen),
) || pod2usage( 2 );
EPrints::Utils::cmd_version( "import_test_data" ) if $version;
pod2usage( 1 ) if $help;
pod2usage( -exitstatus => 0, -verbose => 2 ) if $man;
pod2usage( 2 ) if( scalar @ARGV < 1 ); 

my $noise = 1;
$noise = 0 if( $quiet );
$noise = 1+$verbose if( $verbose );

our $SCIGEN;
if( $scigen )
{
	eval "use Text::Scigen";
	die "scigen argument requires Text::Scigen module" if $@;
	$SCIGEN = Text::Scigen->new( filename => "scirules.in" );
}

srand(1); # Reproduce the exact same set of data

our %CYRILLIC_MAP = (
	(map { chr(65+$_) => chr(0x410 + $_) } 0 .. 25),
	(map { chr(97+$_) => chr(0x430 + $_) } 0 .. 25)
);

our $lorem = Text::Lorem->new();
sub Text::Lorem::name {
	my( $lorem ) = @_;

	my $word = $lorem->words(1);
	return "\u$word";
};

my @NAMES = &read_names;

# generate standard lorems, then add some exotic UTF-8 stuff
$lorem->{wordlist} = $lorem->generate_wordlist();
binmode(\*DATA, ":utf8");
push @{$lorem->{wordlist}}, map { split /\s+/, $_ } <DATA>;

# nb. This syntax is subject to change in future versions.
my( $archiveid, $datasetid, $total ) = @ARGV;

my $session = new EPrints::Session( 1 , $archiveid, $noise );
exit( 1 ) unless( defined $session );

my $db = $session->get_database;

$datasetid = "archive" unless defined $datasetid;
$total ||= 1000;

# Basic data
our @TYPES = @{$session->get_repository->{types}->{eprint}};
our @SUBJECTS = read_subjects( $session, "subjects" );
our @PUBLICATIONS;
for(1..100)
{
	push @PUBLICATIONS, $lorem->sentences(1);
}
our @PUBLISHERS;
for(1..10)
{
	push @PUBLISHERS, $lorem->sentences(1);
}
our @INSTITUTIONS;
for(1..10)
{
	push @INSTITUTIONS, $lorem->sentences(1);
}

my $datapath = "$FindBin::Bin/../data";

our @DOCUMENTS = (
	{
		format => "application/pdf",
		language => "en",
		security => "public",
		main => "paper2.pdf",
		files => [ {
			filename => "paper2.pdf",
			filesize => "128765",
			url => "file:$datapath/paper2.pdf",
		} ],
	},
	{
		format => "application/pdf",
		language => "en",
		security => "public",
		main => "paper.pdf",
		files => [ {
			filename => "paper.pdf",
			filesize => "12174",
			url => "file:$datapath/paper.pdf",
		} ],
	},
	{
		format => "image/jpeg",
		language => "en",
		security => "public",
		main => "picture.jpg",
		files => [ {
			filename => "picture.jpg",
			filesize => "7927",
			url => "file:$datapath/picture.jpg",
		} ],
	},
	{
		format => "text/plain",
		language => "en",
		security => "public",
		main => "text.txt",
		files => [ {
			filename => "text.txt",
			filesize => "35",
			url => "file:$datapath/text.txt",
		} ],
	},
);

my $ds = $session->dataset( $datasetid );

$session->get_repository->{config}->{enable_file_imports} = 1;
$session->get_repository->{config}->{enable_web_imports} = 1;


if( $dump )
{
	print <<"EOX";
<?xml version='1.0'?>
<eprints>
EOX
}
for(1..$total)
{
	my $type = rand_type();
	my $data = {
		eprint_status => $datasetid,
		userid => $userid,
		type => $type,
		title => rand_title(),
		creators => rand_names(),
		editors => ($type =~ /^book/ ? rand_names() : []),
		ispublished => "pub",
		subjects => [ rand_subjects() ],
		full_text_status => "public",
		abstract => rand_abstract(),
		date => rand_date(),
		date_type => "published",
		refereed => "TRUE",
		publication => rand_publication(),
		book_title => ($type eq "book_section" ? rand_title() : undef),
		publisher => ($type =~ /^book/ ? rand_publisher() : undef),
		thesis_type => ($type eq "thesis" ? "phd" : undef),
		institution => rand_institution(),
		documents => rand_documents(),
	};
	if( $dump )
	{
		my $plugin = $session->plugin( "Export::XML" );
		print $plugin->output_dataobj(
			$ds->create_object( $session, $data ),
			omit_declaration => 1,
			omit_root => 1,
		);
	}
	else
	{
		$session->get_database->begin;
		my $eprint = $ds->create_object( $session, $data );
		$session->get_database->commit;
		print $eprint->get_id, "\n";
	}
}
if( $dump )
{
	print "\n</eprints>\n";
}

$session->terminate;

exit;

sub read_names
{
	open(my $fh, "<:gzip", "$FindBin::Bin/../data/names.gz")
		or die "Error opening names.gz: $!";
	my @names = <$fh>;
	chomp($_) for @names;
	close($fh);
	foreach my $i (0..int($#names/2))
	{
		my $name = $names[$i];
		$name =~ s/(.)/$CYRILLIC_MAP{$1}||$1/eg;
		push @names, $name;
	}
	for(@names)
	{
		my( $given, $family, $lineage ) = /^([^,]+)\s+([^,]+)(?:,\s*(.+))?$/;
		if( !defined $family )
		{
			$family = $_;
			$given = undef;
		}
		$family =~ s/\s+//g;
		$_ = {
			given => $given,
			family => $family,
			lineage => $lineage,
		};
	}
	return @names;
}

sub rand_names
{
	my @names;
	my $max = int(rand(6))+1;
	while($max--)
	{
		my $i = int(rand(scalar(@NAMES)));
		push @names, {
			name => $NAMES[$i],
			id => $i,
		};
	}
	return \@names;
}

sub rand_type
{
	$TYPES[int(rand(@TYPES))];
}

sub rand_subjects
{
	my @subjects;
	my $max = int(rand(4))+1;
	while($max--)
	{
		push @subjects, $SUBJECTS[int(rand(@SUBJECTS))];
	}
	return @subjects;
}

sub rand_title
{
	return trim($SCIGEN->generate( "SCI_TITLE" )) if $scigen;
	my $title = $lorem->sentences( 1 );
	$title =~ s/\.$//;
	return $title;
}

sub rand_abstract
{
	return trim($SCIGEN->generate( [qw(SCI_INTRO_A SCI_ABSTRACT_A SCI_INTRO_THESIS)] )) if $scigen;
	return $lorem->paragraphs(int(rand(3))+2);
}

sub rand_date
{
	my $res = int(rand(2));
	my $date = 1950 + int(rand(50));
	if( $res > 0 )
	{
		$date .= sprintf("-%02d",1+int(rand(12)));
	}
	if( $res > 1 )
	{
		$date .= sprintf("-%02d",1+int(rand(29)));
	}
	return $date;
}

sub read_subjects
{
	my( $session, $id ) = @_;

	my $ds = $session->dataset( "subject" );

	my @subjects;
	$ds->search(filters => [
		{ meta_fields => [qw( ancestors )], value => $id },
	])->map(sub {
		(undef, undef, my $subject) = @_;

		return if $subject->value( "depositable" ) ne "TRUE";

		push @subjects, $subject->id;
	});

	return @subjects;
}

sub rand_publication
{
	return trim($SCIGEN->generate( "CITE_JOURNAL" )) if $scigen;
	return $PUBLICATIONS[int(rand(@PUBLICATIONS))];
}

sub rand_documents
{
	my @docs = (0..$#DOCUMENTS);
	for(my $i = @docs; --$i; ) {
		my $j = int rand ($i+1);
		next if $i == $j;
		@docs[$i,$j] = @docs[$j,$i];
	}
	@docs = @docs[0..int(rand($#DOCUMENTS))];

	return [@DOCUMENTS[@docs]];
}

sub rand_publisher
{
	return @PUBLISHERS[int(rand($#PUBLISHERS))];
}

sub rand_institution
{
	return trim($SCIGEN->generate( "CITE_INST" )) if $scigen;
	return @INSTITUTIONS[int(rand($#INSTITUTIONS))];
}

sub trim
{
	my( $str ) = @_;
	$str =~ s/^\s+//;
	$str =~ s/\s+$//;
	$str =~ s/  +/ /;
	return $str;
}

=pod

=cut

__DATA__
Πιο κι άγχος συνηθίζουν χαρακτηριστικό μια μιας σελίδων περιμένουν τι αρέσει
γνωρίζει καθυστερούσε τι ροή Μη πως ίδιο θυμάμαι επιτυχία μου μη χαρτιού
γνωρίζουμε τελικών αντιλήφθηκαν νέα μη Βγήκε κεντρικό κακόκεφος ως μην το
μου ποια προσλάμβανες προσπαθήσεις Όλα βάζοντας εργασίας σημαίνει μα την τη
νόμιζες βρίσκονται Ώρα ας πάρεις φακέλους περιμένουν όχι ως χώρου αρχεία
τοπικές σας γεγονός κακόκεφους με
खयालात परिवहन संपुर्ण प्राण कलइस अपने कार्य हुएआदि प्राधिकरन खयालात वर्णित मुश्किल पेदा देखने औषधिक दुनिया
आंतरजाल करके(विशेष ।क नवंबर नयेलिए दिनांक बारे उन्हे विषय ऎसाजीस शारिरिक बीसबतेबोध बाजार क्षमता। भेदनक्षमता
मार्गदर्शन करते क्षमता। विश्लेषण केन्द्रित बाधा मुक्त
Об том яйце расположенной ушёл всего остальные до или Всех отнимет проходят
за эти так ну адрес человек покидали об лет может команда Нее даже сказать
остальных во ну был фирме колёса задаче заведено качества работать две то
Нет кровью проектировать программировать во опа ну опять сайта конечного
Люсита количества программистом нет ну те уже случае работал отнимет
стратегические ты бог Чем голос процессорах не вы малой вообще поработать
нью уже во совсем раннего предназначенная
Rout ugedon d'Vullen nei an ké d'Loft Scholl fergiess den ston d'Hierz
Hemecht wat jo Jo wuel aremt Poufank rei och Noper päift iw'rem wa rou jo
onser Scholl rëschten Mier spilt blénken an bei iech Mier d'Beem un rei am
Hunn Minutt Freiesch den Ze Frot laacht d'Lëtzebuerger rei mä iwerall
d'Kanner schéinen och Hir dé koum päift Scholl zwé wait Säiten Fuesent as do
gin d'Hierz schaddreg De hin Gart Kënnt onser Ierd Halm Gaas do och
