######################################################################
#
# ParaTools::DocParser; 
#
######################################################################
#
#  This file is part of ParaCite Tools 
#
#  Copyright (c) 2002 University of Southampton, UK. SO17 1BJ.
#
#  ParaTools is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  ParaTools is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with ParaTools; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
######################################################################

package ParaTools::Intl;

use strict;
require Exporter;
use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAG $CHAR_MATCHES %CHAR_GRAMMAR %CHAR_TRANSFORMS %LIGATURES );

use utf8;
use Carp;
use Encode;

@ISA = qw( Exporter );
@EXPORT_OK = qw( &normalise_multichars );
use constant DEBUG => 0;

=pod

=head1 NAME

@<ParaTools::Intl> - utility module for handling International characters

=head1 DESCRIPTION

ParaTools::Intl provides some utility functions for handling international
characters.

=head1 SYNOPSIS

	use ParaTools::Intl qw( normalise_multichars );

	print normalise_multichars( $str );

=head1 METHODS

=over 4

=item $str = normalise_multichar( $str )

Convert multi-char international characters into single UTF-8 chars, e.g.:
	¨o => ö
These appear in pdftotext output from PDFs generated by pdflatex.

=cut

%CHAR_GRAMMAR = (
# "
0x22=>{
	'a'=>0xe4,
	'e'=>0xeb,
	'o'=>0xf6,
	'u'=>0xfc,
},
# '
0x27=>{
	'a'=>0xe1,
	'e'=>0xe9,
	'o'=>0xf3,
	'u'=>0xfa,
	'n'=>0x144,
	'z'=>0x17a,
},
# ^
0x5e=>{
	'a'=>0xe2,
	'e'=>0xea,
	'o'=>0xf4,
	'u'=>0xfb,
},
# `
0x60=>{
	'a'=>0xe0,
	'e'=>0xe8,
	'o'=>0xf2,
	'u'=>0xf9,
},
# diaeresis ()
0xa8=>{
	'a'=>0xe4,
	'e'=>0xeb,
	'o'=>0xf6,
	'u'=>0xfc,
},
# 
0xb4=>{
	'a'=>0xe1,
	'e'=>0xe9,
	'o'=>0xf3,
	'u'=>0xfa,
	'n'=>0x144,
	'z'=>0x17a,
},
# ~
0x7e=>{
	'n'=>0xf1,
},
0x2d9=>{ # Dot above
	'z'=>0x17c,
},
);
%LIGATURES = (
chr(0xfb00) => 'ff',
chr(0xfb01) => 'fi',
chr(0xfb02) => 'fl',
chr(0xfb03) => 'ffi',
chr(0x2013) => '-',
chr(0x2019) => "'",
);

$CHAR_MATCHES = '';
while(my($pf,$hr) = each %CHAR_GRAMMAR) {
	$pf = pack("U",$pf);
	while(my($lett,$rep) = each %$hr) {
		$CHAR_MATCHES .= "\\".$pf.' ?'.$lett.'|';
		$CHAR_TRANSFORMS{$pf.$lett} = pack("U",$rep);
		$CHAR_TRANSFORMS{$pf.' '.$lett} = pack("U",$rep);
	}
}
chop($CHAR_MATCHES);
#$CHAR_MATCHES = "\x{b4}e";

#$CHAR_MATCHES = '[\x{5e}\x{60}\x{a8}\x{b4}\x{7e}][aeounzn]';

if(DEBUG) {
#	binmode(STDERR,":utf8");
	print charcodes($CHAR_MATCHES), "\n";
	for(sort { $a cmp $b } keys %CHAR_TRANSFORMS) {
		print STDERR "'$_' => '$CHAR_TRANSFORMS{$_}'\n";
	}
	my $str = "H".pack("U",0xb4)."enon";
	print STDERR "String test: '".charcodes($str)."' => '".normalise_multichars($str)."'\n";
}

sub normalise_multichars {
	my $str = shift;
	return $str unless $str;
#	croak "utf8 flag not on: $str"
#		unless Encode::is_utf8($str);
#	croak "Cannot normalise multichars on invalid utf8: $str"
#		unless Encode::is_utf8($str,1);
#	while($str =~ s/$CHAR_MATCHES/$CHAR_TRANSFORMS{$&}||'?'/eso) {
#print "Substituting '".charcodes($&)."' with ".($CHAR_TRANSFORMS{$&}||'?')."\n" if defined($&);
#}
#	$str =~ tr/\x{2019}\x{2013}/'-/;
	$str =~ s/([\x{2019}\x{2013}\x{fb00}\x{fb01}\x{fb02}\x{fb03}])/$LIGATURES{$1}/esog;
	$str =~ s/(?<=[A-Za-z])($CHAR_MATCHES)/$CHAR_TRANSFORMS{$1}||'?'/esog;
	$str;
}

sub charcodes {
	use bytes;
	my @chars = split(//,$_[0]);
	my $str = '';
	for(@chars) {
		$str .= sprintf("%s(%x)",$_,ord($_));
	}
	$str;
}

sub grammar {
	return %CHAR_GRAMMAR;
}

1;

__END__

=pod

=back

=head1 AUTHOR

Tim Brody <tdb01r@ecs.soton.ac.uk>

=cut
