perl5.git.perl.org Git - perl5.git/blame

Commit	Line	Data
3ef515df	1	#!./perl
67d7b5ef	2	BEGIN {
a999c27c JH	3	# @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
	4	# with $ENV{PERL_CORE} set
	5	# In case we need it in future...
18cfa0e3	6	require Config; Config->import();
cee96d52	7	pop @INC if $INC[-1] eq '.';
67d7b5ef JH	8	}
67d7b5ef JH	9	use strict;
b536bf57	10	use warnings;
67d7b5ef	11	use Getopt::Std;
f49b0b20	12	use Config;
67d7b5ef	13	my @orig_ARGV = @ARGV;
18cfa0e3	14	our $VERSION = do { my @r = (q$Revision: 2.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
67d7b5ef JH	15
	16	# These may get re-ordered.
	17	# RAW is a do_now as inserted by &enter
0a225b3c	18	# AGG is an aggregated do_now, as built up by &process
a999c27c	19
67d7b5ef JH	20	use constant {
	21	RAW_NEXT => 0,
	22	RAW_IN_LEN => 1,
	23	RAW_OUT_BYTES => 2,
	24	RAW_FALLBACK => 3,
	25
	26	AGG_MIN_IN => 0,
	27	AGG_MAX_IN => 1,
	28	AGG_OUT_BYTES => 2,
	29	AGG_NEXT => 3,
	30	AGG_IN_LEN => 4,
	31	AGG_OUT_LEN => 5,
	32	AGG_FALLBACK => 6,
	33	};
a999c27c	34
67d7b5ef JH	35	# (See the algorithm in encengine.c - we're building structures for it)
	36
	37	# There are two sorts of structures.
	38	# "do_now" (an array, two variants of what needs storing) is whatever we need
	39	# to do now we've read an input byte.
	40	# It's housed in a "do_next" (which is how we got to it), and in turn points
	41	# to a "do_next" which contains all the "do_now"s for the next input byte.
	42
	43	# There will be a "do_next" which is the start state.
	44	# For a single byte encoding it's the only "do_next" - each "do_now" points
	45	# back to it, and each "do_now" will cause bytes. There is no state.
	46
	47	# For a multi-byte encoding where all characters in the input are the same
	48	# length, then there will be a tree of "do_now"->"do_next"->"do_now"
	49	# branching out from the start state, one step for each input byte.
	50	# The leaf "do_now"s will all be at the same distance from the start state,
	51	# only the leaf "do_now"s cause output bytes, and they in turn point back to
	52	# the start state.
	53
0a225b3c	54	# For an encoding where there are variable length input byte sequences, you
67d7b5ef JH	55	# will encounter a leaf "do_now" sooner for the shorter input sequences, but
	56	# as before the leaves will point back to the start state.
	57
	58	# The system will cope with escape encodings (imagine them as a mostly
	59	# self-contained tree for each escape state, and cross links between trees
	60	# at the state-switching characters) but so far no input format defines these.
	61
	62	# The system will also cope with having output "leaves" in the middle of
	63	# the bifurcating branches, not just at the extremities, but again no
	64	# input format does this yet.
	65
	66	# There are two variants of the "do_now" structure. The first, smaller variant
	67	# is generated by &enter as the input file is read. There is one structure
	68	# for each input byte. Say we are mapping a single byte encoding to a
	69	# single byte encoding, with "ABCD" going "abcd". There will be
	70	# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
	71
0a225b3c	72	# &process then walks the tree, building aggregate "do_now" structures for
67d7b5ef JH	73	# adjacent bytes where possible. The aggregate is for a contiguous range of
	74	# bytes which each produce the same length of output, each move to the
	75	# same next state, and each have the same fallback flag.
	76	# So our 4 RAW "do_now"s above become replaced by a single structure
	77	# containing:
	78	# ["A", "D", "abcd", 1, ...]
	79	# ie, for an input byte $_ in "A".."D", output 1 byte, found as
	80	# substr ("abcd", (ord $_ - ord "A") * 1, 1)
	81	# which maps very nicely into pointer arithmetic in C for encengine.c
	82
	83	sub encode_U
	84	{
	85	# UTF-8 encode long hand - only covers part of perl's range
	86	## my $uv = shift;
	87	# chr() works in native space so convert value from table
	88	# into that space before using chr().
	89	my $ch = chr(utf8::unicode_to_native($_[0]));
	90	# Now get core perl to encode that the way it likes.
	91	utf8::encode($ch);
	92	return $ch;
	93	}
	94
	95	sub encode_S
	96	{
	97	# encode single byte
	98	## my ($ch,$page) = @_; return chr($ch);
	99	return chr $_[0];
	100	}
	101
	102	sub encode_D
	103	{
	104	# encode double byte MS byte first
	105	## my ($ch,$page) = @_; return chr($page).chr($ch);
	106	return chr ($_[1]) . chr $_[0];
	107	}
	108
	109	sub encode_M
	110	{
	111	# encode Multi-byte - single for 0..255 otherwise double
	112	## my ($ch,$page) = @_;
	113	## return &encode_D if $page;
	114	## return &encode_S;
	115	return chr ($_[1]) . chr $_[0] if $_[1];
	116	return chr $_[0];
	117	}
	118
	119	my %encode_types = (U => \&encode_U,
	120	S => \&encode_S,
	121	D => \&encode_D,
	122	M => \&encode_M,
	123	);
	124
	125	# Win32 does not expand globs on command line
15f5e486 SH	126	if ($^O eq 'MSWin32' and !$ENV{PERL_CORE}) {
	127	eval "\@ARGV = map(glob(\$_),\@ARGV)";
	128	@ARGV = @orig_ARGV unless @ARGV;
	129	}
67d7b5ef JH	130
	131	my %opt;
	132	# I think these are:
	133	# -Q to disable the duplicate codepoint test
	134	# -S make mapping errors fatal
	135	# -q to remove comments written to output files
	136	# -O to enable the (brute force) substring optimiser
	137	# -o <output> to specify the output file name (else it's the first arg)
	138	# -f <inlist> to give a file with a list of input files (else use the args)
	139	# -n <name> to name the encoding (else use the basename of the input file.
15f5e486 SH	140	#Getopt::Long::Configure("bundling");
15f5e486 SH	141	#GetOptions(\%opt, qw(C M=s S Q q O o=s f=s n=s v));
127a7155	142	getopts('CM:SQqOo:f:n:v',\%opt);
67d7b5ef JH	143
67d7b5ef JH	144	$opt{M} and make_makefile_pl($opt{M}, @ARGV);
aae85ceb	145	$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
127a7155	146	$opt{v} \|\|= $ENV{ENC2XS_VERBOSE};
b3cc7c67	147	$opt{q} \|\|= $ENV{ENC2XS_NO_COMMENTS};
127a7155 SH	148
	149	sub verbose {
	150	print STDERR @_ if $opt{v};
	151	}
	152	sub verbosef {
	153	printf STDERR @_ if $opt{v};
	154	}
67d7b5ef	155
e67e8a4a DM	156
	157	# ($cpp, $static, $sized) = compiler_info($declaration)
	158	#
	159	# return some information about the compiler and compile options we're using:
	160	#
	161	# $declaration - true if we're doing a declaration rather than a definition.
	162	#
	163	# $cpp - we're using C++
	164	# $static - ok to declare the arrays as static
	165	# $sized - the array declarations should be sized
	166
	167	sub compiler_info {
	168	my ($declaration) = @_;
	169
	170	my $ccflags = $Config{ccflags};
	171	if (defined $Config{ccwarnflags}) {
	172	$ccflags .= " " . $Config{ccwarnflags};
	173	}
	174	my $compat = $ccflags =~ /\Q-Wc++-compat/;
	175	my $pedantic = $ccflags =~ /-pedantic/;
	176
	177	my $cpp = ($Config{d_cplusplus} \|\| '') eq 'define';
	178
	179	# The encpage_t tables contain recursive and mutually recursive
	180	# references. To allow them to compile under C++ and some restrictive
	181	# cc options, it may be necessary to make the tables non-static/const
	182	# (thus moving them from the text to the data segment) and/or not
	183	# include the size in the declaration.
	184
	185	my $static = !(
	186	$cpp
	187	\|\| ($compat && $pedantic)
	188	\|\| ($^O eq 'MacOS' && $declaration)
	189	);
	190
	191	# -Wc++-compat on its own warns if the array declaration is sized.
	192	# The easiest way to avoid this warning is simply not to include
	193	# the size in the declaration.
	194	# With -pedantic as well, the issue doesn't arise because $static
	195	# above becomes false.
	196	my $sized = $declaration && !($compat && !$pedantic);
	197
	198	return ($cpp, $static, $sized);
	199	}
	200
	201
67d7b5ef JH	202	# This really should go first, else the die here causes empty (non-erroneous)
	203	# output files to be written.
	204	my @encfiles;
15f5e486	205	if (exists $opt{f}) {
67d7b5ef	206	# -F is followed by name of file containing list of filenames
15f5e486	207	my $flist = $opt{f};
67d7b5ef JH	208	open(FLIST,$flist) \|\| die "Cannot open $flist:$!";
	209	chomp(@encfiles = <FLIST>);
	210	close(FLIST);
	211	} else {
	212	@encfiles = @ARGV;
	213	}
	214
15f5e486	215	my $cname = $opt{o} ? $opt{o} : shift(@ARGV);
18cfa0e3	216	unless ($cname) { # Debugging a Win32 nmake error-only; works via cmdline.
15f5e486 SH	217	print "\nARGV:";
	218	print "$_ " for @ARGV;
	219	print "\nopt:";
	220	print " $_ => ",defined $opt{$_}?$opt{$_}:"undef","\n" for keys %opt;
	221	}
67d7b5ef	222	chmod(0666,$cname) if -f $cname && !-w $cname;
15f5e486	223	open(C,">", $cname) \|\| die "Cannot open $cname:$!";
67d7b5ef JH	224
	225	my $dname = $cname;
	226	my $hname = $cname;
	227
	228	my ($doC,$doEnc,$doUcm,$doPet);
	229
0e4142c9	230	if ($cname =~ /\.(c\|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
67d7b5ef JH	231	{
67d7b5ef JH	232	$doC = 1;
e7cbefb8	233	$dname =~ s/(\.[^\.]*)?$/.exh/;
67d7b5ef	234	chmod(0666,$dname) if -f $cname && !-w $dname;
15f5e486	235	open(D,">", $dname) \|\| die "Cannot open $dname:$!";
67d7b5ef JH	236	$hname =~ s/(\.[^\.]*)?$/.h/;
67d7b5ef JH	237	chmod(0666,$hname) if -f $cname && !-w $hname;
15f5e486	238	open(H,">", $hname) \|\| die "Cannot open $hname:$!";
67d7b5ef JH	239
	240	foreach my $fh (\C,\D,\*H)
	241	{
	242	print $fh <<"END" unless $opt{'q'};
	243	/*
	244	!!!!!!! DO NOT EDIT THIS FILE !!!!!!!
	245	This file was autogenerated by:
	246	$^X $0 @orig_ARGV
f49b0b20	247	enc2xs VERSION $VERSION
67d7b5ef JH	248	*/
	249	END
	250	}
	251
ab5cf2ea YO	252	if ($cname =~ /\.c$/i && $Config{ccname} eq "gcc")
	253	{
	254	print C qq(#pragma GCC diagnostic ignored "-Wc++-compat"\n);
	255	}
	256
	257	if ($cname =~ /\.xs$/i)
67d7b5ef	258	{
c9955564	259	print C "#define PERL_NO_GET_CONTEXT\n";
67d7b5ef JH	260	print C "#include <EXTERN.h>\n";
	261	print C "#include <perl.h>\n";
	262	print C "#include <XSUB.h>\n";
67d7b5ef	263	}
b536bf57	264	print C "#include \"encode.h\"\n\n";
67d7b5ef JH	265
67d7b5ef JH	266	}
ab5cf2ea	267	elsif ($cname =~ /\.enc$/i)
67d7b5ef JH	268	{
	269	$doEnc = 1;
	270	}
ab5cf2ea	271	elsif ($cname =~ /\.ucm$/i)
67d7b5ef JH	272	{
	273	$doUcm = 1;
	274	}
ab5cf2ea	275	elsif ($cname =~ /\.pet$/i)
67d7b5ef JH	276	{
	277	$doPet = 1;
	278	}
	279
	280	my %encoding;
	281	my %strings;
b536bf57 DK	282	my $string_acc;
	283	my %strings_in_acc;
	284
67d7b5ef JH	285	my $saved = 0;
	286	my $subsave = 0;
	287	my $strings = 0;
	288
	289	sub cmp_name
	290	{
	291	if ($a =~ /^.*-(\d+)/)
	292	{
	293	my $an = $1;
	294	if ($b =~ /^.*-(\d+)/)
	295	{
	296	my $r = $an <=> $1;
	297	return $r if $r;
	298	}
	299	}
	300	return $a cmp $b;
	301	}
	302
	303
	304	foreach my $enc (sort cmp_name @encfiles)
	305	{
	306	my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc\|ucm)$/;
	307	$name = $opt{'n'} if exists $opt{'n'};
	308	if (open(E,$enc))
	309	{
	310	if ($sfx eq 'enc')
	311	{
	312	compile_enc(\*E,lc($name));
	313	}
	314	else
	315	{
	316	compile_ucm(\*E,lc($name));
	317	}
	318	}
	319	else
	320	{
	321	warn "Cannot open $enc for $name:$!";
	322	}
	323	}
	324
	325	if ($doC)
	326	{
127a7155	327	verbose "Writing compiled form\n";
67d7b5ef JH	328	foreach my $name (sort cmp_name keys %encoding)
	329	{
	330	my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
b536bf57 DK	331	process($name.'_utf8',$e2u);
	332	addstrings(\*C,$e2u);
	333
	334	process('utf8_'.$name,$u2e);
	335	addstrings(\*C,$u2e);
	336	}
	337	outbigstring(\*C,"enctable");
	338	foreach my $name (sort cmp_name keys %encoding)
	339	{
	340	my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
	341	outtable(\*C,$e2u, "enctable");
	342	outtable(\*C,$u2e, "enctable");
	343
b2704119	344	# push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
67d7b5ef	345	}
e67e8a4a	346	my ($cpp) = compiler_info(0);
127a7155 SH	347	my $ext = $cpp ? 'extern "C"' : "extern";
	348	my $exta = $cpp ? 'extern "C"' : "static";
	349	my $extb = $cpp ? 'extern "C"' : "";
67d7b5ef JH	350	foreach my $enc (sort cmp_name keys %encoding)
67d7b5ef JH	351	{
b2704119 JH	352	# my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
	353	my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
	354	#my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
	355	my $replen = 0;
	356	$replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
67d7b5ef JH	357	my $sym = "${enc}_encoding";
67d7b5ef JH	358	$sym =~ s/\W+/_/g;
7dd03145	359	my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
d1256cb1	360	$min_el,$max_el);
905bf499 RGS	361	print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n";
	362	print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n";
	363	print C "${extb} const encode_t $sym = \n";
f0a41339 DK	364	# This is to make null encoding work -- dankogai
f0a41339 DK	365	for (my $i = (scalar @info) - 1; $i >= 0; --$i){
d1256cb1	366	$info[$i] \|\|= 1;
f0a41339 DK	367	}
f0a41339 DK	368	# end of null tweak -- dankogai
7dd03145	369	print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
67d7b5ef JH	370	}
	371
	372	foreach my $enc (sort cmp_name keys %encoding)
	373	{
	374	my $sym = "${enc}_encoding";
	375	$sym =~ s/\W+/_/g;
127a7155	376	print H "${ext} encode_t $sym;\n";
67d7b5ef JH	377	print D " Encode_XSEncoding(aTHX_ &$sym);\n";
	378	}
	379
	380	if ($cname =~ /(\w+)\.xs$/)
	381	{
	382	my $mod = $1;
	383	print C <<'END';
	384
	385	static void
	386	Encode_XSEncoding(pTHX_ encode_t *enc)
	387	{
	388	dSP;
	389	HV *stash = gv_stashpv("Encode::XS", TRUE);
c7981a06 SH	390	SV *iv = newSViv(PTR2IV(enc));
c7981a06 SH	391	SV *sv = sv_bless(newRV_noinc(iv),stash);
67d7b5ef	392	int i = 0;
89c2544c DM	393	/* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's
	394	constness, in the hope that perl won't mess with it. */
	395	assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0);
c7981a06	396	SvFLAGS(iv) \|= SVp_POK;
89c2544c	397	SvPVX(iv) = (char*) enc->name[0];
67d7b5ef JH	398	PUSHMARK(sp);
	399	XPUSHs(sv);
	400	while (enc->name[i])
	401	{
	402	const char *name = enc->name[i++];
	403	XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
	404	}
	405	PUTBACK;
	406	call_pv("Encode::define_encoding",G_DISCARD);
	407	SvREFCNT_dec(sv);
	408	}
	409
	410	END
	411
	412	print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
	413	print C "BOOT:\n{\n";
	414	print C "#include \"$dname\"\n";
	415	print C "}\n";
	416	}
	417	# Close in void context is bad, m'kay
	418	close(D) or warn "Error closing '$dname': $!";
	419	close(H) or warn "Error closing '$hname': $!";
	420
cf9f87ce RGS	421	my $perc_saved = $saved/($strings + $saved) * 100;
cf9f87ce RGS	422	my $perc_subsaved = $subsave/($strings + $subsave) * 100;
127a7155 SH	423	verbosef "%d bytes in string tables\n",$strings;
127a7155 SH	424	verbosef "%d bytes (%.3g%%) saved spotting duplicates\n",
67d7b5ef	425	$saved, $perc_saved if $saved;
127a7155	426	verbosef "%d bytes (%.3g%%) saved using substrings\n",
67d7b5ef JH	427	$subsave, $perc_subsaved if $subsave;
	428	}
	429	elsif ($doEnc)
	430	{
	431	foreach my $name (sort cmp_name keys %encoding)
	432	{
	433	my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
	434	output_enc(\*C,$name,$e2u);
	435	}
	436	}
	437	elsif ($doUcm)
	438	{
	439	foreach my $name (sort cmp_name keys %encoding)
	440	{
	441	my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
	442	output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
	443	}
	444	}
	445
	446	# writing half meg files and then not checking to see if you just filled the
	447	# disk is bad, m'kay
	448	close(C) or die "Error closing '$cname': $!";
	449
	450	# End of the main program.
	451
	452	sub compile_ucm
	453	{
	454	my ($fh,$name) = @_;
	455	my $e2u = {};
	456	my $u2e = {};
	457	my $cs;
	458	my %attr;
	459	while (<$fh>)
	460	{
	461	s/#.*$//;
	462	last if /^\sCHARMAP\s$/i;
	463	if (/^\s<(\w+)>\s+"?([^"])"?\s*$/i) # " # Grrr
	464	{
	465	$attr{$1} = $2;
	466	}
	467	}
	468	if (!defined($cs = $attr{'code_set_name'}))
	469	{
	470	warn "No <code_set_name> in $name\n";
	471	}
	472	else
	473	{
	474	$name = $cs unless exists $opt{'n'};
	475	}
	476	my $erep;
	477	my $urep;
	478	my $max_el;
	479	my $min_el;
	480	if (exists $attr{'subchar'})
	481	{
b2704119 JH	482	#my @byte;
	483	#$attr{'subchar'} =~ /^\s*/cg;
	484	#push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
	485	#$erep = join('',map(chr(hex($_)),@byte));
	486	$erep = $attr{'subchar'};
	487	$erep =~ s/^\s+//; $erep =~ s/\s+$//;
67d7b5ef	488	}
15f5e486 SH	489	print "Reading $name ($cs)\n"
	490	unless defined $ENV{MAKEFLAGS}
	491	and $ENV{MAKEFLAGS} =~ /\b(s\|silent\|quiet)\b/;
67d7b5ef JH	492	my $nfb = 0;
	493	my $hfb = 0;
	494	while (<$fh>)
	495	{
	496	s/#.*$//;
	497	last if /^\sEND\s+CHARMAP\s$/i;
	498	next if /^\s*$/;
a999c27c JH	499	my (@uni, @byte) = ();
	500	my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
	501	or die "Bad line: $_";
	502	while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
	503	push @uni, map { substr($_, 1) } split(/\+/, $1);
	504	}
	505	while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
	506	push @byte, $1;
	507	}
	508	if (@uni)
67d7b5ef	509	{
a999c27c	510	my $uch = join('', map { encode_U(hex($_)) } @uni );
67d7b5ef JH	511	my $ech = join('',map(chr(hex($_)),@byte));
	512	my $el = length($ech);
	513	$max_el = $el if (!defined($max_el) \|\| $el > $max_el);
	514	$min_el = $el if (!defined($min_el) \|\| $el < $min_el);
	515	if (length($fb))
	516	{
	517	$fb = substr($fb,1);
	518	$hfb++;
	519	}
	520	else
	521	{
	522	$nfb++;
	523	$fb = '0';
	524	}
	525	# $fb is fallback flag
	526	# 0 - round trip safe
	527	# 1 - fallback for unicode -> enc
	528	# 2 - skip sub-char mapping
	529	# 3 - fallback enc -> unicode
	530	enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
	531	enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
	532	}
	533	else
	534	{
	535	warn $_;
	536	}
	537	}
	538	if ($nfb && $hfb)
	539	{
	540	die "$nfb entries without fallback, $hfb entries with\n";
	541	}
	542	$encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
	543	}
	544
	545
	546
	547	sub compile_enc
	548	{
	549	my ($fh,$name) = @_;
	550	my $e2u = {};
	551	my $u2e = {};
	552
	553	my $type;
	554	while ($type = <$fh>)
	555	{
	556	last if $type !~ /^\s*#/;
	557	}
	558	chomp($type);
	559	return if $type eq 'E';
	560	# Do the hash lookup once, rather than once per function call. 4% speedup.
	561	my $type_func = $encode_types{$type};
	562	my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
	563	warn "$type encoded $name\n";
	564	my $rep = '';
	565	# Save a defined test by setting these to defined values.
	566	my $min_el = ~0; # A very big integer
	567	my $max_el = 0; # Anything must be longer than 0
	568	{
	569	my $v = hex($def);
	570	$rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
	571	}
	572	my $errors;
	573	my $seen;
	574	# use -Q to silence the seen test. Makefile.PL uses this by default.
575	$seen = {} unless $opt{Q};
576	do
577	{
578	my $line = <$fh>;
579	chomp($line);
580	my $page = hex($line);
581	my $ch = 0;
582	my $i = 16;
583	do
584	{
585	# So why is it 1% faster to leave the my here?
586	my $line = <$fh>;
587	$line =~ s/\r\n$/\n/;
588	die "$.:${line}Line should be exactly 65 characters long including
589	newline (".length($line).")" unless length ($line) == 65;
590	# Split line into groups of 4 hex digits, convert groups to ints
591	# This takes 65.35
592	# map {hex $_} $line =~ /(....)/g
593	# This takes 63.75 (2.5% less time)
594	# unpack "n", pack "H", $line
595	# There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
596	# Doing it as while ($line =~ /(....)/g) took 74.63
597	foreach my $val (unpack "n", pack "H", $line)
598	{
599	next if $val == 0xFFFD;
600	my $ech = &$type_func($ch,$page);
601	if ($val \|\| (!$ch && !$page))
602	{
603	my $el = length($ech);
604	$max_el = $el if $el > $max_el;
605	$min_el = $el if $el < $min_el;
606	my $uch = encode_U($val);
607	if ($seen) {
608	# We're doing the test.
609	# We don't need to read this quickly, so storing it as a scalar,
610	# rather than 3 (anon array, plus the 2 scalars it holds) saves
611	# RAM and may make us faster on low RAM systems. [see __END__]
612	if (exists $seen->{$uch})
613	{
614	warn sprintf("U%04X is %02X%02X and %04X\n",
615	$val,$page,$ch,$seen->{$uch});
616	$errors++;
617	}
618	else
619	{
620	$seen->{$uch} = $page << 8 \| $ch;
621	}
622	}
623	# Passing 2 extra args each time is 3.6% slower!
624	# Even with having to add $fallback \|\|= 0 later
625	enter_fb0($e2u,$ech,$uch);
626	enter_fb0($u2e,$uch,$ech);
627	}
628	else
629	{
630	# No character at this position
631	# enter($e2u,$ech,undef,$e2u);
632	}
633	$ch++;
634	}
635	} while --$i;
636	} while --$pages;
637	die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
638	if $min_el > $max_el;
639	die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
640	$encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
641	}
642
643	# my ($a,$s,$d,$t,$fb) = @_;
644	sub enter {
645	my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
646	# state we shift to after this (multibyte) input character defaults to same
647	# as current state.
648	$next \|\|= $current;
649	# Making sure it is defined seems to be faster than {no warnings;} in
0a225b3c	650	# &process, or passing it in as 0 explicitly.
67d7b5ef JH	651	# XXX $fallback \|\|= 0;
	652
	653	# Start at the beginning and work forwards through the string to zero.
	654	# effectively we are removing 1 character from the front each time
	655	# but we don't actually edit the string. [this alone seems to be 14% speedup]
	656	# Hence -$pos is the length of the remaining string.
	657	my $pos = -length $inbytes;
	658	while (1) {
	659	my $byte = substr $inbytes, $pos, 1;
	660	# RAW_NEXT => 0,
	661	# RAW_IN_LEN => 1,
	662	# RAW_OUT_BYTES => 2,
	663	# RAW_FALLBACK => 3,
	664	# to unicode an array would seem to be better, because the pages are dense.
	665	# from unicode can be very sparse, favouring a hash.
	666	# hash using the bytes (all length 1) as keys rather than ord value,
	667	# as it's easier to sort these in &process.
	668
	669	# It's faster to always add $fallback even if it's undef, rather than
	670	# choosing between 3 and 4 element array. (hence why we set it defined
	671	# above)
	672	my $do_now = $current->{Raw}{$byte} \|\|= [{},-$pos,'',$fallback];
	673	# When $pos was -1 we were at the last input character.
	674	unless (++$pos) {
	675	$do_now->[RAW_OUT_BYTES] = $outbytes;
	676	$do_now->[RAW_NEXT] = $next;
	677	return;
	678	}
0a225b3c	679	# Tail recursion. The intermediate state may not have a name yet.
67d7b5ef JH	680	$current = $do_now->[RAW_NEXT];
	681	}
	682	}
	683
0a225b3c	684	# This is purely for optimisation. It's just &enter hard coded for $fallback
67d7b5ef JH	685	# of 0, using only a 3 entry array ref to save memory for every entry.
	686	sub enter_fb0 {
	687	my ($current,$inbytes,$outbytes,$next) = @_;
	688	$next \|\|= $current;
	689
	690	my $pos = -length $inbytes;
	691	while (1) {
	692	my $byte = substr $inbytes, $pos, 1;
	693	my $do_now = $current->{Raw}{$byte} \|\|= [{},-$pos,''];
	694	unless (++$pos) {
	695	$do_now->[RAW_OUT_BYTES] = $outbytes;
	696	$do_now->[RAW_NEXT] = $next;
	697	return;
	698	}
	699	$current = $do_now->[RAW_NEXT];
	700	}
	701	}
	702
67d7b5ef JH	703	sub process
	704	{
	705	my ($name,$a) = @_;
	706	$name =~ s/\W+/_/g;
	707	$a->{Cname} = $name;
	708	my $raw = $a->{Raw};
	709	my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
	710	my @ent;
	711	$agg_max_in = 0;
	712	foreach my $key (sort keys %$raw) {
	713	# RAW_NEXT => 0,
	714	# RAW_IN_LEN => 1,
	715	# RAW_OUT_BYTES => 2,
	716	# RAW_FALLBACK => 3,
	717	my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
	718	# Now we are converting from raw to aggregate, switch from 1 byte strings
	719	# to numbers
	720	my $b = ord $key;
	721	$fallback \|\|= 0;
	722	if ($l &&
	723	# If this == fails, we're going to reset $agg_max_in below anyway.
	724	$b == ++$agg_max_in &&
	725	# References in numeric context give the pointer as an int.
	726	$agg_next == $next &&
	727	$agg_in_len == $in_len &&
	728	$agg_out_len == length $out_bytes &&
	729	$agg_fallback == $fallback
	730	# && length($l->[AGG_OUT_BYTES]) < 16
	731	) {
	732	# my $i = ord($b)-ord($l->[AGG_MIN_IN]);
	733	# we can aggregate this byte onto the end.
	734	$l->[AGG_MAX_IN] = $b;
	735	$l->[AGG_OUT_BYTES] .= $out_bytes;
	736	} else {
	737	# AGG_MIN_IN => 0,
	738	# AGG_MAX_IN => 1,
	739	# AGG_OUT_BYTES => 2,
	740	# AGG_NEXT => 3,
	741	# AGG_IN_LEN => 4,
	742	# AGG_OUT_LEN => 5,
	743	# AGG_FALLBACK => 6,
	744	# Reset the last thing we saw, plus set 5 lexicals to save some derefs.
	745	# (only gains .6% on euc-jp -- is it worth it?)
	746	push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
	747	$agg_in_len = $in_len, $agg_out_len = length $out_bytes,
	748	$agg_fallback = $fallback];
	749	}
	750	if (exists $next->{Cname}) {
	751	$next->{'Forward'} = 1 if $next != $a;
	752	} else {
	753	process(sprintf("%s_%02x",$name,$b),$next);
	754	}
	755	}
	756	# encengine.c rules say that last entry must be for 255
	757	if ($agg_max_in < 255) {
	758	push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
	759	}
	760	$a->{'Entries'} = \@ent;
	761	}
	762
b536bf57 DK	763
b536bf57 DK	764	sub addstrings
67d7b5ef JH	765	{
	766	my ($fh,$a) = @_;
	767	my $name = $a->{'Cname'};
	768	# String tables
	769	foreach my $b (@{$a->{'Entries'}})
	770	{
	771	next unless $b->[AGG_OUT_LEN];
b536bf57	772	$strings{$b->[AGG_OUT_BYTES]} = undef;
67d7b5ef JH	773	}
	774	if ($a->{'Forward'})
	775	{
e67e8a4a	776	my ($cpp, $static, $sized) = compiler_info(1);
e67e8a4a	777	my $count = $sized ? scalar(@{$a->{'Entries'}}) : '';
15f5e486 SH	778	if ($static) {
	779	# we cannot ask Config for d_plusplus since we can override CC=g++-6 on the cmdline
	780	print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6
	781	print $fh "extern encpage_t $name\[$count];\n";
	782	print $fh "#else\n";
	783	print $fh "static const encpage_t $name\[$count];\n";
	784	print $fh "#endif\n";
	785	} else {
	786	print $fh "extern encpage_t $name\[$count];\n";
	787	}
b536bf57 DK	788	}
	789	$a->{'DoneStrings'} = 1;
	790	foreach my $b (@{$a->{'Entries'}})
	791	{
	792	my ($s,$e,$out,$t,$end,$l) = @$b;
	793	addstrings($fh,$t) unless $t->{'DoneStrings'};
67d7b5ef	794	}
b536bf57 DK	795	}
	796
	797	sub outbigstring
	798	{
	799	my ($fh,$name) = @_;
	800
	801	$string_acc = '';
	802
	803	# Make the big string in the string accumulator. Longest first, on the hope
	804	# that this makes it more likely that we find the short strings later on.
0a225b3c	805	# Not sure if it helps sorting strings of the same length lexically.
b536bf57 DK	806	foreach my $s (sort {length $b <=> length $a \|\| $a cmp $b} keys %strings) {
	807	my $index = index $string_acc, $s;
	808	if ($index >= 0) {
	809	$saved += length($s);
	810	$strings_in_acc{$s} = $index;
	811	} else {
	812	OPTIMISER: {
d1256cb1 RGS	813	if ($opt{'O'}) {
	814	my $sublength = length $s;
	815	while (--$sublength > 0) {
	816	# progressively lop characters off the end, to see if the start of
	817	# the new string overlaps the end of the accumulator.
	818	if (substr ($string_acc, -$sublength)
	819	eq substr ($s, 0, $sublength)) {
	820	$subsave += $sublength;
	821	$strings_in_acc{$s} = length ($string_acc) - $sublength;
	822	# append the last bit on the end.
	823	$string_acc .= substr ($s, $sublength);
	824	last OPTIMISER;
	825	}
	826	# or if the end of the new string overlaps the start of the
	827	# accumulator
	828	next unless substr ($string_acc, 0, $sublength)
	829	eq substr ($s, -$sublength);
	830	# well, the last $sublength characters of the accumulator match.
	831	# so as we're prepending to the accumulator, need to shift all our
	832	# existing offsets forwards
	833	$_ += $sublength foreach values %strings_in_acc;
	834	$subsave += $sublength;
	835	$strings_in_acc{$s} = 0;
	836	# append the first bit on the start.
	837	$string_acc = substr ($s, 0, -$sublength) . $string_acc;
	838	last OPTIMISER;
	839	}
	840	}
	841	# Optimiser (if it ran) found nothing, so just going have to tack the
	842	# whole thing on the end.
	843	$strings_in_acc{$s} = length $string_acc;
	844	$string_acc .= $s;
b536bf57 DK	845	};
	846	}
	847	}
	848
	849	$strings = length $string_acc;
e67e8a4a	850	my ($cpp) = compiler_info(0);
f49b0b20 JH	851	my $var = $cpp ? '' : 'static';
f49b0b20 JH	852	my $definition = "\n$var const U8 $name\[$strings] = { " .
b536bf57 DK	853	join(',',unpack "C*",$string_acc);
	854	# We have a single long line. Split it at convenient commas.
	855	print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
	856	print $fh substr ($definition, pos $definition), " };\n";
	857	}
	858
	859	sub findstring {
	860	my ($name,$s) = @_;
	861	my $offset = $strings_in_acc{$s};
	862	die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
	863	unless defined $offset;
	864	"$name + $offset";
	865	}
	866
	867	sub outtable
	868	{
	869	my ($fh,$a,$bigname) = @_;
	870	my $name = $a->{'Cname'};
67d7b5ef JH	871	$a->{'Done'} = 1;
	872	foreach my $b (@{$a->{'Entries'}})
	873	{
	874	my ($s,$e,$out,$t,$end,$l) = @$b;
b536bf57	875	outtable($fh,$t,$bigname) unless $t->{'Done'};
67d7b5ef	876	}
e67e8a4a	877	my ($cpp, $static) = compiler_info(0);
15f5e486 SH	878	my $count = scalar(@{$a->{'Entries'}});
	879	if ($static) {
	880	print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6
	881	print $fh "encpage_t $name\[$count] = {\n";
	882	print $fh "#else\n";
	883	print $fh "static const encpage_t $name\[$count] = {\n";
	884	print $fh "#endif\n";
	885	} else {
	886	print $fh "\nencpage_t $name\[$count] = {\n";
	887	}
67d7b5ef JH	888	foreach my $b (@{$a->{'Entries'}})
	889	{
	890	my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
d6f02b51	891	# $end \|= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
67d7b5ef JH	892	print $fh "{";
	893	if ($l)
	894	{
b536bf57	895	printf $fh findstring($bigname,$out);
67d7b5ef JH	896	}
	897	else
	898	{
	899	print $fh "0";
	900	}
	901	print $fh ",",$t->{Cname};
	902	printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
	903	}
	904	print $fh "};\n";
	905	}
	906
67d7b5ef JH	907	sub output_enc
	908	{
	909	my ($fh,$name,$a) = @_;
	910	die "Changed - fix me for new structure";
	911	foreach my $b (sort keys %$a)
	912	{
	913	my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
	914	}
	915	}
	916
	917	sub decode_U
	918	{
	919	my $s = shift;
	920	}
	921
	922	my @uname;
8ced1423	923	sub char_names{} # cf. https://rt.cpan.org/Ticket/Display.html?id=132471
67d7b5ef JH	924
	925	sub output_ucm_page
	926	{
	927	my ($cmap,$a,$t,$pre) = @_;
	928	# warn sprintf("Page %x\n",$pre);
	929	my $raw = $t->{Raw};
	930	foreach my $key (sort keys %$raw) {
	931	# RAW_NEXT => 0,
	932	# RAW_IN_LEN => 1,
	933	# RAW_OUT_BYTES => 2,
	934	# RAW_FALLBACK => 3,
	935	my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
	936	my $u = ord $key;
	937	$fallback \|\|= 0;
	938
	939	if ($next != $a && $next != $t) {
	940	output_ucm_page($cmap,$a,$next,(($pre\|($u &0x3F)) << 6)&0xFFFF);
	941	} elsif (length $out_bytes) {
	942	if ($pre) {
	943	$u = $pre\|($u &0x3f);
	944	}
	945	my $s = sprintf "<U%04X> ",$u;
	946	#foreach my $c (split(//,$out_bytes)) {
	947	# $s .= sprintf "\\x%02X",ord($c);
	948	#}
	949	# 9.5% faster changing that loop to this:
	950	$s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
	951	$s .= sprintf " \|%d # %s\n",($fallback ? 1 : 0),$uname[$u];
	952	push(@$cmap,$s);
	953	} else {
	954	warn join(',',$u, @{$raw->{$key}},$a,$t);
	955	}
	956	}
	957	}
	958
	959	sub output_ucm
	960	{
	961	my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
	962	print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
	963	print $fh "<code_set_name> \"$name\"\n";
	964	char_names();
	965	if (defined $min_el)
	966	{
	967	print $fh "<mb_cur_min> $min_el\n";
	968	}
	969	if (defined $max_el)
	970	{
	971	print $fh "<mb_cur_max> $max_el\n";
	972	}
	973	if (defined $rep)
	974	{
	975	print $fh "<subchar> ";
	976	foreach my $c (split(//,$rep))
	977	{
	978	printf $fh "\\x%02X",ord($c);
	979	}
	980	print $fh "\n";
	981	}
	982	my @cmap;
	983	output_ucm_page(\@cmap,$h,$h,0);
	984	print $fh "#\nCHARMAP\n";
	985	foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
	986	{
	987	print $fh $line;
988	}
989	print $fh "END CHARMAP\n";
990	}
991
3ef515df JH	992	use vars qw(
	993	$_Enc2xs
	994	$_Version
	995	$_Inc
b2704119	996	$_E2X
3ef515df JH	997	$_Name
	998	$_TableFiles
	999	$_Now
	1000	);
	1001
b2704119	1002	sub find_e2x{
b536bf57	1003	eval { require File::Find; };
b2704119 JH	1004	my (@inc, %e2x_dir);
b2704119 JH	1005	for my $inc (@INC){
d1256cb1	1006	push @inc, $inc unless $inc eq '.'; #skip current dir
b2704119 JH	1007	}
b2704119 JH	1008	File::Find::find(
d1256cb1 RGS	1009	sub {
	1010	my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
	1011	$atime,$mtime,$ctime,$blksize,$blocks)
	1012	= lstat($_) or return;
	1013	-f _ or return;
	1014	if (/^.*\.e2x$/o){
	1015	no warnings 'once';
	1016	$e2x_dir{$File::Find::dir} \|\|= $mtime;
	1017	}
	1018	return;
	1019	}, @inc);
b2704119 JH	1020	warn join("\n", keys %e2x_dir), "\n";
b2704119 JH	1021	for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
d1256cb1 RGS	1022	$_E2X = $d;
	1023	# warn "$_E2X => ", scalar localtime($e2x_dir{$d});
	1024	return $_E2X;
b2704119 JH	1025	}
	1026	}
	1027
67d7b5ef JH	1028	sub make_makefile_pl
67d7b5ef JH	1029	{
3f60a930	1030	eval { require Encode } or die "You need to install Encode to use enc2xs -M\nerror: $@\n";
0a225b3c	1031	# our used for variable expansion
3ef515df JH	1032	$_Enc2xs = $0;
3ef515df JH	1033	$_Version = $VERSION;
b2704119	1034	$_E2X = find_e2x();
3ef515df JH	1035	$_Name = shift;
	1036	$_TableFiles = join(",", map {qq('$_')} @_);
	1037	$_Now = scalar localtime();
b2704119	1038
aae85ceb	1039	eval { require File::Spec; };
b2704119	1040	_print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
b2704119	1041	_print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
b2704119	1042	_print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
b2704119	1043	_print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
b2704119	1044	_print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
3ef515df JH	1045	exit;
	1046	}
	1047
aae85ceb	1048	use vars qw(
d1256cb1 RGS	1049	$_ModLines
	1050	$_LocalVer
	1051	);
aae85ceb	1052
656ebd29	1053	sub make_configlocal_pm {
3f60a930	1054	eval { require Encode } or die "Unable to require Encode: $@\n";
aae85ceb	1055	eval { require File::Spec; };
656ebd29	1056
0a225b3c	1057	# our used for variable expantion
656ebd29 RGS	1058	my %in_core = map { $_ => 1 } (
	1059	'ascii', 'iso-8859-1', 'utf8',
	1060	'ascii-ctrl', 'null', 'utf-8-strict'
	1061	);
aae85ceb	1062	my %LocalMod = ();
656ebd29 RGS	1063	# check @enc;
	1064	use File::Find ();
	1065	my $wanted = sub{
	1066	-f $_ or return;
	1067	$File::Find::name =~ /\A\./ and return;
	1068	$File::Find::name =~ /\.pm\z/ or return;
	1069	$File::Find::name =~ m/\bEncode\b/ or return;
	1070	my $mod = $File::Find::name;
	1071	$mod =~ s/.*\bEncode\b/Encode/o;
	1072	$mod =~ s/\.pm\z//o;
	1073	$mod =~ s,/,::,og;
3f60a930	1074	eval qq{ require $mod; } or return;
e8cc621e	1075	warn qq{ require $mod;\n};
656ebd29 RGS	1076	for my $enc ( Encode->encodings() ) {
	1077	no warnings;
	1078	$in_core{$enc} and next;
	1079	$Encode::Config::ExtModule{$enc} and next;
	1080	$LocalMod{$enc} \|\|= $mod;
	1081	}
	1082	};
	1083	File::Find::find({wanted => $wanted}, @INC);
aae85ceb	1084	$_ModLines = "";
656ebd29 RGS	1085	for my $enc ( sort keys %LocalMod ) {
	1086	$_ModLines .=
	1087	qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
aae85ceb	1088	}
e8cc621e	1089	warn $_ModLines if $_ModLines;
aae85ceb	1090	$_LocalVer = _mkversion();
656ebd29 RGS	1091	$_E2X = find_e2x();
	1092	$_Inc = $INC{"Encode.pm"};
	1093	$_Inc =~ s/\.pm$//o;
	1094	_print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
	1095	File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
aae85ceb DK	1096	exit;
	1097	}
	1098
	1099	sub _mkversion{
656ebd29 RGS	1100	# v-string is now depreciated; use time() instead;
	1101	#my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
	1102	#$yyyy += 1900, $mo +=1;
	1103	#return sprintf("v%04d.%04d.%04d", $yyyy, $mo100+$dd, $hh100+$mm);
	1104	return time();
aae85ceb DK	1105	}
aae85ceb DK	1106
3ef515df	1107	sub _print_expand{
3f60a930	1108	eval { require File::Basename } or die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
67d7b5ef	1109	File::Basename->import();
621b0f8d DK	1110	my ($src, $dst, $clobber) = @_;
621b0f8d DK	1111	if (!$clobber and -e $dst){
d1256cb1 RGS	1112	warn "$dst exists. skipping\n";
d1256cb1 RGS	1113	return;
621b0f8d DK	1114	}
621b0f8d DK	1115	warn "Generating $dst...\n";
3ef515df JH	1116	open my $in, $src or die "$src : $!";
3ef515df JH	1117	if ((my $d = dirname($dst)) ne '.'){
d1256cb1	1118	-d $d or mkdir $d, 0755 or die "mkdir $d : $!";
3ef515df	1119	}
15f5e486	1120	open my $out, ">", $dst or die "$!";
3ef515df JH	1121	my $asis = 0;
3ef515df JH	1122	while (<$in>){
d1256cb1 RGS	1123	if (/^#### END_OF_HEADER/){
	1124	$asis = 1; next;
	1125	}
	1126	s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
	1127	print $out $_;
67d7b5ef	1128	}
67d7b5ef	1129	}
67d7b5ef JH	1130	__END__
	1131
	1132	=head1 NAME
	1133
	1134	enc2xs -- Perl Encode Module Generator
	1135
	1136	=head1 SYNOPSIS
	1137
67d7b5ef	1138	enc2xs -[options]
aae85ceb DK	1139	enc2xs -M ModName mapfiles...
aae85ceb DK	1140	enc2xs -C
67d7b5ef JH	1141
	1142	=head1 DESCRIPTION
	1143
	1144	F<enc2xs> builds a Perl extension for use by Encode from either
0ab8f81e JH	1145	Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
	1146	Besides being used internally during the build process of the Encode
	1147	module, you can use F<enc2xs> to add your own encoding to perl.
	1148	No knowledge of XS is necessary.
67d7b5ef JH	1149
	1150	=head1 Quick Guide
	1151
0ab8f81e	1152	If you want to know as little about Perl as possible but need to
67d7b5ef JH	1153	add a new encoding, just read this chapter and forget the rest.
	1154
	1155	=over 4
	1156
3cd3edd2	1157	=item 0.Z<>
67d7b5ef	1158
0ab8f81e JH	1159	Have a .ucm file ready. You can get it from somewhere or you can write
	1160	your own from scratch or you can grab one from the Encode distribution
	1161	and customize it. For the UCM format, see the next Chapter. In the
	1162	example below, I'll call my theoretical encoding myascii, defined
	1163	in I<my.ucm>. C<$> is a shell prompt.
67d7b5ef JH	1164
	1165	$ ls -F
	1166	my.ucm
	1167
3cd3edd2	1168	=item 1.Z<>
67d7b5ef JH	1169
	1170	Issue a command as follows;
	1171
	1172	$ enc2xs -M My my.ucm
3ef515df JH	1173	generating Makefile.PL
	1174	generating My.pm
	1175	generating README
	1176	generating Changes
67d7b5ef JH	1177
	1178	Now take a look at your current directory. It should look like this.
	1179
	1180	$ ls -F
	1181	Makefile.PL My.pm my.ucm t/
	1182
0ab8f81e	1183	The following files were created.
67d7b5ef	1184
0ab8f81e JH	1185	Makefile.PL - MakeMaker script
	1186	My.pm - Encode submodule
	1187	t/My.t - test file
	1188
	1189	=over 4
67d7b5ef	1190
3cd3edd2	1191	=item 1.1.Z<>
037b88d6 JH	1192
	1193	If you want *.ucm installed together with the modules, do as follows;
	1194
	1195	$ mkdir Encode
	1196	$ mv *.ucm Encode
	1197	$ enc2xs -M My Encode/*ucm
	1198
0ab8f81e JH	1199	=back
0ab8f81e JH	1200
3cd3edd2	1201	=item 2.Z<>
67d7b5ef JH	1202
	1203	Edit the files generated. You don't have to if you have no time AND no
	1204	intention to give it to someone else. But it is a good idea to edit
0ab8f81e	1205	the pod and to add more tests.
67d7b5ef	1206
3cd3edd2	1207	=item 3.Z<>
67d7b5ef	1208
0ab8f81e	1209	Now issue a command all Perl Mongers love:
67d7b5ef	1210
9160fdbd	1211	$ perl Makefile.PL
67d7b5ef JH	1212	Writing Makefile for Encode::My
67d7b5ef JH	1213
3cd3edd2	1214	=item 4.Z<>
67d7b5ef JH	1215
	1216	Now all you have to do is make.
	1217
	1218	$ make
	1219	cp My.pm blib/lib/Encode/My.pm
	1220	/usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
	1221	-o encode_t.c -f encode_t.fnm
	1222	Reading myascii (myascii)
	1223	Writing compiled form
	1224	128 bytes in string tables
cf9f87ce RGS	1225	384 bytes (75%) saved spotting duplicates
cf9f87ce RGS	1226	1 bytes (0.775%) saved using substrings
67d7b5ef JH	1227	....
	1228	chmod 644 blib/arch/auto/Encode/My/My.bs
	1229	$
	1230
0ab8f81e JH	1231	The time it takes varies depending on how fast your machine is and
	1232	how large your encoding is. Unless you are working on something big
	1233	like euc-tw, it won't take too long.
67d7b5ef	1234
3cd3edd2	1235	=item 5.Z<>
67d7b5ef JH	1236
	1237	You can "make install" already but you should test first.
	1238
	1239	$ make test
	1240	PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
	1241	-e 'use Test::Harness qw(&runtests $verbose); \
	1242	$verbose=0; runtests @ARGV;' t/*.t
	1243	t/My....ok
	1244	All tests successful.
	1245	Files=1, Tests=2, 0 wallclock secs
	1246	( 0.09 cusr + 0.01 csys = 0.09 CPU)
	1247
3cd3edd2	1248	=item 6.Z<>
67d7b5ef JH	1249
	1250	If you are content with the test result, just "make install"
	1251
3cd3edd2	1252	=item 7.Z<>
aae85ceb	1253
0ab8f81e	1254	If you want to add your encoding to Encode's demand-loading list
aae85ceb DK	1255	(so you don't have to "use Encode::YourEncoding"), run
	1256
	1257	enc2xs -C
	1258
	1259	to update Encode::ConfigLocal, a module that controls local settings.
	1260	After that, "use Encode;" is enough to load your encodings on demand.
	1261
67d7b5ef JH	1262	=back
	1263
	1264	=head1 The Unicode Character Map
	1265
0ab8f81e JH	1266	Encode uses the Unicode Character Map (UCM) format for source character
	1267	mappings. This format is used by IBM's ICU package and was adopted
	1268	by Nick Ing-Simmons for use with the Encode module. Since UCM is
	1269	more flexible than Tcl's Encoding Map and far more user-friendly,
863b2ca0	1270	this is the recommended format for Encode now.
67d7b5ef	1271
0ab8f81e	1272	A UCM file looks like this.
67d7b5ef JH	1273
	1274	#
	1275	# Comments
	1276	#
	1277	<code_set_name> "US-ascii" # Required
	1278	<code_set_alias> "ascii" # Optional
	1279	<mb_cur_min> 1 # Required; usually 1
	1280	<mb_cur_max> 1 # Max. # of bytes/char
	1281	<subchar> \x3F # Substitution char
	1282	#
	1283	CHARMAP
	1284	<U0000> \x00 \|0 # <control>
	1285	<U0001> \x01 \|0 # <control>
	1286	<U0002> \x02 \|0 # <control>
	1287	....
	1288	<U007C> \x7C \|0 # VERTICAL LINE
	1289	<U007D> \x7D \|0 # RIGHT CURLY BRACKET
	1290	<U007E> \x7E \|0 # TILDE
	1291	<U007F> \x7F \|0 # <control>
	1292	END CHARMAP
	1293
	1294	=over 4
	1295
	1296	=item *
	1297
0ab8f81e	1298	Anything that follows C<#> is treated as a comment.
67d7b5ef JH	1299
	1300	=item *
	1301
0ab8f81e JH	1302	The header section continues until a line containing the word
	1303	CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
	1304	pair per line. Strings used as values must be quoted. Barewords are
	1305	treated as numbers. I<\xXX> represents a byte.
67d7b5ef JH	1306
	1307	Most of the keywords are self-explanatory. I<subchar> means
	1308	substitution character, not subcharacter. When you decode a Unicode
	1309	sequence to this encoding but no matching character is found, the byte
	1310	sequence defined here will be used. For most cases, the value here is
0ab8f81e	1311	\x3F; in ASCII, this is a question mark.
67d7b5ef JH	1312
	1313	=item *
	1314
	1315	CHARMAP starts the character map section. Each line has a form as
0ab8f81e	1316	follows:
67d7b5ef JH	1317
	1318	<UXXXX> \xXX.. \|0 # comment
	1319	^ ^ ^
	1320	\| \| +- Fallback flag
	1321	\| +-------- Encoded byte sequence
	1322	+-------------- Unicode Character ID in hex
	1323
0ab8f81e JH	1324	The format is roughly the same as a header section except for the
	1325	fallback flag: \| followed by 0..3. The meaning of the possible
	1326	values is as follows:
67d7b5ef	1327
0ab8f81e	1328	=over 4
67d7b5ef JH	1329
	1330	=item \|0
	1331
0ab8f81e JH	1332	Round trip safe. A character decoded to Unicode encodes back to the
0ab8f81e JH	1333	same byte sequence. Most characters have this flag.
67d7b5ef JH	1334
	1335	=item \|1
	1336
	1337	Fallback for unicode -> encoding. When seen, enc2xs adds this
0ab8f81e	1338	character for the encode map only.
67d7b5ef JH	1339
	1340	=item \|2
	1341
	1342	Skip sub-char mapping should there be no code point.
	1343
	1344	=item \|3
	1345
	1346	Fallback for encoding -> unicode. When seen, enc2xs adds this
0ab8f81e	1347	character for the decode map only.
67d7b5ef JH	1348
	1349	=back
	1350
	1351	=item *
	1352
	1353	And finally, END OF CHARMAP ends the section.
	1354
	1355	=back
	1356
6d1c0808	1357	When you are manually creating a UCM file, you should copy ascii.ucm
0ab8f81e JH	1358	or an existing encoding which is close to yours, rather than write
0ab8f81e JH	1359	your own from scratch.
67d7b5ef JH	1360
67d7b5ef JH	1361	When you do so, make sure you leave at least B<U0000> to B<U0020> as
0ab8f81e	1362	is, unless your environment is EBCDIC.
67d7b5ef JH	1363
	1364	B<CAVEAT>: not all features in UCM are implemented. For example,
	1365	icu:state is not used. Because of that, you need to write a perl
0ab8f81e JH	1366	module if you want to support algorithmical encodings, notably
0ab8f81e JH	1367	the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
67d7b5ef JH	1368	L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
67d7b5ef JH	1369
6d1c0808 JH	1370	=head2 Coping with duplicate mappings
	1371
	1372	When you create a map, you SHOULD make your mappings round-trip safe.
	1373	That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
	1374	$data> stands for all characters that are marked as C<\|0>. Here is
0ab8f81e	1375	how to make sure:
6d1c0808	1376
0ab8f81e	1377	=over 4
6d1c0808 JH	1378
	1379	=item *
	1380
	1381	Sort your map in Unicode order.
	1382
	1383	=item *
	1384
	1385	When you have a duplicate entry, mark either one with '\|1' or '\|3'.
	1386
	1387	=item *
	1388
0ab8f81e	1389	And make sure the '\|1' or '\|3' entry FOLLOWS the '\|0' entry.
6d1c0808 JH	1390
	1391	=back
	1392
	1393	Here is an example from big5-eten.
	1394
	1395	<U2550> \xF9\xF9 \|0
	1396	<U2550> \xA2\xA4 \|3
	1397
	1398	Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
	1399	this;
	1400
	1401	E to U U to E
	1402	--------------------------------------
	1403	\xF9\xF9 => U2550 U2550 => \xF9\xF9
	1404	\xA2\xA4 => U2550
	1405
	1406	So it is round-trip safe for \xF9\xF9. But if the line above is upside
	1407	down, here is what happens.
	1408
	1409	E to U U to E
	1410	--------------------------------------
	1411	\xA2\xA4 => U2550 U2550 => \xF9\xF9
	1412	(\xF9\xF9 => U2550 is now overwritten!)
	1413
	1414	The Encode package comes with F<ucmlint>, a crude but sufficient
0ab8f81e JH	1415	utility to check the integrity of a UCM file. Check under the
0ab8f81e JH	1416	Encode/bin directory for this.
cf9f87ce RGS	1417
	1418	When in doubt, you can use F<ucmsort>, yet another utility under
	1419	Encode/bin directory.
6d1c0808	1420
67d7b5ef JH	1421	=head1 Bookmarks
67d7b5ef JH	1422
0ab8f81e JH	1423	=over 4
	1424
	1425	=item *
	1426
67d7b5ef	1427	ICU Home Page
64bc6d54	1428	L<http://www.icu-project.org/>
67d7b5ef	1429
0ab8f81e JH	1430	=item *
0ab8f81e JH	1431
67d7b5ef	1432	ICU Character Mapping Tables
b2deda17	1433	L<http://site.icu-project.org/charts/charset>
67d7b5ef	1434
0ab8f81e JH	1435	=item *
0ab8f81e JH	1436
67d7b5ef	1437	ICU:Conversion Data
64bc6d54	1438	L<http://www.icu-project.org/userguide/conversion-data.html>
67d7b5ef	1439
0ab8f81e JH	1440	=back
0ab8f81e JH	1441
67d7b5ef JH	1442	=head1 SEE ALSO
	1443
	1444	L<Encode>,
	1445	L<perlmod>,
	1446	L<perlpod>
	1447
	1448	=cut
	1449
	1450	# -Q to disable the duplicate codepoint test
	1451	# -S make mapping errors fatal
	1452	# -q to remove comments written to output files
	1453	# -O to enable the (brute force) substring optimiser
	1454	# -o <output> to specify the output file name (else it's the first arg)
	1455	# -f <inlist> to give a file with a list of input files (else use the args)
	1456	# -n <name> to name the encoding (else use the basename of the input file.
	1457
	1458	With %seen holding array refs:
	1459
	1460	865.66 real 28.80 user 8.79 sys
	1461	7904 maximum resident set size
	1462	1356 average shared memory size
	1463	18566 average unshared data size
	1464	229 average unshared stack size
	1465	46080 page reclaims
	1466	33373 page faults
	1467
	1468	With %seen holding simple scalars:
	1469
	1470	342.16 real 27.11 user 3.54 sys
	1471	8388 maximum resident set size
	1472	1394 average shared memory size
	1473	14969 average unshared data size
	1474	236 average unshared stack size
	1475	28159 page reclaims
	1476	9839 page faults
	1477
	1478	Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
	1479	how %seen is storing things its seen. So it is pathalogically bad on a 16M
	1480	RAM machine, but it's going to help even on modern machines.
	1481	Swapping is bad, m'kay :-)