This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
(perl #127834) remove . from the end of @INC if complex modules are loaded
[perl5.git] / cpan / Encode / bin / enc2xs
CommitLineData
3ef515df 1#!./perl
67d7b5ef 2BEGIN {
a999c27c
JH
3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4 # with $ENV{PERL_CORE} set
5 # In case we need it in future...
6 require Config; import Config;
cee96d52 7 pop @INC if $INC[-1] eq '.';
67d7b5ef
JH
8}
9use strict;
b536bf57 10use warnings;
67d7b5ef 11use Getopt::Std;
f49b0b20 12use Config;
67d7b5ef 13my @orig_ARGV = @ARGV;
e67e8a4a 14our $VERSION = do { my @r = (q$Revision: 2.18 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
67d7b5ef
JH
15
16# These may get re-ordered.
17# RAW is a do_now as inserted by &enter
0a225b3c 18# AGG is an aggregated do_now, as built up by &process
a999c27c 19
67d7b5ef
JH
20use constant {
21 RAW_NEXT => 0,
22 RAW_IN_LEN => 1,
23 RAW_OUT_BYTES => 2,
24 RAW_FALLBACK => 3,
25
26 AGG_MIN_IN => 0,
27 AGG_MAX_IN => 1,
28 AGG_OUT_BYTES => 2,
29 AGG_NEXT => 3,
30 AGG_IN_LEN => 4,
31 AGG_OUT_LEN => 5,
32 AGG_FALLBACK => 6,
33};
a999c27c 34
67d7b5ef
JH
35# (See the algorithm in encengine.c - we're building structures for it)
36
37# There are two sorts of structures.
38# "do_now" (an array, two variants of what needs storing) is whatever we need
39# to do now we've read an input byte.
40# It's housed in a "do_next" (which is how we got to it), and in turn points
41# to a "do_next" which contains all the "do_now"s for the next input byte.
42
43# There will be a "do_next" which is the start state.
44# For a single byte encoding it's the only "do_next" - each "do_now" points
45# back to it, and each "do_now" will cause bytes. There is no state.
46
47# For a multi-byte encoding where all characters in the input are the same
48# length, then there will be a tree of "do_now"->"do_next"->"do_now"
49# branching out from the start state, one step for each input byte.
50# The leaf "do_now"s will all be at the same distance from the start state,
51# only the leaf "do_now"s cause output bytes, and they in turn point back to
52# the start state.
53
0a225b3c 54# For an encoding where there are variable length input byte sequences, you
67d7b5ef
JH
55# will encounter a leaf "do_now" sooner for the shorter input sequences, but
56# as before the leaves will point back to the start state.
57
58# The system will cope with escape encodings (imagine them as a mostly
59# self-contained tree for each escape state, and cross links between trees
60# at the state-switching characters) but so far no input format defines these.
61
62# The system will also cope with having output "leaves" in the middle of
63# the bifurcating branches, not just at the extremities, but again no
64# input format does this yet.
65
66# There are two variants of the "do_now" structure. The first, smaller variant
67# is generated by &enter as the input file is read. There is one structure
68# for each input byte. Say we are mapping a single byte encoding to a
69# single byte encoding, with "ABCD" going "abcd". There will be
70# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
71
0a225b3c 72# &process then walks the tree, building aggregate "do_now" structures for
67d7b5ef
JH
73# adjacent bytes where possible. The aggregate is for a contiguous range of
74# bytes which each produce the same length of output, each move to the
75# same next state, and each have the same fallback flag.
76# So our 4 RAW "do_now"s above become replaced by a single structure
77# containing:
78# ["A", "D", "abcd", 1, ...]
79# ie, for an input byte $_ in "A".."D", output 1 byte, found as
80# substr ("abcd", (ord $_ - ord "A") * 1, 1)
81# which maps very nicely into pointer arithmetic in C for encengine.c
82
83sub encode_U
84{
85 # UTF-8 encode long hand - only covers part of perl's range
86 ## my $uv = shift;
87 # chr() works in native space so convert value from table
88 # into that space before using chr().
89 my $ch = chr(utf8::unicode_to_native($_[0]));
90 # Now get core perl to encode that the way it likes.
91 utf8::encode($ch);
92 return $ch;
93}
94
95sub encode_S
96{
97 # encode single byte
98 ## my ($ch,$page) = @_; return chr($ch);
99 return chr $_[0];
100}
101
102sub encode_D
103{
104 # encode double byte MS byte first
105 ## my ($ch,$page) = @_; return chr($page).chr($ch);
106 return chr ($_[1]) . chr $_[0];
107}
108
109sub encode_M
110{
111 # encode Multi-byte - single for 0..255 otherwise double
112 ## my ($ch,$page) = @_;
113 ## return &encode_D if $page;
114 ## return &encode_S;
115 return chr ($_[1]) . chr $_[0] if $_[1];
116 return chr $_[0];
117}
118
119my %encode_types = (U => \&encode_U,
120 S => \&encode_S,
121 D => \&encode_D,
122 M => \&encode_M,
123 );
124
125# Win32 does not expand globs on command line
126eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
127
128my %opt;
129# I think these are:
130# -Q to disable the duplicate codepoint test
131# -S make mapping errors fatal
132# -q to remove comments written to output files
133# -O to enable the (brute force) substring optimiser
134# -o <output> to specify the output file name (else it's the first arg)
135# -f <inlist> to give a file with a list of input files (else use the args)
136# -n <name> to name the encoding (else use the basename of the input file.
127a7155 137getopts('CM:SQqOo:f:n:v',\%opt);
67d7b5ef
JH
138
139$opt{M} and make_makefile_pl($opt{M}, @ARGV);
aae85ceb 140$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
127a7155
SH
141$opt{v} ||= $ENV{ENC2XS_VERBOSE};
142
143sub verbose {
144 print STDERR @_ if $opt{v};
145}
146sub verbosef {
147 printf STDERR @_ if $opt{v};
148}
67d7b5ef 149
e67e8a4a
DM
150
151# ($cpp, $static, $sized) = compiler_info($declaration)
152#
153# return some information about the compiler and compile options we're using:
154#
155# $declaration - true if we're doing a declaration rather than a definition.
156#
157# $cpp - we're using C++
158# $static - ok to declare the arrays as static
159# $sized - the array declarations should be sized
160
161sub compiler_info {
162 my ($declaration) = @_;
163
164 my $ccflags = $Config{ccflags};
165 if (defined $Config{ccwarnflags}) {
166 $ccflags .= " " . $Config{ccwarnflags};
167 }
168 my $compat = $ccflags =~ /\Q-Wc++-compat/;
169 my $pedantic = $ccflags =~ /-pedantic/;
170
171 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
172
173 # The encpage_t tables contain recursive and mutually recursive
174 # references. To allow them to compile under C++ and some restrictive
175 # cc options, it may be necessary to make the tables non-static/const
176 # (thus moving them from the text to the data segment) and/or not
177 # include the size in the declaration.
178
179 my $static = !(
180 $cpp
181 || ($compat && $pedantic)
182 || ($^O eq 'MacOS' && $declaration)
183 );
184
185 # -Wc++-compat on its own warns if the array declaration is sized.
186 # The easiest way to avoid this warning is simply not to include
187 # the size in the declaration.
188 # With -pedantic as well, the issue doesn't arise because $static
189 # above becomes false.
190 my $sized = $declaration && !($compat && !$pedantic);
191
192 return ($cpp, $static, $sized);
193}
194
195
67d7b5ef
JH
196# This really should go first, else the die here causes empty (non-erroneous)
197# output files to be written.
198my @encfiles;
199if (exists $opt{'f'}) {
200 # -F is followed by name of file containing list of filenames
201 my $flist = $opt{'f'};
202 open(FLIST,$flist) || die "Cannot open $flist:$!";
203 chomp(@encfiles = <FLIST>);
204 close(FLIST);
205} else {
206 @encfiles = @ARGV;
207}
208
209my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
210chmod(0666,$cname) if -f $cname && !-w $cname;
211open(C,">$cname") || die "Cannot open $cname:$!";
212
213my $dname = $cname;
214my $hname = $cname;
215
216my ($doC,$doEnc,$doUcm,$doPet);
217
0e4142c9 218if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
67d7b5ef
JH
219 {
220 $doC = 1;
e7cbefb8 221 $dname =~ s/(\.[^\.]*)?$/.exh/;
67d7b5ef
JH
222 chmod(0666,$dname) if -f $cname && !-w $dname;
223 open(D,">$dname") || die "Cannot open $dname:$!";
224 $hname =~ s/(\.[^\.]*)?$/.h/;
225 chmod(0666,$hname) if -f $cname && !-w $hname;
226 open(H,">$hname") || die "Cannot open $hname:$!";
227
228 foreach my $fh (\*C,\*D,\*H)
229 {
230 print $fh <<"END" unless $opt{'q'};
231/*
232 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
233 This file was autogenerated by:
234 $^X $0 @orig_ARGV
f49b0b20 235 enc2xs VERSION $VERSION
67d7b5ef
JH
236*/
237END
238 }
239
240 if ($cname =~ /(\w+)\.xs$/)
241 {
c9955564 242 print C "#define PERL_NO_GET_CONTEXT\n";
67d7b5ef
JH
243 print C "#include <EXTERN.h>\n";
244 print C "#include <perl.h>\n";
245 print C "#include <XSUB.h>\n";
67d7b5ef 246 }
b536bf57 247 print C "#include \"encode.h\"\n\n";
67d7b5ef
JH
248
249 }
250elsif ($cname =~ /\.enc$/)
251 {
252 $doEnc = 1;
253 }
254elsif ($cname =~ /\.ucm$/)
255 {
256 $doUcm = 1;
257 }
258elsif ($cname =~ /\.pet$/)
259 {
260 $doPet = 1;
261 }
262
263my %encoding;
264my %strings;
b536bf57
DK
265my $string_acc;
266my %strings_in_acc;
267
67d7b5ef
JH
268my $saved = 0;
269my $subsave = 0;
270my $strings = 0;
271
272sub cmp_name
273{
274 if ($a =~ /^.*-(\d+)/)
275 {
276 my $an = $1;
277 if ($b =~ /^.*-(\d+)/)
278 {
279 my $r = $an <=> $1;
280 return $r if $r;
281 }
282 }
283 return $a cmp $b;
284}
285
286
287foreach my $enc (sort cmp_name @encfiles)
288 {
289 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
290 $name = $opt{'n'} if exists $opt{'n'};
291 if (open(E,$enc))
292 {
293 if ($sfx eq 'enc')
294 {
295 compile_enc(\*E,lc($name));
296 }
297 else
298 {
299 compile_ucm(\*E,lc($name));
300 }
301 }
302 else
303 {
304 warn "Cannot open $enc for $name:$!";
305 }
306 }
307
308if ($doC)
309 {
127a7155 310 verbose "Writing compiled form\n";
67d7b5ef
JH
311 foreach my $name (sort cmp_name keys %encoding)
312 {
313 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
b536bf57
DK
314 process($name.'_utf8',$e2u);
315 addstrings(\*C,$e2u);
316
317 process('utf8_'.$name,$u2e);
318 addstrings(\*C,$u2e);
319 }
320 outbigstring(\*C,"enctable");
321 foreach my $name (sort cmp_name keys %encoding)
322 {
323 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
324 outtable(\*C,$e2u, "enctable");
325 outtable(\*C,$u2e, "enctable");
326
b2704119 327 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
67d7b5ef 328 }
e67e8a4a 329 my ($cpp) = compiler_info(0);
127a7155
SH
330 my $ext = $cpp ? 'extern "C"' : "extern";
331 my $exta = $cpp ? 'extern "C"' : "static";
332 my $extb = $cpp ? 'extern "C"' : "";
67d7b5ef
JH
333 foreach my $enc (sort cmp_name keys %encoding)
334 {
b2704119
JH
335 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
336 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
337 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
338 my $replen = 0;
339 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
67d7b5ef
JH
340 my $sym = "${enc}_encoding";
341 $sym =~ s/\W+/_/g;
7dd03145 342 my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
d1256cb1 343 $min_el,$max_el);
905bf499
RGS
344 print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n";
345 print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n";
346 print C "${extb} const encode_t $sym = \n";
f0a41339
DK
347 # This is to make null encoding work -- dankogai
348 for (my $i = (scalar @info) - 1; $i >= 0; --$i){
d1256cb1 349 $info[$i] ||= 1;
f0a41339
DK
350 }
351 # end of null tweak -- dankogai
7dd03145 352 print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
67d7b5ef
JH
353 }
354
355 foreach my $enc (sort cmp_name keys %encoding)
356 {
357 my $sym = "${enc}_encoding";
358 $sym =~ s/\W+/_/g;
127a7155 359 print H "${ext} encode_t $sym;\n";
67d7b5ef
JH
360 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
361 }
362
363 if ($cname =~ /(\w+)\.xs$/)
364 {
365 my $mod = $1;
366 print C <<'END';
367
368static void
369Encode_XSEncoding(pTHX_ encode_t *enc)
370{
371 dSP;
372 HV *stash = gv_stashpv("Encode::XS", TRUE);
c7981a06
SH
373 SV *iv = newSViv(PTR2IV(enc));
374 SV *sv = sv_bless(newRV_noinc(iv),stash);
67d7b5ef 375 int i = 0;
89c2544c
DM
376 /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's
377 constness, in the hope that perl won't mess with it. */
378 assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0);
c7981a06 379 SvFLAGS(iv) |= SVp_POK;
89c2544c 380 SvPVX(iv) = (char*) enc->name[0];
67d7b5ef
JH
381 PUSHMARK(sp);
382 XPUSHs(sv);
383 while (enc->name[i])
384 {
385 const char *name = enc->name[i++];
386 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
387 }
388 PUTBACK;
389 call_pv("Encode::define_encoding",G_DISCARD);
390 SvREFCNT_dec(sv);
391}
392
393END
394
395 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
396 print C "BOOT:\n{\n";
397 print C "#include \"$dname\"\n";
398 print C "}\n";
399 }
400 # Close in void context is bad, m'kay
401 close(D) or warn "Error closing '$dname': $!";
402 close(H) or warn "Error closing '$hname': $!";
403
cf9f87ce
RGS
404 my $perc_saved = $saved/($strings + $saved) * 100;
405 my $perc_subsaved = $subsave/($strings + $subsave) * 100;
127a7155
SH
406 verbosef "%d bytes in string tables\n",$strings;
407 verbosef "%d bytes (%.3g%%) saved spotting duplicates\n",
67d7b5ef 408 $saved, $perc_saved if $saved;
127a7155 409 verbosef "%d bytes (%.3g%%) saved using substrings\n",
67d7b5ef
JH
410 $subsave, $perc_subsaved if $subsave;
411 }
412elsif ($doEnc)
413 {
414 foreach my $name (sort cmp_name keys %encoding)
415 {
416 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
417 output_enc(\*C,$name,$e2u);
418 }
419 }
420elsif ($doUcm)
421 {
422 foreach my $name (sort cmp_name keys %encoding)
423 {
424 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
425 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
426 }
427 }
428
429# writing half meg files and then not checking to see if you just filled the
430# disk is bad, m'kay
431close(C) or die "Error closing '$cname': $!";
432
433# End of the main program.
434
435sub compile_ucm
436{
437 my ($fh,$name) = @_;
438 my $e2u = {};
439 my $u2e = {};
440 my $cs;
441 my %attr;
442 while (<$fh>)
443 {
444 s/#.*$//;
445 last if /^\s*CHARMAP\s*$/i;
446 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
447 {
448 $attr{$1} = $2;
449 }
450 }
451 if (!defined($cs = $attr{'code_set_name'}))
452 {
453 warn "No <code_set_name> in $name\n";
454 }
455 else
456 {
457 $name = $cs unless exists $opt{'n'};
458 }
459 my $erep;
460 my $urep;
461 my $max_el;
462 my $min_el;
463 if (exists $attr{'subchar'})
464 {
b2704119
JH
465 #my @byte;
466 #$attr{'subchar'} =~ /^\s*/cg;
467 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
468 #$erep = join('',map(chr(hex($_)),@byte));
469 $erep = $attr{'subchar'};
470 $erep =~ s/^\s+//; $erep =~ s/\s+$//;
67d7b5ef
JH
471 }
472 print "Reading $name ($cs)\n";
473 my $nfb = 0;
474 my $hfb = 0;
475 while (<$fh>)
476 {
477 s/#.*$//;
478 last if /^\s*END\s+CHARMAP\s*$/i;
479 next if /^\s*$/;
a999c27c
JH
480 my (@uni, @byte) = ();
481 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
482 or die "Bad line: $_";
483 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
484 push @uni, map { substr($_, 1) } split(/\+/, $1);
485 }
486 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
487 push @byte, $1;
488 }
489 if (@uni)
67d7b5ef 490 {
a999c27c 491 my $uch = join('', map { encode_U(hex($_)) } @uni );
67d7b5ef
JH
492 my $ech = join('',map(chr(hex($_)),@byte));
493 my $el = length($ech);
494 $max_el = $el if (!defined($max_el) || $el > $max_el);
495 $min_el = $el if (!defined($min_el) || $el < $min_el);
496 if (length($fb))
497 {
498 $fb = substr($fb,1);
499 $hfb++;
500 }
501 else
502 {
503 $nfb++;
504 $fb = '0';
505 }
506 # $fb is fallback flag
507 # 0 - round trip safe
508 # 1 - fallback for unicode -> enc
509 # 2 - skip sub-char mapping
510 # 3 - fallback enc -> unicode
511 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
512 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
513 }
514 else
515 {
516 warn $_;
517 }
518 }
519 if ($nfb && $hfb)
520 {
521 die "$nfb entries without fallback, $hfb entries with\n";
522 }
523 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
524}
525
526
527
528sub compile_enc
529{
530 my ($fh,$name) = @_;
531 my $e2u = {};
532 my $u2e = {};
533
534 my $type;
535 while ($type = <$fh>)
536 {
537 last if $type !~ /^\s*#/;
538 }
539 chomp($type);
540 return if $type eq 'E';
541 # Do the hash lookup once, rather than once per function call. 4% speedup.
542 my $type_func = $encode_types{$type};
543 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
544 warn "$type encoded $name\n";
545 my $rep = '';
546 # Save a defined test by setting these to defined values.
547 my $min_el = ~0; # A very big integer
548 my $max_el = 0; # Anything must be longer than 0
549 {
550 my $v = hex($def);
551 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
552 }
553 my $errors;
554 my $seen;
555 # use -Q to silence the seen test. Makefile.PL uses this by default.
556 $seen = {} unless $opt{Q};
557 do
558 {
559 my $line = <$fh>;
560 chomp($line);
561 my $page = hex($line);
562 my $ch = 0;
563 my $i = 16;
564 do
565 {
566 # So why is it 1% faster to leave the my here?
567 my $line = <$fh>;
568 $line =~ s/\r\n$/\n/;
569 die "$.:${line}Line should be exactly 65 characters long including
570 newline (".length($line).")" unless length ($line) == 65;
571 # Split line into groups of 4 hex digits, convert groups to ints
572 # This takes 65.35
573 # map {hex $_} $line =~ /(....)/g
574 # This takes 63.75 (2.5% less time)
575 # unpack "n*", pack "H*", $line
576 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
577 # Doing it as while ($line =~ /(....)/g) took 74.63
578 foreach my $val (unpack "n*", pack "H*", $line)
579 {
580 next if $val == 0xFFFD;
581 my $ech = &$type_func($ch,$page);
582 if ($val || (!$ch && !$page))
583 {
584 my $el = length($ech);
585 $max_el = $el if $el > $max_el;
586 $min_el = $el if $el < $min_el;
587 my $uch = encode_U($val);
588 if ($seen) {
589 # We're doing the test.
590 # We don't need to read this quickly, so storing it as a scalar,
591 # rather than 3 (anon array, plus the 2 scalars it holds) saves
592 # RAM and may make us faster on low RAM systems. [see __END__]
593 if (exists $seen->{$uch})
594 {
595 warn sprintf("U%04X is %02X%02X and %04X\n",
596 $val,$page,$ch,$seen->{$uch});
597 $errors++;
598 }
599 else
600 {
601 $seen->{$uch} = $page << 8 | $ch;
602 }
603 }
604 # Passing 2 extra args each time is 3.6% slower!
605 # Even with having to add $fallback ||= 0 later
606 enter_fb0($e2u,$ech,$uch);
607 enter_fb0($u2e,$uch,$ech);
608 }
609 else
610 {
611 # No character at this position
612 # enter($e2u,$ech,undef,$e2u);
613 }
614 $ch++;
615 }
616 } while --$i;
617 } while --$pages;
618 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
619 if $min_el > $max_el;
620 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
621 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
622}
623
624# my ($a,$s,$d,$t,$fb) = @_;
625sub enter {
626 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
627 # state we shift to after this (multibyte) input character defaults to same
628 # as current state.
629 $next ||= $current;
630 # Making sure it is defined seems to be faster than {no warnings;} in
0a225b3c 631 # &process, or passing it in as 0 explicitly.
67d7b5ef
JH
632 # XXX $fallback ||= 0;
633
634 # Start at the beginning and work forwards through the string to zero.
635 # effectively we are removing 1 character from the front each time
636 # but we don't actually edit the string. [this alone seems to be 14% speedup]
637 # Hence -$pos is the length of the remaining string.
638 my $pos = -length $inbytes;
639 while (1) {
640 my $byte = substr $inbytes, $pos, 1;
641 # RAW_NEXT => 0,
642 # RAW_IN_LEN => 1,
643 # RAW_OUT_BYTES => 2,
644 # RAW_FALLBACK => 3,
645 # to unicode an array would seem to be better, because the pages are dense.
646 # from unicode can be very sparse, favouring a hash.
647 # hash using the bytes (all length 1) as keys rather than ord value,
648 # as it's easier to sort these in &process.
649
650 # It's faster to always add $fallback even if it's undef, rather than
651 # choosing between 3 and 4 element array. (hence why we set it defined
652 # above)
653 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
654 # When $pos was -1 we were at the last input character.
655 unless (++$pos) {
656 $do_now->[RAW_OUT_BYTES] = $outbytes;
657 $do_now->[RAW_NEXT] = $next;
658 return;
659 }
0a225b3c 660 # Tail recursion. The intermediate state may not have a name yet.
67d7b5ef
JH
661 $current = $do_now->[RAW_NEXT];
662 }
663}
664
0a225b3c 665# This is purely for optimisation. It's just &enter hard coded for $fallback
67d7b5ef
JH
666# of 0, using only a 3 entry array ref to save memory for every entry.
667sub enter_fb0 {
668 my ($current,$inbytes,$outbytes,$next) = @_;
669 $next ||= $current;
670
671 my $pos = -length $inbytes;
672 while (1) {
673 my $byte = substr $inbytes, $pos, 1;
674 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
675 unless (++$pos) {
676 $do_now->[RAW_OUT_BYTES] = $outbytes;
677 $do_now->[RAW_NEXT] = $next;
678 return;
679 }
680 $current = $do_now->[RAW_NEXT];
681 }
682}
683
67d7b5ef
JH
684sub process
685{
686 my ($name,$a) = @_;
687 $name =~ s/\W+/_/g;
688 $a->{Cname} = $name;
689 my $raw = $a->{Raw};
690 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
691 my @ent;
692 $agg_max_in = 0;
693 foreach my $key (sort keys %$raw) {
694 # RAW_NEXT => 0,
695 # RAW_IN_LEN => 1,
696 # RAW_OUT_BYTES => 2,
697 # RAW_FALLBACK => 3,
698 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
699 # Now we are converting from raw to aggregate, switch from 1 byte strings
700 # to numbers
701 my $b = ord $key;
702 $fallback ||= 0;
703 if ($l &&
704 # If this == fails, we're going to reset $agg_max_in below anyway.
705 $b == ++$agg_max_in &&
706 # References in numeric context give the pointer as an int.
707 $agg_next == $next &&
708 $agg_in_len == $in_len &&
709 $agg_out_len == length $out_bytes &&
710 $agg_fallback == $fallback
711 # && length($l->[AGG_OUT_BYTES]) < 16
712 ) {
713 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
714 # we can aggregate this byte onto the end.
715 $l->[AGG_MAX_IN] = $b;
716 $l->[AGG_OUT_BYTES] .= $out_bytes;
717 } else {
718 # AGG_MIN_IN => 0,
719 # AGG_MAX_IN => 1,
720 # AGG_OUT_BYTES => 2,
721 # AGG_NEXT => 3,
722 # AGG_IN_LEN => 4,
723 # AGG_OUT_LEN => 5,
724 # AGG_FALLBACK => 6,
725 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
726 # (only gains .6% on euc-jp -- is it worth it?)
727 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
728 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
729 $agg_fallback = $fallback];
730 }
731 if (exists $next->{Cname}) {
732 $next->{'Forward'} = 1 if $next != $a;
733 } else {
734 process(sprintf("%s_%02x",$name,$b),$next);
735 }
736 }
737 # encengine.c rules say that last entry must be for 255
738 if ($agg_max_in < 255) {
739 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
740 }
741 $a->{'Entries'} = \@ent;
742}
743
b536bf57
DK
744
745sub addstrings
67d7b5ef
JH
746{
747 my ($fh,$a) = @_;
748 my $name = $a->{'Cname'};
749 # String tables
750 foreach my $b (@{$a->{'Entries'}})
751 {
752 next unless $b->[AGG_OUT_LEN];
b536bf57 753 $strings{$b->[AGG_OUT_BYTES]} = undef;
67d7b5ef
JH
754 }
755 if ($a->{'Forward'})
756 {
e67e8a4a
DM
757 my ($cpp, $static, $sized) = compiler_info(1);
758 my $var = $static ? 'static const' : 'extern';
759 my $count = $sized ? scalar(@{$a->{'Entries'}}) : '';
760 print $fh "$var encpage_t $name\[$count];\n";
b536bf57
DK
761 }
762 $a->{'DoneStrings'} = 1;
763 foreach my $b (@{$a->{'Entries'}})
764 {
765 my ($s,$e,$out,$t,$end,$l) = @$b;
766 addstrings($fh,$t) unless $t->{'DoneStrings'};
67d7b5ef 767 }
b536bf57
DK
768}
769
770sub outbigstring
771{
772 my ($fh,$name) = @_;
773
774 $string_acc = '';
775
776 # Make the big string in the string accumulator. Longest first, on the hope
777 # that this makes it more likely that we find the short strings later on.
0a225b3c 778 # Not sure if it helps sorting strings of the same length lexically.
b536bf57
DK
779 foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
780 my $index = index $string_acc, $s;
781 if ($index >= 0) {
782 $saved += length($s);
783 $strings_in_acc{$s} = $index;
784 } else {
785 OPTIMISER: {
d1256cb1
RGS
786 if ($opt{'O'}) {
787 my $sublength = length $s;
788 while (--$sublength > 0) {
789 # progressively lop characters off the end, to see if the start of
790 # the new string overlaps the end of the accumulator.
791 if (substr ($string_acc, -$sublength)
792 eq substr ($s, 0, $sublength)) {
793 $subsave += $sublength;
794 $strings_in_acc{$s} = length ($string_acc) - $sublength;
795 # append the last bit on the end.
796 $string_acc .= substr ($s, $sublength);
797 last OPTIMISER;
798 }
799 # or if the end of the new string overlaps the start of the
800 # accumulator
801 next unless substr ($string_acc, 0, $sublength)
802 eq substr ($s, -$sublength);
803 # well, the last $sublength characters of the accumulator match.
804 # so as we're prepending to the accumulator, need to shift all our
805 # existing offsets forwards
806 $_ += $sublength foreach values %strings_in_acc;
807 $subsave += $sublength;
808 $strings_in_acc{$s} = 0;
809 # append the first bit on the start.
810 $string_acc = substr ($s, 0, -$sublength) . $string_acc;
811 last OPTIMISER;
812 }
813 }
814 # Optimiser (if it ran) found nothing, so just going have to tack the
815 # whole thing on the end.
816 $strings_in_acc{$s} = length $string_acc;
817 $string_acc .= $s;
b536bf57
DK
818 };
819 }
820 }
821
822 $strings = length $string_acc;
e67e8a4a 823 my ($cpp) = compiler_info(0);
f49b0b20
JH
824 my $var = $cpp ? '' : 'static';
825 my $definition = "\n$var const U8 $name\[$strings] = { " .
b536bf57
DK
826 join(',',unpack "C*",$string_acc);
827 # We have a single long line. Split it at convenient commas.
828 print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
829 print $fh substr ($definition, pos $definition), " };\n";
830}
831
832sub findstring {
833 my ($name,$s) = @_;
834 my $offset = $strings_in_acc{$s};
835 die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
836 unless defined $offset;
837 "$name + $offset";
838}
839
840sub outtable
841{
842 my ($fh,$a,$bigname) = @_;
843 my $name = $a->{'Cname'};
67d7b5ef
JH
844 $a->{'Done'} = 1;
845 foreach my $b (@{$a->{'Entries'}})
846 {
847 my ($s,$e,$out,$t,$end,$l) = @$b;
b536bf57 848 outtable($fh,$t,$bigname) unless $t->{'Done'};
67d7b5ef 849 }
e67e8a4a
DM
850 my ($cpp, $static) = compiler_info(0);
851 my $var = $static ? 'static const ' : '';
852 print $fh "\n${var}encpage_t $name\[",
0629a5b3 853 scalar(@{$a->{'Entries'}}), "] = {\n";
67d7b5ef
JH
854 foreach my $b (@{$a->{'Entries'}})
855 {
856 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
d6f02b51 857 # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
67d7b5ef
JH
858 print $fh "{";
859 if ($l)
860 {
b536bf57 861 printf $fh findstring($bigname,$out);
67d7b5ef
JH
862 }
863 else
864 {
865 print $fh "0";
866 }
867 print $fh ",",$t->{Cname};
868 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
869 }
870 print $fh "};\n";
871}
872
67d7b5ef
JH
873sub output_enc
874{
875 my ($fh,$name,$a) = @_;
876 die "Changed - fix me for new structure";
877 foreach my $b (sort keys %$a)
878 {
879 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
880 }
881}
882
883sub decode_U
884{
885 my $s = shift;
886}
887
888my @uname;
889sub char_names
890{
891 my $s = do "unicore/Name.pl";
892 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
893 pos($s) = 0;
894 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
895 {
896 my $name = $3;
897 my $s = hex($1);
898 last if $s >= 0x10000;
899 my $e = length($2) ? hex($2) : $s;
900 for (my $i = $s; $i <= $e; $i++)
901 {
902 $uname[$i] = $name;
903# print sprintf("U%04X $name\n",$i);
904 }
905 }
906}
907
908sub output_ucm_page
909{
910 my ($cmap,$a,$t,$pre) = @_;
911 # warn sprintf("Page %x\n",$pre);
912 my $raw = $t->{Raw};
913 foreach my $key (sort keys %$raw) {
914 # RAW_NEXT => 0,
915 # RAW_IN_LEN => 1,
916 # RAW_OUT_BYTES => 2,
917 # RAW_FALLBACK => 3,
918 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
919 my $u = ord $key;
920 $fallback ||= 0;
921
922 if ($next != $a && $next != $t) {
923 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
924 } elsif (length $out_bytes) {
925 if ($pre) {
926 $u = $pre|($u &0x3f);
927 }
928 my $s = sprintf "<U%04X> ",$u;
929 #foreach my $c (split(//,$out_bytes)) {
930 # $s .= sprintf "\\x%02X",ord($c);
931 #}
932 # 9.5% faster changing that loop to this:
933 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
934 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
935 push(@$cmap,$s);
936 } else {
937 warn join(',',$u, @{$raw->{$key}},$a,$t);
938 }
939 }
940}
941
942sub output_ucm
943{
944 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
945 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
946 print $fh "<code_set_name> \"$name\"\n";
947 char_names();
948 if (defined $min_el)
949 {
950 print $fh "<mb_cur_min> $min_el\n";
951 }
952 if (defined $max_el)
953 {
954 print $fh "<mb_cur_max> $max_el\n";
955 }
956 if (defined $rep)
957 {
958 print $fh "<subchar> ";
959 foreach my $c (split(//,$rep))
960 {
961 printf $fh "\\x%02X",ord($c);
962 }
963 print $fh "\n";
964 }
965 my @cmap;
966 output_ucm_page(\@cmap,$h,$h,0);
967 print $fh "#\nCHARMAP\n";
968 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
969 {
970 print $fh $line;
971 }
972 print $fh "END CHARMAP\n";
973}
974
3ef515df
JH
975use vars qw(
976 $_Enc2xs
977 $_Version
978 $_Inc
b2704119 979 $_E2X
3ef515df
JH
980 $_Name
981 $_TableFiles
982 $_Now
983);
984
b2704119 985sub find_e2x{
b536bf57 986 eval { require File::Find; };
b2704119
JH
987 my (@inc, %e2x_dir);
988 for my $inc (@INC){
d1256cb1 989 push @inc, $inc unless $inc eq '.'; #skip current dir
b2704119
JH
990 }
991 File::Find::find(
d1256cb1
RGS
992 sub {
993 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
994 $atime,$mtime,$ctime,$blksize,$blocks)
995 = lstat($_) or return;
996 -f _ or return;
997 if (/^.*\.e2x$/o){
998 no warnings 'once';
999 $e2x_dir{$File::Find::dir} ||= $mtime;
1000 }
1001 return;
1002 }, @inc);
b2704119
JH
1003 warn join("\n", keys %e2x_dir), "\n";
1004 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
d1256cb1
RGS
1005 $_E2X = $d;
1006 # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
1007 return $_E2X;
b2704119
JH
1008 }
1009}
1010
67d7b5ef
JH
1011sub make_makefile_pl
1012{
1013 eval { require Encode; };
1014 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
0a225b3c 1015 # our used for variable expansion
3ef515df
JH
1016 $_Enc2xs = $0;
1017 $_Version = $VERSION;
b2704119 1018 $_E2X = find_e2x();
3ef515df
JH
1019 $_Name = shift;
1020 $_TableFiles = join(",", map {qq('$_')} @_);
1021 $_Now = scalar localtime();
b2704119 1022
aae85ceb 1023 eval { require File::Spec; };
b2704119 1024 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
b2704119 1025 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
b2704119 1026 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
b2704119 1027 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
b2704119 1028 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
3ef515df
JH
1029 exit;
1030}
1031
aae85ceb 1032use vars qw(
d1256cb1
RGS
1033 $_ModLines
1034 $_LocalVer
1035 );
aae85ceb 1036
656ebd29 1037sub make_configlocal_pm {
aae85ceb
DK
1038 eval { require Encode; };
1039 $@ and die "Unable to require Encode: $@\n";
1040 eval { require File::Spec; };
656ebd29 1041
0a225b3c 1042 # our used for variable expantion
656ebd29
RGS
1043 my %in_core = map { $_ => 1 } (
1044 'ascii', 'iso-8859-1', 'utf8',
1045 'ascii-ctrl', 'null', 'utf-8-strict'
1046 );
aae85ceb 1047 my %LocalMod = ();
656ebd29
RGS
1048 # check @enc;
1049 use File::Find ();
1050 my $wanted = sub{
1051 -f $_ or return;
1052 $File::Find::name =~ /\A\./ and return;
1053 $File::Find::name =~ /\.pm\z/ or return;
1054 $File::Find::name =~ m/\bEncode\b/ or return;
1055 my $mod = $File::Find::name;
1056 $mod =~ s/.*\bEncode\b/Encode/o;
1057 $mod =~ s/\.pm\z//o;
1058 $mod =~ s,/,::,og;
656ebd29 1059 eval qq{ require $mod; };
e8cc621e
CBW
1060 return if $@;
1061 warn qq{ require $mod;\n};
656ebd29
RGS
1062 for my $enc ( Encode->encodings() ) {
1063 no warnings;
1064 $in_core{$enc} and next;
1065 $Encode::Config::ExtModule{$enc} and next;
1066 $LocalMod{$enc} ||= $mod;
1067 }
1068 };
1069 File::Find::find({wanted => $wanted}, @INC);
aae85ceb 1070 $_ModLines = "";
656ebd29
RGS
1071 for my $enc ( sort keys %LocalMod ) {
1072 $_ModLines .=
1073 qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
aae85ceb 1074 }
e8cc621e 1075 warn $_ModLines if $_ModLines;
aae85ceb 1076 $_LocalVer = _mkversion();
656ebd29
RGS
1077 $_E2X = find_e2x();
1078 $_Inc = $INC{"Encode.pm"};
1079 $_Inc =~ s/\.pm$//o;
1080 _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
1081 File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
aae85ceb
DK
1082 exit;
1083}
1084
1085sub _mkversion{
656ebd29
RGS
1086 # v-string is now depreciated; use time() instead;
1087 #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1088 #$yyyy += 1900, $mo +=1;
1089 #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1090 return time();
aae85ceb
DK
1091}
1092
3ef515df 1093sub _print_expand{
67d7b5ef
JH
1094 eval { require File::Basename; };
1095 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
1096 File::Basename->import();
621b0f8d
DK
1097 my ($src, $dst, $clobber) = @_;
1098 if (!$clobber and -e $dst){
d1256cb1
RGS
1099 warn "$dst exists. skipping\n";
1100 return;
621b0f8d
DK
1101 }
1102 warn "Generating $dst...\n";
3ef515df
JH
1103 open my $in, $src or die "$src : $!";
1104 if ((my $d = dirname($dst)) ne '.'){
d1256cb1 1105 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
3ef515df
JH
1106 }
1107 open my $out, ">$dst" or die "$!";
1108 my $asis = 0;
1109 while (<$in>){
d1256cb1
RGS
1110 if (/^#### END_OF_HEADER/){
1111 $asis = 1; next;
1112 }
1113 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1114 print $out $_;
67d7b5ef 1115 }
67d7b5ef 1116}
67d7b5ef
JH
1117__END__
1118
1119=head1 NAME
1120
1121enc2xs -- Perl Encode Module Generator
1122
1123=head1 SYNOPSIS
1124
67d7b5ef 1125 enc2xs -[options]
aae85ceb
DK
1126 enc2xs -M ModName mapfiles...
1127 enc2xs -C
67d7b5ef
JH
1128
1129=head1 DESCRIPTION
1130
1131F<enc2xs> builds a Perl extension for use by Encode from either
0ab8f81e
JH
1132Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1133Besides being used internally during the build process of the Encode
1134module, you can use F<enc2xs> to add your own encoding to perl.
1135No knowledge of XS is necessary.
67d7b5ef
JH
1136
1137=head1 Quick Guide
1138
0ab8f81e 1139If you want to know as little about Perl as possible but need to
67d7b5ef
JH
1140add a new encoding, just read this chapter and forget the rest.
1141
1142=over 4
1143
3cd3edd2 1144=item 0.Z<>
67d7b5ef 1145
0ab8f81e
JH
1146Have a .ucm file ready. You can get it from somewhere or you can write
1147your own from scratch or you can grab one from the Encode distribution
1148and customize it. For the UCM format, see the next Chapter. In the
1149example below, I'll call my theoretical encoding myascii, defined
1150in I<my.ucm>. C<$> is a shell prompt.
67d7b5ef
JH
1151
1152 $ ls -F
1153 my.ucm
1154
3cd3edd2 1155=item 1.Z<>
67d7b5ef
JH
1156
1157Issue a command as follows;
1158
1159 $ enc2xs -M My my.ucm
3ef515df
JH
1160 generating Makefile.PL
1161 generating My.pm
1162 generating README
1163 generating Changes
67d7b5ef
JH
1164
1165Now take a look at your current directory. It should look like this.
1166
1167 $ ls -F
1168 Makefile.PL My.pm my.ucm t/
1169
0ab8f81e 1170The following files were created.
67d7b5ef 1171
0ab8f81e
JH
1172 Makefile.PL - MakeMaker script
1173 My.pm - Encode submodule
1174 t/My.t - test file
1175
1176=over 4
67d7b5ef 1177
3cd3edd2 1178=item 1.1.Z<>
037b88d6
JH
1179
1180If you want *.ucm installed together with the modules, do as follows;
1181
1182 $ mkdir Encode
1183 $ mv *.ucm Encode
1184 $ enc2xs -M My Encode/*ucm
1185
0ab8f81e
JH
1186=back
1187
3cd3edd2 1188=item 2.Z<>
67d7b5ef
JH
1189
1190Edit the files generated. You don't have to if you have no time AND no
1191intention to give it to someone else. But it is a good idea to edit
0ab8f81e 1192the pod and to add more tests.
67d7b5ef 1193
3cd3edd2 1194=item 3.Z<>
67d7b5ef 1195
0ab8f81e 1196Now issue a command all Perl Mongers love:
67d7b5ef 1197
9160fdbd 1198 $ perl Makefile.PL
67d7b5ef
JH
1199 Writing Makefile for Encode::My
1200
3cd3edd2 1201=item 4.Z<>
67d7b5ef
JH
1202
1203Now all you have to do is make.
1204
1205 $ make
1206 cp My.pm blib/lib/Encode/My.pm
1207 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1208 -o encode_t.c -f encode_t.fnm
1209 Reading myascii (myascii)
1210 Writing compiled form
1211 128 bytes in string tables
cf9f87ce
RGS
1212 384 bytes (75%) saved spotting duplicates
1213 1 bytes (0.775%) saved using substrings
67d7b5ef
JH
1214 ....
1215 chmod 644 blib/arch/auto/Encode/My/My.bs
1216 $
1217
0ab8f81e
JH
1218The time it takes varies depending on how fast your machine is and
1219how large your encoding is. Unless you are working on something big
1220like euc-tw, it won't take too long.
67d7b5ef 1221
3cd3edd2 1222=item 5.Z<>
67d7b5ef
JH
1223
1224You can "make install" already but you should test first.
1225
1226 $ make test
1227 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1228 -e 'use Test::Harness qw(&runtests $verbose); \
1229 $verbose=0; runtests @ARGV;' t/*.t
1230 t/My....ok
1231 All tests successful.
1232 Files=1, Tests=2, 0 wallclock secs
1233 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1234
3cd3edd2 1235=item 6.Z<>
67d7b5ef
JH
1236
1237If you are content with the test result, just "make install"
1238
3cd3edd2 1239=item 7.Z<>
aae85ceb 1240
0ab8f81e 1241If you want to add your encoding to Encode's demand-loading list
aae85ceb
DK
1242(so you don't have to "use Encode::YourEncoding"), run
1243
1244 enc2xs -C
1245
1246to update Encode::ConfigLocal, a module that controls local settings.
1247After that, "use Encode;" is enough to load your encodings on demand.
1248
67d7b5ef
JH
1249=back
1250
1251=head1 The Unicode Character Map
1252
0ab8f81e
JH
1253Encode uses the Unicode Character Map (UCM) format for source character
1254mappings. This format is used by IBM's ICU package and was adopted
1255by Nick Ing-Simmons for use with the Encode module. Since UCM is
1256more flexible than Tcl's Encoding Map and far more user-friendly,
863b2ca0 1257this is the recommended format for Encode now.
67d7b5ef 1258
0ab8f81e 1259A UCM file looks like this.
67d7b5ef
JH
1260
1261 #
1262 # Comments
1263 #
1264 <code_set_name> "US-ascii" # Required
1265 <code_set_alias> "ascii" # Optional
1266 <mb_cur_min> 1 # Required; usually 1
1267 <mb_cur_max> 1 # Max. # of bytes/char
1268 <subchar> \x3F # Substitution char
1269 #
1270 CHARMAP
1271 <U0000> \x00 |0 # <control>
1272 <U0001> \x01 |0 # <control>
1273 <U0002> \x02 |0 # <control>
1274 ....
1275 <U007C> \x7C |0 # VERTICAL LINE
1276 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1277 <U007E> \x7E |0 # TILDE
1278 <U007F> \x7F |0 # <control>
1279 END CHARMAP
1280
1281=over 4
1282
1283=item *
1284
0ab8f81e 1285Anything that follows C<#> is treated as a comment.
67d7b5ef
JH
1286
1287=item *
1288
0ab8f81e
JH
1289The header section continues until a line containing the word
1290CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1291pair per line. Strings used as values must be quoted. Barewords are
1292treated as numbers. I<\xXX> represents a byte.
67d7b5ef
JH
1293
1294Most of the keywords are self-explanatory. I<subchar> means
1295substitution character, not subcharacter. When you decode a Unicode
1296sequence to this encoding but no matching character is found, the byte
1297sequence defined here will be used. For most cases, the value here is
0ab8f81e 1298\x3F; in ASCII, this is a question mark.
67d7b5ef
JH
1299
1300=item *
1301
1302CHARMAP starts the character map section. Each line has a form as
0ab8f81e 1303follows:
67d7b5ef
JH
1304
1305 <UXXXX> \xXX.. |0 # comment
1306 ^ ^ ^
1307 | | +- Fallback flag
1308 | +-------- Encoded byte sequence
1309 +-------------- Unicode Character ID in hex
1310
0ab8f81e
JH
1311The format is roughly the same as a header section except for the
1312fallback flag: | followed by 0..3. The meaning of the possible
1313values is as follows:
67d7b5ef 1314
0ab8f81e 1315=over 4
67d7b5ef
JH
1316
1317=item |0
1318
0ab8f81e
JH
1319Round trip safe. A character decoded to Unicode encodes back to the
1320same byte sequence. Most characters have this flag.
67d7b5ef
JH
1321
1322=item |1
1323
1324Fallback for unicode -> encoding. When seen, enc2xs adds this
0ab8f81e 1325character for the encode map only.
67d7b5ef
JH
1326
1327=item |2
1328
1329Skip sub-char mapping should there be no code point.
1330
1331=item |3
1332
1333Fallback for encoding -> unicode. When seen, enc2xs adds this
0ab8f81e 1334character for the decode map only.
67d7b5ef
JH
1335
1336=back
1337
1338=item *
1339
1340And finally, END OF CHARMAP ends the section.
1341
1342=back
1343
6d1c0808 1344When you are manually creating a UCM file, you should copy ascii.ucm
0ab8f81e
JH
1345or an existing encoding which is close to yours, rather than write
1346your own from scratch.
67d7b5ef
JH
1347
1348When you do so, make sure you leave at least B<U0000> to B<U0020> as
0ab8f81e 1349is, unless your environment is EBCDIC.
67d7b5ef
JH
1350
1351B<CAVEAT>: not all features in UCM are implemented. For example,
1352icu:state is not used. Because of that, you need to write a perl
0ab8f81e
JH
1353module if you want to support algorithmical encodings, notably
1354the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
67d7b5ef
JH
1355L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1356
6d1c0808
JH
1357=head2 Coping with duplicate mappings
1358
1359When you create a map, you SHOULD make your mappings round-trip safe.
1360That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1361$data> stands for all characters that are marked as C<|0>. Here is
0ab8f81e 1362how to make sure:
6d1c0808 1363
0ab8f81e 1364=over 4
6d1c0808
JH
1365
1366=item *
1367
1368Sort your map in Unicode order.
1369
1370=item *
1371
1372When you have a duplicate entry, mark either one with '|1' or '|3'.
1373
1374=item *
1375
0ab8f81e 1376And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
6d1c0808
JH
1377
1378=back
1379
1380Here is an example from big5-eten.
1381
1382 <U2550> \xF9\xF9 |0
1383 <U2550> \xA2\xA4 |3
1384
1385Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1386this;
1387
1388 E to U U to E
1389 --------------------------------------
1390 \xF9\xF9 => U2550 U2550 => \xF9\xF9
1391 \xA2\xA4 => U2550
1392
1393So it is round-trip safe for \xF9\xF9. But if the line above is upside
1394down, here is what happens.
1395
1396 E to U U to E
1397 --------------------------------------
1398 \xA2\xA4 => U2550 U2550 => \xF9\xF9
1399 (\xF9\xF9 => U2550 is now overwritten!)
1400
1401The Encode package comes with F<ucmlint>, a crude but sufficient
0ab8f81e
JH
1402utility to check the integrity of a UCM file. Check under the
1403Encode/bin directory for this.
cf9f87ce
RGS
1404
1405When in doubt, you can use F<ucmsort>, yet another utility under
1406Encode/bin directory.
6d1c0808 1407
67d7b5ef
JH
1408=head1 Bookmarks
1409
0ab8f81e
JH
1410=over 4
1411
1412=item *
1413
67d7b5ef 1414ICU Home Page
64bc6d54 1415L<http://www.icu-project.org/>
67d7b5ef 1416
0ab8f81e
JH
1417=item *
1418
67d7b5ef 1419ICU Character Mapping Tables
b2deda17 1420L<http://site.icu-project.org/charts/charset>
67d7b5ef 1421
0ab8f81e
JH
1422=item *
1423
67d7b5ef 1424ICU:Conversion Data
64bc6d54 1425L<http://www.icu-project.org/userguide/conversion-data.html>
67d7b5ef 1426
0ab8f81e
JH
1427=back
1428
67d7b5ef
JH
1429=head1 SEE ALSO
1430
1431L<Encode>,
1432L<perlmod>,
1433L<perlpod>
1434
1435=cut
1436
1437# -Q to disable the duplicate codepoint test
1438# -S make mapping errors fatal
1439# -q to remove comments written to output files
1440# -O to enable the (brute force) substring optimiser
1441# -o <output> to specify the output file name (else it's the first arg)
1442# -f <inlist> to give a file with a list of input files (else use the args)
1443# -n <name> to name the encoding (else use the basename of the input file.
1444
1445With %seen holding array refs:
1446
1447 865.66 real 28.80 user 8.79 sys
1448 7904 maximum resident set size
1449 1356 average shared memory size
1450 18566 average unshared data size
1451 229 average unshared stack size
1452 46080 page reclaims
1453 33373 page faults
1454
1455With %seen holding simple scalars:
1456
1457 342.16 real 27.11 user 3.54 sys
1458 8388 maximum resident set size
1459 1394 average shared memory size
1460 14969 average unshared data size
1461 236 average unshared stack size
1462 28159 page reclaims
1463 9839 page faults
1464
1465Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1466how %seen is storing things its seen. So it is pathalogically bad on a 16M
1467RAM machine, but it's going to help even on modern machines.
1468Swapping is bad, m'kay :-)