This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
C++ compilation patch by Jarkko
[perl5.git] / ext / Encode / bin / enc2xs
CommitLineData
3ef515df 1#!./perl
67d7b5ef 2BEGIN {
a999c27c
JH
3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4 # with $ENV{PERL_CORE} set
5 # In case we need it in future...
6 require Config; import Config;
67d7b5ef
JH
7}
8use strict;
b536bf57 9use warnings;
67d7b5ef 10use Getopt::Std;
f49b0b20 11use Config;
67d7b5ef 12my @orig_ARGV = @ARGV;
f49b0b20 13our $VERSION = do { my @r = (q$Revision: 2.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
67d7b5ef
JH
14
15# These may get re-ordered.
16# RAW is a do_now as inserted by &enter
17# AGG is an aggreagated do_now, as built up by &process
a999c27c 18
67d7b5ef
JH
19use constant {
20 RAW_NEXT => 0,
21 RAW_IN_LEN => 1,
22 RAW_OUT_BYTES => 2,
23 RAW_FALLBACK => 3,
24
25 AGG_MIN_IN => 0,
26 AGG_MAX_IN => 1,
27 AGG_OUT_BYTES => 2,
28 AGG_NEXT => 3,
29 AGG_IN_LEN => 4,
30 AGG_OUT_LEN => 5,
31 AGG_FALLBACK => 6,
32};
a999c27c 33
67d7b5ef
JH
34# (See the algorithm in encengine.c - we're building structures for it)
35
36# There are two sorts of structures.
37# "do_now" (an array, two variants of what needs storing) is whatever we need
38# to do now we've read an input byte.
39# It's housed in a "do_next" (which is how we got to it), and in turn points
40# to a "do_next" which contains all the "do_now"s for the next input byte.
41
42# There will be a "do_next" which is the start state.
43# For a single byte encoding it's the only "do_next" - each "do_now" points
44# back to it, and each "do_now" will cause bytes. There is no state.
45
46# For a multi-byte encoding where all characters in the input are the same
47# length, then there will be a tree of "do_now"->"do_next"->"do_now"
48# branching out from the start state, one step for each input byte.
49# The leaf "do_now"s will all be at the same distance from the start state,
50# only the leaf "do_now"s cause output bytes, and they in turn point back to
51# the start state.
52
53# For an encoding where there are varaible length input byte sequences, you
54# will encounter a leaf "do_now" sooner for the shorter input sequences, but
55# as before the leaves will point back to the start state.
56
57# The system will cope with escape encodings (imagine them as a mostly
58# self-contained tree for each escape state, and cross links between trees
59# at the state-switching characters) but so far no input format defines these.
60
61# The system will also cope with having output "leaves" in the middle of
62# the bifurcating branches, not just at the extremities, but again no
63# input format does this yet.
64
65# There are two variants of the "do_now" structure. The first, smaller variant
66# is generated by &enter as the input file is read. There is one structure
67# for each input byte. Say we are mapping a single byte encoding to a
68# single byte encoding, with "ABCD" going "abcd". There will be
69# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
70
71# &process then walks the tree, building aggregate "do_now" structres for
72# adjacent bytes where possible. The aggregate is for a contiguous range of
73# bytes which each produce the same length of output, each move to the
74# same next state, and each have the same fallback flag.
75# So our 4 RAW "do_now"s above become replaced by a single structure
76# containing:
77# ["A", "D", "abcd", 1, ...]
78# ie, for an input byte $_ in "A".."D", output 1 byte, found as
79# substr ("abcd", (ord $_ - ord "A") * 1, 1)
80# which maps very nicely into pointer arithmetic in C for encengine.c
81
82sub encode_U
83{
84 # UTF-8 encode long hand - only covers part of perl's range
85 ## my $uv = shift;
86 # chr() works in native space so convert value from table
87 # into that space before using chr().
88 my $ch = chr(utf8::unicode_to_native($_[0]));
89 # Now get core perl to encode that the way it likes.
90 utf8::encode($ch);
91 return $ch;
92}
93
94sub encode_S
95{
96 # encode single byte
97 ## my ($ch,$page) = @_; return chr($ch);
98 return chr $_[0];
99}
100
101sub encode_D
102{
103 # encode double byte MS byte first
104 ## my ($ch,$page) = @_; return chr($page).chr($ch);
105 return chr ($_[1]) . chr $_[0];
106}
107
108sub encode_M
109{
110 # encode Multi-byte - single for 0..255 otherwise double
111 ## my ($ch,$page) = @_;
112 ## return &encode_D if $page;
113 ## return &encode_S;
114 return chr ($_[1]) . chr $_[0] if $_[1];
115 return chr $_[0];
116}
117
118my %encode_types = (U => \&encode_U,
119 S => \&encode_S,
120 D => \&encode_D,
121 M => \&encode_M,
122 );
123
124# Win32 does not expand globs on command line
125eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
126
127my %opt;
128# I think these are:
129# -Q to disable the duplicate codepoint test
130# -S make mapping errors fatal
131# -q to remove comments written to output files
132# -O to enable the (brute force) substring optimiser
133# -o <output> to specify the output file name (else it's the first arg)
134# -f <inlist> to give a file with a list of input files (else use the args)
135# -n <name> to name the encoding (else use the basename of the input file.
aae85ceb 136getopts('CM:SQqOo:f:n:',\%opt);
67d7b5ef
JH
137
138$opt{M} and make_makefile_pl($opt{M}, @ARGV);
aae85ceb 139$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
67d7b5ef
JH
140
141# This really should go first, else the die here causes empty (non-erroneous)
142# output files to be written.
143my @encfiles;
144if (exists $opt{'f'}) {
145 # -F is followed by name of file containing list of filenames
146 my $flist = $opt{'f'};
147 open(FLIST,$flist) || die "Cannot open $flist:$!";
148 chomp(@encfiles = <FLIST>);
149 close(FLIST);
150} else {
151 @encfiles = @ARGV;
152}
153
154my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
155chmod(0666,$cname) if -f $cname && !-w $cname;
156open(C,">$cname") || die "Cannot open $cname:$!";
157
158my $dname = $cname;
159my $hname = $cname;
160
161my ($doC,$doEnc,$doUcm,$doPet);
162
0e4142c9 163if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
67d7b5ef
JH
164 {
165 $doC = 1;
e7cbefb8 166 $dname =~ s/(\.[^\.]*)?$/.exh/;
67d7b5ef
JH
167 chmod(0666,$dname) if -f $cname && !-w $dname;
168 open(D,">$dname") || die "Cannot open $dname:$!";
169 $hname =~ s/(\.[^\.]*)?$/.h/;
170 chmod(0666,$hname) if -f $cname && !-w $hname;
171 open(H,">$hname") || die "Cannot open $hname:$!";
172
173 foreach my $fh (\*C,\*D,\*H)
174 {
175 print $fh <<"END" unless $opt{'q'};
176/*
177 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
178 This file was autogenerated by:
179 $^X $0 @orig_ARGV
f49b0b20 180 enc2xs VERSION $VERSION
67d7b5ef
JH
181*/
182END
183 }
184
185 if ($cname =~ /(\w+)\.xs$/)
186 {
187 print C "#include <EXTERN.h>\n";
188 print C "#include <perl.h>\n";
189 print C "#include <XSUB.h>\n";
190 print C "#define U8 U8\n";
191 }
b536bf57 192 print C "#include \"encode.h\"\n\n";
67d7b5ef
JH
193
194 }
195elsif ($cname =~ /\.enc$/)
196 {
197 $doEnc = 1;
198 }
199elsif ($cname =~ /\.ucm$/)
200 {
201 $doUcm = 1;
202 }
203elsif ($cname =~ /\.pet$/)
204 {
205 $doPet = 1;
206 }
207
208my %encoding;
209my %strings;
b536bf57
DK
210my $string_acc;
211my %strings_in_acc;
212
67d7b5ef
JH
213my $saved = 0;
214my $subsave = 0;
215my $strings = 0;
216
217sub cmp_name
218{
219 if ($a =~ /^.*-(\d+)/)
220 {
221 my $an = $1;
222 if ($b =~ /^.*-(\d+)/)
223 {
224 my $r = $an <=> $1;
225 return $r if $r;
226 }
227 }
228 return $a cmp $b;
229}
230
231
232foreach my $enc (sort cmp_name @encfiles)
233 {
234 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
235 $name = $opt{'n'} if exists $opt{'n'};
236 if (open(E,$enc))
237 {
238 if ($sfx eq 'enc')
239 {
240 compile_enc(\*E,lc($name));
241 }
242 else
243 {
244 compile_ucm(\*E,lc($name));
245 }
246 }
247 else
248 {
249 warn "Cannot open $enc for $name:$!";
250 }
251 }
252
253if ($doC)
254 {
255 print STDERR "Writing compiled form\n";
256 foreach my $name (sort cmp_name keys %encoding)
257 {
258 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
b536bf57
DK
259 process($name.'_utf8',$e2u);
260 addstrings(\*C,$e2u);
261
262 process('utf8_'.$name,$u2e);
263 addstrings(\*C,$u2e);
264 }
265 outbigstring(\*C,"enctable");
266 foreach my $name (sort cmp_name keys %encoding)
267 {
268 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
269 outtable(\*C,$e2u, "enctable");
270 outtable(\*C,$u2e, "enctable");
271
b2704119 272 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
67d7b5ef 273 }
905bf499
RGS
274 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
275 my $exta = $cpp ? 'extern "C" ' : "static";
276 my $extb = $cpp ? 'extern "C" ' : "";
67d7b5ef
JH
277 foreach my $enc (sort cmp_name keys %encoding)
278 {
b2704119
JH
279 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
280 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
281 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
282 my $replen = 0;
283 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
67d7b5ef
JH
284 my $sym = "${enc}_encoding";
285 $sym =~ s/\W+/_/g;
7dd03145 286 my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
d1256cb1 287 $min_el,$max_el);
905bf499
RGS
288 print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n";
289 print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n";
290 print C "${extb} const encode_t $sym = \n";
f0a41339
DK
291 # This is to make null encoding work -- dankogai
292 for (my $i = (scalar @info) - 1; $i >= 0; --$i){
d1256cb1 293 $info[$i] ||= 1;
f0a41339
DK
294 }
295 # end of null tweak -- dankogai
7dd03145 296 print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
67d7b5ef
JH
297 }
298
299 foreach my $enc (sort cmp_name keys %encoding)
300 {
301 my $sym = "${enc}_encoding";
302 $sym =~ s/\W+/_/g;
303 print H "extern encode_t $sym;\n";
304 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
305 }
306
307 if ($cname =~ /(\w+)\.xs$/)
308 {
309 my $mod = $1;
310 print C <<'END';
311
312static void
313Encode_XSEncoding(pTHX_ encode_t *enc)
314{
315 dSP;
316 HV *stash = gv_stashpv("Encode::XS", TRUE);
317 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
318 int i = 0;
319 PUSHMARK(sp);
320 XPUSHs(sv);
321 while (enc->name[i])
322 {
323 const char *name = enc->name[i++];
324 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
325 }
326 PUTBACK;
327 call_pv("Encode::define_encoding",G_DISCARD);
328 SvREFCNT_dec(sv);
329}
330
331END
332
333 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
334 print C "BOOT:\n{\n";
335 print C "#include \"$dname\"\n";
336 print C "}\n";
337 }
338 # Close in void context is bad, m'kay
339 close(D) or warn "Error closing '$dname': $!";
340 close(H) or warn "Error closing '$hname': $!";
341
cf9f87ce
RGS
342 my $perc_saved = $saved/($strings + $saved) * 100;
343 my $perc_subsaved = $subsave/($strings + $subsave) * 100;
67d7b5ef
JH
344 printf STDERR "%d bytes in string tables\n",$strings;
345 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
346 $saved, $perc_saved if $saved;
347 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
348 $subsave, $perc_subsaved if $subsave;
349 }
350elsif ($doEnc)
351 {
352 foreach my $name (sort cmp_name keys %encoding)
353 {
354 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
355 output_enc(\*C,$name,$e2u);
356 }
357 }
358elsif ($doUcm)
359 {
360 foreach my $name (sort cmp_name keys %encoding)
361 {
362 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
363 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
364 }
365 }
366
367# writing half meg files and then not checking to see if you just filled the
368# disk is bad, m'kay
369close(C) or die "Error closing '$cname': $!";
370
371# End of the main program.
372
373sub compile_ucm
374{
375 my ($fh,$name) = @_;
376 my $e2u = {};
377 my $u2e = {};
378 my $cs;
379 my %attr;
380 while (<$fh>)
381 {
382 s/#.*$//;
383 last if /^\s*CHARMAP\s*$/i;
384 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
385 {
386 $attr{$1} = $2;
387 }
388 }
389 if (!defined($cs = $attr{'code_set_name'}))
390 {
391 warn "No <code_set_name> in $name\n";
392 }
393 else
394 {
395 $name = $cs unless exists $opt{'n'};
396 }
397 my $erep;
398 my $urep;
399 my $max_el;
400 my $min_el;
401 if (exists $attr{'subchar'})
402 {
b2704119
JH
403 #my @byte;
404 #$attr{'subchar'} =~ /^\s*/cg;
405 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
406 #$erep = join('',map(chr(hex($_)),@byte));
407 $erep = $attr{'subchar'};
408 $erep =~ s/^\s+//; $erep =~ s/\s+$//;
67d7b5ef
JH
409 }
410 print "Reading $name ($cs)\n";
411 my $nfb = 0;
412 my $hfb = 0;
413 while (<$fh>)
414 {
415 s/#.*$//;
416 last if /^\s*END\s+CHARMAP\s*$/i;
417 next if /^\s*$/;
a999c27c
JH
418 my (@uni, @byte) = ();
419 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
420 or die "Bad line: $_";
421 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
422 push @uni, map { substr($_, 1) } split(/\+/, $1);
423 }
424 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
425 push @byte, $1;
426 }
427 if (@uni)
67d7b5ef 428 {
a999c27c 429 my $uch = join('', map { encode_U(hex($_)) } @uni );
67d7b5ef
JH
430 my $ech = join('',map(chr(hex($_)),@byte));
431 my $el = length($ech);
432 $max_el = $el if (!defined($max_el) || $el > $max_el);
433 $min_el = $el if (!defined($min_el) || $el < $min_el);
434 if (length($fb))
435 {
436 $fb = substr($fb,1);
437 $hfb++;
438 }
439 else
440 {
441 $nfb++;
442 $fb = '0';
443 }
444 # $fb is fallback flag
445 # 0 - round trip safe
446 # 1 - fallback for unicode -> enc
447 # 2 - skip sub-char mapping
448 # 3 - fallback enc -> unicode
449 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
450 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
451 }
452 else
453 {
454 warn $_;
455 }
456 }
457 if ($nfb && $hfb)
458 {
459 die "$nfb entries without fallback, $hfb entries with\n";
460 }
461 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
462}
463
464
465
466sub compile_enc
467{
468 my ($fh,$name) = @_;
469 my $e2u = {};
470 my $u2e = {};
471
472 my $type;
473 while ($type = <$fh>)
474 {
475 last if $type !~ /^\s*#/;
476 }
477 chomp($type);
478 return if $type eq 'E';
479 # Do the hash lookup once, rather than once per function call. 4% speedup.
480 my $type_func = $encode_types{$type};
481 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
482 warn "$type encoded $name\n";
483 my $rep = '';
484 # Save a defined test by setting these to defined values.
485 my $min_el = ~0; # A very big integer
486 my $max_el = 0; # Anything must be longer than 0
487 {
488 my $v = hex($def);
489 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
490 }
491 my $errors;
492 my $seen;
493 # use -Q to silence the seen test. Makefile.PL uses this by default.
494 $seen = {} unless $opt{Q};
495 do
496 {
497 my $line = <$fh>;
498 chomp($line);
499 my $page = hex($line);
500 my $ch = 0;
501 my $i = 16;
502 do
503 {
504 # So why is it 1% faster to leave the my here?
505 my $line = <$fh>;
506 $line =~ s/\r\n$/\n/;
507 die "$.:${line}Line should be exactly 65 characters long including
508 newline (".length($line).")" unless length ($line) == 65;
509 # Split line into groups of 4 hex digits, convert groups to ints
510 # This takes 65.35
511 # map {hex $_} $line =~ /(....)/g
512 # This takes 63.75 (2.5% less time)
513 # unpack "n*", pack "H*", $line
514 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
515 # Doing it as while ($line =~ /(....)/g) took 74.63
516 foreach my $val (unpack "n*", pack "H*", $line)
517 {
518 next if $val == 0xFFFD;
519 my $ech = &$type_func($ch,$page);
520 if ($val || (!$ch && !$page))
521 {
522 my $el = length($ech);
523 $max_el = $el if $el > $max_el;
524 $min_el = $el if $el < $min_el;
525 my $uch = encode_U($val);
526 if ($seen) {
527 # We're doing the test.
528 # We don't need to read this quickly, so storing it as a scalar,
529 # rather than 3 (anon array, plus the 2 scalars it holds) saves
530 # RAM and may make us faster on low RAM systems. [see __END__]
531 if (exists $seen->{$uch})
532 {
533 warn sprintf("U%04X is %02X%02X and %04X\n",
534 $val,$page,$ch,$seen->{$uch});
535 $errors++;
536 }
537 else
538 {
539 $seen->{$uch} = $page << 8 | $ch;
540 }
541 }
542 # Passing 2 extra args each time is 3.6% slower!
543 # Even with having to add $fallback ||= 0 later
544 enter_fb0($e2u,$ech,$uch);
545 enter_fb0($u2e,$uch,$ech);
546 }
547 else
548 {
549 # No character at this position
550 # enter($e2u,$ech,undef,$e2u);
551 }
552 $ch++;
553 }
554 } while --$i;
555 } while --$pages;
556 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
557 if $min_el > $max_el;
558 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
559 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
560}
561
562# my ($a,$s,$d,$t,$fb) = @_;
563sub enter {
564 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
565 # state we shift to after this (multibyte) input character defaults to same
566 # as current state.
567 $next ||= $current;
568 # Making sure it is defined seems to be faster than {no warnings;} in
569 # &process, or passing it in as 0 explicity.
570 # XXX $fallback ||= 0;
571
572 # Start at the beginning and work forwards through the string to zero.
573 # effectively we are removing 1 character from the front each time
574 # but we don't actually edit the string. [this alone seems to be 14% speedup]
575 # Hence -$pos is the length of the remaining string.
576 my $pos = -length $inbytes;
577 while (1) {
578 my $byte = substr $inbytes, $pos, 1;
579 # RAW_NEXT => 0,
580 # RAW_IN_LEN => 1,
581 # RAW_OUT_BYTES => 2,
582 # RAW_FALLBACK => 3,
583 # to unicode an array would seem to be better, because the pages are dense.
584 # from unicode can be very sparse, favouring a hash.
585 # hash using the bytes (all length 1) as keys rather than ord value,
586 # as it's easier to sort these in &process.
587
588 # It's faster to always add $fallback even if it's undef, rather than
589 # choosing between 3 and 4 element array. (hence why we set it defined
590 # above)
591 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
592 # When $pos was -1 we were at the last input character.
593 unless (++$pos) {
594 $do_now->[RAW_OUT_BYTES] = $outbytes;
595 $do_now->[RAW_NEXT] = $next;
596 return;
597 }
598 # Tail recursion. The intermdiate state may not have a name yet.
599 $current = $do_now->[RAW_NEXT];
600 }
601}
602
603# This is purely for optimistation. It's just &enter hard coded for $fallback
604# of 0, using only a 3 entry array ref to save memory for every entry.
605sub enter_fb0 {
606 my ($current,$inbytes,$outbytes,$next) = @_;
607 $next ||= $current;
608
609 my $pos = -length $inbytes;
610 while (1) {
611 my $byte = substr $inbytes, $pos, 1;
612 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
613 unless (++$pos) {
614 $do_now->[RAW_OUT_BYTES] = $outbytes;
615 $do_now->[RAW_NEXT] = $next;
616 return;
617 }
618 $current = $do_now->[RAW_NEXT];
619 }
620}
621
67d7b5ef
JH
622sub process
623{
624 my ($name,$a) = @_;
625 $name =~ s/\W+/_/g;
626 $a->{Cname} = $name;
627 my $raw = $a->{Raw};
628 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
629 my @ent;
630 $agg_max_in = 0;
631 foreach my $key (sort keys %$raw) {
632 # RAW_NEXT => 0,
633 # RAW_IN_LEN => 1,
634 # RAW_OUT_BYTES => 2,
635 # RAW_FALLBACK => 3,
636 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
637 # Now we are converting from raw to aggregate, switch from 1 byte strings
638 # to numbers
639 my $b = ord $key;
640 $fallback ||= 0;
641 if ($l &&
642 # If this == fails, we're going to reset $agg_max_in below anyway.
643 $b == ++$agg_max_in &&
644 # References in numeric context give the pointer as an int.
645 $agg_next == $next &&
646 $agg_in_len == $in_len &&
647 $agg_out_len == length $out_bytes &&
648 $agg_fallback == $fallback
649 # && length($l->[AGG_OUT_BYTES]) < 16
650 ) {
651 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
652 # we can aggregate this byte onto the end.
653 $l->[AGG_MAX_IN] = $b;
654 $l->[AGG_OUT_BYTES] .= $out_bytes;
655 } else {
656 # AGG_MIN_IN => 0,
657 # AGG_MAX_IN => 1,
658 # AGG_OUT_BYTES => 2,
659 # AGG_NEXT => 3,
660 # AGG_IN_LEN => 4,
661 # AGG_OUT_LEN => 5,
662 # AGG_FALLBACK => 6,
663 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
664 # (only gains .6% on euc-jp -- is it worth it?)
665 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
666 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
667 $agg_fallback = $fallback];
668 }
669 if (exists $next->{Cname}) {
670 $next->{'Forward'} = 1 if $next != $a;
671 } else {
672 process(sprintf("%s_%02x",$name,$b),$next);
673 }
674 }
675 # encengine.c rules say that last entry must be for 255
676 if ($agg_max_in < 255) {
677 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
678 }
679 $a->{'Entries'} = \@ent;
680}
681
b536bf57
DK
682
683sub addstrings
67d7b5ef
JH
684{
685 my ($fh,$a) = @_;
686 my $name = $a->{'Cname'};
687 # String tables
688 foreach my $b (@{$a->{'Entries'}})
689 {
690 next unless $b->[AGG_OUT_LEN];
b536bf57 691 $strings{$b->[AGG_OUT_BYTES]} = undef;
67d7b5ef
JH
692 }
693 if ($a->{'Forward'})
694 {
9f20e2cf 695 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
f49b0b20
JH
696 my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static';
697 my $const = $cpp ? '' : 'const';
698 print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
b536bf57
DK
699 }
700 $a->{'DoneStrings'} = 1;
701 foreach my $b (@{$a->{'Entries'}})
702 {
703 my ($s,$e,$out,$t,$end,$l) = @$b;
704 addstrings($fh,$t) unless $t->{'DoneStrings'};
67d7b5ef 705 }
b536bf57
DK
706}
707
708sub outbigstring
709{
710 my ($fh,$name) = @_;
711
712 $string_acc = '';
713
714 # Make the big string in the string accumulator. Longest first, on the hope
715 # that this makes it more likely that we find the short strings later on.
716 # Not sure if it helps sorting strings of the same length lexcically.
717 foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
718 my $index = index $string_acc, $s;
719 if ($index >= 0) {
720 $saved += length($s);
721 $strings_in_acc{$s} = $index;
722 } else {
723 OPTIMISER: {
d1256cb1
RGS
724 if ($opt{'O'}) {
725 my $sublength = length $s;
726 while (--$sublength > 0) {
727 # progressively lop characters off the end, to see if the start of
728 # the new string overlaps the end of the accumulator.
729 if (substr ($string_acc, -$sublength)
730 eq substr ($s, 0, $sublength)) {
731 $subsave += $sublength;
732 $strings_in_acc{$s} = length ($string_acc) - $sublength;
733 # append the last bit on the end.
734 $string_acc .= substr ($s, $sublength);
735 last OPTIMISER;
736 }
737 # or if the end of the new string overlaps the start of the
738 # accumulator
739 next unless substr ($string_acc, 0, $sublength)
740 eq substr ($s, -$sublength);
741 # well, the last $sublength characters of the accumulator match.
742 # so as we're prepending to the accumulator, need to shift all our
743 # existing offsets forwards
744 $_ += $sublength foreach values %strings_in_acc;
745 $subsave += $sublength;
746 $strings_in_acc{$s} = 0;
747 # append the first bit on the start.
748 $string_acc = substr ($s, 0, -$sublength) . $string_acc;
749 last OPTIMISER;
750 }
751 }
752 # Optimiser (if it ran) found nothing, so just going have to tack the
753 # whole thing on the end.
754 $strings_in_acc{$s} = length $string_acc;
755 $string_acc .= $s;
b536bf57
DK
756 };
757 }
758 }
759
760 $strings = length $string_acc;
9f20e2cf 761 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
f49b0b20
JH
762 my $var = $cpp ? '' : 'static';
763 my $definition = "\n$var const U8 $name\[$strings] = { " .
b536bf57
DK
764 join(',',unpack "C*",$string_acc);
765 # We have a single long line. Split it at convenient commas.
766 print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
767 print $fh substr ($definition, pos $definition), " };\n";
768}
769
770sub findstring {
771 my ($name,$s) = @_;
772 my $offset = $strings_in_acc{$s};
773 die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
774 unless defined $offset;
775 "$name + $offset";
776}
777
778sub outtable
779{
780 my ($fh,$a,$bigname) = @_;
781 my $name = $a->{'Cname'};
67d7b5ef
JH
782 $a->{'Done'} = 1;
783 foreach my $b (@{$a->{'Entries'}})
784 {
785 my ($s,$e,$out,$t,$end,$l) = @$b;
b536bf57 786 outtable($fh,$t,$bigname) unless $t->{'Done'};
67d7b5ef 787 }
9f20e2cf 788 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
f49b0b20
JH
789 my $var = $cpp ? '' : 'static';
790 my $const = $cpp ? '' : 'const';
791 print $fh "\n$var $const encpage_t $name\[",
0629a5b3 792 scalar(@{$a->{'Entries'}}), "] = {\n";
67d7b5ef
JH
793 foreach my $b (@{$a->{'Entries'}})
794 {
795 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
d6f02b51 796 # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
67d7b5ef
JH
797 print $fh "{";
798 if ($l)
799 {
b536bf57 800 printf $fh findstring($bigname,$out);
67d7b5ef
JH
801 }
802 else
803 {
804 print $fh "0";
805 }
806 print $fh ",",$t->{Cname};
807 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
808 }
809 print $fh "};\n";
810}
811
67d7b5ef
JH
812sub output_enc
813{
814 my ($fh,$name,$a) = @_;
815 die "Changed - fix me for new structure";
816 foreach my $b (sort keys %$a)
817 {
818 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
819 }
820}
821
822sub decode_U
823{
824 my $s = shift;
825}
826
827my @uname;
828sub char_names
829{
830 my $s = do "unicore/Name.pl";
831 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
832 pos($s) = 0;
833 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
834 {
835 my $name = $3;
836 my $s = hex($1);
837 last if $s >= 0x10000;
838 my $e = length($2) ? hex($2) : $s;
839 for (my $i = $s; $i <= $e; $i++)
840 {
841 $uname[$i] = $name;
842# print sprintf("U%04X $name\n",$i);
843 }
844 }
845}
846
847sub output_ucm_page
848{
849 my ($cmap,$a,$t,$pre) = @_;
850 # warn sprintf("Page %x\n",$pre);
851 my $raw = $t->{Raw};
852 foreach my $key (sort keys %$raw) {
853 # RAW_NEXT => 0,
854 # RAW_IN_LEN => 1,
855 # RAW_OUT_BYTES => 2,
856 # RAW_FALLBACK => 3,
857 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
858 my $u = ord $key;
859 $fallback ||= 0;
860
861 if ($next != $a && $next != $t) {
862 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
863 } elsif (length $out_bytes) {
864 if ($pre) {
865 $u = $pre|($u &0x3f);
866 }
867 my $s = sprintf "<U%04X> ",$u;
868 #foreach my $c (split(//,$out_bytes)) {
869 # $s .= sprintf "\\x%02X",ord($c);
870 #}
871 # 9.5% faster changing that loop to this:
872 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
873 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
874 push(@$cmap,$s);
875 } else {
876 warn join(',',$u, @{$raw->{$key}},$a,$t);
877 }
878 }
879}
880
881sub output_ucm
882{
883 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
884 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
885 print $fh "<code_set_name> \"$name\"\n";
886 char_names();
887 if (defined $min_el)
888 {
889 print $fh "<mb_cur_min> $min_el\n";
890 }
891 if (defined $max_el)
892 {
893 print $fh "<mb_cur_max> $max_el\n";
894 }
895 if (defined $rep)
896 {
897 print $fh "<subchar> ";
898 foreach my $c (split(//,$rep))
899 {
900 printf $fh "\\x%02X",ord($c);
901 }
902 print $fh "\n";
903 }
904 my @cmap;
905 output_ucm_page(\@cmap,$h,$h,0);
906 print $fh "#\nCHARMAP\n";
907 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
908 {
909 print $fh $line;
910 }
911 print $fh "END CHARMAP\n";
912}
913
3ef515df
JH
914use vars qw(
915 $_Enc2xs
916 $_Version
917 $_Inc
b2704119 918 $_E2X
3ef515df
JH
919 $_Name
920 $_TableFiles
921 $_Now
922);
923
b2704119 924sub find_e2x{
b536bf57 925 eval { require File::Find; };
b2704119
JH
926 my (@inc, %e2x_dir);
927 for my $inc (@INC){
d1256cb1 928 push @inc, $inc unless $inc eq '.'; #skip current dir
b2704119
JH
929 }
930 File::Find::find(
d1256cb1
RGS
931 sub {
932 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
933 $atime,$mtime,$ctime,$blksize,$blocks)
934 = lstat($_) or return;
935 -f _ or return;
936 if (/^.*\.e2x$/o){
937 no warnings 'once';
938 $e2x_dir{$File::Find::dir} ||= $mtime;
939 }
940 return;
941 }, @inc);
b2704119
JH
942 warn join("\n", keys %e2x_dir), "\n";
943 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
d1256cb1
RGS
944 $_E2X = $d;
945 # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
946 return $_E2X;
b2704119
JH
947 }
948}
949
67d7b5ef
JH
950sub make_makefile_pl
951{
952 eval { require Encode; };
953 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
3ef515df
JH
954 # our used for variable expanstion
955 $_Enc2xs = $0;
956 $_Version = $VERSION;
b2704119 957 $_E2X = find_e2x();
3ef515df
JH
958 $_Name = shift;
959 $_TableFiles = join(",", map {qq('$_')} @_);
960 $_Now = scalar localtime();
b2704119 961
aae85ceb 962 eval { require File::Spec; };
b2704119 963 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
b2704119 964 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
b2704119 965 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
b2704119 966 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
b2704119 967 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
3ef515df
JH
968 exit;
969}
970
aae85ceb 971use vars qw(
d1256cb1
RGS
972 $_ModLines
973 $_LocalVer
974 );
aae85ceb 975
656ebd29 976sub make_configlocal_pm {
aae85ceb
DK
977 eval { require Encode; };
978 $@ and die "Unable to require Encode: $@\n";
979 eval { require File::Spec; };
656ebd29 980
aae85ceb 981 # our used for variable expanstion
656ebd29
RGS
982 my %in_core = map { $_ => 1 } (
983 'ascii', 'iso-8859-1', 'utf8',
984 'ascii-ctrl', 'null', 'utf-8-strict'
985 );
aae85ceb 986 my %LocalMod = ();
656ebd29
RGS
987 # check @enc;
988 use File::Find ();
989 my $wanted = sub{
990 -f $_ or return;
991 $File::Find::name =~ /\A\./ and return;
992 $File::Find::name =~ /\.pm\z/ or return;
993 $File::Find::name =~ m/\bEncode\b/ or return;
994 my $mod = $File::Find::name;
995 $mod =~ s/.*\bEncode\b/Encode/o;
996 $mod =~ s/\.pm\z//o;
997 $mod =~ s,/,::,og;
998 warn qq{ require $mod;\n};
999 eval qq{ require $mod; };
1000 $@ and die "Can't require $mod: $@\n";
1001 for my $enc ( Encode->encodings() ) {
1002 no warnings;
1003 $in_core{$enc} and next;
1004 $Encode::Config::ExtModule{$enc} and next;
1005 $LocalMod{$enc} ||= $mod;
1006 }
1007 };
1008 File::Find::find({wanted => $wanted}, @INC);
aae85ceb 1009 $_ModLines = "";
656ebd29
RGS
1010 for my $enc ( sort keys %LocalMod ) {
1011 $_ModLines .=
1012 qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
aae85ceb 1013 }
b2704119 1014 warn $_ModLines;
aae85ceb 1015 $_LocalVer = _mkversion();
656ebd29
RGS
1016 $_E2X = find_e2x();
1017 $_Inc = $INC{"Encode.pm"};
1018 $_Inc =~ s/\.pm$//o;
1019 _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
1020 File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
aae85ceb
DK
1021 exit;
1022}
1023
1024sub _mkversion{
656ebd29
RGS
1025 # v-string is now depreciated; use time() instead;
1026 #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1027 #$yyyy += 1900, $mo +=1;
1028 #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1029 return time();
aae85ceb
DK
1030}
1031
3ef515df 1032sub _print_expand{
67d7b5ef
JH
1033 eval { require File::Basename; };
1034 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
1035 File::Basename->import();
621b0f8d
DK
1036 my ($src, $dst, $clobber) = @_;
1037 if (!$clobber and -e $dst){
d1256cb1
RGS
1038 warn "$dst exists. skipping\n";
1039 return;
621b0f8d
DK
1040 }
1041 warn "Generating $dst...\n";
3ef515df
JH
1042 open my $in, $src or die "$src : $!";
1043 if ((my $d = dirname($dst)) ne '.'){
d1256cb1 1044 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
3ef515df
JH
1045 }
1046 open my $out, ">$dst" or die "$!";
1047 my $asis = 0;
1048 while (<$in>){
d1256cb1
RGS
1049 if (/^#### END_OF_HEADER/){
1050 $asis = 1; next;
1051 }
1052 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1053 print $out $_;
67d7b5ef 1054 }
67d7b5ef 1055}
67d7b5ef
JH
1056__END__
1057
1058=head1 NAME
1059
1060enc2xs -- Perl Encode Module Generator
1061
1062=head1 SYNOPSIS
1063
67d7b5ef 1064 enc2xs -[options]
aae85ceb
DK
1065 enc2xs -M ModName mapfiles...
1066 enc2xs -C
67d7b5ef
JH
1067
1068=head1 DESCRIPTION
1069
1070F<enc2xs> builds a Perl extension for use by Encode from either
0ab8f81e
JH
1071Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1072Besides being used internally during the build process of the Encode
1073module, you can use F<enc2xs> to add your own encoding to perl.
1074No knowledge of XS is necessary.
67d7b5ef
JH
1075
1076=head1 Quick Guide
1077
0ab8f81e 1078If you want to know as little about Perl as possible but need to
67d7b5ef
JH
1079add a new encoding, just read this chapter and forget the rest.
1080
1081=over 4
1082
1083=item 0.
1084
0ab8f81e
JH
1085Have a .ucm file ready. You can get it from somewhere or you can write
1086your own from scratch or you can grab one from the Encode distribution
1087and customize it. For the UCM format, see the next Chapter. In the
1088example below, I'll call my theoretical encoding myascii, defined
1089in I<my.ucm>. C<$> is a shell prompt.
67d7b5ef
JH
1090
1091 $ ls -F
1092 my.ucm
1093
1094=item 1.
1095
1096Issue a command as follows;
1097
1098 $ enc2xs -M My my.ucm
3ef515df
JH
1099 generating Makefile.PL
1100 generating My.pm
1101 generating README
1102 generating Changes
67d7b5ef
JH
1103
1104Now take a look at your current directory. It should look like this.
1105
1106 $ ls -F
1107 Makefile.PL My.pm my.ucm t/
1108
0ab8f81e 1109The following files were created.
67d7b5ef 1110
0ab8f81e
JH
1111 Makefile.PL - MakeMaker script
1112 My.pm - Encode submodule
1113 t/My.t - test file
1114
1115=over 4
67d7b5ef 1116
037b88d6
JH
1117=item 1.1.
1118
1119If you want *.ucm installed together with the modules, do as follows;
1120
1121 $ mkdir Encode
1122 $ mv *.ucm Encode
1123 $ enc2xs -M My Encode/*ucm
1124
0ab8f81e
JH
1125=back
1126
67d7b5ef
JH
1127=item 2.
1128
1129Edit the files generated. You don't have to if you have no time AND no
1130intention to give it to someone else. But it is a good idea to edit
0ab8f81e 1131the pod and to add more tests.
67d7b5ef
JH
1132
1133=item 3.
1134
0ab8f81e 1135Now issue a command all Perl Mongers love:
67d7b5ef 1136
9160fdbd 1137 $ perl Makefile.PL
67d7b5ef
JH
1138 Writing Makefile for Encode::My
1139
1140=item 4.
1141
1142Now all you have to do is make.
1143
1144 $ make
1145 cp My.pm blib/lib/Encode/My.pm
1146 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1147 -o encode_t.c -f encode_t.fnm
1148 Reading myascii (myascii)
1149 Writing compiled form
1150 128 bytes in string tables
cf9f87ce
RGS
1151 384 bytes (75%) saved spotting duplicates
1152 1 bytes (0.775%) saved using substrings
67d7b5ef
JH
1153 ....
1154 chmod 644 blib/arch/auto/Encode/My/My.bs
1155 $
1156
0ab8f81e
JH
1157The time it takes varies depending on how fast your machine is and
1158how large your encoding is. Unless you are working on something big
1159like euc-tw, it won't take too long.
67d7b5ef
JH
1160
1161=item 5.
1162
1163You can "make install" already but you should test first.
1164
1165 $ make test
1166 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1167 -e 'use Test::Harness qw(&runtests $verbose); \
1168 $verbose=0; runtests @ARGV;' t/*.t
1169 t/My....ok
1170 All tests successful.
1171 Files=1, Tests=2, 0 wallclock secs
1172 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1173
1174=item 6.
1175
1176If you are content with the test result, just "make install"
1177
aae85ceb
DK
1178=item 7.
1179
0ab8f81e 1180If you want to add your encoding to Encode's demand-loading list
aae85ceb
DK
1181(so you don't have to "use Encode::YourEncoding"), run
1182
1183 enc2xs -C
1184
1185to update Encode::ConfigLocal, a module that controls local settings.
1186After that, "use Encode;" is enough to load your encodings on demand.
1187
67d7b5ef
JH
1188=back
1189
1190=head1 The Unicode Character Map
1191
0ab8f81e
JH
1192Encode uses the Unicode Character Map (UCM) format for source character
1193mappings. This format is used by IBM's ICU package and was adopted
1194by Nick Ing-Simmons for use with the Encode module. Since UCM is
1195more flexible than Tcl's Encoding Map and far more user-friendly,
863b2ca0 1196this is the recommended format for Encode now.
67d7b5ef 1197
0ab8f81e 1198A UCM file looks like this.
67d7b5ef
JH
1199
1200 #
1201 # Comments
1202 #
1203 <code_set_name> "US-ascii" # Required
1204 <code_set_alias> "ascii" # Optional
1205 <mb_cur_min> 1 # Required; usually 1
1206 <mb_cur_max> 1 # Max. # of bytes/char
1207 <subchar> \x3F # Substitution char
1208 #
1209 CHARMAP
1210 <U0000> \x00 |0 # <control>
1211 <U0001> \x01 |0 # <control>
1212 <U0002> \x02 |0 # <control>
1213 ....
1214 <U007C> \x7C |0 # VERTICAL LINE
1215 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1216 <U007E> \x7E |0 # TILDE
1217 <U007F> \x7F |0 # <control>
1218 END CHARMAP
1219
1220=over 4
1221
1222=item *
1223
0ab8f81e 1224Anything that follows C<#> is treated as a comment.
67d7b5ef
JH
1225
1226=item *
1227
0ab8f81e
JH
1228The header section continues until a line containing the word
1229CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1230pair per line. Strings used as values must be quoted. Barewords are
1231treated as numbers. I<\xXX> represents a byte.
67d7b5ef
JH
1232
1233Most of the keywords are self-explanatory. I<subchar> means
1234substitution character, not subcharacter. When you decode a Unicode
1235sequence to this encoding but no matching character is found, the byte
1236sequence defined here will be used. For most cases, the value here is
0ab8f81e 1237\x3F; in ASCII, this is a question mark.
67d7b5ef
JH
1238
1239=item *
1240
1241CHARMAP starts the character map section. Each line has a form as
0ab8f81e 1242follows:
67d7b5ef
JH
1243
1244 <UXXXX> \xXX.. |0 # comment
1245 ^ ^ ^
1246 | | +- Fallback flag
1247 | +-------- Encoded byte sequence
1248 +-------------- Unicode Character ID in hex
1249
0ab8f81e
JH
1250The format is roughly the same as a header section except for the
1251fallback flag: | followed by 0..3. The meaning of the possible
1252values is as follows:
67d7b5ef 1253
0ab8f81e 1254=over 4
67d7b5ef
JH
1255
1256=item |0
1257
0ab8f81e
JH
1258Round trip safe. A character decoded to Unicode encodes back to the
1259same byte sequence. Most characters have this flag.
67d7b5ef
JH
1260
1261=item |1
1262
1263Fallback for unicode -> encoding. When seen, enc2xs adds this
0ab8f81e 1264character for the encode map only.
67d7b5ef
JH
1265
1266=item |2
1267
1268Skip sub-char mapping should there be no code point.
1269
1270=item |3
1271
1272Fallback for encoding -> unicode. When seen, enc2xs adds this
0ab8f81e 1273character for the decode map only.
67d7b5ef
JH
1274
1275=back
1276
1277=item *
1278
1279And finally, END OF CHARMAP ends the section.
1280
1281=back
1282
6d1c0808 1283When you are manually creating a UCM file, you should copy ascii.ucm
0ab8f81e
JH
1284or an existing encoding which is close to yours, rather than write
1285your own from scratch.
67d7b5ef
JH
1286
1287When you do so, make sure you leave at least B<U0000> to B<U0020> as
0ab8f81e 1288is, unless your environment is EBCDIC.
67d7b5ef
JH
1289
1290B<CAVEAT>: not all features in UCM are implemented. For example,
1291icu:state is not used. Because of that, you need to write a perl
0ab8f81e
JH
1292module if you want to support algorithmical encodings, notably
1293the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
67d7b5ef
JH
1294L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1295
6d1c0808
JH
1296=head2 Coping with duplicate mappings
1297
1298When you create a map, you SHOULD make your mappings round-trip safe.
1299That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1300$data> stands for all characters that are marked as C<|0>. Here is
0ab8f81e 1301how to make sure:
6d1c0808 1302
0ab8f81e 1303=over 4
6d1c0808
JH
1304
1305=item *
1306
1307Sort your map in Unicode order.
1308
1309=item *
1310
1311When you have a duplicate entry, mark either one with '|1' or '|3'.
1312
1313=item *
1314
0ab8f81e 1315And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
6d1c0808
JH
1316
1317=back
1318
1319Here is an example from big5-eten.
1320
1321 <U2550> \xF9\xF9 |0
1322 <U2550> \xA2\xA4 |3
1323
1324Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1325this;
1326
1327 E to U U to E
1328 --------------------------------------
1329 \xF9\xF9 => U2550 U2550 => \xF9\xF9
1330 \xA2\xA4 => U2550
1331
1332So it is round-trip safe for \xF9\xF9. But if the line above is upside
1333down, here is what happens.
1334
1335 E to U U to E
1336 --------------------------------------
1337 \xA2\xA4 => U2550 U2550 => \xF9\xF9
1338 (\xF9\xF9 => U2550 is now overwritten!)
1339
1340The Encode package comes with F<ucmlint>, a crude but sufficient
0ab8f81e
JH
1341utility to check the integrity of a UCM file. Check under the
1342Encode/bin directory for this.
cf9f87ce
RGS
1343
1344When in doubt, you can use F<ucmsort>, yet another utility under
1345Encode/bin directory.
6d1c0808 1346
67d7b5ef
JH
1347=head1 Bookmarks
1348
0ab8f81e
JH
1349=over 4
1350
1351=item *
1352
67d7b5ef
JH
1353ICU Home Page
1354L<http://oss.software.ibm.com/icu/>
1355
0ab8f81e
JH
1356=item *
1357
67d7b5ef
JH
1358ICU Character Mapping Tables
1359L<http://oss.software.ibm.com/icu/charset/>
1360
0ab8f81e
JH
1361=item *
1362
67d7b5ef
JH
1363ICU:Conversion Data
1364L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1365
0ab8f81e
JH
1366=back
1367
67d7b5ef
JH
1368=head1 SEE ALSO
1369
1370L<Encode>,
1371L<perlmod>,
1372L<perlpod>
1373
1374=cut
1375
1376# -Q to disable the duplicate codepoint test
1377# -S make mapping errors fatal
1378# -q to remove comments written to output files
1379# -O to enable the (brute force) substring optimiser
1380# -o <output> to specify the output file name (else it's the first arg)
1381# -f <inlist> to give a file with a list of input files (else use the args)
1382# -n <name> to name the encoding (else use the basename of the input file.
1383
1384With %seen holding array refs:
1385
1386 865.66 real 28.80 user 8.79 sys
1387 7904 maximum resident set size
1388 1356 average shared memory size
1389 18566 average unshared data size
1390 229 average unshared stack size
1391 46080 page reclaims
1392 33373 page faults
1393
1394With %seen holding simple scalars:
1395
1396 342.16 real 27.11 user 3.54 sys
1397 8388 maximum resident set size
1398 1394 average shared memory size
1399 14969 average unshared data size
1400 236 average unshared stack size
1401 28159 page reclaims
1402 9839 page faults
1403
1404Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1405how %seen is storing things its seen. So it is pathalogically bad on a 16M
1406RAM machine, but it's going to help even on modern machines.
1407Swapping is bad, m'kay :-)