This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
new perldelta
[perl5.git] / cpan / Encode / bin / enc2xs
CommitLineData
3ef515df 1#!./perl
67d7b5ef 2BEGIN {
a999c27c
JH
3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4 # with $ENV{PERL_CORE} set
5 # In case we need it in future...
18cfa0e3 6 require Config; Config->import();
cee96d52 7 pop @INC if $INC[-1] eq '.';
67d7b5ef
JH
8}
9use strict;
b536bf57 10use warnings;
67d7b5ef 11use Getopt::Std;
f49b0b20 12use Config;
67d7b5ef 13my @orig_ARGV = @ARGV;
18cfa0e3 14our $VERSION = do { my @r = (q$Revision: 2.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
67d7b5ef
JH
15
16# These may get re-ordered.
17# RAW is a do_now as inserted by &enter
0a225b3c 18# AGG is an aggregated do_now, as built up by &process
a999c27c 19
67d7b5ef
JH
20use constant {
21 RAW_NEXT => 0,
22 RAW_IN_LEN => 1,
23 RAW_OUT_BYTES => 2,
24 RAW_FALLBACK => 3,
25
26 AGG_MIN_IN => 0,
27 AGG_MAX_IN => 1,
28 AGG_OUT_BYTES => 2,
29 AGG_NEXT => 3,
30 AGG_IN_LEN => 4,
31 AGG_OUT_LEN => 5,
32 AGG_FALLBACK => 6,
33};
a999c27c 34
67d7b5ef
JH
35# (See the algorithm in encengine.c - we're building structures for it)
36
37# There are two sorts of structures.
38# "do_now" (an array, two variants of what needs storing) is whatever we need
39# to do now we've read an input byte.
40# It's housed in a "do_next" (which is how we got to it), and in turn points
41# to a "do_next" which contains all the "do_now"s for the next input byte.
42
43# There will be a "do_next" which is the start state.
44# For a single byte encoding it's the only "do_next" - each "do_now" points
45# back to it, and each "do_now" will cause bytes. There is no state.
46
47# For a multi-byte encoding where all characters in the input are the same
48# length, then there will be a tree of "do_now"->"do_next"->"do_now"
49# branching out from the start state, one step for each input byte.
50# The leaf "do_now"s will all be at the same distance from the start state,
51# only the leaf "do_now"s cause output bytes, and they in turn point back to
52# the start state.
53
0a225b3c 54# For an encoding where there are variable length input byte sequences, you
67d7b5ef
JH
55# will encounter a leaf "do_now" sooner for the shorter input sequences, but
56# as before the leaves will point back to the start state.
57
58# The system will cope with escape encodings (imagine them as a mostly
59# self-contained tree for each escape state, and cross links between trees
60# at the state-switching characters) but so far no input format defines these.
61
62# The system will also cope with having output "leaves" in the middle of
63# the bifurcating branches, not just at the extremities, but again no
64# input format does this yet.
65
66# There are two variants of the "do_now" structure. The first, smaller variant
67# is generated by &enter as the input file is read. There is one structure
68# for each input byte. Say we are mapping a single byte encoding to a
69# single byte encoding, with "ABCD" going "abcd". There will be
70# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
71
0a225b3c 72# &process then walks the tree, building aggregate "do_now" structures for
67d7b5ef
JH
73# adjacent bytes where possible. The aggregate is for a contiguous range of
74# bytes which each produce the same length of output, each move to the
75# same next state, and each have the same fallback flag.
76# So our 4 RAW "do_now"s above become replaced by a single structure
77# containing:
78# ["A", "D", "abcd", 1, ...]
79# ie, for an input byte $_ in "A".."D", output 1 byte, found as
80# substr ("abcd", (ord $_ - ord "A") * 1, 1)
81# which maps very nicely into pointer arithmetic in C for encengine.c
82
83sub encode_U
84{
85 # UTF-8 encode long hand - only covers part of perl's range
86 ## my $uv = shift;
87 # chr() works in native space so convert value from table
88 # into that space before using chr().
89 my $ch = chr(utf8::unicode_to_native($_[0]));
90 # Now get core perl to encode that the way it likes.
91 utf8::encode($ch);
92 return $ch;
93}
94
95sub encode_S
96{
97 # encode single byte
98 ## my ($ch,$page) = @_; return chr($ch);
99 return chr $_[0];
100}
101
102sub encode_D
103{
104 # encode double byte MS byte first
105 ## my ($ch,$page) = @_; return chr($page).chr($ch);
106 return chr ($_[1]) . chr $_[0];
107}
108
109sub encode_M
110{
111 # encode Multi-byte - single for 0..255 otherwise double
112 ## my ($ch,$page) = @_;
113 ## return &encode_D if $page;
114 ## return &encode_S;
115 return chr ($_[1]) . chr $_[0] if $_[1];
116 return chr $_[0];
117}
118
119my %encode_types = (U => \&encode_U,
120 S => \&encode_S,
121 D => \&encode_D,
122 M => \&encode_M,
123 );
124
125# Win32 does not expand globs on command line
15f5e486
SH
126if ($^O eq 'MSWin32' and !$ENV{PERL_CORE}) {
127 eval "\@ARGV = map(glob(\$_),\@ARGV)";
128 @ARGV = @orig_ARGV unless @ARGV;
129}
67d7b5ef
JH
130
131my %opt;
132# I think these are:
133# -Q to disable the duplicate codepoint test
134# -S make mapping errors fatal
135# -q to remove comments written to output files
136# -O to enable the (brute force) substring optimiser
137# -o <output> to specify the output file name (else it's the first arg)
138# -f <inlist> to give a file with a list of input files (else use the args)
139# -n <name> to name the encoding (else use the basename of the input file.
15f5e486
SH
140#Getopt::Long::Configure("bundling");
141#GetOptions(\%opt, qw(C M=s S Q q O o=s f=s n=s v));
127a7155 142getopts('CM:SQqOo:f:n:v',\%opt);
67d7b5ef
JH
143
144$opt{M} and make_makefile_pl($opt{M}, @ARGV);
aae85ceb 145$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
127a7155 146$opt{v} ||= $ENV{ENC2XS_VERBOSE};
b3cc7c67 147$opt{q} ||= $ENV{ENC2XS_NO_COMMENTS};
127a7155
SH
148
149sub verbose {
150 print STDERR @_ if $opt{v};
151}
152sub verbosef {
153 printf STDERR @_ if $opt{v};
154}
67d7b5ef 155
e67e8a4a
DM
156
157# ($cpp, $static, $sized) = compiler_info($declaration)
158#
159# return some information about the compiler and compile options we're using:
160#
161# $declaration - true if we're doing a declaration rather than a definition.
162#
163# $cpp - we're using C++
164# $static - ok to declare the arrays as static
165# $sized - the array declarations should be sized
166
167sub compiler_info {
168 my ($declaration) = @_;
169
170 my $ccflags = $Config{ccflags};
171 if (defined $Config{ccwarnflags}) {
172 $ccflags .= " " . $Config{ccwarnflags};
173 }
174 my $compat = $ccflags =~ /\Q-Wc++-compat/;
175 my $pedantic = $ccflags =~ /-pedantic/;
176
177 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
178
179 # The encpage_t tables contain recursive and mutually recursive
180 # references. To allow them to compile under C++ and some restrictive
181 # cc options, it may be necessary to make the tables non-static/const
182 # (thus moving them from the text to the data segment) and/or not
183 # include the size in the declaration.
184
185 my $static = !(
186 $cpp
187 || ($compat && $pedantic)
188 || ($^O eq 'MacOS' && $declaration)
189 );
190
191 # -Wc++-compat on its own warns if the array declaration is sized.
192 # The easiest way to avoid this warning is simply not to include
193 # the size in the declaration.
194 # With -pedantic as well, the issue doesn't arise because $static
195 # above becomes false.
196 my $sized = $declaration && !($compat && !$pedantic);
197
198 return ($cpp, $static, $sized);
199}
200
201
67d7b5ef
JH
202# This really should go first, else the die here causes empty (non-erroneous)
203# output files to be written.
204my @encfiles;
15f5e486 205if (exists $opt{f}) {
67d7b5ef 206 # -F is followed by name of file containing list of filenames
15f5e486 207 my $flist = $opt{f};
67d7b5ef
JH
208 open(FLIST,$flist) || die "Cannot open $flist:$!";
209 chomp(@encfiles = <FLIST>);
210 close(FLIST);
211} else {
212 @encfiles = @ARGV;
213}
214
15f5e486 215my $cname = $opt{o} ? $opt{o} : shift(@ARGV);
18cfa0e3 216unless ($cname) { # Debugging a Win32 nmake error-only; works via cmdline.
15f5e486
SH
217 print "\nARGV:";
218 print "$_ " for @ARGV;
219 print "\nopt:";
220 print " $_ => ",defined $opt{$_}?$opt{$_}:"undef","\n" for keys %opt;
221}
67d7b5ef 222chmod(0666,$cname) if -f $cname && !-w $cname;
15f5e486 223open(C,">", $cname) || die "Cannot open $cname:$!";
67d7b5ef
JH
224
225my $dname = $cname;
226my $hname = $cname;
227
228my ($doC,$doEnc,$doUcm,$doPet);
229
0e4142c9 230if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
67d7b5ef
JH
231 {
232 $doC = 1;
e7cbefb8 233 $dname =~ s/(\.[^\.]*)?$/.exh/;
67d7b5ef 234 chmod(0666,$dname) if -f $cname && !-w $dname;
15f5e486 235 open(D,">", $dname) || die "Cannot open $dname:$!";
67d7b5ef
JH
236 $hname =~ s/(\.[^\.]*)?$/.h/;
237 chmod(0666,$hname) if -f $cname && !-w $hname;
15f5e486 238 open(H,">", $hname) || die "Cannot open $hname:$!";
67d7b5ef
JH
239
240 foreach my $fh (\*C,\*D,\*H)
241 {
242 print $fh <<"END" unless $opt{'q'};
243/*
244 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
245 This file was autogenerated by:
246 $^X $0 @orig_ARGV
f49b0b20 247 enc2xs VERSION $VERSION
67d7b5ef
JH
248*/
249END
250 }
251
ab5cf2ea
YO
252 if ($cname =~ /\.c$/i && $Config{ccname} eq "gcc")
253 {
254 print C qq(#pragma GCC diagnostic ignored "-Wc++-compat"\n);
255 }
256
257 if ($cname =~ /\.xs$/i)
67d7b5ef 258 {
c9955564 259 print C "#define PERL_NO_GET_CONTEXT\n";
67d7b5ef
JH
260 print C "#include <EXTERN.h>\n";
261 print C "#include <perl.h>\n";
262 print C "#include <XSUB.h>\n";
67d7b5ef 263 }
b536bf57 264 print C "#include \"encode.h\"\n\n";
67d7b5ef
JH
265
266 }
ab5cf2ea 267elsif ($cname =~ /\.enc$/i)
67d7b5ef
JH
268 {
269 $doEnc = 1;
270 }
ab5cf2ea 271elsif ($cname =~ /\.ucm$/i)
67d7b5ef
JH
272 {
273 $doUcm = 1;
274 }
ab5cf2ea 275elsif ($cname =~ /\.pet$/i)
67d7b5ef
JH
276 {
277 $doPet = 1;
278 }
279
280my %encoding;
281my %strings;
b536bf57
DK
282my $string_acc;
283my %strings_in_acc;
284
67d7b5ef
JH
285my $saved = 0;
286my $subsave = 0;
287my $strings = 0;
288
289sub cmp_name
290{
291 if ($a =~ /^.*-(\d+)/)
292 {
293 my $an = $1;
294 if ($b =~ /^.*-(\d+)/)
295 {
296 my $r = $an <=> $1;
297 return $r if $r;
298 }
299 }
300 return $a cmp $b;
301}
302
303
304foreach my $enc (sort cmp_name @encfiles)
305 {
306 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
307 $name = $opt{'n'} if exists $opt{'n'};
308 if (open(E,$enc))
309 {
310 if ($sfx eq 'enc')
311 {
312 compile_enc(\*E,lc($name));
313 }
314 else
315 {
316 compile_ucm(\*E,lc($name));
317 }
318 }
319 else
320 {
321 warn "Cannot open $enc for $name:$!";
322 }
323 }
324
325if ($doC)
326 {
127a7155 327 verbose "Writing compiled form\n";
67d7b5ef
JH
328 foreach my $name (sort cmp_name keys %encoding)
329 {
330 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
b536bf57
DK
331 process($name.'_utf8',$e2u);
332 addstrings(\*C,$e2u);
333
334 process('utf8_'.$name,$u2e);
335 addstrings(\*C,$u2e);
336 }
337 outbigstring(\*C,"enctable");
338 foreach my $name (sort cmp_name keys %encoding)
339 {
340 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
341 outtable(\*C,$e2u, "enctable");
342 outtable(\*C,$u2e, "enctable");
343
b2704119 344 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
67d7b5ef 345 }
e67e8a4a 346 my ($cpp) = compiler_info(0);
127a7155
SH
347 my $ext = $cpp ? 'extern "C"' : "extern";
348 my $exta = $cpp ? 'extern "C"' : "static";
349 my $extb = $cpp ? 'extern "C"' : "";
67d7b5ef
JH
350 foreach my $enc (sort cmp_name keys %encoding)
351 {
b2704119
JH
352 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
353 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
354 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
355 my $replen = 0;
356 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
67d7b5ef
JH
357 my $sym = "${enc}_encoding";
358 $sym =~ s/\W+/_/g;
7dd03145 359 my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
d1256cb1 360 $min_el,$max_el);
905bf499
RGS
361 print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n";
362 print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n";
363 print C "${extb} const encode_t $sym = \n";
f0a41339
DK
364 # This is to make null encoding work -- dankogai
365 for (my $i = (scalar @info) - 1; $i >= 0; --$i){
d1256cb1 366 $info[$i] ||= 1;
f0a41339
DK
367 }
368 # end of null tweak -- dankogai
7dd03145 369 print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
67d7b5ef
JH
370 }
371
372 foreach my $enc (sort cmp_name keys %encoding)
373 {
374 my $sym = "${enc}_encoding";
375 $sym =~ s/\W+/_/g;
127a7155 376 print H "${ext} encode_t $sym;\n";
67d7b5ef
JH
377 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
378 }
379
380 if ($cname =~ /(\w+)\.xs$/)
381 {
382 my $mod = $1;
383 print C <<'END';
384
385static void
386Encode_XSEncoding(pTHX_ encode_t *enc)
387{
388 dSP;
389 HV *stash = gv_stashpv("Encode::XS", TRUE);
c7981a06
SH
390 SV *iv = newSViv(PTR2IV(enc));
391 SV *sv = sv_bless(newRV_noinc(iv),stash);
67d7b5ef 392 int i = 0;
89c2544c
DM
393 /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's
394 constness, in the hope that perl won't mess with it. */
395 assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0);
c7981a06 396 SvFLAGS(iv) |= SVp_POK;
89c2544c 397 SvPVX(iv) = (char*) enc->name[0];
67d7b5ef
JH
398 PUSHMARK(sp);
399 XPUSHs(sv);
400 while (enc->name[i])
401 {
402 const char *name = enc->name[i++];
403 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
404 }
405 PUTBACK;
406 call_pv("Encode::define_encoding",G_DISCARD);
407 SvREFCNT_dec(sv);
408}
409
410END
411
412 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
413 print C "BOOT:\n{\n";
414 print C "#include \"$dname\"\n";
415 print C "}\n";
416 }
417 # Close in void context is bad, m'kay
418 close(D) or warn "Error closing '$dname': $!";
419 close(H) or warn "Error closing '$hname': $!";
420
cf9f87ce
RGS
421 my $perc_saved = $saved/($strings + $saved) * 100;
422 my $perc_subsaved = $subsave/($strings + $subsave) * 100;
127a7155
SH
423 verbosef "%d bytes in string tables\n",$strings;
424 verbosef "%d bytes (%.3g%%) saved spotting duplicates\n",
67d7b5ef 425 $saved, $perc_saved if $saved;
127a7155 426 verbosef "%d bytes (%.3g%%) saved using substrings\n",
67d7b5ef
JH
427 $subsave, $perc_subsaved if $subsave;
428 }
429elsif ($doEnc)
430 {
431 foreach my $name (sort cmp_name keys %encoding)
432 {
433 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
434 output_enc(\*C,$name,$e2u);
435 }
436 }
437elsif ($doUcm)
438 {
439 foreach my $name (sort cmp_name keys %encoding)
440 {
441 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
442 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
443 }
444 }
445
446# writing half meg files and then not checking to see if you just filled the
447# disk is bad, m'kay
448close(C) or die "Error closing '$cname': $!";
449
450# End of the main program.
451
452sub compile_ucm
453{
454 my ($fh,$name) = @_;
455 my $e2u = {};
456 my $u2e = {};
457 my $cs;
458 my %attr;
459 while (<$fh>)
460 {
461 s/#.*$//;
462 last if /^\s*CHARMAP\s*$/i;
463 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
464 {
465 $attr{$1} = $2;
466 }
467 }
468 if (!defined($cs = $attr{'code_set_name'}))
469 {
470 warn "No <code_set_name> in $name\n";
471 }
472 else
473 {
474 $name = $cs unless exists $opt{'n'};
475 }
476 my $erep;
477 my $urep;
478 my $max_el;
479 my $min_el;
480 if (exists $attr{'subchar'})
481 {
b2704119
JH
482 #my @byte;
483 #$attr{'subchar'} =~ /^\s*/cg;
484 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
485 #$erep = join('',map(chr(hex($_)),@byte));
486 $erep = $attr{'subchar'};
487 $erep =~ s/^\s+//; $erep =~ s/\s+$//;
67d7b5ef 488 }
15f5e486
SH
489 print "Reading $name ($cs)\n"
490 unless defined $ENV{MAKEFLAGS}
491 and $ENV{MAKEFLAGS} =~ /\b(s|silent|quiet)\b/;
67d7b5ef
JH
492 my $nfb = 0;
493 my $hfb = 0;
494 while (<$fh>)
495 {
496 s/#.*$//;
497 last if /^\s*END\s+CHARMAP\s*$/i;
498 next if /^\s*$/;
a999c27c
JH
499 my (@uni, @byte) = ();
500 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
501 or die "Bad line: $_";
502 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
503 push @uni, map { substr($_, 1) } split(/\+/, $1);
504 }
505 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
506 push @byte, $1;
507 }
508 if (@uni)
67d7b5ef 509 {
a999c27c 510 my $uch = join('', map { encode_U(hex($_)) } @uni );
67d7b5ef
JH
511 my $ech = join('',map(chr(hex($_)),@byte));
512 my $el = length($ech);
513 $max_el = $el if (!defined($max_el) || $el > $max_el);
514 $min_el = $el if (!defined($min_el) || $el < $min_el);
515 if (length($fb))
516 {
517 $fb = substr($fb,1);
518 $hfb++;
519 }
520 else
521 {
522 $nfb++;
523 $fb = '0';
524 }
525 # $fb is fallback flag
526 # 0 - round trip safe
527 # 1 - fallback for unicode -> enc
528 # 2 - skip sub-char mapping
529 # 3 - fallback enc -> unicode
530 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
531 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
532 }
533 else
534 {
535 warn $_;
536 }
537 }
538 if ($nfb && $hfb)
539 {
540 die "$nfb entries without fallback, $hfb entries with\n";
541 }
542 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
543}
544
545
546
547sub compile_enc
548{
549 my ($fh,$name) = @_;
550 my $e2u = {};
551 my $u2e = {};
552
553 my $type;
554 while ($type = <$fh>)
555 {
556 last if $type !~ /^\s*#/;
557 }
558 chomp($type);
559 return if $type eq 'E';
560 # Do the hash lookup once, rather than once per function call. 4% speedup.
561 my $type_func = $encode_types{$type};
562 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
563 warn "$type encoded $name\n";
564 my $rep = '';
565 # Save a defined test by setting these to defined values.
566 my $min_el = ~0; # A very big integer
567 my $max_el = 0; # Anything must be longer than 0
568 {
569 my $v = hex($def);
570 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
571 }
572 my $errors;
573 my $seen;
574 # use -Q to silence the seen test. Makefile.PL uses this by default.
575 $seen = {} unless $opt{Q};
576 do
577 {
578 my $line = <$fh>;
579 chomp($line);
580 my $page = hex($line);
581 my $ch = 0;
582 my $i = 16;
583 do
584 {
585 # So why is it 1% faster to leave the my here?
586 my $line = <$fh>;
587 $line =~ s/\r\n$/\n/;
588 die "$.:${line}Line should be exactly 65 characters long including
589 newline (".length($line).")" unless length ($line) == 65;
590 # Split line into groups of 4 hex digits, convert groups to ints
591 # This takes 65.35
592 # map {hex $_} $line =~ /(....)/g
593 # This takes 63.75 (2.5% less time)
594 # unpack "n*", pack "H*", $line
595 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
596 # Doing it as while ($line =~ /(....)/g) took 74.63
597 foreach my $val (unpack "n*", pack "H*", $line)
598 {
599 next if $val == 0xFFFD;
600 my $ech = &$type_func($ch,$page);
601 if ($val || (!$ch && !$page))
602 {
603 my $el = length($ech);
604 $max_el = $el if $el > $max_el;
605 $min_el = $el if $el < $min_el;
606 my $uch = encode_U($val);
607 if ($seen) {
608 # We're doing the test.
609 # We don't need to read this quickly, so storing it as a scalar,
610 # rather than 3 (anon array, plus the 2 scalars it holds) saves
611 # RAM and may make us faster on low RAM systems. [see __END__]
612 if (exists $seen->{$uch})
613 {
614 warn sprintf("U%04X is %02X%02X and %04X\n",
615 $val,$page,$ch,$seen->{$uch});
616 $errors++;
617 }
618 else
619 {
620 $seen->{$uch} = $page << 8 | $ch;
621 }
622 }
623 # Passing 2 extra args each time is 3.6% slower!
624 # Even with having to add $fallback ||= 0 later
625 enter_fb0($e2u,$ech,$uch);
626 enter_fb0($u2e,$uch,$ech);
627 }
628 else
629 {
630 # No character at this position
631 # enter($e2u,$ech,undef,$e2u);
632 }
633 $ch++;
634 }
635 } while --$i;
636 } while --$pages;
637 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
638 if $min_el > $max_el;
639 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
640 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
641}
642
643# my ($a,$s,$d,$t,$fb) = @_;
644sub enter {
645 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
646 # state we shift to after this (multibyte) input character defaults to same
647 # as current state.
648 $next ||= $current;
649 # Making sure it is defined seems to be faster than {no warnings;} in
0a225b3c 650 # &process, or passing it in as 0 explicitly.
67d7b5ef
JH
651 # XXX $fallback ||= 0;
652
653 # Start at the beginning and work forwards through the string to zero.
654 # effectively we are removing 1 character from the front each time
655 # but we don't actually edit the string. [this alone seems to be 14% speedup]
656 # Hence -$pos is the length of the remaining string.
657 my $pos = -length $inbytes;
658 while (1) {
659 my $byte = substr $inbytes, $pos, 1;
660 # RAW_NEXT => 0,
661 # RAW_IN_LEN => 1,
662 # RAW_OUT_BYTES => 2,
663 # RAW_FALLBACK => 3,
664 # to unicode an array would seem to be better, because the pages are dense.
665 # from unicode can be very sparse, favouring a hash.
666 # hash using the bytes (all length 1) as keys rather than ord value,
667 # as it's easier to sort these in &process.
668
669 # It's faster to always add $fallback even if it's undef, rather than
670 # choosing between 3 and 4 element array. (hence why we set it defined
671 # above)
672 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
673 # When $pos was -1 we were at the last input character.
674 unless (++$pos) {
675 $do_now->[RAW_OUT_BYTES] = $outbytes;
676 $do_now->[RAW_NEXT] = $next;
677 return;
678 }
0a225b3c 679 # Tail recursion. The intermediate state may not have a name yet.
67d7b5ef
JH
680 $current = $do_now->[RAW_NEXT];
681 }
682}
683
0a225b3c 684# This is purely for optimisation. It's just &enter hard coded for $fallback
67d7b5ef
JH
685# of 0, using only a 3 entry array ref to save memory for every entry.
686sub enter_fb0 {
687 my ($current,$inbytes,$outbytes,$next) = @_;
688 $next ||= $current;
689
690 my $pos = -length $inbytes;
691 while (1) {
692 my $byte = substr $inbytes, $pos, 1;
693 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
694 unless (++$pos) {
695 $do_now->[RAW_OUT_BYTES] = $outbytes;
696 $do_now->[RAW_NEXT] = $next;
697 return;
698 }
699 $current = $do_now->[RAW_NEXT];
700 }
701}
702
67d7b5ef
JH
703sub process
704{
705 my ($name,$a) = @_;
706 $name =~ s/\W+/_/g;
707 $a->{Cname} = $name;
708 my $raw = $a->{Raw};
709 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
710 my @ent;
711 $agg_max_in = 0;
712 foreach my $key (sort keys %$raw) {
713 # RAW_NEXT => 0,
714 # RAW_IN_LEN => 1,
715 # RAW_OUT_BYTES => 2,
716 # RAW_FALLBACK => 3,
717 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
718 # Now we are converting from raw to aggregate, switch from 1 byte strings
719 # to numbers
720 my $b = ord $key;
721 $fallback ||= 0;
722 if ($l &&
723 # If this == fails, we're going to reset $agg_max_in below anyway.
724 $b == ++$agg_max_in &&
725 # References in numeric context give the pointer as an int.
726 $agg_next == $next &&
727 $agg_in_len == $in_len &&
728 $agg_out_len == length $out_bytes &&
729 $agg_fallback == $fallback
730 # && length($l->[AGG_OUT_BYTES]) < 16
731 ) {
732 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
733 # we can aggregate this byte onto the end.
734 $l->[AGG_MAX_IN] = $b;
735 $l->[AGG_OUT_BYTES] .= $out_bytes;
736 } else {
737 # AGG_MIN_IN => 0,
738 # AGG_MAX_IN => 1,
739 # AGG_OUT_BYTES => 2,
740 # AGG_NEXT => 3,
741 # AGG_IN_LEN => 4,
742 # AGG_OUT_LEN => 5,
743 # AGG_FALLBACK => 6,
744 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
745 # (only gains .6% on euc-jp -- is it worth it?)
746 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
747 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
748 $agg_fallback = $fallback];
749 }
750 if (exists $next->{Cname}) {
751 $next->{'Forward'} = 1 if $next != $a;
752 } else {
753 process(sprintf("%s_%02x",$name,$b),$next);
754 }
755 }
756 # encengine.c rules say that last entry must be for 255
757 if ($agg_max_in < 255) {
758 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
759 }
760 $a->{'Entries'} = \@ent;
761}
762
b536bf57
DK
763
764sub addstrings
67d7b5ef
JH
765{
766 my ($fh,$a) = @_;
767 my $name = $a->{'Cname'};
768 # String tables
769 foreach my $b (@{$a->{'Entries'}})
770 {
771 next unless $b->[AGG_OUT_LEN];
b536bf57 772 $strings{$b->[AGG_OUT_BYTES]} = undef;
67d7b5ef
JH
773 }
774 if ($a->{'Forward'})
775 {
e67e8a4a 776 my ($cpp, $static, $sized) = compiler_info(1);
e67e8a4a 777 my $count = $sized ? scalar(@{$a->{'Entries'}}) : '';
15f5e486
SH
778 if ($static) {
779 # we cannot ask Config for d_plusplus since we can override CC=g++-6 on the cmdline
780 print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6
781 print $fh "extern encpage_t $name\[$count];\n";
782 print $fh "#else\n";
783 print $fh "static const encpage_t $name\[$count];\n";
784 print $fh "#endif\n";
785 } else {
786 print $fh "extern encpage_t $name\[$count];\n";
787 }
b536bf57
DK
788 }
789 $a->{'DoneStrings'} = 1;
790 foreach my $b (@{$a->{'Entries'}})
791 {
792 my ($s,$e,$out,$t,$end,$l) = @$b;
793 addstrings($fh,$t) unless $t->{'DoneStrings'};
67d7b5ef 794 }
b536bf57
DK
795}
796
797sub outbigstring
798{
799 my ($fh,$name) = @_;
800
801 $string_acc = '';
802
803 # Make the big string in the string accumulator. Longest first, on the hope
804 # that this makes it more likely that we find the short strings later on.
0a225b3c 805 # Not sure if it helps sorting strings of the same length lexically.
b536bf57
DK
806 foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
807 my $index = index $string_acc, $s;
808 if ($index >= 0) {
809 $saved += length($s);
810 $strings_in_acc{$s} = $index;
811 } else {
812 OPTIMISER: {
d1256cb1
RGS
813 if ($opt{'O'}) {
814 my $sublength = length $s;
815 while (--$sublength > 0) {
816 # progressively lop characters off the end, to see if the start of
817 # the new string overlaps the end of the accumulator.
818 if (substr ($string_acc, -$sublength)
819 eq substr ($s, 0, $sublength)) {
820 $subsave += $sublength;
821 $strings_in_acc{$s} = length ($string_acc) - $sublength;
822 # append the last bit on the end.
823 $string_acc .= substr ($s, $sublength);
824 last OPTIMISER;
825 }
826 # or if the end of the new string overlaps the start of the
827 # accumulator
828 next unless substr ($string_acc, 0, $sublength)
829 eq substr ($s, -$sublength);
830 # well, the last $sublength characters of the accumulator match.
831 # so as we're prepending to the accumulator, need to shift all our
832 # existing offsets forwards
833 $_ += $sublength foreach values %strings_in_acc;
834 $subsave += $sublength;
835 $strings_in_acc{$s} = 0;
836 # append the first bit on the start.
837 $string_acc = substr ($s, 0, -$sublength) . $string_acc;
838 last OPTIMISER;
839 }
840 }
841 # Optimiser (if it ran) found nothing, so just going have to tack the
842 # whole thing on the end.
843 $strings_in_acc{$s} = length $string_acc;
844 $string_acc .= $s;
b536bf57
DK
845 };
846 }
847 }
848
849 $strings = length $string_acc;
e67e8a4a 850 my ($cpp) = compiler_info(0);
f49b0b20
JH
851 my $var = $cpp ? '' : 'static';
852 my $definition = "\n$var const U8 $name\[$strings] = { " .
b536bf57
DK
853 join(',',unpack "C*",$string_acc);
854 # We have a single long line. Split it at convenient commas.
855 print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
856 print $fh substr ($definition, pos $definition), " };\n";
857}
858
859sub findstring {
860 my ($name,$s) = @_;
861 my $offset = $strings_in_acc{$s};
862 die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
863 unless defined $offset;
864 "$name + $offset";
865}
866
867sub outtable
868{
869 my ($fh,$a,$bigname) = @_;
870 my $name = $a->{'Cname'};
67d7b5ef
JH
871 $a->{'Done'} = 1;
872 foreach my $b (@{$a->{'Entries'}})
873 {
874 my ($s,$e,$out,$t,$end,$l) = @$b;
b536bf57 875 outtable($fh,$t,$bigname) unless $t->{'Done'};
67d7b5ef 876 }
e67e8a4a 877 my ($cpp, $static) = compiler_info(0);
15f5e486
SH
878 my $count = scalar(@{$a->{'Entries'}});
879 if ($static) {
880 print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6
881 print $fh "encpage_t $name\[$count] = {\n";
882 print $fh "#else\n";
883 print $fh "static const encpage_t $name\[$count] = {\n";
884 print $fh "#endif\n";
885 } else {
886 print $fh "\nencpage_t $name\[$count] = {\n";
887 }
67d7b5ef
JH
888 foreach my $b (@{$a->{'Entries'}})
889 {
890 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
d6f02b51 891 # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
67d7b5ef
JH
892 print $fh "{";
893 if ($l)
894 {
b536bf57 895 printf $fh findstring($bigname,$out);
67d7b5ef
JH
896 }
897 else
898 {
899 print $fh "0";
900 }
901 print $fh ",",$t->{Cname};
902 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
903 }
904 print $fh "};\n";
905}
906
67d7b5ef
JH
907sub output_enc
908{
909 my ($fh,$name,$a) = @_;
910 die "Changed - fix me for new structure";
911 foreach my $b (sort keys %$a)
912 {
913 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
914 }
915}
916
917sub decode_U
918{
919 my $s = shift;
920}
921
922my @uname;
8ced1423 923sub char_names{} # cf. https://rt.cpan.org/Ticket/Display.html?id=132471
67d7b5ef
JH
924
925sub output_ucm_page
926{
927 my ($cmap,$a,$t,$pre) = @_;
928 # warn sprintf("Page %x\n",$pre);
929 my $raw = $t->{Raw};
930 foreach my $key (sort keys %$raw) {
931 # RAW_NEXT => 0,
932 # RAW_IN_LEN => 1,
933 # RAW_OUT_BYTES => 2,
934 # RAW_FALLBACK => 3,
935 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
936 my $u = ord $key;
937 $fallback ||= 0;
938
939 if ($next != $a && $next != $t) {
940 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
941 } elsif (length $out_bytes) {
942 if ($pre) {
943 $u = $pre|($u &0x3f);
944 }
945 my $s = sprintf "<U%04X> ",$u;
946 #foreach my $c (split(//,$out_bytes)) {
947 # $s .= sprintf "\\x%02X",ord($c);
948 #}
949 # 9.5% faster changing that loop to this:
950 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
951 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
952 push(@$cmap,$s);
953 } else {
954 warn join(',',$u, @{$raw->{$key}},$a,$t);
955 }
956 }
957}
958
959sub output_ucm
960{
961 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
962 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
963 print $fh "<code_set_name> \"$name\"\n";
964 char_names();
965 if (defined $min_el)
966 {
967 print $fh "<mb_cur_min> $min_el\n";
968 }
969 if (defined $max_el)
970 {
971 print $fh "<mb_cur_max> $max_el\n";
972 }
973 if (defined $rep)
974 {
975 print $fh "<subchar> ";
976 foreach my $c (split(//,$rep))
977 {
978 printf $fh "\\x%02X",ord($c);
979 }
980 print $fh "\n";
981 }
982 my @cmap;
983 output_ucm_page(\@cmap,$h,$h,0);
984 print $fh "#\nCHARMAP\n";
985 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
986 {
987 print $fh $line;
988 }
989 print $fh "END CHARMAP\n";
990}
991
3ef515df
JH
992use vars qw(
993 $_Enc2xs
994 $_Version
995 $_Inc
b2704119 996 $_E2X
3ef515df
JH
997 $_Name
998 $_TableFiles
999 $_Now
1000);
1001
b2704119 1002sub find_e2x{
b536bf57 1003 eval { require File::Find; };
b2704119
JH
1004 my (@inc, %e2x_dir);
1005 for my $inc (@INC){
d1256cb1 1006 push @inc, $inc unless $inc eq '.'; #skip current dir
b2704119
JH
1007 }
1008 File::Find::find(
d1256cb1
RGS
1009 sub {
1010 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
1011 $atime,$mtime,$ctime,$blksize,$blocks)
1012 = lstat($_) or return;
1013 -f _ or return;
1014 if (/^.*\.e2x$/o){
1015 no warnings 'once';
1016 $e2x_dir{$File::Find::dir} ||= $mtime;
1017 }
1018 return;
1019 }, @inc);
b2704119
JH
1020 warn join("\n", keys %e2x_dir), "\n";
1021 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
d1256cb1
RGS
1022 $_E2X = $d;
1023 # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
1024 return $_E2X;
b2704119
JH
1025 }
1026}
1027
67d7b5ef
JH
1028sub make_makefile_pl
1029{
3f60a930 1030 eval { require Encode } or die "You need to install Encode to use enc2xs -M\nerror: $@\n";
0a225b3c 1031 # our used for variable expansion
3ef515df
JH
1032 $_Enc2xs = $0;
1033 $_Version = $VERSION;
b2704119 1034 $_E2X = find_e2x();
3ef515df
JH
1035 $_Name = shift;
1036 $_TableFiles = join(",", map {qq('$_')} @_);
1037 $_Now = scalar localtime();
b2704119 1038
aae85ceb 1039 eval { require File::Spec; };
b2704119 1040 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
b2704119 1041 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
b2704119 1042 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
b2704119 1043 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
b2704119 1044 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
3ef515df
JH
1045 exit;
1046}
1047
aae85ceb 1048use vars qw(
d1256cb1
RGS
1049 $_ModLines
1050 $_LocalVer
1051 );
aae85ceb 1052
656ebd29 1053sub make_configlocal_pm {
3f60a930 1054 eval { require Encode } or die "Unable to require Encode: $@\n";
aae85ceb 1055 eval { require File::Spec; };
656ebd29 1056
0a225b3c 1057 # our used for variable expantion
656ebd29
RGS
1058 my %in_core = map { $_ => 1 } (
1059 'ascii', 'iso-8859-1', 'utf8',
1060 'ascii-ctrl', 'null', 'utf-8-strict'
1061 );
aae85ceb 1062 my %LocalMod = ();
656ebd29
RGS
1063 # check @enc;
1064 use File::Find ();
1065 my $wanted = sub{
1066 -f $_ or return;
1067 $File::Find::name =~ /\A\./ and return;
1068 $File::Find::name =~ /\.pm\z/ or return;
1069 $File::Find::name =~ m/\bEncode\b/ or return;
1070 my $mod = $File::Find::name;
1071 $mod =~ s/.*\bEncode\b/Encode/o;
1072 $mod =~ s/\.pm\z//o;
1073 $mod =~ s,/,::,og;
3f60a930 1074 eval qq{ require $mod; } or return;
e8cc621e 1075 warn qq{ require $mod;\n};
656ebd29
RGS
1076 for my $enc ( Encode->encodings() ) {
1077 no warnings;
1078 $in_core{$enc} and next;
1079 $Encode::Config::ExtModule{$enc} and next;
1080 $LocalMod{$enc} ||= $mod;
1081 }
1082 };
1083 File::Find::find({wanted => $wanted}, @INC);
aae85ceb 1084 $_ModLines = "";
656ebd29
RGS
1085 for my $enc ( sort keys %LocalMod ) {
1086 $_ModLines .=
1087 qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
aae85ceb 1088 }
e8cc621e 1089 warn $_ModLines if $_ModLines;
aae85ceb 1090 $_LocalVer = _mkversion();
656ebd29
RGS
1091 $_E2X = find_e2x();
1092 $_Inc = $INC{"Encode.pm"};
1093 $_Inc =~ s/\.pm$//o;
1094 _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
1095 File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
aae85ceb
DK
1096 exit;
1097}
1098
1099sub _mkversion{
656ebd29
RGS
1100 # v-string is now depreciated; use time() instead;
1101 #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1102 #$yyyy += 1900, $mo +=1;
1103 #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1104 return time();
aae85ceb
DK
1105}
1106
3ef515df 1107sub _print_expand{
3f60a930 1108 eval { require File::Basename } or die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
67d7b5ef 1109 File::Basename->import();
621b0f8d
DK
1110 my ($src, $dst, $clobber) = @_;
1111 if (!$clobber and -e $dst){
d1256cb1
RGS
1112 warn "$dst exists. skipping\n";
1113 return;
621b0f8d
DK
1114 }
1115 warn "Generating $dst...\n";
3ef515df
JH
1116 open my $in, $src or die "$src : $!";
1117 if ((my $d = dirname($dst)) ne '.'){
d1256cb1 1118 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
3ef515df 1119 }
15f5e486 1120 open my $out, ">", $dst or die "$!";
3ef515df
JH
1121 my $asis = 0;
1122 while (<$in>){
d1256cb1
RGS
1123 if (/^#### END_OF_HEADER/){
1124 $asis = 1; next;
1125 }
1126 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1127 print $out $_;
67d7b5ef 1128 }
67d7b5ef 1129}
67d7b5ef
JH
1130__END__
1131
1132=head1 NAME
1133
1134enc2xs -- Perl Encode Module Generator
1135
1136=head1 SYNOPSIS
1137
67d7b5ef 1138 enc2xs -[options]
aae85ceb
DK
1139 enc2xs -M ModName mapfiles...
1140 enc2xs -C
67d7b5ef
JH
1141
1142=head1 DESCRIPTION
1143
1144F<enc2xs> builds a Perl extension for use by Encode from either
0ab8f81e
JH
1145Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1146Besides being used internally during the build process of the Encode
1147module, you can use F<enc2xs> to add your own encoding to perl.
1148No knowledge of XS is necessary.
67d7b5ef
JH
1149
1150=head1 Quick Guide
1151
0ab8f81e 1152If you want to know as little about Perl as possible but need to
67d7b5ef
JH
1153add a new encoding, just read this chapter and forget the rest.
1154
1155=over 4
1156
3cd3edd2 1157=item 0.Z<>
67d7b5ef 1158
0ab8f81e
JH
1159Have a .ucm file ready. You can get it from somewhere or you can write
1160your own from scratch or you can grab one from the Encode distribution
1161and customize it. For the UCM format, see the next Chapter. In the
1162example below, I'll call my theoretical encoding myascii, defined
1163in I<my.ucm>. C<$> is a shell prompt.
67d7b5ef
JH
1164
1165 $ ls -F
1166 my.ucm
1167
3cd3edd2 1168=item 1.Z<>
67d7b5ef
JH
1169
1170Issue a command as follows;
1171
1172 $ enc2xs -M My my.ucm
3ef515df
JH
1173 generating Makefile.PL
1174 generating My.pm
1175 generating README
1176 generating Changes
67d7b5ef
JH
1177
1178Now take a look at your current directory. It should look like this.
1179
1180 $ ls -F
1181 Makefile.PL My.pm my.ucm t/
1182
0ab8f81e 1183The following files were created.
67d7b5ef 1184
0ab8f81e
JH
1185 Makefile.PL - MakeMaker script
1186 My.pm - Encode submodule
1187 t/My.t - test file
1188
1189=over 4
67d7b5ef 1190
3cd3edd2 1191=item 1.1.Z<>
037b88d6
JH
1192
1193If you want *.ucm installed together with the modules, do as follows;
1194
1195 $ mkdir Encode
1196 $ mv *.ucm Encode
1197 $ enc2xs -M My Encode/*ucm
1198
0ab8f81e
JH
1199=back
1200
3cd3edd2 1201=item 2.Z<>
67d7b5ef
JH
1202
1203Edit the files generated. You don't have to if you have no time AND no
1204intention to give it to someone else. But it is a good idea to edit
0ab8f81e 1205the pod and to add more tests.
67d7b5ef 1206
3cd3edd2 1207=item 3.Z<>
67d7b5ef 1208
0ab8f81e 1209Now issue a command all Perl Mongers love:
67d7b5ef 1210
9160fdbd 1211 $ perl Makefile.PL
67d7b5ef
JH
1212 Writing Makefile for Encode::My
1213
3cd3edd2 1214=item 4.Z<>
67d7b5ef
JH
1215
1216Now all you have to do is make.
1217
1218 $ make
1219 cp My.pm blib/lib/Encode/My.pm
1220 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1221 -o encode_t.c -f encode_t.fnm
1222 Reading myascii (myascii)
1223 Writing compiled form
1224 128 bytes in string tables
cf9f87ce
RGS
1225 384 bytes (75%) saved spotting duplicates
1226 1 bytes (0.775%) saved using substrings
67d7b5ef
JH
1227 ....
1228 chmod 644 blib/arch/auto/Encode/My/My.bs
1229 $
1230
0ab8f81e
JH
1231The time it takes varies depending on how fast your machine is and
1232how large your encoding is. Unless you are working on something big
1233like euc-tw, it won't take too long.
67d7b5ef 1234
3cd3edd2 1235=item 5.Z<>
67d7b5ef
JH
1236
1237You can "make install" already but you should test first.
1238
1239 $ make test
1240 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1241 -e 'use Test::Harness qw(&runtests $verbose); \
1242 $verbose=0; runtests @ARGV;' t/*.t
1243 t/My....ok
1244 All tests successful.
1245 Files=1, Tests=2, 0 wallclock secs
1246 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1247
3cd3edd2 1248=item 6.Z<>
67d7b5ef
JH
1249
1250If you are content with the test result, just "make install"
1251
3cd3edd2 1252=item 7.Z<>
aae85ceb 1253
0ab8f81e 1254If you want to add your encoding to Encode's demand-loading list
aae85ceb
DK
1255(so you don't have to "use Encode::YourEncoding"), run
1256
1257 enc2xs -C
1258
1259to update Encode::ConfigLocal, a module that controls local settings.
1260After that, "use Encode;" is enough to load your encodings on demand.
1261
67d7b5ef
JH
1262=back
1263
1264=head1 The Unicode Character Map
1265
0ab8f81e
JH
1266Encode uses the Unicode Character Map (UCM) format for source character
1267mappings. This format is used by IBM's ICU package and was adopted
1268by Nick Ing-Simmons for use with the Encode module. Since UCM is
1269more flexible than Tcl's Encoding Map and far more user-friendly,
863b2ca0 1270this is the recommended format for Encode now.
67d7b5ef 1271
0ab8f81e 1272A UCM file looks like this.
67d7b5ef
JH
1273
1274 #
1275 # Comments
1276 #
1277 <code_set_name> "US-ascii" # Required
1278 <code_set_alias> "ascii" # Optional
1279 <mb_cur_min> 1 # Required; usually 1
1280 <mb_cur_max> 1 # Max. # of bytes/char
1281 <subchar> \x3F # Substitution char
1282 #
1283 CHARMAP
1284 <U0000> \x00 |0 # <control>
1285 <U0001> \x01 |0 # <control>
1286 <U0002> \x02 |0 # <control>
1287 ....
1288 <U007C> \x7C |0 # VERTICAL LINE
1289 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1290 <U007E> \x7E |0 # TILDE
1291 <U007F> \x7F |0 # <control>
1292 END CHARMAP
1293
1294=over 4
1295
1296=item *
1297
0ab8f81e 1298Anything that follows C<#> is treated as a comment.
67d7b5ef
JH
1299
1300=item *
1301
0ab8f81e
JH
1302The header section continues until a line containing the word
1303CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1304pair per line. Strings used as values must be quoted. Barewords are
1305treated as numbers. I<\xXX> represents a byte.
67d7b5ef
JH
1306
1307Most of the keywords are self-explanatory. I<subchar> means
1308substitution character, not subcharacter. When you decode a Unicode
1309sequence to this encoding but no matching character is found, the byte
1310sequence defined here will be used. For most cases, the value here is
0ab8f81e 1311\x3F; in ASCII, this is a question mark.
67d7b5ef
JH
1312
1313=item *
1314
1315CHARMAP starts the character map section. Each line has a form as
0ab8f81e 1316follows:
67d7b5ef
JH
1317
1318 <UXXXX> \xXX.. |0 # comment
1319 ^ ^ ^
1320 | | +- Fallback flag
1321 | +-------- Encoded byte sequence
1322 +-------------- Unicode Character ID in hex
1323
0ab8f81e
JH
1324The format is roughly the same as a header section except for the
1325fallback flag: | followed by 0..3. The meaning of the possible
1326values is as follows:
67d7b5ef 1327
0ab8f81e 1328=over 4
67d7b5ef
JH
1329
1330=item |0
1331
0ab8f81e
JH
1332Round trip safe. A character decoded to Unicode encodes back to the
1333same byte sequence. Most characters have this flag.
67d7b5ef
JH
1334
1335=item |1
1336
1337Fallback for unicode -> encoding. When seen, enc2xs adds this
0ab8f81e 1338character for the encode map only.
67d7b5ef
JH
1339
1340=item |2
1341
1342Skip sub-char mapping should there be no code point.
1343
1344=item |3
1345
1346Fallback for encoding -> unicode. When seen, enc2xs adds this
0ab8f81e 1347character for the decode map only.
67d7b5ef
JH
1348
1349=back
1350
1351=item *
1352
1353And finally, END OF CHARMAP ends the section.
1354
1355=back
1356
6d1c0808 1357When you are manually creating a UCM file, you should copy ascii.ucm
0ab8f81e
JH
1358or an existing encoding which is close to yours, rather than write
1359your own from scratch.
67d7b5ef
JH
1360
1361When you do so, make sure you leave at least B<U0000> to B<U0020> as
0ab8f81e 1362is, unless your environment is EBCDIC.
67d7b5ef
JH
1363
1364B<CAVEAT>: not all features in UCM are implemented. For example,
1365icu:state is not used. Because of that, you need to write a perl
0ab8f81e
JH
1366module if you want to support algorithmical encodings, notably
1367the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
67d7b5ef
JH
1368L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1369
6d1c0808
JH
1370=head2 Coping with duplicate mappings
1371
1372When you create a map, you SHOULD make your mappings round-trip safe.
1373That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1374$data> stands for all characters that are marked as C<|0>. Here is
0ab8f81e 1375how to make sure:
6d1c0808 1376
0ab8f81e 1377=over 4
6d1c0808
JH
1378
1379=item *
1380
1381Sort your map in Unicode order.
1382
1383=item *
1384
1385When you have a duplicate entry, mark either one with '|1' or '|3'.
1386
1387=item *
1388
0ab8f81e 1389And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
6d1c0808
JH
1390
1391=back
1392
1393Here is an example from big5-eten.
1394
1395 <U2550> \xF9\xF9 |0
1396 <U2550> \xA2\xA4 |3
1397
1398Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1399this;
1400
1401 E to U U to E
1402 --------------------------------------
1403 \xF9\xF9 => U2550 U2550 => \xF9\xF9
1404 \xA2\xA4 => U2550
1405
1406So it is round-trip safe for \xF9\xF9. But if the line above is upside
1407down, here is what happens.
1408
1409 E to U U to E
1410 --------------------------------------
1411 \xA2\xA4 => U2550 U2550 => \xF9\xF9
1412 (\xF9\xF9 => U2550 is now overwritten!)
1413
1414The Encode package comes with F<ucmlint>, a crude but sufficient
0ab8f81e
JH
1415utility to check the integrity of a UCM file. Check under the
1416Encode/bin directory for this.
cf9f87ce
RGS
1417
1418When in doubt, you can use F<ucmsort>, yet another utility under
1419Encode/bin directory.
6d1c0808 1420
67d7b5ef
JH
1421=head1 Bookmarks
1422
0ab8f81e
JH
1423=over 4
1424
1425=item *
1426
67d7b5ef 1427ICU Home Page
64bc6d54 1428L<http://www.icu-project.org/>
67d7b5ef 1429
0ab8f81e
JH
1430=item *
1431
67d7b5ef 1432ICU Character Mapping Tables
b2deda17 1433L<http://site.icu-project.org/charts/charset>
67d7b5ef 1434
0ab8f81e
JH
1435=item *
1436
67d7b5ef 1437ICU:Conversion Data
64bc6d54 1438L<http://www.icu-project.org/userguide/conversion-data.html>
67d7b5ef 1439
0ab8f81e
JH
1440=back
1441
67d7b5ef
JH
1442=head1 SEE ALSO
1443
1444L<Encode>,
1445L<perlmod>,
1446L<perlpod>
1447
1448=cut
1449
1450# -Q to disable the duplicate codepoint test
1451# -S make mapping errors fatal
1452# -q to remove comments written to output files
1453# -O to enable the (brute force) substring optimiser
1454# -o <output> to specify the output file name (else it's the first arg)
1455# -f <inlist> to give a file with a list of input files (else use the args)
1456# -n <name> to name the encoding (else use the basename of the input file.
1457
1458With %seen holding array refs:
1459
1460 865.66 real 28.80 user 8.79 sys
1461 7904 maximum resident set size
1462 1356 average shared memory size
1463 18566 average unshared data size
1464 229 average unshared stack size
1465 46080 page reclaims
1466 33373 page faults
1467
1468With %seen holding simple scalars:
1469
1470 342.16 real 27.11 user 3.54 sys
1471 8388 maximum resident set size
1472 1394 average shared memory size
1473 14969 average unshared data size
1474 236 average unshared stack size
1475 28159 page reclaims
1476 9839 page faults
1477
1478Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1479how %seen is storing things its seen. So it is pathalogically bad on a 16M
1480RAM machine, but it's going to help even on modern machines.
1481Swapping is bad, m'kay :-)