This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Historic version labels
[perl5.git] / ext / Encode / bin / enc2xs
CommitLineData
3ef515df 1#!./perl
67d7b5ef 2BEGIN {
a999c27c
JH
3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4 # with $ENV{PERL_CORE} set
5 # In case we need it in future...
6 require Config; import Config;
67d7b5ef
JH
7}
8use strict;
9use Getopt::Std;
10my @orig_ARGV = @ARGV;
2d06ad02 11our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
67d7b5ef
JH
12
13# These may get re-ordered.
14# RAW is a do_now as inserted by &enter
15# AGG is an aggreagated do_now, as built up by &process
a999c27c 16
67d7b5ef
JH
17use constant {
18 RAW_NEXT => 0,
19 RAW_IN_LEN => 1,
20 RAW_OUT_BYTES => 2,
21 RAW_FALLBACK => 3,
22
23 AGG_MIN_IN => 0,
24 AGG_MAX_IN => 1,
25 AGG_OUT_BYTES => 2,
26 AGG_NEXT => 3,
27 AGG_IN_LEN => 4,
28 AGG_OUT_LEN => 5,
29 AGG_FALLBACK => 6,
30};
a999c27c 31
67d7b5ef
JH
32# (See the algorithm in encengine.c - we're building structures for it)
33
34# There are two sorts of structures.
35# "do_now" (an array, two variants of what needs storing) is whatever we need
36# to do now we've read an input byte.
37# It's housed in a "do_next" (which is how we got to it), and in turn points
38# to a "do_next" which contains all the "do_now"s for the next input byte.
39
40# There will be a "do_next" which is the start state.
41# For a single byte encoding it's the only "do_next" - each "do_now" points
42# back to it, and each "do_now" will cause bytes. There is no state.
43
44# For a multi-byte encoding where all characters in the input are the same
45# length, then there will be a tree of "do_now"->"do_next"->"do_now"
46# branching out from the start state, one step for each input byte.
47# The leaf "do_now"s will all be at the same distance from the start state,
48# only the leaf "do_now"s cause output bytes, and they in turn point back to
49# the start state.
50
51# For an encoding where there are varaible length input byte sequences, you
52# will encounter a leaf "do_now" sooner for the shorter input sequences, but
53# as before the leaves will point back to the start state.
54
55# The system will cope with escape encodings (imagine them as a mostly
56# self-contained tree for each escape state, and cross links between trees
57# at the state-switching characters) but so far no input format defines these.
58
59# The system will also cope with having output "leaves" in the middle of
60# the bifurcating branches, not just at the extremities, but again no
61# input format does this yet.
62
63# There are two variants of the "do_now" structure. The first, smaller variant
64# is generated by &enter as the input file is read. There is one structure
65# for each input byte. Say we are mapping a single byte encoding to a
66# single byte encoding, with "ABCD" going "abcd". There will be
67# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
68
69# &process then walks the tree, building aggregate "do_now" structres for
70# adjacent bytes where possible. The aggregate is for a contiguous range of
71# bytes which each produce the same length of output, each move to the
72# same next state, and each have the same fallback flag.
73# So our 4 RAW "do_now"s above become replaced by a single structure
74# containing:
75# ["A", "D", "abcd", 1, ...]
76# ie, for an input byte $_ in "A".."D", output 1 byte, found as
77# substr ("abcd", (ord $_ - ord "A") * 1, 1)
78# which maps very nicely into pointer arithmetic in C for encengine.c
79
80sub encode_U
81{
82 # UTF-8 encode long hand - only covers part of perl's range
83 ## my $uv = shift;
84 # chr() works in native space so convert value from table
85 # into that space before using chr().
86 my $ch = chr(utf8::unicode_to_native($_[0]));
87 # Now get core perl to encode that the way it likes.
88 utf8::encode($ch);
89 return $ch;
90}
91
92sub encode_S
93{
94 # encode single byte
95 ## my ($ch,$page) = @_; return chr($ch);
96 return chr $_[0];
97}
98
99sub encode_D
100{
101 # encode double byte MS byte first
102 ## my ($ch,$page) = @_; return chr($page).chr($ch);
103 return chr ($_[1]) . chr $_[0];
104}
105
106sub encode_M
107{
108 # encode Multi-byte - single for 0..255 otherwise double
109 ## my ($ch,$page) = @_;
110 ## return &encode_D if $page;
111 ## return &encode_S;
112 return chr ($_[1]) . chr $_[0] if $_[1];
113 return chr $_[0];
114}
115
116my %encode_types = (U => \&encode_U,
117 S => \&encode_S,
118 D => \&encode_D,
119 M => \&encode_M,
120 );
121
122# Win32 does not expand globs on command line
123eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
124
125my %opt;
126# I think these are:
127# -Q to disable the duplicate codepoint test
128# -S make mapping errors fatal
129# -q to remove comments written to output files
130# -O to enable the (brute force) substring optimiser
131# -o <output> to specify the output file name (else it's the first arg)
132# -f <inlist> to give a file with a list of input files (else use the args)
133# -n <name> to name the encoding (else use the basename of the input file.
aae85ceb 134getopts('CM:SQqOo:f:n:',\%opt);
67d7b5ef
JH
135
136$opt{M} and make_makefile_pl($opt{M}, @ARGV);
aae85ceb 137$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
67d7b5ef
JH
138
139# This really should go first, else the die here causes empty (non-erroneous)
140# output files to be written.
141my @encfiles;
142if (exists $opt{'f'}) {
143 # -F is followed by name of file containing list of filenames
144 my $flist = $opt{'f'};
145 open(FLIST,$flist) || die "Cannot open $flist:$!";
146 chomp(@encfiles = <FLIST>);
147 close(FLIST);
148} else {
149 @encfiles = @ARGV;
150}
151
152my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
153chmod(0666,$cname) if -f $cname && !-w $cname;
154open(C,">$cname") || die "Cannot open $cname:$!";
155
156my $dname = $cname;
157my $hname = $cname;
158
159my ($doC,$doEnc,$doUcm,$doPet);
160
161if ($cname =~ /\.(c|xs)$/)
162 {
163 $doC = 1;
e7cbefb8 164 $dname =~ s/(\.[^\.]*)?$/.exh/;
67d7b5ef
JH
165 chmod(0666,$dname) if -f $cname && !-w $dname;
166 open(D,">$dname") || die "Cannot open $dname:$!";
167 $hname =~ s/(\.[^\.]*)?$/.h/;
168 chmod(0666,$hname) if -f $cname && !-w $hname;
169 open(H,">$hname") || die "Cannot open $hname:$!";
170
171 foreach my $fh (\*C,\*D,\*H)
172 {
173 print $fh <<"END" unless $opt{'q'};
174/*
175 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
176 This file was autogenerated by:
177 $^X $0 @orig_ARGV
178*/
179END
180 }
181
182 if ($cname =~ /(\w+)\.xs$/)
183 {
184 print C "#include <EXTERN.h>\n";
185 print C "#include <perl.h>\n";
186 print C "#include <XSUB.h>\n";
187 print C "#define U8 U8\n";
188 }
189 print C "#include \"encode.h\"\n";
190
191 }
192elsif ($cname =~ /\.enc$/)
193 {
194 $doEnc = 1;
195 }
196elsif ($cname =~ /\.ucm$/)
197 {
198 $doUcm = 1;
199 }
200elsif ($cname =~ /\.pet$/)
201 {
202 $doPet = 1;
203 }
204
205my %encoding;
206my %strings;
207my $saved = 0;
208my $subsave = 0;
209my $strings = 0;
210
211sub cmp_name
212{
213 if ($a =~ /^.*-(\d+)/)
214 {
215 my $an = $1;
216 if ($b =~ /^.*-(\d+)/)
217 {
218 my $r = $an <=> $1;
219 return $r if $r;
220 }
221 }
222 return $a cmp $b;
223}
224
225
226foreach my $enc (sort cmp_name @encfiles)
227 {
228 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
229 $name = $opt{'n'} if exists $opt{'n'};
230 if (open(E,$enc))
231 {
232 if ($sfx eq 'enc')
233 {
234 compile_enc(\*E,lc($name));
235 }
236 else
237 {
238 compile_ucm(\*E,lc($name));
239 }
240 }
241 else
242 {
243 warn "Cannot open $enc for $name:$!";
244 }
245 }
246
247if ($doC)
248 {
249 print STDERR "Writing compiled form\n";
250 foreach my $name (sort cmp_name keys %encoding)
251 {
252 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
253 output(\*C,$name.'_utf8',$e2u);
254 output(\*C,'utf8_'.$name,$u2e);
b2704119 255 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
67d7b5ef
JH
256 }
257 foreach my $enc (sort cmp_name keys %encoding)
258 {
b2704119
JH
259 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
260 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
261 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
262 my $replen = 0;
263 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
85982a32 264 my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
67d7b5ef
JH
265 my $sym = "${enc}_encoding";
266 $sym =~ s/\W+/_/g;
267 print C "encode_t $sym = \n";
f0a41339
DK
268 # This is to make null encoding work -- dankogai
269 for (my $i = (scalar @info) - 1; $i >= 0; --$i){
270 $info[$i] ||= 1;
271 }
272 # end of null tweak -- dankogai
67d7b5ef
JH
273 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
274 }
275
276 foreach my $enc (sort cmp_name keys %encoding)
277 {
278 my $sym = "${enc}_encoding";
279 $sym =~ s/\W+/_/g;
280 print H "extern encode_t $sym;\n";
281 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
282 }
283
284 if ($cname =~ /(\w+)\.xs$/)
285 {
286 my $mod = $1;
287 print C <<'END';
288
289static void
290Encode_XSEncoding(pTHX_ encode_t *enc)
291{
292 dSP;
293 HV *stash = gv_stashpv("Encode::XS", TRUE);
294 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
295 int i = 0;
296 PUSHMARK(sp);
297 XPUSHs(sv);
298 while (enc->name[i])
299 {
300 const char *name = enc->name[i++];
301 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
302 }
303 PUTBACK;
304 call_pv("Encode::define_encoding",G_DISCARD);
305 SvREFCNT_dec(sv);
306}
307
308END
309
310 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
311 print C "BOOT:\n{\n";
312 print C "#include \"$dname\"\n";
313 print C "}\n";
314 }
315 # Close in void context is bad, m'kay
316 close(D) or warn "Error closing '$dname': $!";
317 close(H) or warn "Error closing '$hname': $!";
318
319 my $perc_saved = $strings/($strings + $saved) * 100;
320 my $perc_subsaved = $strings/($strings + $subsave) * 100;
321 printf STDERR "%d bytes in string tables\n",$strings;
322 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
323 $saved, $perc_saved if $saved;
324 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
325 $subsave, $perc_subsaved if $subsave;
326 }
327elsif ($doEnc)
328 {
329 foreach my $name (sort cmp_name keys %encoding)
330 {
331 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
332 output_enc(\*C,$name,$e2u);
333 }
334 }
335elsif ($doUcm)
336 {
337 foreach my $name (sort cmp_name keys %encoding)
338 {
339 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
340 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
341 }
342 }
343
344# writing half meg files and then not checking to see if you just filled the
345# disk is bad, m'kay
346close(C) or die "Error closing '$cname': $!";
347
348# End of the main program.
349
350sub compile_ucm
351{
352 my ($fh,$name) = @_;
353 my $e2u = {};
354 my $u2e = {};
355 my $cs;
356 my %attr;
357 while (<$fh>)
358 {
359 s/#.*$//;
360 last if /^\s*CHARMAP\s*$/i;
361 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
362 {
363 $attr{$1} = $2;
364 }
365 }
366 if (!defined($cs = $attr{'code_set_name'}))
367 {
368 warn "No <code_set_name> in $name\n";
369 }
370 else
371 {
372 $name = $cs unless exists $opt{'n'};
373 }
374 my $erep;
375 my $urep;
376 my $max_el;
377 my $min_el;
378 if (exists $attr{'subchar'})
379 {
b2704119
JH
380 #my @byte;
381 #$attr{'subchar'} =~ /^\s*/cg;
382 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
383 #$erep = join('',map(chr(hex($_)),@byte));
384 $erep = $attr{'subchar'};
385 $erep =~ s/^\s+//; $erep =~ s/\s+$//;
67d7b5ef
JH
386 }
387 print "Reading $name ($cs)\n";
388 my $nfb = 0;
389 my $hfb = 0;
390 while (<$fh>)
391 {
392 s/#.*$//;
393 last if /^\s*END\s+CHARMAP\s*$/i;
394 next if /^\s*$/;
a999c27c
JH
395 my (@uni, @byte) = ();
396 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
397 or die "Bad line: $_";
398 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
399 push @uni, map { substr($_, 1) } split(/\+/, $1);
400 }
401 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
402 push @byte, $1;
403 }
404 if (@uni)
67d7b5ef 405 {
a999c27c 406 my $uch = join('', map { encode_U(hex($_)) } @uni );
67d7b5ef
JH
407 my $ech = join('',map(chr(hex($_)),@byte));
408 my $el = length($ech);
409 $max_el = $el if (!defined($max_el) || $el > $max_el);
410 $min_el = $el if (!defined($min_el) || $el < $min_el);
411 if (length($fb))
412 {
413 $fb = substr($fb,1);
414 $hfb++;
415 }
416 else
417 {
418 $nfb++;
419 $fb = '0';
420 }
421 # $fb is fallback flag
422 # 0 - round trip safe
423 # 1 - fallback for unicode -> enc
424 # 2 - skip sub-char mapping
425 # 3 - fallback enc -> unicode
426 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
427 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
428 }
429 else
430 {
431 warn $_;
432 }
433 }
434 if ($nfb && $hfb)
435 {
436 die "$nfb entries without fallback, $hfb entries with\n";
437 }
438 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
439}
440
441
442
443sub compile_enc
444{
445 my ($fh,$name) = @_;
446 my $e2u = {};
447 my $u2e = {};
448
449 my $type;
450 while ($type = <$fh>)
451 {
452 last if $type !~ /^\s*#/;
453 }
454 chomp($type);
455 return if $type eq 'E';
456 # Do the hash lookup once, rather than once per function call. 4% speedup.
457 my $type_func = $encode_types{$type};
458 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
459 warn "$type encoded $name\n";
460 my $rep = '';
461 # Save a defined test by setting these to defined values.
462 my $min_el = ~0; # A very big integer
463 my $max_el = 0; # Anything must be longer than 0
464 {
465 my $v = hex($def);
466 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
467 }
468 my $errors;
469 my $seen;
470 # use -Q to silence the seen test. Makefile.PL uses this by default.
471 $seen = {} unless $opt{Q};
472 do
473 {
474 my $line = <$fh>;
475 chomp($line);
476 my $page = hex($line);
477 my $ch = 0;
478 my $i = 16;
479 do
480 {
481 # So why is it 1% faster to leave the my here?
482 my $line = <$fh>;
483 $line =~ s/\r\n$/\n/;
484 die "$.:${line}Line should be exactly 65 characters long including
485 newline (".length($line).")" unless length ($line) == 65;
486 # Split line into groups of 4 hex digits, convert groups to ints
487 # This takes 65.35
488 # map {hex $_} $line =~ /(....)/g
489 # This takes 63.75 (2.5% less time)
490 # unpack "n*", pack "H*", $line
491 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
492 # Doing it as while ($line =~ /(....)/g) took 74.63
493 foreach my $val (unpack "n*", pack "H*", $line)
494 {
495 next if $val == 0xFFFD;
496 my $ech = &$type_func($ch,$page);
497 if ($val || (!$ch && !$page))
498 {
499 my $el = length($ech);
500 $max_el = $el if $el > $max_el;
501 $min_el = $el if $el < $min_el;
502 my $uch = encode_U($val);
503 if ($seen) {
504 # We're doing the test.
505 # We don't need to read this quickly, so storing it as a scalar,
506 # rather than 3 (anon array, plus the 2 scalars it holds) saves
507 # RAM and may make us faster on low RAM systems. [see __END__]
508 if (exists $seen->{$uch})
509 {
510 warn sprintf("U%04X is %02X%02X and %04X\n",
511 $val,$page,$ch,$seen->{$uch});
512 $errors++;
513 }
514 else
515 {
516 $seen->{$uch} = $page << 8 | $ch;
517 }
518 }
519 # Passing 2 extra args each time is 3.6% slower!
520 # Even with having to add $fallback ||= 0 later
521 enter_fb0($e2u,$ech,$uch);
522 enter_fb0($u2e,$uch,$ech);
523 }
524 else
525 {
526 # No character at this position
527 # enter($e2u,$ech,undef,$e2u);
528 }
529 $ch++;
530 }
531 } while --$i;
532 } while --$pages;
533 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
534 if $min_el > $max_el;
535 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
536 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
537}
538
539# my ($a,$s,$d,$t,$fb) = @_;
540sub enter {
541 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
542 # state we shift to after this (multibyte) input character defaults to same
543 # as current state.
544 $next ||= $current;
545 # Making sure it is defined seems to be faster than {no warnings;} in
546 # &process, or passing it in as 0 explicity.
547 # XXX $fallback ||= 0;
548
549 # Start at the beginning and work forwards through the string to zero.
550 # effectively we are removing 1 character from the front each time
551 # but we don't actually edit the string. [this alone seems to be 14% speedup]
552 # Hence -$pos is the length of the remaining string.
553 my $pos = -length $inbytes;
554 while (1) {
555 my $byte = substr $inbytes, $pos, 1;
556 # RAW_NEXT => 0,
557 # RAW_IN_LEN => 1,
558 # RAW_OUT_BYTES => 2,
559 # RAW_FALLBACK => 3,
560 # to unicode an array would seem to be better, because the pages are dense.
561 # from unicode can be very sparse, favouring a hash.
562 # hash using the bytes (all length 1) as keys rather than ord value,
563 # as it's easier to sort these in &process.
564
565 # It's faster to always add $fallback even if it's undef, rather than
566 # choosing between 3 and 4 element array. (hence why we set it defined
567 # above)
568 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
569 # When $pos was -1 we were at the last input character.
570 unless (++$pos) {
571 $do_now->[RAW_OUT_BYTES] = $outbytes;
572 $do_now->[RAW_NEXT] = $next;
573 return;
574 }
575 # Tail recursion. The intermdiate state may not have a name yet.
576 $current = $do_now->[RAW_NEXT];
577 }
578}
579
580# This is purely for optimistation. It's just &enter hard coded for $fallback
581# of 0, using only a 3 entry array ref to save memory for every entry.
582sub enter_fb0 {
583 my ($current,$inbytes,$outbytes,$next) = @_;
584 $next ||= $current;
585
586 my $pos = -length $inbytes;
587 while (1) {
588 my $byte = substr $inbytes, $pos, 1;
589 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
590 unless (++$pos) {
591 $do_now->[RAW_OUT_BYTES] = $outbytes;
592 $do_now->[RAW_NEXT] = $next;
593 return;
594 }
595 $current = $do_now->[RAW_NEXT];
596 }
597}
598
599
600sub outstring
601{
602 my ($fh,$name,$s) = @_;
603 my $sym = $strings{$s};
604 if ($sym)
605 {
606 $saved += length($s);
607 }
608 else
609 {
610 if ($opt{'O'}) {
611 foreach my $o (keys %strings)
612 {
613 next unless (my $i = index($o,$s)) >= 0;
614 $sym = $strings{$o};
615 # gcc things that 0x0e+0x10 (anything with e+) starts to look like
616 # a hexadecimal floating point constant. Silly gcc. Only p
617 # introduces a floating point constant. Put the space in to stop it
618 # getting confused.
619 $sym .= sprintf(" +0x%02x",$i) if ($i);
620 $subsave += length($s);
621 return $strings{$s} = $sym;
622 }
623 }
624 $strings{$s} = $sym = $name;
625 $strings += length($s);
626 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
627 # Maybe we should assert that these are all <256.
628 $definition .= join(',',unpack "C*",$s);
629 # We have a single long line. Split it at convenient commas.
630 $definition =~ s/(.{74,77},)/$1\n/g;
631 print $fh "$definition };\n\n";
632 }
633 return $sym;
634}
635
636sub process
637{
638 my ($name,$a) = @_;
639 $name =~ s/\W+/_/g;
640 $a->{Cname} = $name;
641 my $raw = $a->{Raw};
642 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
643 my @ent;
644 $agg_max_in = 0;
645 foreach my $key (sort keys %$raw) {
646 # RAW_NEXT => 0,
647 # RAW_IN_LEN => 1,
648 # RAW_OUT_BYTES => 2,
649 # RAW_FALLBACK => 3,
650 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
651 # Now we are converting from raw to aggregate, switch from 1 byte strings
652 # to numbers
653 my $b = ord $key;
654 $fallback ||= 0;
655 if ($l &&
656 # If this == fails, we're going to reset $agg_max_in below anyway.
657 $b == ++$agg_max_in &&
658 # References in numeric context give the pointer as an int.
659 $agg_next == $next &&
660 $agg_in_len == $in_len &&
661 $agg_out_len == length $out_bytes &&
662 $agg_fallback == $fallback
663 # && length($l->[AGG_OUT_BYTES]) < 16
664 ) {
665 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
666 # we can aggregate this byte onto the end.
667 $l->[AGG_MAX_IN] = $b;
668 $l->[AGG_OUT_BYTES] .= $out_bytes;
669 } else {
670 # AGG_MIN_IN => 0,
671 # AGG_MAX_IN => 1,
672 # AGG_OUT_BYTES => 2,
673 # AGG_NEXT => 3,
674 # AGG_IN_LEN => 4,
675 # AGG_OUT_LEN => 5,
676 # AGG_FALLBACK => 6,
677 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
678 # (only gains .6% on euc-jp -- is it worth it?)
679 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
680 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
681 $agg_fallback = $fallback];
682 }
683 if (exists $next->{Cname}) {
684 $next->{'Forward'} = 1 if $next != $a;
685 } else {
686 process(sprintf("%s_%02x",$name,$b),$next);
687 }
688 }
689 # encengine.c rules say that last entry must be for 255
690 if ($agg_max_in < 255) {
691 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
692 }
693 $a->{'Entries'} = \@ent;
694}
695
696sub outtable
697{
698 my ($fh,$a) = @_;
699 my $name = $a->{'Cname'};
700 # String tables
701 foreach my $b (@{$a->{'Entries'}})
702 {
703 next unless $b->[AGG_OUT_LEN];
704 my $s = $b->[AGG_MIN_IN];
705 my $e = $b->[AGG_MAX_IN];
706 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
707 }
708 if ($a->{'Forward'})
709 {
f0a41339
DK
710 my $var = $^O eq 'MacOS' ? 'extern' : 'static';
711 print $fh "\n$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
67d7b5ef
JH
712 }
713 $a->{'Done'} = 1;
714 foreach my $b (@{$a->{'Entries'}})
715 {
716 my ($s,$e,$out,$t,$end,$l) = @$b;
717 outtable($fh,$t) unless $t->{'Done'};
718 }
719 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
720 foreach my $b (@{$a->{'Entries'}})
721 {
722 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
d6f02b51 723 # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
67d7b5ef
JH
724 print $fh "{";
725 if ($l)
726 {
727 printf $fh outstring($fh,'',$out);
728 }
729 else
730 {
731 print $fh "0";
732 }
733 print $fh ",",$t->{Cname};
734 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
735 }
736 print $fh "};\n";
737}
738
739sub output
740{
741 my ($fh,$name,$a) = @_;
742 process($name,$a);
743 # Sub-tables
744 outtable($fh,$a);
745}
746
747sub output_enc
748{
749 my ($fh,$name,$a) = @_;
750 die "Changed - fix me for new structure";
751 foreach my $b (sort keys %$a)
752 {
753 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
754 }
755}
756
757sub decode_U
758{
759 my $s = shift;
760}
761
762my @uname;
763sub char_names
764{
765 my $s = do "unicore/Name.pl";
766 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
767 pos($s) = 0;
768 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
769 {
770 my $name = $3;
771 my $s = hex($1);
772 last if $s >= 0x10000;
773 my $e = length($2) ? hex($2) : $s;
774 for (my $i = $s; $i <= $e; $i++)
775 {
776 $uname[$i] = $name;
777# print sprintf("U%04X $name\n",$i);
778 }
779 }
780}
781
782sub output_ucm_page
783{
784 my ($cmap,$a,$t,$pre) = @_;
785 # warn sprintf("Page %x\n",$pre);
786 my $raw = $t->{Raw};
787 foreach my $key (sort keys %$raw) {
788 # RAW_NEXT => 0,
789 # RAW_IN_LEN => 1,
790 # RAW_OUT_BYTES => 2,
791 # RAW_FALLBACK => 3,
792 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
793 my $u = ord $key;
794 $fallback ||= 0;
795
796 if ($next != $a && $next != $t) {
797 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
798 } elsif (length $out_bytes) {
799 if ($pre) {
800 $u = $pre|($u &0x3f);
801 }
802 my $s = sprintf "<U%04X> ",$u;
803 #foreach my $c (split(//,$out_bytes)) {
804 # $s .= sprintf "\\x%02X",ord($c);
805 #}
806 # 9.5% faster changing that loop to this:
807 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
808 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
809 push(@$cmap,$s);
810 } else {
811 warn join(',',$u, @{$raw->{$key}},$a,$t);
812 }
813 }
814}
815
816sub output_ucm
817{
818 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
819 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
820 print $fh "<code_set_name> \"$name\"\n";
821 char_names();
822 if (defined $min_el)
823 {
824 print $fh "<mb_cur_min> $min_el\n";
825 }
826 if (defined $max_el)
827 {
828 print $fh "<mb_cur_max> $max_el\n";
829 }
830 if (defined $rep)
831 {
832 print $fh "<subchar> ";
833 foreach my $c (split(//,$rep))
834 {
835 printf $fh "\\x%02X",ord($c);
836 }
837 print $fh "\n";
838 }
839 my @cmap;
840 output_ucm_page(\@cmap,$h,$h,0);
841 print $fh "#\nCHARMAP\n";
842 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
843 {
844 print $fh $line;
845 }
846 print $fh "END CHARMAP\n";
847}
848
3ef515df
JH
849use vars qw(
850 $_Enc2xs
851 $_Version
852 $_Inc
b2704119 853 $_E2X
3ef515df
JH
854 $_Name
855 $_TableFiles
856 $_Now
857);
858
b2704119
JH
859sub find_e2x{
860 eval { require File::Find };
861 my (@inc, %e2x_dir);
862 for my $inc (@INC){
863 push @inc, $inc unless $inc eq '.'; #skip current dir
864 }
865 File::Find::find(
866 sub {
867 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
868 $atime,$mtime,$ctime,$blksize,$blocks)
869 = lstat($_) or return;
870 -f _ or return;
871 if (/^.*\.e2x$/o){
872 $e2x_dir{$File::Find::dir} ||= $mtime;
873 }
874 return;
875 }, @inc);
876 warn join("\n", keys %e2x_dir), "\n";
877 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
878 $_E2X = $d;
879 # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
880 return $_E2X;
881 }
882}
883
67d7b5ef
JH
884sub make_makefile_pl
885{
886 eval { require Encode; };
887 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
3ef515df
JH
888 # our used for variable expanstion
889 $_Enc2xs = $0;
890 $_Version = $VERSION;
b2704119 891 $_E2X = find_e2x();
3ef515df
JH
892 $_Name = shift;
893 $_TableFiles = join(",", map {qq('$_')} @_);
894 $_Now = scalar localtime();
b2704119 895
aae85ceb 896 eval { require File::Spec; };
b2704119 897 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
b2704119 898 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
b2704119 899 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
b2704119 900 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
b2704119 901 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
3ef515df
JH
902 exit;
903}
904
aae85ceb
DK
905use vars qw(
906 $_ModLines
907 $_LocalVer
908 );
909
910sub make_configlocal_pm
911{
912 eval { require Encode; };
913 $@ and die "Unable to require Encode: $@\n";
914 eval { require File::Spec; };
915 # our used for variable expanstion
916 my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
917 my %LocalMod = ();
918 for my $d (@INC){
919 my $inc = File::Spec->catfile($d, "Encode");
920 -d $inc or next;
921 opendir my $dh, $inc or die "$inc:$!";
922 warn "Checking $inc...\n";
923 for my $f (grep /\.pm$/o, readdir($dh)){
924 -f File::Spec->catfile($inc, "$f") or next;
925 $INC{"Encode/$f"} and next;
926 warn "require Encode/$f;\n";
927 eval { require "Encode/$f"; };
928 $@ and die "Can't require Encode/$f: $@\n";
929 for my $enc (Encode->encodings()){
930 $in_core{$enc} and next;
931 $Encode::Config::ExtModule{$enc} and next;
932 my $mod = "Encode/$f";
933 $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
b2704119 934 $LocalMod{$enc} ||= $mod;
aae85ceb
DK
935 }
936 }
937 }
938 $_ModLines = "";
939 for my $enc (sort keys %LocalMod){
940 $_ModLines .=
941 qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
942 }
b2704119 943 warn $_ModLines;
aae85ceb 944 $_LocalVer = _mkversion();
b2704119 945 $_E2X = find_e2x();
aae85ceb 946 $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
b2704119 947 _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
621b0f8d
DK
948 File::Spec->catfile($_Inc,"ConfigLocal.pm"),
949 1);
aae85ceb
DK
950 exit;
951}
952
953sub _mkversion{
954 my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
955 $yyyy += 1900, $mo +=1;
956 return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
957}
958
3ef515df 959sub _print_expand{
67d7b5ef
JH
960 eval { require File::Basename; };
961 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
962 File::Basename->import();
621b0f8d
DK
963 my ($src, $dst, $clobber) = @_;
964 if (!$clobber and -e $dst){
965 warn "$dst exists. skipping\n";
966 return;
967 }
968 warn "Generating $dst...\n";
3ef515df
JH
969 open my $in, $src or die "$src : $!";
970 if ((my $d = dirname($dst)) ne '.'){
971 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
972 }
973 open my $out, ">$dst" or die "$!";
974 my $asis = 0;
975 while (<$in>){
976 if (/^#### END_OF_HEADER/){
977 $asis = 1; next;
978 }
979 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
980 print $out $_;
67d7b5ef 981 }
67d7b5ef 982}
67d7b5ef
JH
983__END__
984
985=head1 NAME
986
987enc2xs -- Perl Encode Module Generator
988
989=head1 SYNOPSIS
990
67d7b5ef 991 enc2xs -[options]
aae85ceb
DK
992 enc2xs -M ModName mapfiles...
993 enc2xs -C
67d7b5ef
JH
994
995=head1 DESCRIPTION
996
997F<enc2xs> builds a Perl extension for use by Encode from either
0ab8f81e
JH
998Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
999Besides being used internally during the build process of the Encode
1000module, you can use F<enc2xs> to add your own encoding to perl.
1001No knowledge of XS is necessary.
67d7b5ef
JH
1002
1003=head1 Quick Guide
1004
0ab8f81e 1005If you want to know as little about Perl as possible but need to
67d7b5ef
JH
1006add a new encoding, just read this chapter and forget the rest.
1007
1008=over 4
1009
1010=item 0.
1011
0ab8f81e
JH
1012Have a .ucm file ready. You can get it from somewhere or you can write
1013your own from scratch or you can grab one from the Encode distribution
1014and customize it. For the UCM format, see the next Chapter. In the
1015example below, I'll call my theoretical encoding myascii, defined
1016in I<my.ucm>. C<$> is a shell prompt.
67d7b5ef
JH
1017
1018 $ ls -F
1019 my.ucm
1020
1021=item 1.
1022
1023Issue a command as follows;
1024
1025 $ enc2xs -M My my.ucm
3ef515df
JH
1026 generating Makefile.PL
1027 generating My.pm
1028 generating README
1029 generating Changes
67d7b5ef
JH
1030
1031Now take a look at your current directory. It should look like this.
1032
1033 $ ls -F
1034 Makefile.PL My.pm my.ucm t/
1035
0ab8f81e 1036The following files were created.
67d7b5ef 1037
0ab8f81e
JH
1038 Makefile.PL - MakeMaker script
1039 My.pm - Encode submodule
1040 t/My.t - test file
1041
1042=over 4
67d7b5ef 1043
037b88d6
JH
1044=item 1.1.
1045
1046If you want *.ucm installed together with the modules, do as follows;
1047
1048 $ mkdir Encode
1049 $ mv *.ucm Encode
1050 $ enc2xs -M My Encode/*ucm
1051
0ab8f81e
JH
1052=back
1053
67d7b5ef
JH
1054=item 2.
1055
1056Edit the files generated. You don't have to if you have no time AND no
1057intention to give it to someone else. But it is a good idea to edit
0ab8f81e 1058the pod and to add more tests.
67d7b5ef
JH
1059
1060=item 3.
1061
0ab8f81e 1062Now issue a command all Perl Mongers love:
67d7b5ef 1063
9160fdbd 1064 $ perl Makefile.PL
67d7b5ef
JH
1065 Writing Makefile for Encode::My
1066
1067=item 4.
1068
1069Now all you have to do is make.
1070
1071 $ make
1072 cp My.pm blib/lib/Encode/My.pm
1073 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1074 -o encode_t.c -f encode_t.fnm
1075 Reading myascii (myascii)
1076 Writing compiled form
1077 128 bytes in string tables
1078 384 bytes (25%) saved spotting duplicates
1079 1 bytes (99.2%) saved using substrings
1080 ....
1081 chmod 644 blib/arch/auto/Encode/My/My.bs
1082 $
1083
0ab8f81e
JH
1084The time it takes varies depending on how fast your machine is and
1085how large your encoding is. Unless you are working on something big
1086like euc-tw, it won't take too long.
67d7b5ef
JH
1087
1088=item 5.
1089
1090You can "make install" already but you should test first.
1091
1092 $ make test
1093 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1094 -e 'use Test::Harness qw(&runtests $verbose); \
1095 $verbose=0; runtests @ARGV;' t/*.t
1096 t/My....ok
1097 All tests successful.
1098 Files=1, Tests=2, 0 wallclock secs
1099 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1100
1101=item 6.
1102
1103If you are content with the test result, just "make install"
1104
aae85ceb
DK
1105=item 7.
1106
0ab8f81e 1107If you want to add your encoding to Encode's demand-loading list
aae85ceb
DK
1108(so you don't have to "use Encode::YourEncoding"), run
1109
1110 enc2xs -C
1111
1112to update Encode::ConfigLocal, a module that controls local settings.
1113After that, "use Encode;" is enough to load your encodings on demand.
1114
67d7b5ef
JH
1115=back
1116
1117=head1 The Unicode Character Map
1118
0ab8f81e
JH
1119Encode uses the Unicode Character Map (UCM) format for source character
1120mappings. This format is used by IBM's ICU package and was adopted
1121by Nick Ing-Simmons for use with the Encode module. Since UCM is
1122more flexible than Tcl's Encoding Map and far more user-friendly,
1123this is the recommended formet for Encode now.
67d7b5ef 1124
0ab8f81e 1125A UCM file looks like this.
67d7b5ef
JH
1126
1127 #
1128 # Comments
1129 #
1130 <code_set_name> "US-ascii" # Required
1131 <code_set_alias> "ascii" # Optional
1132 <mb_cur_min> 1 # Required; usually 1
1133 <mb_cur_max> 1 # Max. # of bytes/char
1134 <subchar> \x3F # Substitution char
1135 #
1136 CHARMAP
1137 <U0000> \x00 |0 # <control>
1138 <U0001> \x01 |0 # <control>
1139 <U0002> \x02 |0 # <control>
1140 ....
1141 <U007C> \x7C |0 # VERTICAL LINE
1142 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1143 <U007E> \x7E |0 # TILDE
1144 <U007F> \x7F |0 # <control>
1145 END CHARMAP
1146
1147=over 4
1148
1149=item *
1150
0ab8f81e 1151Anything that follows C<#> is treated as a comment.
67d7b5ef
JH
1152
1153=item *
1154
0ab8f81e
JH
1155The header section continues until a line containing the word
1156CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1157pair per line. Strings used as values must be quoted. Barewords are
1158treated as numbers. I<\xXX> represents a byte.
67d7b5ef
JH
1159
1160Most of the keywords are self-explanatory. I<subchar> means
1161substitution character, not subcharacter. When you decode a Unicode
1162sequence to this encoding but no matching character is found, the byte
1163sequence defined here will be used. For most cases, the value here is
0ab8f81e 1164\x3F; in ASCII, this is a question mark.
67d7b5ef
JH
1165
1166=item *
1167
1168CHARMAP starts the character map section. Each line has a form as
0ab8f81e 1169follows:
67d7b5ef
JH
1170
1171 <UXXXX> \xXX.. |0 # comment
1172 ^ ^ ^
1173 | | +- Fallback flag
1174 | +-------- Encoded byte sequence
1175 +-------------- Unicode Character ID in hex
1176
0ab8f81e
JH
1177The format is roughly the same as a header section except for the
1178fallback flag: | followed by 0..3. The meaning of the possible
1179values is as follows:
67d7b5ef 1180
0ab8f81e 1181=over 4
67d7b5ef
JH
1182
1183=item |0
1184
0ab8f81e
JH
1185Round trip safe. A character decoded to Unicode encodes back to the
1186same byte sequence. Most characters have this flag.
67d7b5ef
JH
1187
1188=item |1
1189
1190Fallback for unicode -> encoding. When seen, enc2xs adds this
0ab8f81e 1191character for the encode map only.
67d7b5ef
JH
1192
1193=item |2
1194
1195Skip sub-char mapping should there be no code point.
1196
1197=item |3
1198
1199Fallback for encoding -> unicode. When seen, enc2xs adds this
0ab8f81e 1200character for the decode map only.
67d7b5ef
JH
1201
1202=back
1203
1204=item *
1205
1206And finally, END OF CHARMAP ends the section.
1207
1208=back
1209
6d1c0808 1210When you are manually creating a UCM file, you should copy ascii.ucm
0ab8f81e
JH
1211or an existing encoding which is close to yours, rather than write
1212your own from scratch.
67d7b5ef
JH
1213
1214When you do so, make sure you leave at least B<U0000> to B<U0020> as
0ab8f81e 1215is, unless your environment is EBCDIC.
67d7b5ef
JH
1216
1217B<CAVEAT>: not all features in UCM are implemented. For example,
1218icu:state is not used. Because of that, you need to write a perl
0ab8f81e
JH
1219module if you want to support algorithmical encodings, notably
1220the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
67d7b5ef
JH
1221L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1222
6d1c0808
JH
1223=head2 Coping with duplicate mappings
1224
1225When you create a map, you SHOULD make your mappings round-trip safe.
1226That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1227$data> stands for all characters that are marked as C<|0>. Here is
0ab8f81e 1228how to make sure:
6d1c0808 1229
0ab8f81e 1230=over 4
6d1c0808
JH
1231
1232=item *
1233
1234Sort your map in Unicode order.
1235
1236=item *
1237
1238When you have a duplicate entry, mark either one with '|1' or '|3'.
1239
1240=item *
1241
0ab8f81e 1242And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
6d1c0808
JH
1243
1244=back
1245
1246Here is an example from big5-eten.
1247
1248 <U2550> \xF9\xF9 |0
1249 <U2550> \xA2\xA4 |3
1250
1251Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1252this;
1253
1254 E to U U to E
1255 --------------------------------------
1256 \xF9\xF9 => U2550 U2550 => \xF9\xF9
1257 \xA2\xA4 => U2550
1258
1259So it is round-trip safe for \xF9\xF9. But if the line above is upside
1260down, here is what happens.
1261
1262 E to U U to E
1263 --------------------------------------
1264 \xA2\xA4 => U2550 U2550 => \xF9\xF9
1265 (\xF9\xF9 => U2550 is now overwritten!)
1266
1267The Encode package comes with F<ucmlint>, a crude but sufficient
0ab8f81e
JH
1268utility to check the integrity of a UCM file. Check under the
1269Encode/bin directory for this.
6d1c0808
JH
1270
1271
67d7b5ef
JH
1272=head1 Bookmarks
1273
0ab8f81e
JH
1274=over 4
1275
1276=item *
1277
67d7b5ef
JH
1278ICU Home Page
1279L<http://oss.software.ibm.com/icu/>
1280
0ab8f81e
JH
1281=item *
1282
67d7b5ef
JH
1283ICU Character Mapping Tables
1284L<http://oss.software.ibm.com/icu/charset/>
1285
0ab8f81e
JH
1286=item *
1287
67d7b5ef
JH
1288ICU:Conversion Data
1289L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1290
0ab8f81e
JH
1291=back
1292
67d7b5ef
JH
1293=head1 SEE ALSO
1294
1295L<Encode>,
1296L<perlmod>,
1297L<perlpod>
1298
1299=cut
1300
1301# -Q to disable the duplicate codepoint test
1302# -S make mapping errors fatal
1303# -q to remove comments written to output files
1304# -O to enable the (brute force) substring optimiser
1305# -o <output> to specify the output file name (else it's the first arg)
1306# -f <inlist> to give a file with a list of input files (else use the args)
1307# -n <name> to name the encoding (else use the basename of the input file.
1308
1309With %seen holding array refs:
1310
1311 865.66 real 28.80 user 8.79 sys
1312 7904 maximum resident set size
1313 1356 average shared memory size
1314 18566 average unshared data size
1315 229 average unshared stack size
1316 46080 page reclaims
1317 33373 page faults
1318
1319With %seen holding simple scalars:
1320
1321 342.16 real 27.11 user 3.54 sys
1322 8388 maximum resident set size
1323 1394 average shared memory size
1324 14969 average unshared data size
1325 236 average unshared stack size
1326 28159 page reclaims
1327 9839 page faults
1328
1329Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1330how %seen is storing things its seen. So it is pathalogically bad on a 16M
1331RAM machine, but it's going to help even on modern machines.
1332Swapping is bad, m'kay :-)