head	1.1;
branch	1.1.1;
access;
symbols
	netbsd-11-0-RC4:1.1.1.1
	netbsd-11-0-RC3:1.1.1.1
	netbsd-11-0-RC2:1.1.1.1
	netbsd-11-0-RC1:1.1.1.1
	perseant-exfatfs-base-20250801:1.1.1.1
	netbsd-11:1.1.1.1.0.4
	netbsd-11-base:1.1.1.1
	perseant-exfatfs-base-20240630:1.1.1.1
	perseant-exfatfs:1.1.1.1.0.2
	perseant-exfatfs-base:1.1.1.1
	LESS-643:1.1.1.1
	GREENWOODSOFTWARE:1.1.1;
locks; strict;
comment	@# @;


1.1
date	2023.10.06.04.32.48;	author simonb;	state Exp;
branches
	1.1.1.1;
next	;
commitid	5NbWE4l6mMQXkwHE;

1.1.1.1
date	2023.10.06.04.32.48;	author simonb;	state Exp;
branches;
next	;
commitid	5NbWE4l6mMQXkwHE;


desc
@@



1.1
log
@Initial revision
@
text
@#!/usr/bin/env perl
use strict;

my $USAGE = <<__EOF__;
   usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt
          -n = take non-matching types
          -f = zero-based type field (default 2)
__EOF__

use Getopt::Std;
use vars qw( $opt_f $opt_n );

my $type_field = 2;

# Override Unicode tables for certain control chars
# that are expected to be found in normal text files.
my %force_space = (
    0x08 => 1, # backspace
    0x09 => 1, # tab
    0x0a => 1, # newline
    0x0c => 1, # form feed
    0x0d => 1, # carriage return
);

# Hangul Jamo medial vowels and final consonants should be zero width.
my @@force_compose = (
    [0x1160, 0x11ff],
    [0xd7b0, 0xd7c6],
    [0xd7cb, 0xd7fb]
);

exit (main() ? 0 : 1);

sub main {
    my $args = join ' ', @@ARGV;
    die $USAGE if not getopts('f:n');
    $type_field = $opt_f if $opt_f;

    my %types;
    my $arg;
    while ($arg = shift @@ARGV) {
        last if $arg eq '--';
        $types{$arg} = 1;
    }
    my %out = ( 'types' => \%types );

    my %force_compose;
    foreach my $comp (@@force_compose) {
        my ($lo,$hi) = @@$comp;
        for (my $ch = $lo; $ch <= $hi; ++$ch) {
            $force_compose{$ch} = 1;
        }
    }

    my $date = `date`;
    chomp $date;
    print "/* Generated by \"$0 $args\" on $date */\n";

    my $last_code = 0;
    my $start_range = 0;
    while (<>) {
        chomp;
        s/#.*//;
        my @@fields = split /;/;
        next if not @@fields;
        my ($lo_code, $hi_code);
        my $codes = $fields[0];
        if ($codes =~ /(\w+)\.\.(\w+)/) {
            $lo_code = hex $1;
            $hi_code = hex $2;
        } else {
            $lo_code = $hi_code = hex $codes;
        }
        if ($fields[1] =~ /, First>$/) {
            die "invalid Unicode data: First with range" if $hi_code != $lo_code;
            $start_range = $lo_code;
            next;
        }
        if ($fields[1] =~ /, Last>$/) {
            die "invalid Unicode data: Last without First" if not $start_range;
            $lo_code = $start_range;
            $start_range = 0;
        } elsif ($start_range) {
            die "invalid Unicode data: First without Last";
        }
        my $type = $fields[$type_field];
        $type =~ s/\s//g;
        for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) {
            output(\%out, $last_code,
                $force_space{$last_code} ? 'Zs' : $force_compose{$last_code} ? 'Mn' : $type);
        }
    }
    output(\%out, $last_code);
    return 1;
}

sub output {
    my ($out, $code, $type) = @@_;
    my $type_ok = ($type and ${${$out}{types}}{$type});
    $type_ok = not $type_ok if $opt_n;
    my $prev_code = $$out{prev_code};

    if (not $type_ok) {
        end_run($out, $prev_code);
    } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) {
        end_run($out, $prev_code);
        start_run($out, $code, $type);
    }
    $$out{prev_code} = $code;
}

sub start_run {
    my ($out, $code, $type) = @@_;
    $$out{start_code} = $code;
    $$out{prev_code} = $code;
    $$out{run_type} = $type;
    $$out{in_run} = 1;
}

sub end_run {
    my ($out, $code) = @@_;
    return if not $$out{in_run};
    printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type};
    $$out{in_run} = 0;
}
@


1.1.1.1
log
@Import less-643.  Way to many changes and bugfixes over the last ten
years to list.  Look at src/external/bsd/less/dist/version.c for a
complete list of changes.
@
text
@@
