#!/usr/bin/env perl
#
# word parser/code generator experiement for Lingua::Awkwords. okay for
# things like Toki Pona that use CV or VCV forms, less so for English,
# probably needs to better deconstruct at the syllable level for better
# results there?
#
#   ... | ./lingua-method > makewords
#   # (edit makewords code here as necessary)
#   perl makewords 10
#
# there are various things to TWEAK in this code. the input words will
# likely need to be cleaned up (lower case the input, cleanup or remove
# contractions, etc)

use 5.24.0;
use warnings;

my (%letters, %pattern, @plist, %ptypes, %tally, %gtally);

while (readline) {
    chomp;
    foreach my $word (split ' ') {
        parse_word(
            $word,
            vowels     => qr/[aeiou]+/,
            consonants => qr/[bcdfghjklmnpqrstvwxyz]+/
        );
    }
}

patternize(\%pattern, '', 0);

foreach my $type (keys %ptypes) {
    # TWEAK this here weight as necessary
    $letters{$type} = join '/', letter_weights($type, weight => 5);
}

print <<"HEADER";
#!/usr/bin/env perl
use 5.24.0;
use warnings;
use Lingua::Awkwords::Subpattern;
use Lingua::Awkwords;

Lingua::Awkwords::Subpattern->set_patterns(
HEADER

while (my ($L, $weights) = each %letters) {
    say '    ', $L, ' => Lingua::Awkwords->parse_string(q{ ', $weights, ' }),';
}

local $" = '/';
print <<"FOOTER";
);

# AB VC YZ are the subpatterns for beginning, middle, tail bits
my \$tree = Lingua::Awkwords->new( pattern => q{ @plist } );

foreach (1..(shift // 20)) {
    say \$tree->render;
}

FOOTER

# my excuse is I'm not a linguist
sub deconstruct {
    my ($s, %param) = @_;
    my @form;
  LEX: {
        last LEX if $s =~ m/\G$/cg;    # end of string
        if ($s =~ m/\G ($param{vowels}) /cgx) {
            push @form, [ 'V', $1 ];
            redo LEX;
        }
        if ($s =~ m/\G ($param{consonants}$param{vowels}) /cgx) {
            push @form, [ 'S', $1 ];
            redo LEX;
        }
        if ($s =~ m/\G ($param{consonants}) /cgx) {
            push @form, [ 'C', $1 ];
            redo LEX;
        }
        redo LEX if $s =~ m/\G./cgs;    # dunno, step forward
    }
    return @form;
}

# this allows the formation of new words by allowing "letters" from the
# global tally though with a weighting to more or less favor the form of
# the input
sub letter_weights {
    my ($type, %param) = @_;
    # TWEAK need a mapping here of all specialized head/tail subpatterns
    # back to their original global type (this, desonstruct, and
    # parse_word must all agree)
    my $gt = $type =~ tr/ABYZDX/VCVCSS/r;
    my @choices;
    while (my ($letter, $weight) = each $tally{$type}->%*) {
        push @choices,
          $letter . '*' . ($weight * $param{weight} + $gtally{$gt}{$letter});
    }
    while (my ($letter, $weight) = each $tally{$gt}->%*) {
        next if exists $tally{$type}{$letter};
        push @choices, $letter . '*' . $weight;
    }
    return @choices;
}

# TWEAK you may want a different parsage in which case different code
# will be required
sub parse_word {
    my ($s, %param) = @_;
    my @forms = deconstruct($s, %param);
    return if !@forms;
    foreach my $f (@forms) {
        $gtally{ $f->[0] }{ $f->[1] }++;
    }
    # beginnings and endings are important? well they are here
    $forms[0][0]  =~ tr/VCS/ABD/;
    $forms[-1][0] =~ tr/VCS/YZX/;
    my $p = \%pattern;
    foreach my $f (@forms) {
        $tally{ $f->[0] }{ $f->[1] }++;
        # build the tree used by patternize to figure out all the
        # different large scale word forms (e.g. CVC or VC) (which means
        # this tool only generates words using those same forms, though
        # with the letters there in randomized. edit the generated code
        # to adjust the patterns generated)
        $p->{ $f->[0] }[0]++;
        $p = $p->{ $f->[0] }[1] //= {};
    }
}

# recurse through built pattern tree and populate pattern list
sub patternize {
    my ($p, $str, $sum) = @_;
    if (!$p->%*) {
        push @plist, "$str*$sum" if $sum;
        return;
    }
    while (my ($form, $attr) = each $p->%*) {
        $ptypes{$form} = 1;
        patternize($attr->[1], $str . $form, $sum + $attr->[0]);
    }
}
