#!/usr/bin/perl -w
# massagevendor.pl: v0.03 2002/03/07 KELEMEN Peter <fuji@debian.org>
# This program is part of the arpwatch Debian package.

# Compile Ethernet vendor code listings from different sources into internal
# format used by arpwatch(8).

# Ethernet vendor listings recognized:
# IEEE OUI: http://standards.ieee.org/regauth/oui/oui.txt
# CaveBear: http://map-ne.com/Ethernet/Ethernet.txt

use Getopt::Long;
use strict;
use vars qw($opt_output $opt_ieee $opt_cavebear $opt_firstword $opt_help $opt_version);

my $VERSION = q(0.03);

$opt_output = 'ethercodes.dat';
$opt_ieee = 'oui.txt';
$opt_cavebear = 'Ethernet.txt';
$opt_firstword = 0;
$opt_help = 0;
$opt_version = 0;

my $time = time();

GetOptions(qw(output=s ieee=s cavebear=s firstword help version));
version() if $opt_version;
usage() if $opt_help;

open(IEEE, '<' . $opt_ieee) || die "$opt_ieee: $!\n";
open(CAVEBEAR, '<' . $opt_cavebear) || die "$opt_cavebear: $!\n";
open(OUTPUT, '>' . $opt_output) || die "$opt_output: $!\n";

my ($prefix, $vendor);
my %ieee = ();
my %cavebear = ();

# Process IEEE OUI listing.
while (<IEEE>) {
	next unless /^[[:xdigit:]]{2}-[[:xdigit:]]{2}-[[:xdigit:]]{2}\s/;
	chomp;
	($prefix, $vendor) = m/^([[:xdigit:]]{2}-[[:xdigit:]]{2}-[[:xdigit:]]{2})\s+\([^(]+\)\s+(.*)$/;
	$prefix =~ s/-//g;
	$ieee{mangle_prefix($prefix)} = mangle_vendor($vendor);
}

# Process CaveBear Ethernet vendor codes.
while (<CAVEBEAR>) {
	next unless /^[0-9A-F]{6}\s/;
	s/\r//g;
	chomp;
	($prefix, $vendor) = m/^([0-9A-F]{6})\s+(.*)$/;
	$vendor =~ s/\s{2,}/ /g;
	$vendor =~ s/([a-z])([A-Z])/$1 $2/g;
	$cavebear{mangle_prefix($prefix)} = $vendor;
}
print STDERR scalar keys %ieee, " (IEEE), ", scalar keys %cavebear, " (CaveBear).\n";

# Attempt to merge.
my $equal = 0;
my $first_word = 0;
my $merged = 0;
my $c_only = 0;
foreach (sort keys %cavebear) {
	if (exists $ieee{$_}) {

		my ($i, $c);

		# Treat as equal if sanitized vendor name equals.
		$i = lc($ieee{$_});
		$c = lc($cavebear{$_});
		$i =~ s/\W//g;
		$c =~ s/\W//g;
		if ($i eq $c) {
			++$equal;
			next;
		}

		if ($opt_firstword) {
			# Treat as equal if first word matches.
			$i = lc($ieee{$_});
			$c = lc($cavebear{$_});
			$i =~ s/(\w+).*/$1/;
			$c =~ s/(\w+).*/$1/;
			if (0 and $i eq $c) {
				$i =~ s/\W//g;
				$c =~ s/\W//g;
				if (length($c) >= length($i)) {
					$ieee{$_} = $cavebear{$_};
				}
				++$first_word;
				next;
			}
		}

		# At this point we're helpless; simple heuristics could not
		# determine if the two vendors were the same.  Merge them.
		# Always use vendor name from IEEE, add vendor name from
		# CaveBear in brackets ([]).
		$ieee{$_} .= " [" . $cavebear{$_} . ']';
		$ieee{$_} =~ s/\t/; /g;
		++$merged;
	} else {
		$ieee{$_} = $cavebear{$_};
		++$c_only;
	}
}

foreach (sort keys %ieee) {
	print OUTPUT "$_\t$ieee{$_}\n";
}
close(OUTPUT);
close(IEEE);
close(CAVEBEAR);

$time = time()-$time;
print scalar keys %ieee, " total. (processed in $time seconds)\n";
print STDERR $equal, " equal, ", $first_word, " equal based on company name.\n";
print STDERR $merged, " merged, ", $c_only, " listed only by CaveBear.\n";

# Mangle prefix code.  Wish there was some standard notation...
sub mangle_prefix {
	my $prefix = lc(shift);		# Lowercase.

	# Extract XX:XX:XX MAC address prefix.
	@_ = $prefix =~ m/^([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})$/;

	map { s/^0//; } @_;		# Strip leading zeros.
	return join(':', @_);
}

# Mangle vendor name since the IEEE listing is really carelessly compiled.
# Heavy heuristics here, you've been warned.
sub mangle_vendor {
	my $vendor = shift;
	$vendor =~ s/^\W+//;		# Strip leading garbage.
	$vendor =~ s/\s{2,}/ /g;	# Condense whitespace.
	$vendor =~ s/ \W$//;		# Strip trailing garbage.
	$vendor =~ s/(\w{4,})/\u\L$1/g;	# Capitalize words 4 or more chars.

	# Heuristics to uc() abbreviations longer than 4 characters.
	$vendor =~ s/^(Ecci|Ris(c|q)|Seel|Uunet)$/\U$1/g;

	# Heuristics to lc() abbreviations shorter than 4 characters.
	$vendor =~ s/\b(COM?|IN(C|T)|LTD|DIV|SYS|PUB|IND|PT(Y|E)|LAB|TEC|SEL|EON)\b/\u\L$1/g;

	# Lowercase ordinary words.
	$vendor =~ s/\b(BAY|THE|ZUR|END|ONE|SAN|SUN|NET|WAY|TOP|BOX)/\u\L$1/g;
	$vendor =~ s/\b(BUG|BUS|PIG|TEN|LAW|NOT|SEA|LEE)\b/\u\L$1/g;

	# Lowercase French words and abbreviations.
	$vendor =~ s/\b(LA|LE|SOC|FA)\b/\u\L$1/g;

	# Lowercase English, French, German and Spanish conjunctions.
	$vendor =~ s/\b(AND|OF|TO|IN|FOR)\b/\L$1/g;
	$vendor =~ s/\b(DES|UND|DE|DI|DU)\b/\L$1/g;

	$vendor =~ s/\b((L|D)')/\L$1/g;	# Lowercase French prefixes.
	$vendor =~ s/('[A-Z])\b/\L$1/g;	# Lowercase letters following an "'".
	$vendor =~ s/\bGmbh\b/GmbH/g;	# Treat GmbH specially.

	return $vendor;
}

sub version {
	print <<EOF
massagevendor $VERSION
Copyright (C) 2000-2001 KELEMEN Peter <fuji\@debian.org>
EOF
	;
	exit(0);
}

sub usage {
	print <<EOF
Massage IEEE OUI listing and Cavebear Ethernet vendor database into arpwatch(8) format.

Usage: massagevendor [--ieee=<file>] [--cavebear=<file>] [--output=<file>] [--help] [--firstword]

Options:
	--ieee=<file>		Read IEEE OUI listing from this file [oui.txt]
	--cavebear=<file>	Read Cavebear Ethernet vendor info from here [Ethernet.txt]
	--output=<file>		Write massaged output to this file [ethercodes.dat]
	--firstword		Enable equality heuristics based on he first word of
				a vendor's name.  This might produce unwanted results.
	--help			You are reading this now.

EOF
	;
	exit(0);
}
