# Lumas.pm
#
# Copyright (c) 2003 Tech-Know-Ware Ltd. All rights reserved.
# This program is free software; you can redistribute it and/or
# modify it according to Perl's artistic licence, subject to
# the following notice:
# 
# NOTICE
#
# THIS FILE IS PRESENTED 'AS IS' AND COMES WITH NO WARRENTY OF 
# ANY KIND.  
# 
# TECH-KNOW-WARE LTD SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED 
# BY ANYONE OR ANYTHING DUE TO THE USE OF THIS FILE HOWEVER THEY 
# MAY BE CAUSED.
# 
# This file, or any derivative of it, may be used for any 
# purpose, including commercial purposes.
# 
# Users are permitted to modify the contents of the file, as 
# long as this notice and the copyright notice remain.
# 
# Users are permitted to extract elements of this file into
# other files, subject to this notice and the copyright notice 
# also being copied into said files.
# 
# For more information about Lumas, go to:
# 
# 	http://www.tech-know-ware.com/lumas
# 
# END OF NOTICE


package Lumas;


# Required libraries
use strict;
use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
require Exporter;


# Global variables
$VERSION = '0.7';

@ISA     = qw( Exporter );
@EXPORT  = qw( LMS_ERROR LMS_FINISHED LMS_TAG LMS_LIST_TAG LMS_VOID_CONST_UASCII	
				LMS_BOOL LMS_INT LMS_FLOAT LMS_DATE LMS_TIME LMS_OID 
				LMS_IPV4 LMS_IPV6 LMS_ASCII LMS_UNICODE LMS_BYTES 
				LMS_STRUCT_START LMS_STRUCT_END LMS_EMBED_START LMS_EMBED_END
				LMS_NOTHING );
@EXPORT_OK  = qw( get_next get_named get_named_nm get_named_cm is_uascii skip_value );


####################################################################################
#                               'Constants'
####################################################################################

sub LMS_ERROR				{ 0 }
sub LMS_FINISHED			{ 1 }
sub LMS_TAG					{ 2 }
sub LMS_LIST_TAG			{ 3 }
sub LMS_VOID_CONST_UASCII	{ 4 }
sub LMS_BOOL				{ 5 }
sub LMS_INT					{ 6 }
sub LMS_FLOAT				{ 7 }
sub LMS_DATE				{ 8 }
sub LMS_TIME				{ 9 }
sub LMS_OID					{ 10 }
sub LMS_IPV4				{ 11 }
sub LMS_IPV6				{ 12 }
sub LMS_ASCII				{ 13 }
sub LMS_UNICODE				{ 14 }
sub LMS_BYTES				{ 15 }
sub LMS_STRUCT_START		{ 16 }
sub LMS_STRUCT_END			{ 17 }
sub LMS_EMBEDDED			{ 18 }

sub LMS_NOTHING				{ 30 }

####################################################################################
#                             Public functions
####################################################################################

sub get_next( $ $ $ )
{
	# Get the next token from the message
	# $_[0] - The message
	# $_[1] - The variable into which the type code in the form of LMS_XXXX is to be placed
	# $_[2] - Where the actual read value is to be placed

	# Skip leading white space and comments
	while( $_[0] =~ /\G\s+/gc 
			|| $_[0] =~ m#\G//([^\n\r]*)#gc 
			|| $_[0] =~ m#\G/\*(.*?)\*/#gc ) {
	}

	if( $_[0] =~ /\G\s*$/gc ) {
		$_[1] = LMS_FINISHED;
		$_[2] = '<finished>';
		return 0;
	}
	elsif( $_[0] =~ /\G([^\d{}()[\-=][^{}()=\s]*)\s*=/gc ) {
		$_[1] = LMS_TAG;
		$_[2] = $1;
	}
	elsif( $_[0] =~ /\G,/gc ) {
		$_[1] = LMS_LIST_TAG;
		$_[2] = ',';
	}
	elsif( $_[0] =~ /\G(True)[\s,})]/igc || $_[0] =~ /\G(False)[\s,})]/igc 
			|| $_[0] =~ /\G(T)[\s,})]/igc || $_[0] =~ /\G(F)[\s,})]/igc ) {
		--pos($_[0]);		# Might end in ',' which we want next time round
		$_[1] = LMS_BOOL;
		$_[2] = $1;
	}
	elsif( $_[0] =~ /\G(-?\d+)[\s,})]/gc ) {
		--pos($_[0]);		# Might end in ',' which we want next time round
		$_[1] = LMS_INT;
		$_[2] = $1;
	}
	elsif( $_[0] =~ /\G(-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)[\s,})]/gc
			|| $_[0] =~ /\G(-?INF)[\s,})]/gc
			|| $_[0] =~ /\G(NaN)[\s,})]/gc ) {
		--pos($_[0]);		# Might end in ',' which we want next time round
		$_[1] = LMS_FLOAT;
		$_[2] = $1;
	}
	elsif( $_[0] =~ /\G(\d{4}-\d{2}-\d{2})[\s,})]/gc ) {
		--pos($_[0]);
		$_[1] = LMS_DATE;
		$_[2] = $1;
	}
	elsif( $_[0] =~ /\G(\d{2}:\d{2}(?::\d{2})?)[\s,})]/gc ) {
		--pos($_[0]);
		$_[1] = LMS_TIME;
		$_[2] = $1;
	}
	elsif( $_[0] =~ /\G(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s,})]/gc ) {
		--pos($_[0]);
		$_[1] = LMS_IPV4;
		$_[2] = $1;
	}
	elsif( $_[0] =~ /\G([\da-f]{1,4}(?::[\da-f]{1,4})*::(?:[\da-f]{1,4}(?::[\da-f]{1,4})*))[\s,})]/igc # hexseq "::" [ hexseq ]
			|| $_[0] =~ /\G(::[\da-f]{1,4}(?::[\da-f]{1,4})*)[\s,})]/igc		# "::" [ hexseq ]
			|| $_[0] =~ /\G([\da-f]{1,4}(?::[\da-f]{1,4})*)[\s,})]/igc ) {		# hexseq

		# hexseq         =  hex4 *( ":" hex4)	-> [\da-f]{1,4}(?::[\da-f]{1,4}
		# hex4           =  1*4HEXDIG			-> [\da-f]{1,4}

		--pos($_[0]);
		$_[1] = LMS_IPV6;
		$_[2] = $1;
	}
	elsif( $_[0] =~ /\G(\d+(?:~\d+)*)[\s,})]/gc ) {
		--pos($_[0]);
		$_[1] = LMS_OID;
		$_[2] = $1;
	}
	elsif( $_[0] =~ /\G''/gc ) {
		$_[1] = LMS_ASCII;
		$_[2] = '';
	}
	elsif( $_[0] =~ /\G'((?:\\\\)*)'/gc ) {	# A whole string of \ pairs
		$_[1] = LMS_ASCII;
		$_[2] = $1;
		$_[2] =~ s/\\([\\'])/$1/g;
	}
	elsif( $_[0] =~ /\G'(.*[^\\](?:\\\\)*)'/gc ) {	# Something that ends with pairs of \
		$_[1] = LMS_ASCII;
		$_[2] = $1;
		$_[2] =~ s/\\([\\'])/$1/g;
	}
	elsif( $_[0] =~ /\G""/gc ) {
		$_[1] = LMS_UNICODE;
		$_[2] = '';
	}
	elsif( $_[0] =~ /\G"((?:\\\\)*)"/gc ) {	# A whole string of \ pairs
		$_[1] = LMS_UNICODE;
		$_[2] = $1;
		$_[2] =~ s/\\([\\"])/$1/g;
	}
	elsif( $_[0] =~ /\G"(.*[^\\](?:\\\\)*)"/gc ) {	# Something that ends with pairs of \
		$_[1] = LMS_UNICODE;
		$_[2] = $1;
		$_[2] =~ s/\\([\\"])/$1/g;
	}
	elsif( $_[0] =~ /\G\[\s*([\sa-zA-Z0-9_\/]*={0,2})\s*\]/gc ) {
		$_[1] = LMS_BYTES;
		$_[2] = $1;
	}
	elsif( $_[0] =~ /\G{/gc ) {
		$_[1] = LMS_STRUCT_START;
		$_[2] = '<struct_start>';
	}
	elsif( $_[0] =~ /\G}/gc ) {
		$_[1] = LMS_STRUCT_END;
		$_[2] = '<struct_end>';
	}
	elsif( $_[0] =~ /\G\(/gc ) {
		$_[1] = LMS_EMBEDDED;
		my $start = pos( $_[0] );
		skip_to_compound_end( $_[0] );
		$_[2] = substr( $_[0], $start, pos( $_[0] ) - $start - 1 );
	}
	elsif( $_[0] =~ /\G([^\d{(^\-})\]]\w*)/gc ) {	# Needs to be defined last
		$_[1] = LMS_VOID_CONST_UASCII;
		$_[2] = $1;
	}
	else {
		$_[0] =~ /\G(.{1,8})/gc;
		$_[1] = LMS_ERROR;
		$_[2] = "<error:$1>";
		return 0;
	}

	return 1;
}


sub get_named_nm( $ $ $ $ )		# No move
{
	my $pos = pos( $_[0] );
	my $result = &get_named;
	pos( $_[0] ) = $pos;
	return $result;
}


sub get_named_cm( $ $ $ $ )		# Conditional move
{
	my $pos = pos( $_[0] );
	my $result = &get_named;
	pos( $_[0] ) = $pos if !$result;
	return $result;
}


sub get_named( $ $ $ $ )
{
	# $_[0] = message
	# $_[1] = Name - of the form '0fill,2=4mine';
	# $_[2] = Where return value is placed

	my $name;
	my $utag;
	my $ttag;
	my $instance;
	my $werehere;		# $werehere = we're here
	my $at_void = 0;

	$_[2] = LMS_NOTHING;		# Prevent struct nesting on first pass through for loop

	pos( $_[0] ) = 0 if $_[1] =~ m#^=#;	# Rewind if root requested

	return 1 if $_[1] eq '=';	# Just move to start of message


	# Strategy: get a value and say 'is this the value we want'.  
	# If not, get next value and repeat

	return 0 if ! get_next( $_[0], $_[2], $_[3] );

	LOOP:
	foreach $name ( split /=/, $_[1] ) {
		if( $at_void ) {
			return 0;
		}
		elsif( $name ne '' ) {
			return 0 if $_[2] == LMS_STRUCT_START and ! get_next( $_[0], $_[2], $_[3] );

			($utag, $ttag, $instance) = ($name =~ /^(\d*)([^,]*)(?:,(\d+))?$/);

			my $ctag = 0;
			++$ctag if $ttag ne '';	# This is a bit of a hack!!!
			my $cinstance = 0;

			while( $ctag <= $utag and $_[2] != LMS_STRUCT_END ) {	# Untagged phase

				next LOOP if $ttag eq '' and $ctag == $utag and ++$cinstance > $instance;

				return 0 if $_[2] == LMS_TAG and ! skip_value( $_[0] );	# Untagged union
				return 0 if $_[2] == LMS_STRUCT_START 
								and ! skip_to_compound_end( $_[0] );

				return 0 if ! get_next( $_[0], $_[2], $_[3] );

				if( $_[2] == LMS_LIST_TAG ) {
					return 0 if ! get_next( $_[0], $_[2], $_[3] );
					return 0 if $_[2] == LMS_LIST_TAG;
				}
				else {
					++$ctag;
				}
			}

			if( $ttag ne '' ) {		# Tagged phase
				my $last_tag = '';

				# Make sure we have a tag before entering the loop below
				# -Allows us to omit the number of untagged items in the search spec
				until( $_[2] == LMS_TAG or $_[2] == LMS_VOID_CONST_UASCII ) {
					return 0 if $_[2] == LMS_STRUCT_START 
									and ! skip_to_compound_end( $_[0] );
					return 0 if ! get_next( $_[0], $_[2], $_[3] );
				}

				while( 1 ) {
					$last_tag = $_[3] if $_[2] == LMS_TAG or $_[2] == LMS_VOID_CONST_UASCII;

					if( $_[2] == LMS_TAG || $_[2] == LMS_LIST_TAG || $_[2] == LMS_VOID_CONST_UASCII ) {
						if( $last_tag eq $ttag and ++$cinstance > $instance) {
							return 0 if $_[2] != LMS_VOID_CONST_UASCII and ! get_next( $_[0], $_[2], $_[3] );

							$at_void if $_[2] == LMS_VOID_CONST_UASCII;
							next LOOP;
						}
						return 0 if $_[2] == LMS_TAG || $_[2] == LMS_LIST_TAG and ! skip_value( $_[0] );
						
						do {	# This allows us to omit the number of untagged items in the search spec
							return 0 if ! get_next( $_[0], $_[2], $_[3] );
							return 0 if $_[2] == LMS_STRUCT_END;
						} until( $_[2] == LMS_TAG or $_[2] == LMS_LIST_TAG 
													or $_[2] == LMS_VOID_CONST_UASCII );
					}
					else {
						return 0;
					}
				}
			}

			return 0;	# We've failed if we get to here (next LOOP is the way out)
		}
	}

	return 1;
}


sub is_uascii( $ )
{
	# An unquoted-ascii value might look like any of the below types.  Therefore, 
	# to test if something is unquoted-ascii it is best to call 
	# "Lumas::is_uascii( $type )" where $type is the value supplied by the 
	# get_next() or get_named() functions.

	return $_[0] == LMS_VOID_CONST_UASCII
			|| $_[0] == LMS_BOOL
			|| $_[0] == LMS_INT
			|| $_[0] == LMS_FLOAT
			|| $_[0] == LMS_DATE
			|| $_[0] == LMS_TIME
			|| $_[0] == LMS_IPV4
			|| $_[0] == LMS_IPV6
			|| $_[0] == LMS_OID;
}


sub is_float( $ )
{
	# A float value might look like any of the below types.  Therefore, to test
	# if something is float it is best to call "Lumas::is_float( $type )" 
	# where $type is the value supplied by the get_next() or get_named() functions.

	return $_[0] == LMS_FLOAT || $_[0] == LMS_INT;
}


sub skip_value( $ )
{
	# Skips the next value
	my $type;
	my $value;

	# Keep reading while read value is a union tag
	while( get_next( $_[0], $type, $value ) && $type == LMS_TAG ) {
	}

	return skip_to_compound_end( $_[0] ) if $type == LMS_STRUCT_START;

	return 1;	# Success
}


####################################################################################
#                             Private functions
####################################################################################

sub skip_to_compound_end
{
	# Only interested in {}() characters, but must ignore them within '' and "" 
	# quotes.
	# Therefore, skip '' or "" block, and non-white space blocks, 
	# excluding { } ( )

	my $nesting = 1;
	while( $nesting > 0 ) {
		# Skip leading white space and comments
		while( $_[0] =~ /\G\s+/gc 
				|| $_[0] =~ m#\G//[^\n\r]*#gc 
				|| $_[0] =~ m#\G/\*.*?\*/#gc ) {
		}

		if( $_[0] =~ /\G'((?:\\\\)*)'/gc				# A whole string of \ pairs
				|| $_[0] =~ /\G'(.*[^\\](?:\\\\)*)'/gc	# Something that ends with pairs of \
				|| $_[0] =~ /\G"((?:\\\\)*)"/gc			# A whole string of \ pairs
				|| $_[0] =~ /\G"(.*[^\\](?:\\\\)*)"/gc	# Populated "" quoted block
				|| $_[0] =~ /\G\[[^\]]*\]/gc			# Bytes
				|| $_[0] =~ /\G[^\s{}()]+/gc ) {		# Non white space except specials
			# Nothing to do here
		} elsif( $_[0] =~ /\G{/gc ) {
			++$nesting;
		} elsif( $_[0] =~ /\G}/gc ) {
			--$nesting;
		} elsif( $_[0] =~ /\G\(/gc ) {
			++$nesting;
		} elsif( $_[0] =~ /\G\)/gc ) {
			--$nesting;
		} elsif( $_[0] =~ /\G$/gc ) {
			$nesting = 0;
		} else {
			return 0;	# Failed
		}
	}

	return 1;
}

1;

__END__


=head1 NAME

Lumas - Language for Universal Message Abstraction and Specification

=head1 SYNOPSIS

    use Lumas;

	$in_message = "123 a = b";

	get_next( $in_message, $out_type, $out_value );

	skip_value( $in_message );

	get_named( $in_message, '=1a', $out_type, $out_value );

	get_named_nm( $in_message, 'a', $out_type, $out_value );

	get_named_cm( $in_message, '1a', $out_type, $out_value );


=head1 DESCRIPTION

C<Lumas> allows for the reading of Lumas encoded messages.  For more on Lumas and
its encoding format, see:

	http://www.tech-know-ware.com/lumas

=head1 USER FUNCTIONS

These functions provide the user interface to C<Lumas>.

=over 4

=item get_next ( IN_MESSAGE, OUT_TYPE, OUT_VALUE )

On each call to C<get_next> the next message token is retrieved.  This allows
the message to be parsed in a method akin to XML's SAX function.  However,
it is the responsibility of the user's parsing code to call C<get_next> rather
than the user's code being called by the library which is the case for SAX.  This
approach allows the context of the read to be easily represented by the function 
from which the C<get_next> is read.

C<IN_MESSAGE> is the message to be parsed.  The module uses progressive matching,
so once set, the string used for IN_MESSAGE should not have user regular
expressions applied to it.  In some circumstances it may be appropriate to make
a copy of the message, and use that when calling C<get_next>.

When the function is complete, the variable C<OUT_TYPE> will be set to one of
the C<LMS_XXXXX> constants and the variable C<OUT_VALUE> will contain the value
read.

The return value is C<1> for success and C<0> for failure.


=item get_named ( IN_MESSAGE, IN_PATH, OUT_TYPE, OUT_VALUE )

C<get_named> allows a named parameter from a message to be read.  The parameters
C<IN_MESSAGE>, C<OUT_TYPE> and C<OUT_VALUE> have the same meanings as for
C<get_next>.

C<IN_PATH> specifies the parameter to be read in a similar way to how a file is
specified within a directory hierachy.  Each level in the hierarchy is made up 
of 3 separate and optional parts, the I<untagged count>, the I<text tag> and 
the I<instance>.  The I<untagged count> specifies either 0 based count of the
untagged parameter that is being sought, or the total number of untagged values
if a I<text tag> is specified.   (Note: I<untagged count> can be omitted from the
specification if it is known that the parser can not confuse an untagged parameter
with a sought tagged parameter.)   I<text tag> specifies any text tag that is
to be looked for.  I<instance> specifies the instance of the parameter that is to be
looked for, starting at 0.  For example, in there are 3 occurances of a particular
value in a message section, the 2nd instance can be referenced by setting the
I<instance> part of the specification to 1.  

The parts are combined into a single string, using the format 
'<untagged count><text tag>,<instance>'.  For example '2mine,1', skips 2 untagged 
values and then looks for the 2nd (1th) instance of the 'mine' parameter.

Each level of the hierarchy is separated by an = sign.  E.g. 
'2mine,1=3', which gets the 4th untagged parameter within the '2mine,1' struct.

If the path starts with an '=', (e.g. '=2mine,1=3') the path is relative to 
the start of the message, otherwise it is relative to the match operation just 
ended.


=item get_named_nm ( IN_MESSAGE, IN_PATH, OUT_TYPE, OUT_VALUE )

C<get_named_nm> (nm = no move) is similar to C<get_named> except that it 
records the location in the string before an attempt is made to locate the 
specified parameter, and returns the that position once the location operation is
complete.

This allows you to locate the start of a struct using C<get_named> and then do
searches relative to the start of the struct using C<get_named_nm> after that.
This will be more efficient, and probably easier to code.


=item get_named_cm ( IN_MESSAGE, IN_PATH, OUT_TYPE, OUT_VALUE )

C<get_named_cm> (cm = conditional move) is similar to C<get_named_nm> except 
that it will return to the stored location only if the parameter in NOT found.
If the parameter is found, it will leave the string position at the location
where the search ended.  This allows you to conditionally move into a struct 
depending on whether it is present or not.  

You may also use Perl's I<pos( IN_MESSAGE )> to record positions in the message
during a parsing operation.


=item skip_value ( IN_MESSAGE )

C<skip_value> skips the next value, including whole structs and embedded items.
When using C<get_next> to parse a message it can be used to skip over items that
you are not interested in.  

N.B. You must not call C<skip_value> for void types.


=item is_uascii( IN_TYPE )

An unquoted-ascii value might look like a number of other types, such as 
integers, dates etc.  If the unquoted ascii value looks like one of the other
types, the tokeniser will label it as the other type.  If you know you are looking
for an unquoted ascii value that may look like one of the other types, call 
C<is_uascii> with the type returned from the parser.  It will return C<1> if the
type could actually be unquoted ascii, and C<0> otherwise.


=item is_float( IN_TYPE )

A float value might look like a regular integer.  If the float value looks like 
an integer the tokeniser will label it as such.  If you know you are looking
for a float value that may look like an integer, call C<is_float> with the type 
returned from the parser.  It will return C<1> if the type could actually 
be float, and C<0> otherwise.


=back

=head1 EXPORTS

C<Lumas> exports a number of constant functions of the form C<LMS_XXXXX> that 
are used to identify the read types.  These are: 

LMS_ERROR, LMS_FINISHED, LMS_TAG, LMS_LIST_TAG, LMS_VOID_CONST_UASCII, 
LMS_BOOL, LMS_INT, LMS_DATE, LMS_TIME, LMS_OID, LMS_IPV4, LMS_IPV6, 
LMS_ASCII, LMS_UNICODE, LMS_BYTES, LMS_STRUCT_START, LMS_STRUCT_END, 
LMS_EMBEDDED, LMS_NOTHING.

=head1 BUGS

Fixed in Version 0.7:
	(When a ascii or unicode string ends in something like \\' the tokeniser will only 
	 grab \' and decide that that is not the end of the string.  Really need something 
	 that says if there's an even number of \ before a ', then it's the end of the 
	 string, whereas, if there's an odd number, it's not the end of the string.)

=head1 AUTHOR

Pete Cordell <lumas@tech-know-ware.com>

=head1 COPYRIGHT

Copyright (c) 2003 Tech-Know-Ware Ltd. All rights reserved.

=cut