# Lumas.pm # # Copyright (c) 2003 Tech-Know-Ware Ltd. All rights reserved. # This program is free software; you can redistribute it and/or # modify it according to Perl's artistic licence, subject to # the following notice: # # NOTICE # # THIS FILE IS PRESENTED 'AS IS' AND COMES WITH NO WARRENTY OF # ANY KIND. # # TECH-KNOW-WARE LTD SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED # BY ANYONE OR ANYTHING DUE TO THE USE OF THIS FILE HOWEVER THEY # MAY BE CAUSED. # # This file, or any derivative of it, may be used for any # purpose, including commercial purposes. # # Users are permitted to modify the contents of the file, as # long as this notice and the copyright notice remain. # # Users are permitted to extract elements of this file into # other files, subject to this notice and the copyright notice # also being copied into said files. # # For more information about Lumas, go to: # # http://www.tech-know-ware.com/lumas # # END OF NOTICE package Lumas; # Required libraries use strict; use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); require Exporter; # Global variables $VERSION = '0.7'; @ISA = qw( Exporter ); @EXPORT = qw( LMS_ERROR LMS_FINISHED LMS_TAG LMS_LIST_TAG LMS_VOID_CONST_UASCII LMS_BOOL LMS_INT LMS_FLOAT LMS_DATE LMS_TIME LMS_OID LMS_IPV4 LMS_IPV6 LMS_ASCII LMS_UNICODE LMS_BYTES LMS_STRUCT_START LMS_STRUCT_END LMS_EMBED_START LMS_EMBED_END LMS_NOTHING ); @EXPORT_OK = qw( get_next get_named get_named_nm get_named_cm is_uascii skip_value ); #################################################################################### # 'Constants' #################################################################################### sub LMS_ERROR { 0 } sub LMS_FINISHED { 1 } sub LMS_TAG { 2 } sub LMS_LIST_TAG { 3 } sub LMS_VOID_CONST_UASCII { 4 } sub LMS_BOOL { 5 } sub LMS_INT { 6 } sub LMS_FLOAT { 7 } sub LMS_DATE { 8 } sub LMS_TIME { 9 } sub LMS_OID { 10 } sub LMS_IPV4 { 11 } sub LMS_IPV6 { 12 } sub LMS_ASCII { 13 } sub LMS_UNICODE { 14 } sub LMS_BYTES { 15 } sub LMS_STRUCT_START { 16 } sub LMS_STRUCT_END { 17 } sub LMS_EMBEDDED { 18 } sub LMS_NOTHING { 30 } #################################################################################### # Public functions #################################################################################### sub get_next( $ $ $ ) { # Get the next token from the message # $_[0] - The message # $_[1] - The variable into which the type code in the form of LMS_XXXX is to be placed # $_[2] - Where the actual read value is to be placed # Skip leading white space and comments while( $_[0] =~ /\G\s+/gc || $_[0] =~ m#\G//([^\n\r]*)#gc || $_[0] =~ m#\G/\*(.*?)\*/#gc ) { } if( $_[0] =~ /\G\s*$/gc ) { $_[1] = LMS_FINISHED; $_[2] = ''; return 0; } elsif( $_[0] =~ /\G([^\d{}()[\-=][^{}()=\s]*)\s*=/gc ) { $_[1] = LMS_TAG; $_[2] = $1; } elsif( $_[0] =~ /\G,/gc ) { $_[1] = LMS_LIST_TAG; $_[2] = ','; } elsif( $_[0] =~ /\G(True)[\s,})]/igc || $_[0] =~ /\G(False)[\s,})]/igc || $_[0] =~ /\G(T)[\s,})]/igc || $_[0] =~ /\G(F)[\s,})]/igc ) { --pos($_[0]); # Might end in ',' which we want next time round $_[1] = LMS_BOOL; $_[2] = $1; } elsif( $_[0] =~ /\G(-?\d+)[\s,})]/gc ) { --pos($_[0]); # Might end in ',' which we want next time round $_[1] = LMS_INT; $_[2] = $1; } elsif( $_[0] =~ /\G(-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)[\s,})]/gc || $_[0] =~ /\G(-?INF)[\s,})]/gc || $_[0] =~ /\G(NaN)[\s,})]/gc ) { --pos($_[0]); # Might end in ',' which we want next time round $_[1] = LMS_FLOAT; $_[2] = $1; } elsif( $_[0] =~ /\G(\d{4}-\d{2}-\d{2})[\s,})]/gc ) { --pos($_[0]); $_[1] = LMS_DATE; $_[2] = $1; } elsif( $_[0] =~ /\G(\d{2}:\d{2}(?::\d{2})?)[\s,})]/gc ) { --pos($_[0]); $_[1] = LMS_TIME; $_[2] = $1; } elsif( $_[0] =~ /\G(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s,})]/gc ) { --pos($_[0]); $_[1] = LMS_IPV4; $_[2] = $1; } elsif( $_[0] =~ /\G([\da-f]{1,4}(?::[\da-f]{1,4})*::(?:[\da-f]{1,4}(?::[\da-f]{1,4})*))[\s,})]/igc # hexseq "::" [ hexseq ] || $_[0] =~ /\G(::[\da-f]{1,4}(?::[\da-f]{1,4})*)[\s,})]/igc # "::" [ hexseq ] || $_[0] =~ /\G([\da-f]{1,4}(?::[\da-f]{1,4})*)[\s,})]/igc ) { # hexseq # hexseq = hex4 *( ":" hex4) -> [\da-f]{1,4}(?::[\da-f]{1,4} # hex4 = 1*4HEXDIG -> [\da-f]{1,4} --pos($_[0]); $_[1] = LMS_IPV6; $_[2] = $1; } elsif( $_[0] =~ /\G(\d+(?:~\d+)*)[\s,})]/gc ) { --pos($_[0]); $_[1] = LMS_OID; $_[2] = $1; } elsif( $_[0] =~ /\G''/gc ) { $_[1] = LMS_ASCII; $_[2] = ''; } elsif( $_[0] =~ /\G'((?:\\\\)*)'/gc ) { # A whole string of \ pairs $_[1] = LMS_ASCII; $_[2] = $1; $_[2] =~ s/\\([\\'])/$1/g; } elsif( $_[0] =~ /\G'(.*[^\\](?:\\\\)*)'/gc ) { # Something that ends with pairs of \ $_[1] = LMS_ASCII; $_[2] = $1; $_[2] =~ s/\\([\\'])/$1/g; } elsif( $_[0] =~ /\G""/gc ) { $_[1] = LMS_UNICODE; $_[2] = ''; } elsif( $_[0] =~ /\G"((?:\\\\)*)"/gc ) { # A whole string of \ pairs $_[1] = LMS_UNICODE; $_[2] = $1; $_[2] =~ s/\\([\\"])/$1/g; } elsif( $_[0] =~ /\G"(.*[^\\](?:\\\\)*)"/gc ) { # Something that ends with pairs of \ $_[1] = LMS_UNICODE; $_[2] = $1; $_[2] =~ s/\\([\\"])/$1/g; } elsif( $_[0] =~ /\G\[\s*([\sa-zA-Z0-9_\/]*={0,2})\s*\]/gc ) { $_[1] = LMS_BYTES; $_[2] = $1; } elsif( $_[0] =~ /\G{/gc ) { $_[1] = LMS_STRUCT_START; $_[2] = ''; } elsif( $_[0] =~ /\G}/gc ) { $_[1] = LMS_STRUCT_END; $_[2] = ''; } elsif( $_[0] =~ /\G\(/gc ) { $_[1] = LMS_EMBEDDED; my $start = pos( $_[0] ); skip_to_compound_end( $_[0] ); $_[2] = substr( $_[0], $start, pos( $_[0] ) - $start - 1 ); } elsif( $_[0] =~ /\G([^\d{(^\-})\]]\w*)/gc ) { # Needs to be defined last $_[1] = LMS_VOID_CONST_UASCII; $_[2] = $1; } else { $_[0] =~ /\G(.{1,8})/gc; $_[1] = LMS_ERROR; $_[2] = ""; return 0; } return 1; } sub get_named_nm( $ $ $ $ ) # No move { my $pos = pos( $_[0] ); my $result = &get_named; pos( $_[0] ) = $pos; return $result; } sub get_named_cm( $ $ $ $ ) # Conditional move { my $pos = pos( $_[0] ); my $result = &get_named; pos( $_[0] ) = $pos if !$result; return $result; } sub get_named( $ $ $ $ ) { # $_[0] = message # $_[1] = Name - of the form '0fill,2=4mine'; # $_[2] = Where return value is placed my $name; my $utag; my $ttag; my $instance; my $werehere; # $werehere = we're here my $at_void = 0; $_[2] = LMS_NOTHING; # Prevent struct nesting on first pass through for loop pos( $_[0] ) = 0 if $_[1] =~ m#^=#; # Rewind if root requested return 1 if $_[1] eq '='; # Just move to start of message # Strategy: get a value and say 'is this the value we want'. # If not, get next value and repeat return 0 if ! get_next( $_[0], $_[2], $_[3] ); LOOP: foreach $name ( split /=/, $_[1] ) { if( $at_void ) { return 0; } elsif( $name ne '' ) { return 0 if $_[2] == LMS_STRUCT_START and ! get_next( $_[0], $_[2], $_[3] ); ($utag, $ttag, $instance) = ($name =~ /^(\d*)([^,]*)(?:,(\d+))?$/); my $ctag = 0; ++$ctag if $ttag ne ''; # This is a bit of a hack!!! my $cinstance = 0; while( $ctag <= $utag and $_[2] != LMS_STRUCT_END ) { # Untagged phase next LOOP if $ttag eq '' and $ctag == $utag and ++$cinstance > $instance; return 0 if $_[2] == LMS_TAG and ! skip_value( $_[0] ); # Untagged union return 0 if $_[2] == LMS_STRUCT_START and ! skip_to_compound_end( $_[0] ); return 0 if ! get_next( $_[0], $_[2], $_[3] ); if( $_[2] == LMS_LIST_TAG ) { return 0 if ! get_next( $_[0], $_[2], $_[3] ); return 0 if $_[2] == LMS_LIST_TAG; } else { ++$ctag; } } if( $ttag ne '' ) { # Tagged phase my $last_tag = ''; # Make sure we have a tag before entering the loop below # -Allows us to omit the number of untagged items in the search spec until( $_[2] == LMS_TAG or $_[2] == LMS_VOID_CONST_UASCII ) { return 0 if $_[2] == LMS_STRUCT_START and ! skip_to_compound_end( $_[0] ); return 0 if ! get_next( $_[0], $_[2], $_[3] ); } while( 1 ) { $last_tag = $_[3] if $_[2] == LMS_TAG or $_[2] == LMS_VOID_CONST_UASCII; if( $_[2] == LMS_TAG || $_[2] == LMS_LIST_TAG || $_[2] == LMS_VOID_CONST_UASCII ) { if( $last_tag eq $ttag and ++$cinstance > $instance) { return 0 if $_[2] != LMS_VOID_CONST_UASCII and ! get_next( $_[0], $_[2], $_[3] ); $at_void if $_[2] == LMS_VOID_CONST_UASCII; next LOOP; } return 0 if $_[2] == LMS_TAG || $_[2] == LMS_LIST_TAG and ! skip_value( $_[0] ); do { # This allows us to omit the number of untagged items in the search spec return 0 if ! get_next( $_[0], $_[2], $_[3] ); return 0 if $_[2] == LMS_STRUCT_END; } until( $_[2] == LMS_TAG or $_[2] == LMS_LIST_TAG or $_[2] == LMS_VOID_CONST_UASCII ); } else { return 0; } } } return 0; # We've failed if we get to here (next LOOP is the way out) } } return 1; } sub is_uascii( $ ) { # An unquoted-ascii value might look like any of the below types. Therefore, # to test if something is unquoted-ascii it is best to call # "Lumas::is_uascii( $type )" where $type is the value supplied by the # get_next() or get_named() functions. return $_[0] == LMS_VOID_CONST_UASCII || $_[0] == LMS_BOOL || $_[0] == LMS_INT || $_[0] == LMS_FLOAT || $_[0] == LMS_DATE || $_[0] == LMS_TIME || $_[0] == LMS_IPV4 || $_[0] == LMS_IPV6 || $_[0] == LMS_OID; } sub is_float( $ ) { # A float value might look like any of the below types. Therefore, to test # if something is float it is best to call "Lumas::is_float( $type )" # where $type is the value supplied by the get_next() or get_named() functions. return $_[0] == LMS_FLOAT || $_[0] == LMS_INT; } sub skip_value( $ ) { # Skips the next value my $type; my $value; # Keep reading while read value is a union tag while( get_next( $_[0], $type, $value ) && $type == LMS_TAG ) { } return skip_to_compound_end( $_[0] ) if $type == LMS_STRUCT_START; return 1; # Success } #################################################################################### # Private functions #################################################################################### sub skip_to_compound_end { # Only interested in {}() characters, but must ignore them within '' and "" # quotes. # Therefore, skip '' or "" block, and non-white space blocks, # excluding { } ( ) my $nesting = 1; while( $nesting > 0 ) { # Skip leading white space and comments while( $_[0] =~ /\G\s+/gc || $_[0] =~ m#\G//[^\n\r]*#gc || $_[0] =~ m#\G/\*.*?\*/#gc ) { } if( $_[0] =~ /\G'((?:\\\\)*)'/gc # A whole string of \ pairs || $_[0] =~ /\G'(.*[^\\](?:\\\\)*)'/gc # Something that ends with pairs of \ || $_[0] =~ /\G"((?:\\\\)*)"/gc # A whole string of \ pairs || $_[0] =~ /\G"(.*[^\\](?:\\\\)*)"/gc # Populated "" quoted block || $_[0] =~ /\G\[[^\]]*\]/gc # Bytes || $_[0] =~ /\G[^\s{}()]+/gc ) { # Non white space except specials # Nothing to do here } elsif( $_[0] =~ /\G{/gc ) { ++$nesting; } elsif( $_[0] =~ /\G}/gc ) { --$nesting; } elsif( $_[0] =~ /\G\(/gc ) { ++$nesting; } elsif( $_[0] =~ /\G\)/gc ) { --$nesting; } elsif( $_[0] =~ /\G$/gc ) { $nesting = 0; } else { return 0; # Failed } } return 1; } 1; __END__ =head1 NAME Lumas - Language for Universal Message Abstraction and Specification =head1 SYNOPSIS use Lumas; $in_message = "123 a = b"; get_next( $in_message, $out_type, $out_value ); skip_value( $in_message ); get_named( $in_message, '=1a', $out_type, $out_value ); get_named_nm( $in_message, 'a', $out_type, $out_value ); get_named_cm( $in_message, '1a', $out_type, $out_value ); =head1 DESCRIPTION C allows for the reading of Lumas encoded messages. For more on Lumas and its encoding format, see: http://www.tech-know-ware.com/lumas =head1 USER FUNCTIONS These functions provide the user interface to C. =over 4 =item get_next ( IN_MESSAGE, OUT_TYPE, OUT_VALUE ) On each call to C the next message token is retrieved. This allows the message to be parsed in a method akin to XML's SAX function. However, it is the responsibility of the user's parsing code to call C rather than the user's code being called by the library which is the case for SAX. This approach allows the context of the read to be easily represented by the function from which the C is read. C is the message to be parsed. The module uses progressive matching, so once set, the string used for IN_MESSAGE should not have user regular expressions applied to it. In some circumstances it may be appropriate to make a copy of the message, and use that when calling C. When the function is complete, the variable C will be set to one of the C constants and the variable C will contain the value read. The return value is C<1> for success and C<0> for failure. =item get_named ( IN_MESSAGE, IN_PATH, OUT_TYPE, OUT_VALUE ) C allows a named parameter from a message to be read. The parameters C, C and C have the same meanings as for C. C specifies the parameter to be read in a similar way to how a file is specified within a directory hierachy. Each level in the hierarchy is made up of 3 separate and optional parts, the I, the I and the I. The I specifies either 0 based count of the untagged parameter that is being sought, or the total number of untagged values if a I is specified. (Note: I can be omitted from the specification if it is known that the parser can not confuse an untagged parameter with a sought tagged parameter.) I specifies any text tag that is to be looked for. I specifies the instance of the parameter that is to be looked for, starting at 0. For example, in there are 3 occurances of a particular value in a message section, the 2nd instance can be referenced by setting the I part of the specification to 1. The parts are combined into a single string, using the format ','. For example '2mine,1', skips 2 untagged values and then looks for the 2nd (1th) instance of the 'mine' parameter. Each level of the hierarchy is separated by an = sign. E.g. '2mine,1=3', which gets the 4th untagged parameter within the '2mine,1' struct. If the path starts with an '=', (e.g. '=2mine,1=3') the path is relative to the start of the message, otherwise it is relative to the match operation just ended. =item get_named_nm ( IN_MESSAGE, IN_PATH, OUT_TYPE, OUT_VALUE ) C (nm = no move) is similar to C except that it records the location in the string before an attempt is made to locate the specified parameter, and returns the that position once the location operation is complete. This allows you to locate the start of a struct using C and then do searches relative to the start of the struct using C after that. This will be more efficient, and probably easier to code. =item get_named_cm ( IN_MESSAGE, IN_PATH, OUT_TYPE, OUT_VALUE ) C (cm = conditional move) is similar to C except that it will return to the stored location only if the parameter in NOT found. If the parameter is found, it will leave the string position at the location where the search ended. This allows you to conditionally move into a struct depending on whether it is present or not. You may also use Perl's I to record positions in the message during a parsing operation. =item skip_value ( IN_MESSAGE ) C skips the next value, including whole structs and embedded items. When using C to parse a message it can be used to skip over items that you are not interested in. N.B. You must not call C for void types. =item is_uascii( IN_TYPE ) An unquoted-ascii value might look like a number of other types, such as integers, dates etc. If the unquoted ascii value looks like one of the other types, the tokeniser will label it as the other type. If you know you are looking for an unquoted ascii value that may look like one of the other types, call C with the type returned from the parser. It will return C<1> if the type could actually be unquoted ascii, and C<0> otherwise. =item is_float( IN_TYPE ) A float value might look like a regular integer. If the float value looks like an integer the tokeniser will label it as such. If you know you are looking for a float value that may look like an integer, call C with the type returned from the parser. It will return C<1> if the type could actually be float, and C<0> otherwise. =back =head1 EXPORTS C exports a number of constant functions of the form C that are used to identify the read types. These are: LMS_ERROR, LMS_FINISHED, LMS_TAG, LMS_LIST_TAG, LMS_VOID_CONST_UASCII, LMS_BOOL, LMS_INT, LMS_DATE, LMS_TIME, LMS_OID, LMS_IPV4, LMS_IPV6, LMS_ASCII, LMS_UNICODE, LMS_BYTES, LMS_STRUCT_START, LMS_STRUCT_END, LMS_EMBEDDED, LMS_NOTHING. =head1 BUGS Fixed in Version 0.7: (When a ascii or unicode string ends in something like \\' the tokeniser will only grab \' and decide that that is not the end of the string. Really need something that says if there's an even number of \ before a ', then it's the end of the string, whereas, if there's an odd number, it's not the end of the string.) =head1 AUTHOR Pete Cordell =head1 COPYRIGHT Copyright (c) 2003 Tech-Know-Ware Ltd. All rights reserved. =cut