#!/usr/bin/perl
#####################################
# iswf - checks an XML file to see if it is well-formed.                    #
#                                                                          #
#                                                                          #
# iswf < XMLINPUT                                                          #
# writes error messages to STDOUT                                          #
#                                                                          #
# M.Leventhal, Grif, S.A.                                                  #
# michael@grif.fr                                                            #
# 1 Sept 1997                                                              #
#                                                                          #
# Notes: based on 07-Aug-97 XML Working draft. Not complete, does no        #
# entity checks, ASCII-only, among other omissions, but catches lots of    #
# stuff.                                                                    #
#                                                                          #
# Unrestricted use is hereby granted as long as the author is credited or  #
# discredited as the case may be.                                           #
#####################################
 
# The first two lines cause the entire document is read into the
# $file variable. This spares me certain
# complications which arise from reading it line by line
# and Perl is able to do this sort of thing fairly
# efficiently.
 
undef($/);
$file = <>;
&parse($file);
# I loop through the file, processing each start or end
# tag when it is seen.

sub parse {
my($file) = @_; 
while ($file =~ /([^<]*)<(\/)?([^>]+)>/) { 
  $st_or_et = $2;
  $gi = $3;
  $file = $';
  $pretext = $1; 
# I recognize the following kinds of objects: XML declaration
# (a particular type of processing instruction), processing
# instructions, comments, doctype declaration, cdata marked
# sections, and elements. Since the document production has
# order rules I set a flag when a particlar type of object
# has been processed. I invoke a subroutine to process each
# type of object.
 
 if ($gi =~ /^\?XML/) {
   &process_decl;
   $decl_seen = 1;
 }
 elsif ($gi =~ /^\?/) {
   &process_pi;
   $misc_seen = 1;
 }
 elsif ($gi =~ /^!\-\-/) {
   &process_comment;
   $misc_seen;
 }
 elsif ($gi =~ /^!DOCTYPE/) {
   &process_doctype;
   $doctype_seen = 1;
 }
 elsif ($gi =~ /^\!\[CDATA\[/) {
   &process_cdata;
 }
 else {
   &process_element;
   $element_seen = 1;
 }
}

}
 
#--------------------------------------------------------------------------# 
sub check_error_count {
if ($error_count == 0) {
  print "This document appears to be well-formed.\n"; } 
}
#--------------------------------------------------------------------------# 
 
# Check to see if the ancestor stack containing all parents up to the
# root is empty.
 
sub check_empty_stack {
if ($#ancestors > -1) {
  &print_error_at_context;
}
}
#--------------------------------------------------------------------------# 
 
# Check to see if there is any uncontained PCDATA lying around (white space
# at the end of the document doesn't count). I check also to see that
# a root to the document was found which catches a null file error. 
 
sub check_uncontained_pcdata {
if ($file !~ /^\s*$/ || $ROOT eq "") {
  $error_count++;
  print "\nNot well formed uncontained #PCDATA or null file\n"; 
}
}
#--------------------------------------------------------------------------# 
 
# Check that the XML declaration is coded properly and in the correct
# position (before any other object in the file and occuring only
# once.)
 
sub process_decl {
if ($decl_seen || $misc_seen || $doctype_seen || $element_seen) { 
  $error_count++;
  print "XML declaration can only be at the head of the document.\n"; 
}
 
# No checks are performed on processing instructions but the following
# will be used to store the PI in the $gi variable and advance the
# file pointer.
 
&process_pi;
 
# This is slightly lazy since we allow version='1.0". It is quite simple
# to fix just by making an OR of each parameter with either ' ' or " "
# quote marks.
 
if ($gi !~/\?XML\s+version=[\'\"]1.0[\'\"](\s+encoding=[\'\"][^\'\"]*[\'\"])?
(\s+RMD=[\'\"](NONE|INTERNAL|ALL)[\'\"])?\s*\?/) 
{
  $error_count++;
  print "Format of XML declaration is wrong.\n"; 
}
}
#--------------------------------------------------------------------------# 
 
# Check that the Doctype statement is in the right position and, otherwise,
# make no attempt to parse its contents, including the root element. The
# root element will determined from the element production itself and
# the "claim" of the Doctype won't be verified. 
 
sub process_doctype {
if ($doctype_seen || $element_seen) {
  $error_count++;
  print "Doctype can only appear once and must be within prolog.\n";
}
if ($gi =~ /\[/ && $gi !~ /\]$/) {
  $file =~ /\]>/;
  $file = $';
  $gi = $gi.$`.$&;
}
}
#--------------------------------------------------------------------------# 
 
# Performs the well-formed check necessary to verify that CDATA is not
# nested. We will pick up the wrong end of CDATA marker if this is the
# case so the error message is critical. 
 
sub process_cdata {
if ($gi !~ /\]\]$/) {
  $file =~ /\]\]>/;
  $file = $';
  $gi = $gi.$`."]]";
}
$gi =~ /\!\[CDATA\[(.*)\]\]/;
$body = $1;
if ($body =~ /<\!\[CDATA\[/) {
  print "Nested CDATA.\n";
  &print_error_at_context;
}
}
#--------------------------------------------------------------------------# 
 
# Performs the well-formed check of ensuring that '--' is not nested
# in the comment body which would cause problems for SGML processors. 
 
sub process_comment {
if ($gi !~ /\-\-$/) {
  $file =~ /\-\->/;
  $file = $';
  $gi = $gi.$`."--";
}
$gi =~ /\!\-\-((.|\n)*)\-\-/;
$body = $1;
if ($body =~ /\-\-/) {
  $error_count++;
  print "Comment contains --.\n";
}
}
#--------------------------------------------------------------------------# 
 
# This is the main subroutine which handles the ancestor stack (in an
# array) checking the proper nesting of the element part of the document
# production.
 
sub process_element {
print "characters: ($pretext)\n";
print "process_element: ($gi) $st_or_et\n";
# Distinguish between empty elements which do not add a parent to the
# ancestor stack and elements which can have content. 
 
if ($gi =~ /\/$/) {
  $xml_empty = 1;
  $gi =~ s/\/$//;
 
# XML well-formedness says every document must have a container so an
# empty element cannot be the root, even if it is the only element in
# the document.
 
   if (!$element_seen) {
     print "Empty element <$gi/> cannot be the root.\n"; 
   }
}
else {
  $xml_empty = 0;
}
 
# Check to see that attributes are well-formed. 
 
if ($gi =~ /\s/) {
  $gi = $`;
  $attrline = $';
  $attrs = $attrline;
 
# This time we properly check to see that either ' ' or " " is
# used to surround the attribute values. 
 
  while ($attrs =~ /\s*([^\s=]*)\s*=\s*(("[^"]*")|('[^']*'))/) { 
 
# An end tag may not, of course, have attributes. 
	print "\tAttr: $1 $2\n"; 
      if ($st_or_et eq "\/") {
        print "Attributes may not be placed on end tags.\n"; 
        &print_error_at_context;
      }
      $attrname = $1;
 
# Check for a valid attribute name.
 
      &check_name($attrname);
      $attrs = $';
     }
     $attrs =~ s/\s//g;
 
# The above regex should have processed all the attributes. If anything
# is left after getting rid of white space it is because the attribute
# expressesion was malformed.
 
     if ($attrs ne "") {
       print "Malformed attributes.\n";
       &print_error_at_context;
     }
  }
 
# If XML is declared case-sensitive the following line should be
# removed. At the moment it isn't so I set everything to lower
# case so we can match start and end tags irrespective of case
# differences.
 
$gi =~ tr/A-Z/a-z/;
if (!$element_seen) {
  $ROOT = $gi; }
 
# Check to see that the generic identifier is a well-formed name. 
 
&check_name($gi);
 
# If I have an end tag I just check the top of the stack, the
# end tag must match the last parent or it is an error. If I
# find an error I have I could either pop or not pop the stack.
# What I want is to perform some manner of error recovery so
# I can continue to report well-formed errors on the rest of
# the document. If I pop the stack and my problem was caused
# by a missing end tag I will end up reporting errors on every
# tag thereafter. If I don't pop the stack and the problem
# was caused by a misspelled end tag name I will also report
# errors on every following tag. I happened to chose the latter. 
 
if ($st_or_et eq "\/") {
  $parent = $ancestors[$#ancestors];
  if ($parent ne $gi) {
    if (@ancestors eq $ROOT) { @ancestors = ""; } 
    else {
      &print_error_at_context;
    }
  }
  else {
    pop @ancestors;
  }
}
else {
 
# This is either an empty tag or a start tag. In the latter case
# push the generic identifier onto the ancestor stack. 
 
   if (!$xml_empty) {
     push (@ancestors, $gi); }
}
 
}
#--------------------------------------------------------------------------# 
 
# Skip over processing instructions.
 
sub process_pi {
if ($gi !~ /\?$/) {
   $file =~ /\?>/;
   $gi = $gi.$`."?";
   $file = $';
}
}
#--------------------------------------------------------------------------# 
sub print_error_at_context {
 
# This routine prints out an error message with the contents of the
# ancestor stack so the context of the error can be identified. 
 
# It would be most helpful to have line numbers. In principle it
# is possible but more difficult since we choose to not process the
# document line by line. We could still count line break characters
# as we scan the document.
 
# Nesting errors can cause every tag thereafter to generate an error
# so stop at 10.
 
if ($error_count == 10) {
  print "More than 10 errors ...\n";
  $error_count++;
}
else {
  $error_count++;
  print "Not well formed at context ";
 
# Just cycle through the ancestor stack. 
 
  foreach $element (@ancestors) {
    print "$first$element";
    $first = "->";
  }
  $first = "";
  print " tag: <$st_or_et$gi $attrline>\n"; 
}
 
}
#--------------------------------------------------------------------------# 
 
# Check for a well-formed Name as defined in the Name production. 
 
sub check_name {
local($name) = @_;
 
if ($name !~ /^[A-Za-z_:][\w\.\-:]*$/) { 
  print "Invalid element or attribute name: $name\n"; 
  &print_error_at_context;
}
}
#---------------------
