#!/usr/bin/perl

#--------------------------------------------------------
# lyx2cnx-alpha-1.2
#
# Lyx to CNXML Translator
# First Alpha Version (second revision)
#
# By Chris Winstead
# Utah State University
# winstead@engineering.usu.edu
#
# Project initiated: Oct. 20, 2005
# Most recent revision: Feb. 23, 2006
#
# This Perl script translates basic LyX paragraph
# fields into corresponding CNXML fields. It also
# uses Tralics for translating equations, which 
# is an easy-to-install-and-use external program
# available at:
#
# http://www-sop.inria.fr/apics/tralics/
#
#
# For documentation on this page, see the Connexions
# module titled "Creating Connexions Content Using LyX" at:
#
# http://cnx.org/content/m13238/latest/
#
# Connexions, by the way, is an online system for 
# authoring, managing, sharing and deploying web-based
# instructional content for University courses.
# The Connexions system is currently based at
#
# http://cnx.rice.edu
#
#=========================================================
# ------------
# --  USAGE --
# ------------
# lyx2cnx lyxfile cnxfile
#
# When executed, the script looks for a file called 
# template.cnxml, which is the empty document generated
# by Connexions when a module is first created.  LyX2cnx
# builds the document from this template by inserting your
# translated content between the <content> and </content>
# tags in the template. The resulting cnxfile can be 
# imported to Connexions as Plain XML.
#
# The template file is optional, but without a template
# the output file will not have appropriate header and
# footer content (so you have to cut-and-paste it into
# Connexions).
#
# WARNING: LyX2cnx also creates a directory called "temp" 
# where equations are translated. This directory is removed
# when the script is finished. If you already have a "temp"
# subdirectory, LyX2cnx will erase it.
#
#==========================================================
#
# Enjoy.
#
# All use, redistribution and modification are AUTHORIZED,
# but attribution must be given to the author, Chris Winstead.
# The author would also like to be informed of any useful
# modifications to or bugs in this script.
#
#
# There is no express or implied warrantee, etc for this
# software. It is provided as-is, and the author accepts
# no liability for any damages which may result from its
# use. 
#
#--------------------------------------------------------

system("mkdir temp");

$f = "";
$f2 = "";
$t = "";

open(srcfile, $ARGV[0]) || die "Invalid input file.\n";
open(outfile, ">".$ARGV[1]) || die "Invalid output or no file specified.\n";
$tmp_exist = open(template, "template.cnxml");

while (<srcfile>)
  {
    $f .= $_;
  }

if ($tmp_exist)
{
    while (<template>)
    {
	$t .= $_;
    }
}


@Roots = ("\\begin_inset", "\\begin_inset Float figure");
$RootTags = "begin_inset";
$CloseTag = "end_inset";


#----------------------------------------------------------------------------------
# Separate out the equations.
#
# Users can write equations with the LyX editor, which are translated by Tralics.
# Optionally, users can supply their own MathML translation within a Minipage 
# environment positioned immediately after the LyX equation. Equations to be 
# translated are in the @TexEquations array. User translations are placed in
# the @UserMathML array.
#----------------------------------------------------------------------------------

@TexEquations = ();
@UserMathML = ();

while ($f =~ /(\\begin_inset\s*?Formula([\w\W]+?)\\end_inset)([\s]*?
      \\begin_inset\sMinipage[\w\W]*?\\layout[\w\W]*?\n([\w\W]*?)\s*?\\end_inset)?/gcx)
  {
    push(@TexEquations, $2);
    push(@UserMathML, $4);
  }


# Now remove the Minipages.
$f =~ s{\\begin_inset\sMinipage[\w\W]*?\\end_inset}{}gx;

# Now replace special characters '<' and '>'.
$f =~ s{\<}{\&lt;}gmx;
$f =~ s{\>}{\&gt;}gmx;


#----------------------------------------------------------------------------------
# Process Section, Figure, Paragraphs, and other LyX environments:
#----------------------------------------------------------------------------------

# Section processor:
$f =~ s{\\layout Section\s*(\\begin_inset LatexCommand \\label\{(.+?)\}\s*\\end_inset)(.*?)(\\layout.*?)(?=(\\layout Section)|(\\the_end))}{\n<section id="$2">\n<name>$3</name>\n\n$4\n</section>}sg;

# Section processor (unlabeled):
my($secid) = 0;
$f =~ s{(?{$secid++})\\layout Section\s*([^\\]*)(\\layout.*?)(?=(\\layout Section)|(\\the_end)|(<section>))}{\n<section id="sec$secid">\n<name>$1</name>\n\n$2\n</section>}sg;


# Remove superfluous (empty) paragraphs
$f =~ s{\\layout\s*?Standard[\n\s\t]*?(?=\\begin_inset\s*?Float)}{}gmx;


# Figure processor:
my($fignum) = 0;
# Labeled figures:
$f =~ 
  s{\\begin_inset Float figure[^\\]*(?:\\layout Standard)?\s*(?:\\align center)?\s*\\begin_inset Graphics\s*filename\s*?(?:\w*/)*([\w\d\_-]+?\.([\w\d]+?))\s[^\\]*\\end_inset[^\\]*\\layout Caption[^\\]*(?:\\begin_inset LatexCommand \\label\{(.+?)\}\s*\\end_inset)([^\\]*)\\end_inset}
{\n<figure id=\"$3\">\n  <media type="image/$2" src="$1"/>
  <caption>\n$4\n  </caption>\n</figure>\n}sg;

# Unlabeled figures:
$f =~ 
  s{(?{$fignum++})\\begin_inset Float figure[^\\]*(?:\\layout Standard)?\s*(?:\\align center)?\s*\\begin_inset Graphics[^\\]*filename\s*?(?:\w*/)*([\w\d\_-]+?\.([\w\d]+?))\s[^\\]*\\end_inset[^\\]*\\layout Caption([^\\]*)\\end_inset}
{\n<figure id=\"fig$fignum\">\n  <media type="image/$2" src="$1"/>
  <caption>\n$3\n  </caption>\n</figure>\n}sg;

# Captionless figures:
$f =~ 
  s{(?{$fignum++})\\begin_inset Float figure[^\\]*(?:\\layout Standard)?\s*(?:\\align center)?\s*\\begin_inset Graphics[^\\]*filename\s*?(?:\w*/)*([\w\d\_-]+?\.([\w\d]+?))\s[^\\]*\\end_inset\s*\\end_inset}
{\n<figure id=\"fig$fignum\">\n  <media type="image/$2" src="$1"/>
 </figure>\n}sg;


# Cross-reference processor:
$f =~
  s{(Figure|Fig\.|Section|Sec\.|\()*\s*\\begin_inset\s*?LatexCommand\s*?\\ref\W*?([-:\w\s\d]+?)\W*?\s*?\\end_inset\s*\)*}
  { <cnxn target="$2"/> }imgx;

# List processor (unordered)
$f =~ s{(\s*\\layout\s*Itemize[\w\W]+?)(<|\\layout(?!\s*Itemize))}{\n<list>\n$1\n</list>\n$2}igx;
$f =~ s{(\s*\\layout\s*Itemize\s*)([\w\W]+?)\s*(?=\\layout|<)}{\n<item>$2</item>\n}igx;

# List processor (ordered)
$f =~ s{(\s*\\layout\s*Enumerate[\w\W]+?)(<|\\layout(?!\s*Enumerate))}{\n<list type="enumerated">\n$1\n</list>\n$2}igx;
$f =~ s{(\s*\\layout\s*Enumerate\s*)([\w\W]+?)\s*(?=\\layout|<)}{\n<item>$2</item>\n}igx;

# Paragraph processor:
$f =~
  s{\s*\\layout\s*Standard\s*([\w\W]+?)\s*(?=\\layout\s*Standard|<list|<section|</section|<figure|\\the_end)}
  {\n<para>\n $1 \n</para>\n}igx;

# Add IDs to paragraphs:
local($paridx) = 0;
$f =~ s/(?{$paridx++})<para>/<para id=\"para$paridx\">/g;

# Add IDs to lists:
local($listidx) = 0;
$f =~ s/(?{$listidx++})<list/<list id=\"list$listidx\"/g;


#----------------------------------------------------------------------------------
# Miscellaneous operations:
#----------------------------------------------------------------------------------


# Convert emphasis tags
$f =~ s{[\n]*\\emph\son[\n\s]*([\w\W]*?)[\n\s]*\\emph\sdefault[\s\n]*}{\<emphasis\>$1\</emphasis\> }igx;

# Convert "typewriter" to "code"
$f =~ s{\\family typewriter([^\\]*)\\family default}{\<code\>$1\</code\> }sg;

# Convert "bold" to "term"
$f =~ s{\\series bold([^\\]*)\\series default}{\<term\>$1\</term\> }sg;

# Convert URLs
$f =~ s{[\n]*\\begin_inset\sLatexCommand\s\\url\[([^\]]+)\]\{([^\}]+)\}[^\\]*\\end_inset}
  {\<link src=\"$2\"\>$1\</link\>}gx;


#----------------------------------------------------------------------------------
# Now translate the equations 
#----------------------------------------------------------------------------------

# Spot labels within TexEquations:
@EquationLabels = ();
foreach $i (@TexEquations)
  {
    if ($i =~ /\\label\{([\w\W]+?)\}/)
      {
	push(@EquationLabels, $1);
      }
    else
      {
	push(@EquationLabels, "");
      }
  }


@Equations = ();
ProcessEquations();

# Now substitute User MathML where it is available:
$idx=0;
foreach $i (@Equations)
{
  if (length($UserMathML[$idx])>2)
    {
	$UserMathML[$idx] =~ s/\n//mxg;
      if (length($EquationLabels[$idx])>2)
	{
	  $neweq = "<equation id=\"" . $EquationLabels[$idx] . "\">\n";
	  $neweq .= $UserMathML[$idx] . "\n</equation>\n";
	}
      else 
	{
	  $neweq = $UserMathML[$idx];
	}
	  
      $Equations[$idx] = $neweq;
    }
  else
  {
      if (length($EquationLabels[$idx])>2)
	{
	  $neweq = "<equation id=\"" . $EquationLabels[$idx] . "\">\n";
	  $neweq .= $Equations[$idx] . "\n</equation>\n";
	  $Equations[$idx] = $neweq;      
      }
  }

  $idx++;
} 


# Now replace the Tex equations with MathML versions:
$eqidx = 0;
$f =~ s{(\\begin_inset\s*Formula.*?\\end_inset)(?(1)(?{$eqidx++}))}{\n$Equations[$eqidx-1]}gis;


#----------------------------------------------------------------------------------
# Final document construction.
#----------------------------------------------------------------------------------

# Now chop the header and footer
$f =~ s{[\w\W]*?(?=<para|<section|<figure)}{}mx;
$f =~ s{\\the_end}{}mx;

# Miscellaneous cleanup:
$f =~ s{[\n]*?[.]}{.}gmx;
$f =~ s{\\backslash}{\\}gmx;
$f =~ s{\n\\newline}{}gmx;
$f =~ s{\n[\n]*}{\n}gx;
$f =~ s{[\n\s]*\\begin_inset\sQuotes\seld\s\\end_inset[\s\n\r]*}{ \"}gx;
$f =~ s{[\n\s]*\\begin_inset\sQuotes\serd\s\\end_inset[\s\n]*}{\" }gx;
$f =~ s{\s\.}{\.}gx;

$t =~ /^(.*<content>).*(<\/content>.*)$/s;
print outfile $1."\n\n".$f."\n\n".$2;

system("rm -R temp");

#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
# END OF MAIN CODE
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------


#----------------------------------------------------------------------------------
# Subroutine for translating equations (requires TRALICS)
#----------------------------------------------------------------------------------

sub ProcessEquations
  {
    open(EQNS, ">temp/equations.tex");
    print EQNS "\\documentclass{article}\n\\makeatletter";
    print EQNS "\n\\makeatother\n\\begin{document}\n";
    print EQNS join("\n\n",@TexEquations);
    print EQNS "\n\\end{document}\n\n";
    close(EQNS);
    
    chdir 'temp';
    system "tralics -xml equations.tex > dump";
    chdir '..';
    open(MATHML, "./temp/equations.xml");
    
    my($mathmlstring) = "";
    my($eqidx) = 0;
    while (<MATHML>)
      {
	  local($equ) = $_;
	  if ($_ =~ /<math|<simplemath/)
	  {
	      $equ =~ s{</*p>}{}g;
	      $equ =~ s/xmlns=\'.+?\'//g;
	      $equ =~ s/<simplemath>/<math><mo>/g;
	      $equ =~ s/<\/simplemath>/<\/mo><\/math>/g;
	      $equ =~ s/(<\/*)/$1m:/g;
	      $equ =~ s/<\/*m:formula.*?>//g;
	      $equ =~ s/(<m:mfenced.*?>)/$1<m:mrow>/sg;
	      $equ =~ s/(<\/m:mfenced.*?>)/<\/m:mrow>$1/sg;
	      push(@Equations, $equ);
	  }
      }
    
    close(MATHML);
        
  }



