/*
 * Copyright (c) 1995, 1996 Gunther Schadow.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
#include "pg_config.h"

IDENT("@(#) unitparse.cc (Gunther Schadow) 06/27/96");

#include "Unit.h"
#include <string.h>
#include <stdio.h> // for sprintf() only!
#include <stdlib.h>
#include <ctype.h>
#include <math.h>
#include "exception.h"
#include "logfile.h"

/*
 * The unit parser
 *
 * Synopsis: [10*[+|-]exponent][prefix]unit[[+|-]exponent]
 *
 * The tricky point here is that prefix and unit are ambiguous. The
 * solution might be to always scan the best fit for the unit then
 * take the rest for the prefix. It is easy if unit is enclosed in
 * parenthesis but there are normally no prefixes in those cases.
 *
 * Parentheses seem to be used for two different ends:
 * 1. to enclose new, mostly dimensionless pseudo-units. The use of
 *    this remains unclear to me, since the prefix is enclosed together
 *    with the unit. Examples:
 *
 *    (tot)   particles total count
 *    (cfu)   colony forming units
 *    (ppm)   parts per million
 *    (kat)   katal
 *    (kkat)  kilocatal
 *    (ph)    pH
 *
 *    let's hope that there are no true ambiguities like if we would
 *    allow the unit ``at'' (Atü = athmospheric pressure excess) which
 *    would make a Katal (kat) to a kilo-Atü (k-at).  
 *
 * 2. The parentheses term is postponed to the prefix-unit term and
 *    encloses information about the substance that the measurement
 *    relates to. Examples:
 *
 *    cm(h2o) cm of water column, a unit for pressure, not for length
 *    g(creat)    grams of creatinine
 *    g(hgb)      grams of hemoglobin
 *    g(tot_prot) grams of total protein
 *    g(wet_tis)  grams of wet tissue
 *
 *    This group again devides into two subgroups: While in ``cm(h2o)''
 *    the parentheses term totally changes the dimension of the unit,
 *    the parentheses in ``g(...)'' do not change the dimension but
 *    attach an information about substance to it. This is particularly
 *    valueable in the creatinine and haemoglobine case, since it allows
 *    us to calculate the substance quantity (moles). However, in the
 *    protein or tissue case we cannot use the term. Anyway to make proper
 *    use of the substance information we need a substance code database
 *    which would provide us with those informations like the mole-mass
 *    or density in order to calculate the conversions. On the other hand
 *    in the cases of haemoglobin and creatinine the information in paren-
 *    theses duplicate the information that was already supplied in the 
 *    observation identifier.
 *      For now, since establishing a sophisticated substance code and
 *    database is too much and unsafe (non-standard) work, I decide to
 *    generally forget about the parenthesized substance terms, if they
 *    are not known as a whole to the units database. I.e. in case of
 *    cm(h2o) we certainly must value the parenthesized ``h2o'' but we
 *    do this by adding the whole unit term m(h2o) as a unit of pressure
 *    to the unit database. The gram terms, particularly the ``wet tissue''
 *    will not be valued at all. It is still possible to value g(creat)
 *    as a dimensionless unit of substance quantity (in moles).
 *
 * Finally (if it wasn't already enough :-() there is also the custom to
 * use terms like ``10*3/ml'' as a unit instead of the proper standard
 * unit ``/ul''. This must be taken care of too.
 */

/* set(const char *) -- The unit parser
 *
 * Method outline:
 * 1. Scan an operator followed by one unit factor with exponent. First
 *    factor has implied operator '.'.
 * 2. Try to interpret the whole symbol as a unit with no prefix. If
 *    this fails, assume first character as the prefix, if successful
 *    try the unit again, if failed assume the prefix to include the
 *    next character and so forth...
 * 3. If a unit symbol starts with a parentheses just leave away the
 *    parenthese.
 * 4. If a unit is followed by a parentheses, try to find a unit with
 *    the parenthesized suffix. If this fails, try to leave away the
 *    parentheses BUT AT LEAST ISSUE A WARNING, since this is not save
 *    in case the other party intents an effect as with cm(h2o) (see
 *    above).
 */

void Unit::set(const char *t)
{
  unit = UnitAtom::null_unit;
  unit.name = strdup(t);
      
  // convert to lower case, I don't want this, but HL7 says so!
  for(char *p = unit.name; *p != 0; p++) *p = tolower((int)*p);
  t = unit.name;

  const int op_div = -1;
  const int op_mul = +1;

  bool cnvf_found = FALSE;
  bool first_factor = TRUE;

  /*
   * First factor has an optional operator
   */
  int op = op_mul; // default operator is '.' on first factor
  
  if(*t == '/')
    {
      op = op_div;
      t++;
    }
  else
    if(*t == '.') t++;

  /*
   * Enter the factor's loop
   */
  do
    {
      bool outer_paren = FALSE;

      if(*t == '(') // parenthesized unit
	{
	  outer_paren = TRUE;
	  ++t;
	}
      
      /*
       * find end of unit symbol
       */
      
      const char *d = t;
      
      if(*t == '1')	
	{
	  if(t[1] == '0' && t[2] == '*') // The pseudo-unit "10*"
	    d = &t[3];    
	  else if(!isdigit(t[1])) // The pseudo unit "1"
	    d = &t[1];
	}
      else
	while(! strchr("./+-0123456789()", *d))
	  if(*d == '\0')
	    EPARSE("premature end of term: `%s'", t);
	  else
	    d++;
      
      int pfx;
      UnitAtom ua;
      
      if(*d == '(') // postponed parenthesis
	{ 
	  const char *d1 = d; 
	  while(*d1 != ')') // scan to end of parenthesis
	    if(*d1 == '\0')
	      EPARSE("premature end of term: `%s'", t);
	    else
	      d1++;
	  d1++;
	  
	  // try to resolve the symbol with parentheses
	  if(tounit(t, d1 - t, pfx, ua)) 
	    d = d1;
	  else
	    {
	      LOGWARNING("unable to resolve symbol: `%s'", t);
	      
	      // try to resolve the symbol without parenthesis
	      
	      if(tounit(t, d - t, pfx, ua)) 
		d = d1; // discard postponed parentheses
	      else
		EPARSE("unable to resolve symbol: `%s'", t);
	    }
	}
      else
	if(! tounit(t, d - t, pfx, ua)) // try to resolve the symbol
	  EPARSE("unable to resolve symbol: `%s'", t);
      
      t = d;

      if(outer_paren) // factor was parenthesized
	if(*t == ')') // try the right parenthesis
	  {
	    t++;
	    outer_paren = FALSE;
	  }

      // if right parenthesis could not be found here, try later
	        
      /*
       * We found a valid unit, pfx contains prefix and un a pointer to a
       * unit structure.
       *
       * Read an optional exponent:
       */
      
      long exp = 1; // default exponent is 1
      

      if(strchr("+-0123456789", *t))
	{
	  exp = strtol(t, (char **)&d, 10);
	  if(t == d)
	    exp = 1; // FIXME: this is actually an error
	  else
	    t = d;
	}

      if(outer_paren) // there is still an open parenthesis
	if(*t == ')') // try the right parenthesis
	  t++;
	else
	  EPARSE("missing right parenthesis `%s'", t); 

      /*
       * OK, we've got everything, now update the unit structure:
       */

      unit.base += ua->base * ( op * exp );
      
      if(op == +1)
	unit.coeff_mantissa *= ::pow(ua->coeff_mantissa, (double)exp);
      else
	unit.coeff_mantissa /= ::pow(ua->coeff_mantissa, (double)exp);
      
      unit.coeff_exponent += op * ( pfx + ua->coeff_exponent ) * exp;
      
      /* Allow conversion functions only for isolated units, i.e. no 
       * exponents other than 1 and must be the single factor of a
       * unit term.
       */
      
      if(ua->cnv_from != NULL)
	if((op * exp) != 1 || !first_factor || *t != '\0')
	  EPARSE("unit has conversion functions but is not isolated: `%s'"
		 , unit.name);
	else
	  {
	    unit.cnv_from = ua -> cnv_from;
	    unit.cnv_to   = ua -> cnv_to;	
	    cnvf_found = TRUE;      
	  }

      first_factor = FALSE;

      /***********************************************************
       * end of factor
       * now scan for next operator:
       */

      if(*t == '.')
	op = op_mul;
      else if(*t == '/')
	op = op_div;
      else if(*t != '\0') // an operator must follow or end of term
	EPARSE("syntax error in unit term: `%s'", t);
    }
  while(*t++ != '\0');

  Code::set();
}  

bool
Unit::tounit(const char *sym, size_t len, int &pfx, UnitAtom &ua)
{
  const char *root;
  char buf[len + 1];
  strncpy(buf, sym, len);
  buf[len] = '\0';

  for(size_t pfxlen = 0; pfxlen < len; pfxlen++)
    {
      pfx = prefices::lookup(buf, pfxlen, &root);
      if(ua.lookup(root) && pfx != prefices::not_found)
	return TRUE;
    }
  
  return FALSE;
}

/* invert a unit string, i.e. exchange the operators '/' and '.': 
 *
 * "kg.m/s2" -> "/kg/m.s2"
 *
 * This method does not produce beautiful results, but they are correct
 * and this is what is important here.
 *
 * NOTE: A new string is allocated, thus there may be somthing to delete
 * after a call to invert(const char*).
 */

char *Unit::invert(const char *u) // invert a unit string
{
  char *vbase = new char[strlen(u) + 2];
  char *v = vbase;

  if(u[0] == '/')
    u++;
  else
    {
      if(u[0] == '.')
	u++;
      *v++ = '/';
    }

  while(TRUE)
    {
      switch(*u) {
      case '\0':
	*v = *u;
	return vbase;
      case '.':
	*v = '/';
	break;
      case '/':
	*v = '.';
	break;
      default:
	*v = *u;
	break;
      }
      u++; v++;
    }
}

/* raise a unit string to the i-th power, i.e. add or update the exponents.
 *
 * kg.m/s2 (^ -2) --> kg-2.m-2/s-4 
 *
 * Note that pow(t, -1) produces different results from invert(t).
 * Again pow(const char*, int) allocates a new result string.
 */

char* Unit::pow(const char *u, int n)
{
  /*                 try to guess an approximate and save length */
  char *vbase = new char[strlen(u) * (int)log10((double)abs(n)) * 4 + 100];
  char *v = vbase;

  bool did_exponent = FALSE;

  if(u[0] == '/')
    *v++ = *u++;

  while(TRUE)
    {
      switch(*u) {
      case '\0':
	{
	  if(! did_exponent)
	    {
	      int w;
	      sprintf(v, "%+d%n", n, &w);
	      v += w;
	      *v = *u;
	    }
	  char *r = strdup(vbase);
	  delete [] vbase;
	  return r;
	}
      case '.':
      case '/':
	if(! did_exponent)
	  {
	    int w;
	    sprintf(v, "%+d%n", n, &w);
	    v += w;
	    *v = *u;
	  }
	break;	    
      case '1':
	if(u[1] == '0' && u[2] == '*') // The pseudo-unit "10*"
	  {
	    *v++ = *u++; *v++ = *u++; *v++ = *u++;
	    break;
	  }
	else if(!isdigit(u[1])) // The pseudo unit "1"
	  *v++ = *u++;
	// else continue with next case:
      case '0':
      case '2':
      case '3':
      case '4':
      case '5':  
      case '6':      // i.e. an exponent is already there
      case '7':
      case '8':
      case '9':
      case '+':
      case '-':
	{
	  char *d;
	  long e = strtol(u, &d, 10);
	  if(u == d)
	    e = 1; // FIXME: this is actually an error
	  else
	    u = d;
	  int w;
	  sprintf(v, "%+ld%n", n * e, &w);
	  v += w;
	  did_exponent = TRUE;
	}
	break;
      default:
	*v = *u;
	break;
      }
      v++; u++;
    }
}