/* * Copyright (c) 1995, 1996 Gunther Schadow. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "pg_config.h" IDENT("@(#) unitparse.cc (Gunther Schadow) 06/27/96"); #include "Unit.h" #include #include // for sprintf() only! #include #include #include #include "exception.h" #include "logfile.h" /* * The unit parser * * Synopsis: [10*[+|-]exponent][prefix]unit[[+|-]exponent] * * The tricky point here is that prefix and unit are ambiguous. The * solution might be to always scan the best fit for the unit then * take the rest for the prefix. It is easy if unit is enclosed in * parenthesis but there are normally no prefixes in those cases. * * Parentheses seem to be used for two different ends: * 1. to enclose new, mostly dimensionless pseudo-units. The use of * this remains unclear to me, since the prefix is enclosed together * with the unit. Examples: * * (tot) particles total count * (cfu) colony forming units * (ppm) parts per million * (kat) katal * (kkat) kilocatal * (ph) pH * * let's hope that there are no true ambiguities like if we would * allow the unit ``at'' (Atü = athmospheric pressure excess) which * would make a Katal (kat) to a kilo-Atü (k-at). * * 2. The parentheses term is postponed to the prefix-unit term and * encloses information about the substance that the measurement * relates to. Examples: * * cm(h2o) cm of water column, a unit for pressure, not for length * g(creat) grams of creatinine * g(hgb) grams of hemoglobin * g(tot_prot) grams of total protein * g(wet_tis) grams of wet tissue * * This group again devides into two subgroups: While in ``cm(h2o)'' * the parentheses term totally changes the dimension of the unit, * the parentheses in ``g(...)'' do not change the dimension but * attach an information about substance to it. This is particularly * valueable in the creatinine and haemoglobine case, since it allows * us to calculate the substance quantity (moles). However, in the * protein or tissue case we cannot use the term. Anyway to make proper * use of the substance information we need a substance code database * which would provide us with those informations like the mole-mass * or density in order to calculate the conversions. On the other hand * in the cases of haemoglobin and creatinine the information in paren- * theses duplicate the information that was already supplied in the * observation identifier. * For now, since establishing a sophisticated substance code and * database is too much and unsafe (non-standard) work, I decide to * generally forget about the parenthesized substance terms, if they * are not known as a whole to the units database. I.e. in case of * cm(h2o) we certainly must value the parenthesized ``h2o'' but we * do this by adding the whole unit term m(h2o) as a unit of pressure * to the unit database. The gram terms, particularly the ``wet tissue'' * will not be valued at all. It is still possible to value g(creat) * as a dimensionless unit of substance quantity (in moles). * * Finally (if it wasn't already enough :-() there is also the custom to * use terms like ``10*3/ml'' as a unit instead of the proper standard * unit ``/ul''. This must be taken care of too. */ /* set(const char *) -- The unit parser * * Method outline: * 1. Scan an operator followed by one unit factor with exponent. First * factor has implied operator '.'. * 2. Try to interpret the whole symbol as a unit with no prefix. If * this fails, assume first character as the prefix, if successful * try the unit again, if failed assume the prefix to include the * next character and so forth... * 3. If a unit symbol starts with a parentheses just leave away the * parenthese. * 4. If a unit is followed by a parentheses, try to find a unit with * the parenthesized suffix. If this fails, try to leave away the * parentheses BUT AT LEAST ISSUE A WARNING, since this is not save * in case the other party intents an effect as with cm(h2o) (see * above). */ void Unit::set(const char *t) { unit = UnitAtom::null_unit; unit.name = strdup(t); // convert to lower case, I don't want this, but HL7 says so! for(char *p = unit.name; *p != 0; p++) *p = tolower((int)*p); t = unit.name; const int op_div = -1; const int op_mul = +1; bool cnvf_found = FALSE; bool first_factor = TRUE; /* * First factor has an optional operator */ int op = op_mul; // default operator is '.' on first factor if(*t == '/') { op = op_div; t++; } else if(*t == '.') t++; /* * Enter the factor's loop */ do { bool outer_paren = FALSE; if(*t == '(') // parenthesized unit { outer_paren = TRUE; ++t; } /* * find end of unit symbol */ const char *d = t; if(*t == '1') { if(t[1] == '0' && t[2] == '*') // The pseudo-unit "10*" d = &t[3]; else if(!isdigit(t[1])) // The pseudo unit "1" d = &t[1]; } else while(! strchr("./+-0123456789()", *d)) if(*d == '\0') EPARSE("premature end of term: `%s'", t); else d++; int pfx; UnitAtom ua; if(*d == '(') // postponed parenthesis { const char *d1 = d; while(*d1 != ')') // scan to end of parenthesis if(*d1 == '\0') EPARSE("premature end of term: `%s'", t); else d1++; d1++; // try to resolve the symbol with parentheses if(tounit(t, d1 - t, pfx, ua)) d = d1; else { LOGWARNING("unable to resolve symbol: `%s'", t); // try to resolve the symbol without parenthesis if(tounit(t, d - t, pfx, ua)) d = d1; // discard postponed parentheses else EPARSE("unable to resolve symbol: `%s'", t); } } else if(! tounit(t, d - t, pfx, ua)) // try to resolve the symbol EPARSE("unable to resolve symbol: `%s'", t); t = d; if(outer_paren) // factor was parenthesized if(*t == ')') // try the right parenthesis { t++; outer_paren = FALSE; } // if right parenthesis could not be found here, try later /* * We found a valid unit, pfx contains prefix and un a pointer to a * unit structure. * * Read an optional exponent: */ long exp = 1; // default exponent is 1 if(strchr("+-0123456789", *t)) { exp = strtol(t, (char **)&d, 10); if(t == d) exp = 1; // FIXME: this is actually an error else t = d; } if(outer_paren) // there is still an open parenthesis if(*t == ')') // try the right parenthesis t++; else EPARSE("missing right parenthesis `%s'", t); /* * OK, we've got everything, now update the unit structure: */ unit.base += ua->base * ( op * exp ); if(op == +1) unit.coeff_mantissa *= ::pow(ua->coeff_mantissa, (double)exp); else unit.coeff_mantissa /= ::pow(ua->coeff_mantissa, (double)exp); unit.coeff_exponent += op * ( pfx + ua->coeff_exponent ) * exp; /* Allow conversion functions only for isolated units, i.e. no * exponents other than 1 and must be the single factor of a * unit term. */ if(ua->cnv_from != NULL) if((op * exp) != 1 || !first_factor || *t != '\0') EPARSE("unit has conversion functions but is not isolated: `%s'" , unit.name); else { unit.cnv_from = ua -> cnv_from; unit.cnv_to = ua -> cnv_to; cnvf_found = TRUE; } first_factor = FALSE; /*********************************************************** * end of factor * now scan for next operator: */ if(*t == '.') op = op_mul; else if(*t == '/') op = op_div; else if(*t != '\0') // an operator must follow or end of term EPARSE("syntax error in unit term: `%s'", t); } while(*t++ != '\0'); Code::set(); } bool Unit::tounit(const char *sym, size_t len, int &pfx, UnitAtom &ua) { const char *root; char buf[len + 1]; strncpy(buf, sym, len); buf[len] = '\0'; for(size_t pfxlen = 0; pfxlen < len; pfxlen++) { pfx = prefices::lookup(buf, pfxlen, &root); if(ua.lookup(root) && pfx != prefices::not_found) return TRUE; } return FALSE; } /* invert a unit string, i.e. exchange the operators '/' and '.': * * "kg.m/s2" -> "/kg/m.s2" * * This method does not produce beautiful results, but they are correct * and this is what is important here. * * NOTE: A new string is allocated, thus there may be somthing to delete * after a call to invert(const char*). */ char *Unit::invert(const char *u) // invert a unit string { char *vbase = new char[strlen(u) + 2]; char *v = vbase; if(u[0] == '/') u++; else { if(u[0] == '.') u++; *v++ = '/'; } while(TRUE) { switch(*u) { case '\0': *v = *u; return vbase; case '.': *v = '/'; break; case '/': *v = '.'; break; default: *v = *u; break; } u++; v++; } } /* raise a unit string to the i-th power, i.e. add or update the exponents. * * kg.m/s2 (^ -2) --> kg-2.m-2/s-4 * * Note that pow(t, -1) produces different results from invert(t). * Again pow(const char*, int) allocates a new result string. */ char* Unit::pow(const char *u, int n) { /* try to guess an approximate and save length */ char *vbase = new char[strlen(u) * (int)log10((double)abs(n)) * 4 + 100]; char *v = vbase; bool did_exponent = FALSE; if(u[0] == '/') *v++ = *u++; while(TRUE) { switch(*u) { case '\0': { if(! did_exponent) { int w; sprintf(v, "%+d%n", n, &w); v += w; *v = *u; } char *r = strdup(vbase); delete [] vbase; return r; } case '.': case '/': if(! did_exponent) { int w; sprintf(v, "%+d%n", n, &w); v += w; *v = *u; } break; case '1': if(u[1] == '0' && u[2] == '*') // The pseudo-unit "10*" { *v++ = *u++; *v++ = *u++; *v++ = *u++; break; } else if(!isdigit(u[1])) // The pseudo unit "1" *v++ = *u++; // else continue with next case: case '0': case '2': case '3': case '4': case '5': case '6': // i.e. an exponent is already there case '7': case '8': case '9': case '+': case '-': { char *d; long e = strtol(u, &d, 10); if(u == d) e = 1; // FIXME: this is actually an error else u = d; int w; sprintf(v, "%+ld%n", n * e, &w); v += w; did_exponent = TRUE; } break; default: *v = *u; break; } v++; u++; } }