LibOFX
ofx_preproc.cpp
Go to the documentation of this file.
00001 /***************************************************************************
00002           ofx_preproc.cpp
00003                              -------------------
00004     copyright            : (C) 2002 by Benoit Gr�oir
00005     email                : benoitg@coeus.ca
00006 ***************************************************************************/
00012 /***************************************************************************
00013  *                                                                         *
00014  *   This program is free software; you can redistribute it and/or modify  *
00015  *   it under the terms of the GNU General Public License as published by  *
00016  *   the Free Software Foundation; either version 2 of the License, or     *
00017  *   (at your option) any later version.                                   *
00018  *                                                                         *
00019  ***************************************************************************/
00020 #include "../config.h"
00021 #include <iostream>
00022 #include <fstream>
00023 #include <cstdlib>
00024 #include <stdio.h>
00025 #include <string>
00026 #include "ParserEventGeneratorKit.h"
00027 #include "libofx.h"
00028 #include "messages.hh"
00029 #include "ofx_sgml.hh"
00030 #include "ofc_sgml.hh"
00031 #include "ofx_preproc.hh"
00032 #include "ofx_utilities.hh"
00033 #ifdef HAVE_ICONV
00034 #include <iconv.h>
00035 #endif
00036 
00037 #ifdef OS_WIN32
00038 # define DIRSEP "\\"
00039 #else
00040 # define DIRSEP "/"
00041 #endif
00042 
00043 #ifdef OS_WIN32
00044 # include "win32.hh"
00045 # include <windows.h> // for GetModuleFileName()
00046 # undef ERROR
00047 # undef DELETE
00048 #endif
00049 
00050 #define LIBOFX_DEFAULT_INPUT_ENCODING "CP1252"
00051 #define LIBOFX_DEFAULT_OUTPUT_ENCODING "UTF-8"
00052 
00053 using namespace std;
00057 #ifdef MAKEFILE_DTD_PATH
00058 const int DTD_SEARCH_PATH_NUM = 4;
00059 #else
00060 const int DTD_SEARCH_PATH_NUM = 3;
00061 #endif
00062 
00066 const char *DTD_SEARCH_PATH[DTD_SEARCH_PATH_NUM] =
00067 {
00068 #ifdef MAKEFILE_DTD_PATH
00069   MAKEFILE_DTD_PATH ,
00070 #endif
00071   "/usr/local/share/libofx/dtd",
00072   "/usr/share/libofx/dtd",
00073   "~"
00074 };
00075 const unsigned int READ_BUFFER_SIZE = 1024;
00076 
00081 int ofx_proc_file(LibofxContextPtr ctx, const char * p_filename)
00082 {
00083   LibofxContext *libofx_context;
00084   bool ofx_start = false;
00085   bool ofx_end = false;
00086   bool file_is_xml = false;
00087 
00088   ifstream input_file;
00089   ofstream tmp_file;
00090   char buffer[READ_BUFFER_SIZE];
00091   char *iconv_buffer;
00092   string s_buffer;
00093   char *filenames[3];
00094   char tmp_filename[256];
00095   int tmp_file_fd;
00096 #ifdef HAVE_ICONV
00097   iconv_t conversion_descriptor;
00098 #endif
00099   libofx_context = (LibofxContext*)ctx;
00100 
00101   if (p_filename != NULL && strcmp(p_filename, "") != 0)
00102   {
00103     message_out(DEBUG, string("ofx_proc_file():Opening file: ") + p_filename);
00104 
00105     input_file.open(p_filename);
00106     if (!input_file)
00107     {
00108       message_out(ERROR, "ofx_proc_file():Unable to open the input file " + string(p_filename));
00109     }
00110 
00111     mkTempFileName("libofxtmpXXXXXX", tmp_filename, sizeof(tmp_filename));
00112 
00113     message_out(DEBUG, "ofx_proc_file(): Creating temp file: " + string(tmp_filename));
00114     tmp_file_fd = mkstemp(tmp_filename);
00115     if (tmp_file_fd)
00116     {
00117       tmp_file.open(tmp_filename);
00118       if (!tmp_file)
00119       {
00120         message_out(ERROR, "ofx_proc_file():Unable to open the created temp file " + string(tmp_filename));
00121         return -1;
00122       }
00123     }
00124     else
00125     {
00126       message_out(ERROR, "ofx_proc_file():Unable to create a temp file at " + string(tmp_filename));
00127       return -1;
00128     }
00129 
00130     if (input_file && tmp_file)
00131     {
00132       int header_separator_idx;
00133       string header_name;
00134       string header_value;
00135       string ofx_encoding;
00136       string ofx_charset;
00137       bool end_of_line;
00138       do
00139       {
00140         s_buffer.clear();
00141         bool end_of_line = false;
00142         do
00143         {
00144           input_file.get(buffer, sizeof(buffer), '\n');
00145           //cout<< "got: " << buffer<<"\n";
00146           s_buffer.append(buffer);
00147 
00148           // Watch out: If input_file is in eof(), any subsequent read or
00149           // peek() will fail. However, the while() condition will
00150           // correctly catch this. Otherwise we need to check for this like so:
00151           //if (input_file.eof()) break;
00152 
00153           //cout<<"input_file.gcount(): "<<input_file.gcount()<<" sizeof(buffer): "<<sizeof(buffer)<<endl;
00154           if ( !input_file.eof() && (input_file.peek() == '\n'))
00155           {
00156             input_file.get(); // Discard the newline
00157             s_buffer.append("\n");
00158             end_of_line = true;
00159           }
00160           else if ( !input_file.eof() && input_file.fail())
00161           {
00162             input_file.clear();
00163           }
00164         }
00165         // Continue reading as long as we're not at EOF *and* we've not yet
00166         // reached an end-of-line.
00167         while (!input_file.eof() && !end_of_line);
00168 
00169         if (ofx_start == false && (s_buffer.find("<?xml") != string::npos))
00170         {
00171           message_out(DEBUG, "ofx_proc_file(): File is an actual XML file, iconv conversion will be skipped.");
00172           file_is_xml = true;
00173         }
00174 
00175         int ofx_start_idx;
00176         if (ofx_start == false &&
00177             (
00178               (libofx_context->currentFileType() == OFX &&
00179                ((ofx_start_idx = s_buffer.find("<OFX>")) !=
00180                 string::npos || (ofx_start_idx = s_buffer.find("<ofx>")) != string::npos))
00181               || (libofx_context->currentFileType() == OFC &&
00182                   ((ofx_start_idx = s_buffer.find("<OFC>")) != string::npos ||
00183                    (ofx_start_idx = s_buffer.find("<ofc>")) != string::npos))
00184             )
00185            )
00186         {
00187           ofx_start = true;
00188           if(file_is_xml==false)
00189           {
00190             s_buffer.erase(0, ofx_start_idx); //Fix for really broken files that don't have a newline after the header.
00191           }
00192           message_out(DEBUG, "ofx_proc_file():<OFX> or <OFC> has been found");
00193 
00194           if(file_is_xml==true)
00195           {
00196             static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
00197             if(putenv(sp_charset_fixed)!=0)
00198             {
00199               message_out(ERROR, "ofx_proc_file(): putenv failed");
00200             }
00201             /* Normally the following would be "xml".
00202              * Unfortunately, opensp's generic api will garble UTF-8 if this is
00203              * set to xml.  So we set any single byte encoding to avoid messing
00204              * up UTF-8.  Unfortunately this means that non-UTF-8 files will not
00205              * get properly translated.  We'd need to manually detect the
00206              * encoding in the XML header and convert the xml with iconv like we
00207              * do for SGML to work around the problem.  Most unfortunate. */
00208             static char sp_encoding[] = "SP_ENCODING=ms-dos";
00209             if(putenv(sp_encoding)!=0)
00210             {
00211               message_out(ERROR, "ofx_proc_file(): putenv failed");
00212             }
00213           }
00214           else
00215           {
00216             static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
00217             if(putenv(sp_charset_fixed)!=0)
00218             {
00219               message_out(ERROR, "ofx_proc_file(): putenv failed");
00220             }
00221             static char sp_encoding[] = "SP_ENCODING=ms-dos"; //Any single byte encoding will do, we don't want opensp messing up UTF-8;
00222             if(putenv(sp_encoding)!=0)
00223             {
00224               message_out(ERROR, "ofx_proc_file(): putenv failed");
00225             }
00226 #ifdef HAVE_ICONV
00227           string fromcode;
00228           string tocode;
00229           if (ofx_encoding.compare("USASCII") == 0)
00230           {
00231             if (ofx_charset.compare("ISO-8859-1") == 0 || ofx_charset.compare("8859-1") == 0)
00232             {
00233               //Only "ISO-8859-1" is actually a legal value, but since the banks follows the spec SO well...
00234               fromcode = "ISO-8859-1";
00235             }
00236             else if (ofx_charset.compare("1252") == 0 || ofx_charset.compare("CP1252") == 0)
00237             {
00238               //Only "1252" is actually a legal value, but since the banks follows the spec SO well...
00239               fromcode = "CP1252";
00240             }
00241             else if (ofx_charset.compare("NONE") == 0)
00242             {
00243               fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
00244             }
00245             else
00246             {
00247               fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
00248             }
00249           }
00250           else if (ofx_encoding.compare("UTF-8") == 0 || ofx_encoding.compare("UNICODE") == 0)
00251           {
00252                 //While "UNICODE" isn't a legal value, some cyrilic files do specify it as such...
00253             fromcode = "UTF-8";
00254           }
00255           else
00256           {
00257             fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
00258           }
00259           tocode = LIBOFX_DEFAULT_OUTPUT_ENCODING;
00260           message_out(DEBUG, "ofx_proc_file(): Setting up iconv for fromcode: " + fromcode + ", tocode: " + tocode);
00261           conversion_descriptor = iconv_open (tocode.c_str(), fromcode.c_str());
00262 #endif
00263           }
00264         }
00265         else
00266         {
00267           //We are still in the headers
00268           if ((header_separator_idx = s_buffer.find(':')) != string::npos)
00269           {
00270             //Header processing
00271             header_name.assign(s_buffer.substr(0, header_separator_idx));
00272             header_value.assign(s_buffer.substr(header_separator_idx + 1));
00273             while ( header_value[header_value.length() -1 ] == '\n' ||
00274                     header_value[header_value.length() -1 ] == '\r' )
00275               header_value.erase(header_value.length() - 1);
00276             message_out(DEBUG, "ofx_proc_file():Header: " + header_name + " with value: " + header_value + " has been found");
00277             if (header_name.compare("ENCODING") == 0)
00278             {
00279               ofx_encoding.assign(header_value);
00280             }
00281             if (header_name.compare("CHARSET") == 0)
00282             {
00283               ofx_charset.assign(header_value);
00284             }
00285           }
00286         }
00287 
00288         if (file_is_xml==true || (ofx_start == true && ofx_end == false))
00289         {
00290           if(ofx_start == true)
00291           {
00292             /* The above test won't help us if the <OFX> tag is on the same line
00293              * as the xml header, but as opensp can't be used to parse it anyway
00294              * this isn't a great loss for now.
00295              */
00296             s_buffer = sanitize_proprietary_tags(s_buffer);
00297           }
00298           //cout<< s_buffer<<"\n";
00299           if(file_is_xml==false)
00300           {
00301 #ifdef HAVE_ICONV
00302             size_t inbytesleft = strlen(s_buffer.c_str());
00303             size_t outbytesleft = inbytesleft * 2 - 1;
00304             iconv_buffer = (char*) malloc (inbytesleft * 2);
00305             memset(iconv_buffer, 0, inbytesleft * 2);
00306 #ifdef OS_WIN32
00307             const char * inchar = (const char *)s_buffer.c_str();
00308 #else
00309             char * inchar = (char *)s_buffer.c_str();
00310 #endif
00311             char * outchar = iconv_buffer;
00312             int iconv_retval = iconv (conversion_descriptor,
00313                                       &inchar, &inbytesleft,
00314                                       &outchar, &outbytesleft);
00315             if (iconv_retval == -1)
00316             {
00317               message_out(ERROR, "ofx_proc_file(): Conversion error");
00318             }
00319             s_buffer = iconv_buffer;
00320             free (iconv_buffer);
00321 #endif
00322           }
00323           cout<<s_buffer<<"\n";
00324           tmp_file.write(s_buffer.c_str(), s_buffer.length());
00325         }
00326 
00327         if (ofx_start == true &&
00328             (
00329               (libofx_context->currentFileType() == OFX &&
00330                ((ofx_start_idx = s_buffer.find("</OFX>")) != string::npos ||
00331                 (ofx_start_idx = s_buffer.find("</ofx>")) != string::npos))
00332               || (libofx_context->currentFileType() == OFC &&
00333                   ((ofx_start_idx = s_buffer.find("</OFC>")) != string::npos ||
00334                    (ofx_start_idx = s_buffer.find("</ofc>")) != string::npos))
00335             )
00336            )
00337         {
00338           ofx_end = true;
00339           message_out(DEBUG, "ofx_proc_file():</OFX> or </OFC>  has been found");
00340         }
00341 
00342       }
00343       while (!input_file.eof() && !input_file.bad());
00344     }
00345     input_file.close();
00346     tmp_file.close();
00347 #ifdef HAVE_ICONV
00348     if(file_is_xml==false)
00349     {
00350       iconv_close(conversion_descriptor);
00351     }
00352 #endif
00353     char filename_openspdtd[255];
00354     char filename_dtd[255];
00355     char filename_ofx[255];
00356     strncpy(filename_openspdtd, find_dtd(ctx, OPENSPDCL_FILENAME).c_str(), 255); //The opensp sgml dtd file
00357     if (libofx_context->currentFileType() == OFX)
00358     {
00359       strncpy(filename_dtd, find_dtd(ctx, OFX160DTD_FILENAME).c_str(), 255); //The ofx dtd file
00360     }
00361     else if (libofx_context->currentFileType() == OFC)
00362     {
00363       strncpy(filename_dtd, find_dtd(ctx, OFCDTD_FILENAME).c_str(), 255); //The ofc dtd file
00364     }
00365     else
00366     {
00367       message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00368     }
00369 
00370     if ((string)filename_dtd != "" && (string)filename_openspdtd != "")
00371     {
00372       strncpy(filename_ofx, tmp_filename, 255); //The processed ofx file
00373       filenames[0] = filename_openspdtd;
00374       filenames[1] = filename_dtd;
00375       filenames[2] = filename_ofx;
00376       if (libofx_context->currentFileType() == OFX)
00377       {
00378         ofx_proc_sgml(libofx_context, 3, filenames);
00379       }
00380       else if (libofx_context->currentFileType() == OFC)
00381       {
00382         ofc_proc_sgml(libofx_context, 3, filenames);
00383       }
00384       else
00385       {
00386         message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00387       }
00388       if (remove(tmp_filename) != 0)
00389       {
00390         message_out(ERROR, "ofx_proc_file(): Error deleting temporary file " + string(tmp_filename));
00391       }
00392     }
00393     else
00394     {
00395       message_out(ERROR, "ofx_proc_file(): FATAL: Missing DTD, aborting");
00396     }
00397   }
00398   else
00399   {
00400     message_out(ERROR, "ofx_proc_file():No input file specified");
00401   }
00402   return 0;
00403 }
00404 
00405 
00410 string sanitize_proprietary_tags(string input_string)
00411 {
00412   unsigned int i;
00413   size_t input_string_size;
00414   bool strip = false;
00415   bool tag_open = false;
00416   int tag_open_idx = 0; //Are we within < > ?
00417   bool closing_tag_open = false; //Are we within </ > ?
00418   int orig_tag_open_idx = 0;
00419   bool proprietary_tag = false; //Are we within a proprietary element?
00420   bool proprietary_closing_tag = false;
00421   int crop_end_idx = 0;
00422   char buffer[READ_BUFFER_SIZE] = "";
00423   char tagname[READ_BUFFER_SIZE] = "";
00424   int tagname_idx = 0;
00425   char close_tagname[READ_BUFFER_SIZE] = "";
00426 
00427   for (i = 0; i < READ_BUFFER_SIZE; i++)
00428   {
00429     buffer[i] = 0;
00430     tagname[i] = 0;
00431     close_tagname[i] = 0;
00432   }
00433 
00434   input_string_size = input_string.size();
00435 
00436   for (i = 0; i <= input_string_size; i++)
00437   {
00438     if (input_string.c_str()[i] == '<')
00439     {
00440       tag_open = true;
00441       tag_open_idx = i;
00442       if (proprietary_tag == true && input_string.c_str()[i+1] == '/')
00443       {
00444         //We are now in a closing tag
00445         closing_tag_open = true;
00446         //cout<<"Comparaison: "<<tagname<<"|"<<&(input_string.c_str()[i+2])<<"|"<<strlen(tagname)<<endl;
00447         if (strncmp(tagname, &(input_string.c_str()[i+2]), strlen(tagname)) != 0)
00448         {
00449           //If it is the begining of an other tag
00450           //cout<<"DIFFERENT!"<<endl;
00451           crop_end_idx = i - 1;
00452           strip = true;
00453         }
00454         else
00455         {
00456           //Otherwise, it is the start of the closing tag of the proprietary tag
00457           proprietary_closing_tag = true;
00458         }
00459       }
00460       else if (proprietary_tag == true)
00461       {
00462         //It is the start of a new tag, following a proprietary tag
00463         crop_end_idx = i - 1;
00464         strip = true;
00465       }
00466     }
00467     else if (input_string.c_str()[i] == '>')
00468     {
00469       tag_open = false;
00470       closing_tag_open = false;
00471       tagname[tagname_idx] = 0;
00472       tagname_idx = 0;
00473       if (proprietary_closing_tag == true)
00474       {
00475         crop_end_idx = i;
00476         strip = true;
00477       }
00478     }
00479     else if (tag_open == true && closing_tag_open == false)
00480     {
00481       if (input_string.c_str()[i] == '.')
00482       {
00483         if (proprietary_tag != true)
00484         {
00485           orig_tag_open_idx = tag_open_idx;
00486           proprietary_tag = true;
00487         }
00488       }
00489       tagname[tagname_idx] = input_string.c_str()[i];
00490       tagname_idx++;
00491     }
00492     //cerr <<i<<endl;
00493     if (strip == true && orig_tag_open_idx < input_string.size())
00494     {
00495       input_string.copy(buffer, (crop_end_idx - orig_tag_open_idx) + 1, orig_tag_open_idx);
00496       message_out(INFO, "sanitize_proprietary_tags() (end tag or new tag) removed: " + string(buffer));
00497       input_string.erase(orig_tag_open_idx, (crop_end_idx - orig_tag_open_idx) + 1);
00498       i = orig_tag_open_idx - 1;
00499       proprietary_tag = false;
00500       proprietary_closing_tag = false;
00501       closing_tag_open = false;
00502       tag_open = false;
00503       strip = false;
00504     }
00505 
00506   }//end for
00507   if (proprietary_tag == true && orig_tag_open_idx < input_string.size())
00508   {
00509     if (crop_end_idx == 0)   //no closing tag
00510     {
00511       crop_end_idx = input_string.size() - 1;
00512     }
00513     input_string.copy(buffer, (crop_end_idx - orig_tag_open_idx) + 1, orig_tag_open_idx);
00514     message_out(INFO, "sanitize_proprietary_tags() (end of line) removed: " + string(buffer));
00515     input_string.erase(orig_tag_open_idx, (crop_end_idx - orig_tag_open_idx) + 1);
00516   }
00517   return input_string;
00518 }
00519 
00520 
00521 #ifdef OS_WIN32
00522 static std::string get_dtd_installation_directory()
00523 {
00524   // Partial implementation of
00525   // http://developer.gnome.org/doc/API/2.0/glib/glib-Windows-Compatibility-Functions.html#g-win32-get-package-installation-directory
00526   char ch_fn[MAX_PATH], *p;
00527   std::string str_fn;
00528 
00529   if (!GetModuleFileName(NULL, ch_fn, MAX_PATH)) return "";
00530 
00531   if ((p = strrchr(ch_fn, '\\')) != NULL)
00532     * p = '\0';
00533 
00534   p = strrchr(ch_fn, '\\');
00535   if (p && (_stricmp(p + 1, "bin") == 0 ||
00536             _stricmp(p + 1, "lib") == 0))
00537     *p = '\0';
00538 
00539   str_fn = ch_fn;
00540   str_fn += "\\share\\libofx\\dtd";
00541 
00542   return str_fn;
00543 }
00544 #endif
00545 
00546 
00559 std::string find_dtd(LibofxContextPtr ctx, const std::string& dtd_filename)
00560 {
00561   string dtd_path_filename;
00562   char *env_dtd_path;
00563 
00564   dtd_path_filename = reinterpret_cast<const LibofxContext*>(ctx)->dtdDir();
00565   if (!dtd_path_filename.empty())
00566   {
00567     dtd_path_filename.append(dtd_filename);
00568     ifstream dtd_file(dtd_path_filename.c_str());
00569     if (dtd_file)
00570     {
00571       message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00572       return dtd_path_filename;
00573     }
00574   }
00575 
00576 #ifdef OS_WIN32
00577   dtd_path_filename = get_dtd_installation_directory();
00578   if (!dtd_path_filename.empty())
00579   {
00580     dtd_path_filename.append(DIRSEP);
00581     dtd_path_filename.append(dtd_filename);
00582     ifstream dtd_file(dtd_path_filename.c_str());
00583     if (dtd_file)
00584     {
00585       message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00586       return dtd_path_filename;
00587     }
00588   }
00589 #endif
00590   /* Search in environement variable OFX_DTD_PATH */
00591   env_dtd_path = getenv("OFX_DTD_PATH");
00592   if (env_dtd_path)
00593   {
00594     dtd_path_filename.append(env_dtd_path);
00595     dtd_path_filename.append(DIRSEP);
00596     dtd_path_filename.append(dtd_filename);
00597     ifstream dtd_file(dtd_path_filename.c_str());
00598     if (!dtd_file)
00599     {
00600       message_out(STATUS, "find_dtd():OFX_DTD_PATH env variable was was present, but unable to open the file " + dtd_path_filename);
00601     }
00602     else
00603     {
00604       message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00605       return dtd_path_filename;
00606     }
00607   }
00608 
00609   for (int i = 0; i < DTD_SEARCH_PATH_NUM; i++)
00610   {
00611     dtd_path_filename = DTD_SEARCH_PATH[i];
00612     dtd_path_filename.append(DIRSEP);
00613     dtd_path_filename.append(dtd_filename);
00614     ifstream dtd_file(dtd_path_filename.c_str());
00615     if (!dtd_file)
00616     {
00617       message_out(DEBUG, "find_dtd():Unable to open the file " + dtd_path_filename);
00618     }
00619     else
00620     {
00621       message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00622       return dtd_path_filename;
00623     }
00624   }
00625 
00626   /* Last resort, look in source tree relative path (useful for development) */
00627   dtd_path_filename = "";
00628   dtd_path_filename.append("..");
00629   dtd_path_filename.append(DIRSEP);
00630   dtd_path_filename.append("dtd");
00631   dtd_path_filename.append(DIRSEP);
00632   dtd_path_filename.append(dtd_filename);
00633   ifstream dtd_file(dtd_path_filename.c_str());
00634   if (!dtd_file)
00635   {
00636     message_out(DEBUG, "find_dtd(): Unable to open the file " + dtd_path_filename + ", most likely we are not in the source tree.");
00637   }
00638   else
00639   {
00640     message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00641     return dtd_path_filename;
00642   }
00643 
00644 
00645   message_out(ERROR, "find_dtd():Unable to find the DTD named " + dtd_filename);
00646   return "";
00647 }
00648 
00649