Main Page   Class Hierarchy   Alphabetical List   Compound List   Examples  
itparser.h
00001 #ifndef _MIMETIC_PARSER_ITPARSER_H_
00002 #define _MIMETIC_PARSER_ITPARSER_H_
00003 #include <iterator>
00004 #include <algorithm>
00005 #include <stack>
00006 #include <iostream>
00007 #include <mimetic/tree.h>
00008 #include <mimetic/utils.h>
00009 #include <mimetic/mimeentity.h>
00010 
00011 
00012 // FIXME: handle HigherLevelClosingBoundary
00013 
00014 namespace mimetic
00015 {
00016 
00017 /// Parse the input reading from an iterator
00018 template<typename Iterator, 
00019 typename ItCategory=typename std::iterator_traits<Iterator>::iterator_category> 
00020 struct IteratorParser
00021 {
00022 };
00023 
00024 /*
00025  * Input Iterator
00026  */
00027 template<typename Iterator>
00028 struct IteratorParser<Iterator, std::input_iterator_tag>
00029 {
00030 
00031     IteratorParser(MimeEntity& me)
00032     : m_me(me), m_iMask(imNone), m_lastBoundary(NoBoundary)
00033     {
00034         m_entityStack.push(&m_me);
00035     }
00036     virtual ~IteratorParser()
00037     {
00038     }
00039     /**
00040      * set the Ignore Mask to \p mask
00041      */
00042     void iMask(size_t mask)    {    m_iMask = mask;        }
00043     /**
00044      * get the Ignore Mask 
00045      */
00046     size_t iMask() const    {    return m_iMask;        }
00047     /**
00048      * start parsing
00049      */
00050     void run(Iterator bit, Iterator eit)
00051     {
00052         m_bit = bit;
00053         m_eit = eit;
00054         doLoad();
00055     }
00056 protected:
00057     typedef std::list<std::string> BoundaryList;
00058     enum { 
00059         CR = 0xD, 
00060         LF = 0xA, 
00061         NL = '\n' 
00062     };
00063     enum /* ParsingElem */ { 
00064         peIgnore, 
00065         pePreamble, 
00066         peBody, 
00067         peEpilogue 
00068     };
00069     enum BoundaryType {
00070         NoBoundary = 0,
00071         Boundary,
00072         ClosingBoundary,
00073         HigherLevelBoundary
00074         //, HigherLevelClosingBoundary
00075     };
00076     enum EntityType { 
00077         etRfc822, 
00078         etMsgRfc822, 
00079         etMultipart 
00080     };
00081     // vars
00082     MimeEntity& m_me;
00083     Iterator m_bit, m_eit;
00084     size_t m_iMask; // ignore mask
00085     BoundaryList m_boundaryList;
00086     BoundaryType m_lastBoundary;
00087     std::stack<MimeEntity*> m_entityStack;
00088 
00089 protected:
00090     void appendPreambleBlock(const char* buf, int sz)
00091     {
00092         MimeEntity* pMe = m_entityStack.top();
00093         pMe->body().preamble().append(buf,sz);
00094     }
00095     
00096     void appendEpilogueBlock(const char* buf, int sz)
00097     {
00098         MimeEntity* pMe = m_entityStack.top();
00099         pMe->body().epilogue().append(buf,sz);
00100     }
00101     
00102     void appendBodyBlock(const char* buf, int sz)
00103     {
00104         MimeEntity* pMe = m_entityStack.top();
00105         pMe->body().append(buf, sz);
00106     }
00107     
00108     std::string getBoundary()
00109     {
00110         const MimeEntity* pMe = m_entityStack.top();
00111         const ContentType& ct = pMe->header().contentType();
00112         return std::string("--") + ct.param("boundary");
00113     }
00114     
00115     void popChild()
00116     {
00117         m_entityStack.pop();
00118     }
00119     
00120     void pushNewChild()
00121     {
00122         MimeEntity* pMe = m_entityStack.top();
00123         MimeEntity* pChild = new MimeEntity;
00124         pMe->body().parts().push_back(pChild);
00125         m_entityStack.push(pChild);
00126     }
00127     
00128     EntityType getType()
00129     {
00130         MimeEntity* pMe = m_entityStack.top();
00131         const Header& h = pMe->header();
00132         // will NOT be automatically created if it doesn't exists;
00133         // null ContentType will be returned
00134         const ContentType& ct = h.contentType();
00135         if(ct.isMultipart())
00136             return etMultipart;
00137         else if    (ct.type() == "message" && ct.subtype() == "rfc822") 
00138             return etMsgRfc822;
00139         else
00140             return etRfc822;
00141     }
00142     
00143     void addField(const std::string& name, const std::string& value)
00144     {
00145         MimeEntity* pMe = m_entityStack.top();
00146         Header& h = pMe->header();
00147         Header::iterator it = h.insert(h.end(), Field());
00148         it->name(name);
00149         it->value(value);
00150     }
00151 
00152     BoundaryType isBoundary(const std::string& line) 
00153     {
00154         if(line.length() == 0 || line[0] != '-')
00155             return m_lastBoundary = NoBoundary;
00156 
00157         int level = 0; // multipart nesting level
00158         int lineLen = line.length();
00159         BoundaryList::const_iterator bit,eit;
00160         bit = m_boundaryList.begin(), eit = m_boundaryList.end();
00161         for(;bit != eit; ++bit, ++level)
00162         {
00163             const std::string& b = *bit;
00164             int bLen = b.length();
00165             if(line.compare(0, bLen, b) == 0)
00166             { 
00167                 // not the expected boundary, malformed msg
00168                 if(level > 0)
00169                     return m_lastBoundary=HigherLevelBoundary;
00170                 // plain boundary or closing boundary?
00171                 if(lineLen > bLen && line.compare(bLen,2,"--") == 0)
00172                     return m_lastBoundary = ClosingBoundary;
00173                 else
00174                     return m_lastBoundary = Boundary;
00175             }
00176         }
00177         return m_lastBoundary = NoBoundary;
00178     }
00179     // is new line
00180     inline bool isnl(char c) const
00181     {
00182         return (c == CR || c == LF);
00183     }
00184     // is a two char newline
00185     inline bool isnl(char a, char b) const
00186     {
00187         if(a == CR || a == LF)
00188             if(b == (a == CR ? LF : CR))
00189                 return true;
00190         return false;
00191     }
00192     void doLoad()
00193     {
00194         loadHeader();
00195         loadBody();
00196     }
00197     bool valid() const
00198     {
00199         return m_bit != m_eit;
00200     }
00201     void append(char*& buf, size_t& bufsz, char c, size_t& pos)
00202     {
00203         enum { alloc_block = 128};
00204         if(pos == bufsz) 
00205         {
00206             // allocate and init buffer
00207             char* tmp = buf;
00208             int oldBufsz = bufsz;
00209             while(pos >= bufsz)
00210                 bufsz = bufsz + alloc_block;
00211             buf = new char[bufsz+1];    
00212             if(tmp != 0)
00213             {
00214                 assert(oldBufsz > 0);
00215                 memset(buf, 0, bufsz);
00216                 memcpy(buf, tmp, oldBufsz);
00217                 delete[] tmp;
00218             }
00219         }
00220         buf[pos++] = c;
00221     }
00222     // parses the header and calls addField and pushChild
00223     // to add fields and nested entities
00224     void loadHeader()
00225     {
00226         enum { 
00227             sInit,
00228             sIgnoreLine,
00229             sNewline,
00230             sWaitingName, 
00231             sWaitingValue, 
00232             sWaitingFoldedValue,
00233             sName, 
00234             sValue,
00235             sIgnoreHeader
00236         };
00237         register int status;
00238         int pos;
00239         char *name, *value;
00240         size_t nBufSz, vBufSz, nPos, vPos;
00241         char prev, c = 0;
00242 
00243         name = value = 0;
00244         pos = nBufSz = vBufSz = nPos = vPos = 0;
00245         status = (m_iMask & imHeader ? sIgnoreHeader : sInit);
00246         //status = sInit;
00247         while(m_bit != m_eit)
00248         {
00249             c = *m_bit;
00250             switch(status)
00251             {
00252             case sInit:
00253                 if(isnl(c))
00254                     status = sNewline;
00255                 else
00256                     status = sName;
00257                 continue;
00258             case sIgnoreLine:
00259                 if(!isnl(c))
00260                     break;
00261                 status = sNewline;
00262                 continue;
00263             case sNewline:
00264                 status = sWaitingName;
00265                 if(pos > 0)
00266                 {
00267                     pos = 0;
00268                     prev = c;
00269                     if(++m_bit == m_eit) goto out; //eof
00270                     c = *m_bit;
00271                     if(c == (prev == CR ? LF : CR))
00272                     {
00273                         --pos;
00274                         break;
00275                     } else 
00276                         continue;
00277                 } else {
00278                     // empty line, end of header
00279                     prev = c;
00280                     if(++m_bit == m_eit) goto out; //eof
00281                     c = *m_bit;
00282                     if(c == (prev == CR ? LF : CR))
00283                         ++m_bit;    
00284                     goto out;
00285                 }
00286             case sWaitingName:
00287                 if(isblank(c))
00288                 {
00289                     // folded value
00290                     status = sWaitingFoldedValue;
00291                     continue;
00292                 } 
00293                 // not blank, new field or empty line 
00294                 if(nPos)
00295                 {
00296                     name[nPos] = 0;
00297                     // is not an empty field (name: \n)
00298                     if(vPos) 
00299                     {
00300                         value[vPos] = 0;
00301                         addField(name,value);
00302                     } else
00303                         addField(name,"");
00304                     nPos = vPos = 0;
00305                 }
00306                 status = (isnl(c) ? sNewline : sName);
00307                 continue;
00308             case sWaitingValue:
00309                 if(isblank(c))
00310                     break; // eat leading blanks
00311                 status = sValue;
00312                 continue;
00313             case sWaitingFoldedValue:
00314                 if(isblank(c))
00315                     break; // eat leading blanks
00316                 append(value, vBufSz, ' ', vPos);
00317                 status = sValue;
00318                 continue;
00319             case sName:
00320                 if(c > 32 && c < 127 && c != ':') {
00321                     if(nPos > 0 && isblank(name[nPos-1]))
00322                     {
00323                         /* "FIELDNAME BLANK+ c" found, consider that the first 
00324                            body line */
00325                         onBlock(name, nPos, peBody);
00326                         goto out;
00327                     }
00328                     append(name, nBufSz, c, nPos);
00329                 } else if(c == ':') {
00330                     if(nPos == 0)
00331                     {
00332                         /* header line starting with ':', ignore the line */
00333                         status = sIgnoreLine;
00334                         continue;
00335                     }
00336 
00337                     /* malformed fix: remove any trailing blanks of the field 
00338                        name */
00339                     while(nPos > 0 && isblank(name[nPos-1]))
00340                         nPos--;
00341 
00342                     status = sWaitingValue;
00343                 } else if(isblank(c)) {
00344                     /* blank after the field name -> malformed; it may be a 
00345                        malformed field with trailing blank or
00346                        the start of the body; save the char so we can try to 
00347                        recover later trimming the field name or push the
00348                        whole line to the body part with onBlock() */
00349                     append(name, nBufSz, c, nPos);
00350                 } else {
00351                     /* bad header line or blank line between header and body is
00352                        missing; consider we're in the first line of the body */
00353                     onBlock(name, nPos, peBody);
00354                     goto out;
00355                 }
00356                 break;
00357             case sValue:
00358                 if(isnl(c))
00359                 {
00360                     status = sNewline;
00361                     continue;
00362                 }
00363                 append(value, vBufSz, c, vPos);
00364                 break;
00365             case sIgnoreHeader:
00366                 if(isnl(c))
00367                 {
00368                     prev = c;
00369                     if(++m_bit == m_eit) goto out; //eof
00370                     c = *m_bit;
00371                     if(c == (prev == CR ? LF : CR))
00372                         ++m_bit;    
00373                     if(pos == 0)    
00374                         goto out; //empty line, eoh
00375                     pos = 0;
00376                     continue;
00377                 } 
00378                 break;
00379             }
00380             ++m_bit; ++pos;
00381         }
00382     out:
00383         if(name)
00384             delete[] name;
00385         if(value)
00386             delete[] value;
00387         return;
00388     }
00389     void loadBody()
00390     {
00391         switch(getType())
00392         {
00393         case etRfc822:
00394             if(m_iMask & imBody)
00395                 jump_to_next_boundary();
00396             else
00397                 copy_until_boundary(peBody);
00398             break;
00399         case etMultipart:
00400             loadMultipart();
00401             break;
00402         case etMsgRfc822:
00403             if(m_iMask & imChildParts)
00404                 jump_to_next_boundary();
00405             else {
00406                 pushNewChild();
00407                 doLoad(); // load child entities
00408                 popChild();
00409             }
00410             break;
00411         }
00412     }
00413     void loadMultipart()
00414     {
00415         std::string boundary = getBoundary();
00416         m_boundaryList.push_front(boundary);
00417         ParsingElem pe;
00418         // preamble
00419         pe = (m_iMask & imPreamble ? peIgnore : pePreamble );
00420         copy_until_boundary(pe);
00421         while(m_bit != m_eit)
00422         {
00423             switch(m_lastBoundary)
00424             {
00425             case NoBoundary:
00426                 return; // eof
00427             case Boundary:
00428                 if(m_iMask & imChildParts)
00429                     jump_to_next_boundary();
00430                 else {
00431                     pushNewChild();
00432                     doLoad();
00433                     popChild();
00434                 }
00435                 break;
00436             case ClosingBoundary:
00437                 m_boundaryList.erase(m_boundaryList.begin());
00438                 // epilogue
00439                 pe=(m_iMask & imEpilogue? peIgnore: peEpilogue);
00440                 copy_until_boundary(pe);
00441                 return;
00442             case HigherLevelBoundary:
00443                 m_boundaryList.erase(m_boundaryList.begin());
00444                 return;
00445             }
00446         }
00447     }
00448     inline void onBlock(const char* block, int sz, ParsingElem pe)
00449     {
00450         switch(pe)
00451         {
00452         case peIgnore:
00453             return;
00454         case pePreamble:
00455             appendPreambleBlock(block, sz);
00456             break;
00457         case peEpilogue:
00458             appendEpilogueBlock(block, sz);
00459             break;
00460         case peBody:
00461             appendBodyBlock(block, sz);
00462             break;
00463         }
00464     }
00465     void jump_to_next_boundary()
00466     {
00467         copy_until_boundary(peIgnore);
00468     }
00469     // this is where most of execution time is spent when parsing
00470     // large messages; I'm using a plain char[] buffer instead of
00471     // std::string because I want to be as fast as possible here
00472     virtual void copy_until_boundary(ParsingElem pe)
00473     {
00474         size_t pos, lines, eomsz = 0;
00475         register char c;
00476         enum { nlsz = 1 };
00477         const char *eom = 0;
00478 
00479         enum { blksz = 4096 };
00480         char block[blksz];
00481         size_t blkpos = 0;
00482         size_t sl_off = 0; // start of line offset into *block
00483 
00484         pos = lines = 0;
00485         while(m_bit != m_eit)
00486         {
00487             // if buffer is full
00488             if(blkpos >= blksz - 2 - nlsz)
00489             {
00490                 if(sl_off == 0)
00491                 { 
00492                     // very long line found, assume it 
00493                     // can't be a boundary and flush the buf
00494                     // with the partial line
00495                     block[blkpos] = 0;
00496                     onBlock(block, blkpos, pe);
00497                     blkpos = sl_off = 0;
00498                 } else {
00499                     // flush the buffer except the last
00500                     // (probably incomplete) line
00501                     size_t llen = blkpos - sl_off;
00502                     onBlock(block, sl_off, pe);
00503                     memmove(block, block + sl_off, llen);
00504                     sl_off = 0;
00505                     blkpos = llen;
00506                 }
00507             }
00508             c = *m_bit;
00509             if(isnl(c))
00510             {
00511                 char nlbuf[3] = { 0, 0, 0 };
00512 
00513                 nlbuf[0] = c; // save the current NL char in nlbuf
00514 
00515                 // save the second char of the NL sequence (if any) in nlbuf
00516                 if(++m_bit != m_eit) 
00517                 {
00518                     char next = *m_bit;
00519                     if(next == (c == CR ? LF : CR))
00520                     {
00521                         nlbuf[1] = next; // save the next char in the NL seq
00522                         ++m_bit;
00523                     }
00524                 }
00525 
00526                 if(pos)
00527                 {
00528                     // not an empty row, is this a boundary?
00529                     block[blkpos] = 0;
00530                     if(block[sl_off] == '-' && sl_off < blkpos &&
00531                          block[sl_off+1] == '-')
00532                     {
00533                         std::string Line(block+sl_off, blkpos-sl_off);
00534                         if(isBoundary(Line))
00535                         {
00536                             // trim last newline
00537                             if (sl_off>=2) 
00538                             {
00539                                 int i = sl_off;
00540                                 char a = block[--i];
00541                                 char b = block[--i];
00542 
00543                                 if(isnl(a,b))
00544                                     sl_off -= 2;
00545                                 else if(isnl(a))
00546                                     sl_off--;
00547 
00548                             } else if (sl_off==1 && isnl(block[0])) {
00549                                 sl_off--;
00550                             }
00551                             onBlock(block, sl_off, pe);
00552                             return;
00553                         }
00554                     }
00555                     // exit if this is the end of message 
00556                     // marker
00557                     if(eom && pos >= eomsz)
00558                     {
00559                         char *line = block + sl_off;
00560                         size_t i = 0;
00561                         for(; i < eomsz; i++)
00562                             if(eom[i] != line[i])
00563                                 break;
00564                         if(i==eomsz) // if eom found
00565                         {
00566                             onBlock(block, sl_off,
00567                                 pe);
00568                             return; 
00569                         }
00570                     }
00571                 }
00572                 // append the saved NL sequence
00573                 for(int i = 0; nlbuf[i] != 0; i++)
00574                     block[blkpos++] = nlbuf[i];
00575                 block[blkpos] = 0;
00576                 sl_off = blkpos;
00577                 pos = 0;
00578             } else {
00579                 pos++; // line pos
00580                 block[blkpos++] = c;
00581                 ++m_bit; 
00582             }
00583         }
00584         // eof
00585         block[blkpos] = 0;
00586         onBlock(block, blkpos, pe);
00587     }
00588 };
00589 
00590 
00591 /*
00592  * Forward Iterator
00593  */
00594 template<typename Iterator>
00595 struct IteratorParser<Iterator, std::forward_iterator_tag>: 
00596     public IteratorParser<Iterator, std::input_iterator_tag>
00597 {
00598     /* input_iterator ops
00599      * *it = xxx
00600      * X& op++
00601      * X& op++(int)
00602      */
00603     typedef IteratorParser<Iterator, std::input_iterator_tag> base_type;
00604     IteratorParser(MimeEntity& me)
00605     : base_type(me)
00606     {
00607     }
00608 };
00609 
00610 /*
00611  * Bidirectional Iterator
00612  */
00613 template<typename Iterator>
00614 struct IteratorParser<Iterator, std::bidirectional_iterator_tag>:
00615     public IteratorParser<Iterator, std::forward_iterator_tag>
00616 {
00617     typedef IteratorParser<Iterator, std::forward_iterator_tag> base_type;
00618     IteratorParser(MimeEntity& me)
00619     : base_type(me)
00620     {
00621     }
00622 };
00623 
00624 /*
00625  * Random Access Iterator
00626  */
00627 template<typename Iterator>
00628 struct IteratorParser<Iterator, std::random_access_iterator_tag>:
00629     public IteratorParser<Iterator, std::bidirectional_iterator_tag>
00630 {
00631     typedef IteratorParser<Iterator, std::bidirectional_iterator_tag> base_type;
00632     IteratorParser(MimeEntity& me)
00633     : base_type(me)
00634     {
00635     }
00636 private:
00637     using base_type::peIgnore;
00638     using base_type::pePreamble;
00639     using base_type::peBody;
00640     using base_type::peEpilogue;
00641     
00642     using base_type::NoBoundary;
00643     using base_type::Boundary;
00644     using base_type::ClosingBoundary;
00645     using base_type::HigherLevelBoundary;
00646     
00647     using base_type::m_boundaryList;
00648     using base_type::m_lastBoundary;
00649     using base_type::m_entityStack;
00650     using base_type::m_me;
00651     using base_type::m_iMask;
00652     using base_type::m_bit;
00653     using base_type::m_eit;
00654     using base_type::isnl;
00655     
00656     typedef TreeNode<char> BoundaryTree;
00657     inline void onBlock(Iterator bit, int size, ParsingElem pe)
00658     {
00659         if(pe == peIgnore)
00660             return;
00661         Iterator eit = bit + size;
00662         MimeEntity* pMe = m_entityStack.top();
00663         switch(pe)
00664         {
00665         case pePreamble:
00666             pMe->body().preamble().append(bit, eit);
00667             break;
00668         case peEpilogue:
00669             pMe->body().epilogue().append(bit, eit);
00670             break;
00671         case peBody:
00672             pMe->body().append(bit, eit);
00673             break;
00674         }
00675     }
00676     void copy_until_boundary(ParsingElem pe)
00677     {
00678         // if we don't have any boundary copy until m_eit and return
00679         if(m_boundaryList.empty())
00680         {
00681             onBlock(m_bit, m_eit-m_bit, pe);
00682             m_bit = m_eit;
00683             return;
00684         }
00685         // search for current boundary; if not found (i.e. malformed
00686         // message) repeat the search for higher level boundary
00687         // (slow just for malformed msg, very fast otherwise)
00688         typename base_type::BoundaryList::const_iterator 
00689             bBit = m_boundaryList.begin(), bEit = m_boundaryList.end();
00690         m_lastBoundary = NoBoundary;
00691         int depth = 0;
00692         for( ;bBit != bEit; ++bBit, ++depth)
00693         {
00694             const std::string& boundary = *bBit;
00695             Iterator off;
00696             if( (off=utils::find_bm(m_bit,m_eit,boundary)) != m_eit)
00697             {
00698                 Iterator base = m_bit;
00699                 size_t block_sz = off - base;
00700                 m_lastBoundary = 
00701                     (depth ? HigherLevelBoundary: Boundary);
00702                 off += boundary.length();
00703                 m_bit = off;
00704                 if(off<m_eit-1 && *off =='-' && *(off+1) == '-')
00705                 {
00706                     m_lastBoundary = ClosingBoundary;
00707                     m_bit = off + 2;
00708                 }
00709                 if(m_bit < m_eit-1 && isnl(*m_bit)) 
00710                 {
00711                     char c = *m_bit++;
00712                     char next = *m_bit;
00713                     if(isnl(next) && next != c)
00714                         ++m_bit;
00715                 }
00716 
00717                 // trim last newline
00718                 if(block_sz)
00719                 {
00720                     Iterator p = base + block_sz;
00721                     char a = *--p, b = *--p;
00722                     if(isnl(a,b))
00723                         block_sz -= 2;
00724                     else if(isnl(a))
00725                         block_sz--;
00726                 }
00727                 onBlock(base, block_sz, pe);
00728                 return;
00729             } else {
00730                 onBlock(m_bit, m_eit-m_bit, pe);
00731                 m_bit = m_eit;
00732             }
00733         }
00734     }
00735     BoundaryTree m_boundaryTree;
00736     void buildBoundaryTree()
00737     {
00738         m_boundaryTree = BoundaryTree(); // clear
00739         typename base_type::BoundaryList::const_iterator 
00740             bit = m_boundaryList.begin(), eit = m_boundaryList.end();
00741         BoundaryTree::NodeList *pChilds;
00742         BoundaryTree::NodeList::iterator it;
00743         int depth = 0;
00744         for( ; bit != eit; ++bit)
00745         {
00746             pChilds = &m_boundaryTree.childList();
00747             it = pChilds->begin();
00748             const char *w = bit->c_str();
00749             do
00750             {
00751                 it = find_if(pChilds->begin(), pChilds->end(), 
00752                         FindNodePred<char>(*w));
00753                 if( it == pChilds->end() )
00754                     it = pChilds->insert(pChilds->end(),*w);
00755                 pChilds = &it->childList();
00756                 depth++;
00757             } while(*(++w));
00758         }
00759     }
00760 
00761 };
00762 
00763 }
00764 
00765 #endif