1:怎么获取Markup.cpp 和 Markup.h

  首先到http://www.firstobject.com/dn_markup.htm链接下,下载Release 11.5 zip (579k)C++ source code for Linux, Mac, Windows,解压后里面是一个Test文件夹和Markup.cpp和Markup.h文件,将Markup.h和Markup .cpp拷贝并添加到工程中,第一次编译可能会出现预编译错误,解决的方法在Markup.cpp最前面include "stdafx.h",或者关闭预编译。

这是工程的大致结构

markup.h

  1. // Markup.h: interface for the CMarkup class.
  2. //
  3. // Markup Release 11.5
  4. // Copyright (C) 2011 First Objective Software, Inc. All rights reserved
  5. // Go to www.firstobject.com for the latest CMarkup and EDOM documentation
  6. // Use in commercial applications requires written permission
  7. // This software is provided "as is", with no warranty.
  8.  
  9. #if !defined(_MARKUP_H_INCLUDED_)
  10. #define _MARKUP_H_INCLUDED_
  11.  
  12. #include <stdlib.h>
  13. #include <string.h> // memcpy, memset, strcmp...
  14.  
  15. // Major build options
  16. // MARKUP_WCHAR wide char (2-byte UTF-16 on Windows, 4-byte UTF-32 on Linux and OS X)
  17. // MARKUP_MBCS ANSI/double-byte strings on Windows
  18. // MARKUP_STL (default except VC++) use STL strings instead of MFC strings
  19. // MARKUP_SAFESTR to use string _s functions in VC++ 2005 (_MSC_VER >= 1400)
  20. // MARKUP_WINCONV (default for VC++) for Windows API character conversion
  21. // MARKUP_ICONV (default for GNU) for character conversion on Linux and OS X and other platforms
  22. // MARKUP_STDCONV to use neither WINCONV or ICONV, falls back to setlocale based conversion for ANSI
  23. //
  24. #if ! defined(MARKUP_WINDOWS)
  25. #if defined(_WIN32) || defined(WIN32)
  26. #define MARKUP_WINDOWS
  27. #endif // WIN32 or _WIN32
  28. #endif // not MARKUP_WINDOWS
  29. #if _MSC_VER > 1000 // VC++
  30. #pragma once
  31. #if ! defined(MARKUP_SAFESTR) // not VC++ safe strings
  32. #pragma warning(disable:4996) // VC++ 2005 deprecated function warnings
  33. #endif // not VC++ safe strings
  34. #if defined(MARKUP_STL) && _MSC_VER < 1400 // STL pre VC++ 2005
  35. #pragma warning(disable:4786) // std::string long names
  36. #endif // VC++ 2005 STL
  37. #else // not VC++
  38. #if ! defined(MARKUP_STL)
  39. #define MARKUP_STL
  40. #endif // not STL
  41. #if defined(__GNUC__) && ! defined(MARKUP_ICONV) && ! defined(MARKUP_STDCONV) && ! defined(MARKUP_WINCONV)
  42. #if ! defined(MARKUP_WINDOWS)
  43. #define MARKUP_ICONV
  44. #endif // not Windows
  45. #endif // GNUC and not ICONV not STDCONV not WINCONV
  46. #endif // not VC++
  47. #if (defined(_UNICODE) || defined(UNICODE)) && ! defined(MARKUP_WCHAR)
  48. #define MARKUP_WCHAR
  49. #endif // _UNICODE or UNICODE
  50. #if (defined(_MBCS) || defined(MBCS)) && ! defined(MARKUP_MBCS)
  51. #define MARKUP_MBCS
  52. #endif // _MBCS and not MBCS
  53. #if ! defined(MARKUP_SIZEOFWCHAR)
  54. #if __SIZEOF_WCHAR_T__ == 4 || __WCHAR_MAX__ > 0x10000
  55. #define MARKUP_SIZEOFWCHAR 4
  56. #else // sizeof(wchar_t) != 4
  57. #define MARKUP_SIZEOFWCHAR 2
  58. #endif // sizeof(wchar_t) != 4
  59. #endif // not MARKUP_SIZEOFWCHAR
  60. #if ! defined(MARKUP_WINCONV) && ! defined(MARKUP_STDCONV) && ! defined(MARKUP_ICONV)
  61. #define MARKUP_WINCONV
  62. #endif // not WINCONV not STDCONV not ICONV
  63. #if ! defined(MARKUP_FILEBLOCKSIZE)
  64. #define MARKUP_FILEBLOCKSIZE 16384
  65. #endif
  66.  
  67. // Text type and function defines (compiler and build-option dependent)
  68. //
  69. #define MCD_ACP 0
  70. #define MCD_UTF8 65001
  71. #define MCD_UTF16 1200
  72. #define MCD_UTF32 65005
  73. #if defined(MARKUP_WCHAR)
  74. #define MCD_CHAR wchar_t
  75. #define MCD_PCSZ const wchar_t*
  76. #define MCD_PSZLEN (int)wcslen
  77. #define MCD_PSZCHR wcschr
  78. #define MCD_PSZSTR wcsstr
  79. #define MCD_PSZTOL wcstol
  80. #if defined(MARKUP_SAFESTR) // VC++ safe strings
  81. #define MCD_SSZ(sz) sz,(sizeof(sz)/sizeof(MCD_CHAR))
  82. #define MCD_PSZCPY(sz,p) wcscpy_s(MCD_SSZ(sz),p)
  83. #define MCD_PSZNCPY(sz,p,n) wcsncpy_s(MCD_SSZ(sz),p,n)
  84. #define MCD_SPRINTF swprintf_s
  85. #define MCD_FOPEN(f,n,m) {if(_wfopen_s(&f,n,m)!=0)f=NULL;}
  86. #else // not VC++ safe strings
  87. #if defined(__GNUC__) && ! defined(MARKUP_WINDOWS) // non-Windows GNUC
  88. #define MCD_SSZ(sz) sz,(sizeof(sz)/sizeof(MCD_CHAR))
  89. #else // not non-Windows GNUC
  90. #define MCD_SSZ(sz) sz
  91. #endif // not non-Windows GNUC
  92. #define MCD_PSZCPY wcscpy
  93. #define MCD_PSZNCPY wcsncpy
  94. #define MCD_SPRINTF swprintf
  95. #define MCD_FOPEN(f,n,m) f=_wfopen(n,m)
  96. #endif // not VC++ safe strings
  97. #define MCD_T(s) L ## s
  98. #if MARKUP_SIZEOFWCHAR == 4 // sizeof(wchar_t) == 4
  99. #define MCD_ENC MCD_T("UTF-32")
  100. #else // sizeof(wchar_t) == 2
  101. #define MCD_ENC MCD_T("UTF-16")
  102. #endif
  103. #define MCD_CLEN(p) 1
  104. #else // not MARKUP_WCHAR
  105. #define MCD_CHAR char
  106. #define MCD_PCSZ const char*
  107. #define MCD_PSZLEN (int)strlen
  108. #define MCD_PSZCHR strchr
  109. #define MCD_PSZSTR strstr
  110. #define MCD_PSZTOL strtol
  111. #if defined(MARKUP_SAFESTR) // VC++ safe strings
  112. #define MCD_SSZ(sz) sz,(sizeof(sz)/sizeof(MCD_CHAR))
  113. #define MCD_PSZCPY(sz,p) strcpy_s(MCD_SSZ(sz),p)
  114. #define MCD_PSZNCPY(sz,p,n) strncpy_s(MCD_SSZ(sz),p,n)
  115. #define MCD_SPRINTF sprintf_s
  116. #define MCD_FOPEN(f,n,m) {if(fopen_s(&f,n,m)!=0)f=NULL;}
  117. #else // not VC++ safe strings
  118. #define MCD_SSZ(sz) sz
  119. #define MCD_PSZCPY strcpy
  120. #define MCD_PSZNCPY strncpy
  121. #define MCD_SPRINTF sprintf
  122. #define MCD_FOPEN(f,n,m) f=fopen(n,m)
  123. #endif // not VC++ safe strings
  124. #define MCD_T(s) s
  125. #if defined(MARKUP_MBCS) // MBCS/double byte
  126. #define MCD_ENC MCD_T("")
  127. #if defined(MARKUP_WINCONV)
  128. #define MCD_CLEN(p) (int)_mbclen((const unsigned char*)p)
  129. #else // not WINCONV
  130. #define MCD_CLEN(p) (int)mblen(p,MB_CUR_MAX)
  131. #endif // not WINCONV
  132. #else // not MBCS/double byte
  133. #define MCD_ENC MCD_T("UTF-8")
  134. #define MCD_CLEN(p) 1
  135. #endif // not MBCS/double byte
  136. #endif // not MARKUP_WCHAR
  137. #if _MSC_VER < 1000 // not VC++
  138. #define MCD_STRERROR strerror(errno)
  139. #endif // not VC++
  140.  
  141. // String type and function defines (compiler and build-option dependent)
  142. // Define MARKUP_STL to use STL strings
  143. //
  144. #if defined(MARKUP_STL) // STL
  145. #include <string>
  146. #if defined(MARKUP_WCHAR)
  147. #define MCD_STR std::wstring
  148. #else // not MARKUP_WCHAR
  149. #define MCD_STR std::string
  150. #endif // not MARKUP_WCHAR
  151. #define MCD_2PCSZ(s) s.c_str()
  152. #define MCD_STRLENGTH(s) (int)s.size()
  153. #define MCD_STRCLEAR(s) s.erase()
  154. #define MCD_STRCLEARSIZE(s) MCD_STR t; s.swap(t)
  155. #define MCD_STRISEMPTY(s) s.empty()
  156. #define MCD_STRMID(s,n,l) s.substr(n,l)
  157. #define MCD_STRASSIGN(s,p,n) s.assign(p,n)
  158. #define MCD_STRCAPACITY(s) (int)s.capacity()
  159. #define MCD_STRINSERTREPLACE(d,i,r,s) d.replace(i,r,s)
  160. #define MCD_GETBUFFER(s,n) new MCD_CHAR[n+1]; if ((int)s.capacity()<(int)n) s.reserve(n)
  161. #define MCD_RELEASEBUFFER(s,p,n) s.replace(0,s.size(),p,n); delete[]p
  162. #define MCD_BLDRESERVE(s,n) s.reserve(n)
  163. #define MCD_BLDCHECK(s,n,d) ;
  164. #define MCD_BLDRELEASE(s) ;
  165. #define MCD_BLDAPPENDN(s,p,n) s.append(p,n)
  166. #define MCD_BLDAPPEND(s,p) s.append(p)
  167. #define MCD_BLDAPPEND1(s,c) s+=(MCD_CHAR)(c)
  168. #define MCD_BLDLEN(s) (int)s.size()
  169. #define MCD_BLDTRUNC(s,n) s.resize(n)
  170. #else // not STL, i.e. MFC
  171. // afx.h provides CString, to avoid "WINVER not defined" #include stdafh.x in Markup.cpp
  172. #include <afx.h>
  173. #define MCD_STR CString
  174. #define MCD_2PCSZ(s) ((MCD_PCSZ)s)
  175. #define MCD_STRLENGTH(s) s.GetLength()
  176. #define MCD_STRCLEAR(s) s.Empty()
  177. #define MCD_STRCLEARSIZE(s) s=MCD_STR()
  178. #define MCD_STRISEMPTY(s) s.IsEmpty()
  179. #define MCD_STRMID(s,n,l) s.Mid(n,l)
  180. #define MCD_STRASSIGN(s,p,n) memcpy(s.GetBuffer(n),p,(n)*sizeof(MCD_CHAR));s.ReleaseBuffer(n);
  181. #define MCD_STRCAPACITY(s) (((CStringData*)((MCD_PCSZ)s)-1)->nAllocLength)
  182. #define MCD_GETBUFFER(s,n) s.GetBuffer(n)
  183. #define MCD_RELEASEBUFFER(s,p,n) s.ReleaseBuffer(n)
  184. #define MCD_BLDRESERVE(s,n) MCD_CHAR*pD=s.GetBuffer(n); int nL=0
  185. #define MCD_BLDCHECK(s,n,d) if(nL+(int)(d)>n){s.ReleaseBuffer(nL);n<<=2;pD=s.GetBuffer(n);}
  186. #define MCD_BLDRELEASE(s) s.ReleaseBuffer(nL)
  187. #define MCD_BLDAPPENDN(s,p,n) MCD_PSZNCPY(&pD[nL],p,n);nL+=n
  188. #define MCD_BLDAPPEND(s,p) MCD_PSZCPY(&pD[nL],p);nL+=MCD_PSZLEN(p)
  189. #define MCD_BLDAPPEND1(s,c) pD[nL++]=(MCD_CHAR)(c)
  190. #define MCD_BLDLEN(s) nL
  191. #define MCD_BLDTRUNC(s,n) nL=n
  192. #endif // not STL
  193. #define MCD_STRTOINT(s) MCD_PSZTOL(MCD_2PCSZ(s),NULL,10)
  194.  
  195. // Allow function args to accept string objects as constant string pointers
  196. struct MCD_CSTR
  197. {
  198. MCD_CSTR() { pcsz=NULL; };
  199. MCD_CSTR( MCD_PCSZ p ) { pcsz=p; };
  200. MCD_CSTR( const MCD_STR& s ) { pcsz = MCD_2PCSZ(s); };
  201. operator MCD_PCSZ() const { return pcsz; };
  202. MCD_PCSZ pcsz;
  203. };
  204.  
  205. // On Linux and OS X, filenames are not specified in wchar_t
  206. #if defined(MARKUP_WCHAR) && defined(__GNUC__)
  207. #undef MCD_FOPEN
  208. #define MCD_FOPEN(f,n,m) f=fopen(n,m)
  209. #define MCD_T_FILENAME(s) s
  210. #define MCD_PCSZ_FILENAME const char*
  211. struct MCD_CSTR_FILENAME
  212. {
  213. MCD_CSTR_FILENAME() { pcsz=NULL; };
  214. MCD_CSTR_FILENAME( MCD_PCSZ_FILENAME p ) { pcsz=p; };
  215. MCD_CSTR_FILENAME( const std::string& s ) { pcsz = s.c_str(); };
  216. operator MCD_PCSZ_FILENAME() const { return pcsz; };
  217. MCD_PCSZ_FILENAME pcsz;
  218. };
  219. #else // not WCHAR GNUC
  220. #define MCD_CSTR_FILENAME MCD_CSTR
  221. #define MCD_T_FILENAME MCD_T
  222. #define MCD_PCSZ_FILENAME MCD_PCSZ
  223. #endif // not WCHAR GNUC
  224.  
  225. // File fseek, ftell and offset type
  226. #if defined(__GNUC__) && ! defined(MARKUP_WINDOWS) // non-Windows GNUC
  227. #define MCD_FSEEK fseeko
  228. #define MCD_FTELL ftello
  229. #define MCD_INTFILEOFFSET off_t
  230. #elif _MSC_VER >= 1000 && defined(MARKUP_HUGEFILE) // VC++ HUGEFILE
  231. #if _MSC_VER < 1400 // before VC++ 2005
  232. extern "C" int __cdecl _fseeki64(FILE *, __int64, int);
  233. extern "C" __int64 __cdecl _ftelli64(FILE *);
  234. #endif // before VC++ 2005
  235. #define MCD_FSEEK _fseeki64
  236. #define MCD_FTELL _ftelli64
  237. #define MCD_INTFILEOFFSET __int64
  238. #else // not non-Windows GNUC or VC++ HUGEFILE
  239. #define MCD_FSEEK fseek
  240. #define MCD_FTELL ftell
  241. #define MCD_INTFILEOFFSET long
  242. #endif // not non-Windows GNUC or VC++ HUGEFILE
  243.  
  244. // End of line choices: none, return, newline, or CRLF
  245. #if defined(MARKUP_EOL_NONE)
  246. #define MCD_EOL MCD_T("")
  247. #elif defined(MARKUP_EOL_RETURN) // rare; only used on some old operating systems
  248. #define MCD_EOL MCD_T("\r")
  249. #elif defined(MARKUP_EOL_NEWLINE) // Unix standard
  250. #define MCD_EOL MCD_T("\n")
  251. #elif defined(MARKUP_EOL_CRLF) || defined(MARKUP_WINDOWS) // Windows standard
  252. #define MCD_EOL MCD_T("\r\n")
  253. #else // not Windows and not otherwise specified
  254. #define MCD_EOL MCD_T("\n")
  255. #endif // not Windows and not otherwise specified
  256. #define MCD_EOLLEN (sizeof(MCD_EOL)/sizeof(MCD_CHAR)-1) // string length of MCD_EOL
  257.  
  258. struct FilePos;
  259. struct TokenPos;
  260. struct NodePos;
  261. struct PathPos;
  262. struct SavedPosMapArray;
  263. struct ElemPosTree;
  264.  
  265. class CMarkup
  266. {
  267. public:
  268. CMarkup() { x_InitMarkup(); SetDoc( NULL ); };
  269. CMarkup( MCD_CSTR szDoc ) { x_InitMarkup(); SetDoc( szDoc ); };
  270. CMarkup( int nFlags ) { x_InitMarkup(); SetDoc( NULL ); m_nDocFlags = nFlags; };
  271. CMarkup( const CMarkup& markup ) { x_InitMarkup(); *this = markup; };
  272. void operator=( const CMarkup& markup );
  273. ~CMarkup();
  274.  
  275. // Navigate
  276. bool Load( MCD_CSTR_FILENAME szFileName );
  277. bool SetDoc( MCD_PCSZ pDoc );
  278. bool SetDoc( const MCD_STR& strDoc );
  279. bool IsWellFormed();
  280. bool FindElem( MCD_CSTR szName=NULL );
  281. bool FindChildElem( MCD_CSTR szName=NULL );
  282. bool IntoElem();
  283. bool OutOfElem();
  284. void ResetChildPos() { x_SetPos(m_iPosParent,m_iPos,); };
  285. void ResetMainPos() { x_SetPos(m_iPosParent,,); };
  286. void ResetPos() { x_SetPos(,,); };
  287. MCD_STR GetTagName() const;
  288. MCD_STR GetChildTagName() const { return x_GetTagName(m_iPosChild); };
  289. MCD_STR GetData() { return x_GetData(m_iPos); };
  290. MCD_STR GetChildData() { return x_GetData(m_iPosChild); };
  291. MCD_STR GetElemContent() const { return x_GetElemContent(m_iPos); };
  292. MCD_STR GetAttrib( MCD_CSTR szAttrib ) const { return x_GetAttrib(m_iPos,szAttrib); };
  293. MCD_STR GetChildAttrib( MCD_CSTR szAttrib ) const { return x_GetAttrib(m_iPosChild,szAttrib); };
  294. bool GetNthAttrib( int n, MCD_STR& strAttrib, MCD_STR& strValue ) const;
  295. MCD_STR GetAttribName( int n ) const;
  296. int FindNode( int nType= );
  297. int GetNodeType() { return m_nNodeType; };
  298. bool SavePos( MCD_CSTR szPosName=MCD_T(""), int nMap = );
  299. bool RestorePos( MCD_CSTR szPosName=MCD_T(""), int nMap = );
  300. bool SetMapSize( int nSize, int nMap = );
  301. MCD_STR GetError() const;
  302. const MCD_STR& GetResult() const { return m_strResult; };
  303. int GetDocFlags() const { return m_nDocFlags; };
  304. void SetDocFlags( int nFlags ) { m_nDocFlags = (nFlags & ~(MDF_READFILE|MDF_WRITEFILE|MDF_APPENDFILE)); };
  305. enum MarkupDocFlags
  306. {
  307. MDF_UTF16LEFILE = ,
  308. MDF_UTF8PREAMBLE = ,
  309. MDF_IGNORECASE = ,
  310. MDF_READFILE = ,
  311. MDF_WRITEFILE = ,
  312. MDF_APPENDFILE = ,
  313. MDF_UTF16BEFILE = ,
  314. MDF_TRIMWHITESPACE = ,
  315. MDF_COLLAPSEWHITESPACE =
  316. };
  317. enum MarkupNodeFlags
  318. {
  319. MNF_WITHCDATA = 0x01,
  320. MNF_WITHNOLINES = 0x02,
  321. MNF_WITHXHTMLSPACE = 0x04,
  322. MNF_WITHREFS = 0x08,
  323. MNF_WITHNOEND = 0x10,
  324. MNF_ESCAPEQUOTES = 0x100,
  325. MNF_NONENDED = 0x100000,
  326. MNF_ILLDATA = 0x200000
  327. };
  328. enum MarkupNodeType
  329. {
  330. MNT_ELEMENT = , // 0x0001
  331. MNT_TEXT = , // 0x0002
  332. MNT_WHITESPACE = , // 0x0004
  333. MNT_TEXT_AND_WHITESPACE = , // 0x0006
  334. MNT_CDATA_SECTION = , // 0x0008
  335. MNT_PROCESSING_INSTRUCTION = , // 0x0010
  336. MNT_COMMENT = , // 0x0020
  337. MNT_DOCUMENT_TYPE = , // 0x0040
  338. MNT_EXCLUDE_WHITESPACE = , // 0x007b
  339. MNT_LONE_END_TAG = , // 0x0080
  340. MNT_NODE_ERROR = // 0x8000
  341. };
  342.  
  343. // Create
  344. bool Save( MCD_CSTR_FILENAME szFileName );
  345. const MCD_STR& GetDoc() const { return m_strDoc; };
  346. bool AddElem( MCD_CSTR szName, MCD_CSTR szData=NULL, int nFlags= ) { return x_AddElem(szName,szData,nFlags); };
  347. bool InsertElem( MCD_CSTR szName, MCD_CSTR szData=NULL, int nFlags= ) { return x_AddElem(szName,szData,nFlags|MNF_INSERT); };
  348. bool AddChildElem( MCD_CSTR szName, MCD_CSTR szData=NULL, int nFlags= ) { return x_AddElem(szName,szData,nFlags|MNF_CHILD); };
  349. bool InsertChildElem( MCD_CSTR szName, MCD_CSTR szData=NULL, int nFlags= ) { return x_AddElem(szName,szData,nFlags|MNF_INSERT|MNF_CHILD); };
  350. bool AddElem( MCD_CSTR szName, int nValue, int nFlags= ) { return x_AddElem(szName,nValue,nFlags); };
  351. bool InsertElem( MCD_CSTR szName, int nValue, int nFlags= ) { return x_AddElem(szName,nValue,nFlags|MNF_INSERT); };
  352. bool AddChildElem( MCD_CSTR szName, int nValue, int nFlags= ) { return x_AddElem(szName,nValue,nFlags|MNF_CHILD); };
  353. bool InsertChildElem( MCD_CSTR szName, int nValue, int nFlags= ) { return x_AddElem(szName,nValue,nFlags|MNF_INSERT|MNF_CHILD); };
  354. bool AddAttrib( MCD_CSTR szAttrib, MCD_CSTR szValue ) { return x_SetAttrib(m_iPos,szAttrib,szValue); };
  355. bool AddChildAttrib( MCD_CSTR szAttrib, MCD_CSTR szValue ) { return x_SetAttrib(m_iPosChild,szAttrib,szValue); };
  356. bool AddAttrib( MCD_CSTR szAttrib, int nValue ) { return x_SetAttrib(m_iPos,szAttrib,nValue); };
  357. bool AddChildAttrib( MCD_CSTR szAttrib, int nValue ) { return x_SetAttrib(m_iPosChild,szAttrib,nValue); };
  358. bool AddSubDoc( MCD_CSTR szSubDoc ) { return x_AddSubDoc(szSubDoc,); };
  359. bool InsertSubDoc( MCD_CSTR szSubDoc ) { return x_AddSubDoc(szSubDoc,MNF_INSERT); };
  360. MCD_STR GetSubDoc() { return x_GetSubDoc(m_iPos); };
  361. bool AddChildSubDoc( MCD_CSTR szSubDoc ) { return x_AddSubDoc(szSubDoc,MNF_CHILD); };
  362. bool InsertChildSubDoc( MCD_CSTR szSubDoc ) { return x_AddSubDoc(szSubDoc,MNF_CHILD|MNF_INSERT); };
  363. MCD_STR GetChildSubDoc() { return x_GetSubDoc(m_iPosChild); };
  364. bool AddNode( int nType, MCD_CSTR szText ) { return x_AddNode(nType,szText,); };
  365. bool InsertNode( int nType, MCD_CSTR szText ) { return x_AddNode(nType,szText,MNF_INSERT); };
  366.  
  367. // Modify
  368. bool RemoveElem();
  369. bool RemoveChildElem();
  370. bool RemoveNode();
  371. bool SetAttrib( MCD_CSTR szAttrib, MCD_CSTR szValue, int nFlags= ) { return x_SetAttrib(m_iPos,szAttrib,szValue,nFlags); };
  372. bool SetChildAttrib( MCD_CSTR szAttrib, MCD_CSTR szValue, int nFlags= ) { return x_SetAttrib(m_iPosChild,szAttrib,szValue,nFlags); };
  373. bool SetAttrib( MCD_CSTR szAttrib, int nValue, int nFlags= ) { return x_SetAttrib(m_iPos,szAttrib,nValue,nFlags); };
  374. bool SetChildAttrib( MCD_CSTR szAttrib, int nValue, int nFlags= ) { return x_SetAttrib(m_iPosChild,szAttrib,nValue,nFlags); };
  375. bool SetData( MCD_CSTR szData, int nFlags= ) { return x_SetData(m_iPos,szData,nFlags); };
  376. bool SetChildData( MCD_CSTR szData, int nFlags= ) { return x_SetData(m_iPosChild,szData,nFlags); };
  377. bool SetData( int nValue ) { return x_SetData(m_iPos,nValue); };
  378. bool SetChildData( int nValue ) { return x_SetData(m_iPosChild,nValue); };
  379. bool SetElemContent( MCD_CSTR szContent ) { return x_SetElemContent(szContent); };
  380.  
  381. // Utility
  382. static bool ReadTextFile( MCD_CSTR_FILENAME szFileName, MCD_STR& strDoc, MCD_STR* pstrResult=NULL, int* pnDocFlags=NULL, MCD_STR* pstrEncoding=NULL );
  383. static bool WriteTextFile( MCD_CSTR_FILENAME szFileName, const MCD_STR& strDoc, MCD_STR* pstrResult=NULL, int* pnDocFlags=NULL, MCD_STR* pstrEncoding=NULL );
  384. static MCD_STR EscapeText( MCD_CSTR szText, int nFlags = );
  385. static MCD_STR UnescapeText( MCD_CSTR szText, int nTextLength = -, int nFlags = );
  386. static int UTF16To8( char *pszUTF8, const unsigned short* pwszUTF16, int nUTF8Count );
  387. static int UTF8To16( unsigned short* pwszUTF16, const char* pszUTF8, int nUTF8Count );
  388. static MCD_STR UTF8ToA( MCD_CSTR pszUTF8, int* pnFailed = NULL );
  389. static MCD_STR AToUTF8( MCD_CSTR pszANSI );
  390. static void EncodeCharUTF8( int nUChar, char* pszUTF8, int& nUTF8Len );
  391. static int DecodeCharUTF8( const char*& pszUTF8, const char* pszUTF8End = NULL );
  392. static void EncodeCharUTF16( int nUChar, unsigned short* pwszUTF16, int& nUTF16Len );
  393. static int DecodeCharUTF16( const unsigned short*& pwszUTF16, const unsigned short* pszUTF16End = NULL );
  394. static bool DetectUTF8( const char* pText, int nTextLen, int* pnNonASCII = NULL, bool* bErrorAtEnd = NULL );
  395. static MCD_STR GetDeclaredEncoding( MCD_CSTR szDoc );
  396. static int GetEncodingCodePage( MCD_CSTR pszEncoding );
  397.  
  398. protected:
  399.  
  400. #if defined(_DEBUG)
  401. MCD_PCSZ m_pDebugCur;
  402. MCD_PCSZ m_pDebugPos;
  403. #endif // DEBUG
  404.  
  405. MCD_STR m_strDoc;
  406. MCD_STR m_strResult;
  407.  
  408. int m_iPosParent;
  409. int m_iPos;
  410. int m_iPosChild;
  411. int m_iPosFree;
  412. int m_iPosDeleted;
  413. int m_nNodeType;
  414. int m_nNodeOffset;
  415. int m_nNodeLength;
  416. int m_nDocFlags;
  417.  
  418. FilePos* m_pFilePos;
  419. SavedPosMapArray* m_pSavedPosMaps;
  420. ElemPosTree* m_pElemPosTree;
  421.  
  422. enum MarkupNodeFlagsInternal
  423. {
  424. MNF_INSERT = 0x002000,
  425. MNF_CHILD = 0x004000
  426. };
  427.  
  428. #if defined(_DEBUG) // DEBUG
  429. void x_SetDebugState();
  430. #define MARKUP_SETDEBUGSTATE x_SetDebugState()
  431. #else // not DEBUG
  432. #define MARKUP_SETDEBUGSTATE
  433. #endif // not DEBUG
  434.  
  435. void x_InitMarkup();
  436. void x_SetPos( int iPosParent, int iPos, int iPosChild );
  437. int x_GetFreePos();
  438. bool x_AllocElemPos( int nNewSize = );
  439. int x_GetParent( int i );
  440. bool x_ParseDoc();
  441. int x_ParseElem( int iPos, TokenPos& token );
  442. int x_FindElem( int iPosParent, int iPos, PathPos& path ) const;
  443. MCD_STR x_GetPath( int iPos ) const;
  444. MCD_STR x_GetTagName( int iPos ) const;
  445. MCD_STR x_GetData( int iPos );
  446. MCD_STR x_GetAttrib( int iPos, MCD_PCSZ pAttrib ) const;
  447. static MCD_STR x_EncodeCDATASection( MCD_PCSZ szData );
  448. bool x_AddElem( MCD_PCSZ pName, MCD_PCSZ pValue, int nFlags );
  449. bool x_AddElem( MCD_PCSZ pName, int nValue, int nFlags );
  450. MCD_STR x_GetSubDoc( int iPos );
  451. bool x_AddSubDoc( MCD_PCSZ pSubDoc, int nFlags );
  452. bool x_SetAttrib( int iPos, MCD_PCSZ pAttrib, MCD_PCSZ pValue, int nFlags= );
  453. bool x_SetAttrib( int iPos, MCD_PCSZ pAttrib, int nValue, int nFlags= );
  454. bool x_AddNode( int nNodeType, MCD_PCSZ pText, int nNodeFlags );
  455. void x_RemoveNode( int iPosParent, int& iPos, int& nNodeType, int& nNodeOffset, int& nNodeLength );
  456. static bool x_CreateNode( MCD_STR& strNode, int nNodeType, MCD_PCSZ pText );
  457. int x_InsertNew( int iPosParent, int& iPosRel, NodePos& node );
  458. void x_AdjustForNode( int iPosParent, int iPos, int nShift );
  459. void x_Adjust( int iPos, int nShift, bool bAfterPos = false );
  460. void x_LinkElem( int iPosParent, int iPosBefore, int iPos );
  461. int x_UnlinkElem( int iPos );
  462. int x_UnlinkPrevElem( int iPosParent, int iPosBefore, int iPos );
  463. int x_ReleaseSubDoc( int iPos );
  464. int x_ReleasePos( int iPos );
  465. void x_CheckSavedPos();
  466. bool x_SetData( int iPos, MCD_PCSZ szData, int nFlags );
  467. bool x_SetData( int iPos, int nValue );
  468. int x_RemoveElem( int iPos );
  469. MCD_STR x_GetElemContent( int iPos ) const;
  470. bool x_SetElemContent( MCD_PCSZ szContent );
  471. void x_DocChange( int nLeft, int nReplace, const MCD_STR& strInsert );
  472. };
  473.  
  474. #endif // !defined(_MARKUP_H_INCLUDED_)

markup.cpp

  1. // Markup.cpp: implementation of the CMarkup class.
  2. //
  3. // Markup Release 11.5
  4. // Copyright (C) 2011 First Objective Software, Inc. All rights reserved
  5. // Go to www.firstobject.com for the latest CMarkup and EDOM documentation
  6. // Use in commercial applications requires written permission
  7. // This software is provided "as is", with no warranty.
  8. //
  9. #include "stdafx.h"
  10. #include <stdio.h>
  11. #include "Markup.h"
  12.  
  13. #if defined(MCD_STRERROR) // C error routine
  14. #include <errno.h>
  15. #endif // C error routine
  16.  
  17. #if defined (MARKUP_ICONV)
  18. #include <iconv.h>
  19. #endif
  20.  
  21. #define x_ATTRIBQUOTE '\"' // can be double or single quote
  22.  
  23. #if defined(MARKUP_STL) && ( defined(MARKUP_WINCONV) || (! defined(MCD_STRERROR)))
  24. #include <windows.h> // for MultiByteToWideChar, WideCharToMultiByte, FormatMessage
  25. #endif // need windows.h when STL and (not setlocale or not strerror), MFC afx.h includes it already
  26.  
  27. #if defined(MARKUP_MBCS) // MBCS/double byte
  28. #pragma message( "Note: MBCS build (not UTF-8)" )
  29. // For UTF-8, remove MBCS from project settings C/C++ preprocessor definitions
  30. #if defined (MARKUP_WINCONV)
  31. #include <mbstring.h> // for VC++ _mbclen
  32. #endif // WINCONV
  33. #endif // MBCS/double byte
  34.  
  35. #if defined(_DEBUG) && _MSC_VER > 1000 // VC++ DEBUG
  36. #undef THIS_FILE
  37. static char THIS_FILE[]=__FILE__;
  38. #if defined(DEBUG_NEW)
  39. #define new DEBUG_NEW
  40. #endif // DEBUG_NEW
  41. #endif // VC++ DEBUG
  42.  
  43. // Disable "while ( 1 )" warning in VC++ 2002
  44. #if _MSC_VER >= 1300 // VC++ 2002 (7.0)
  45. #pragma warning(disable:4127)
  46. #endif // VC++ 2002 (7.0)
  47.  
  48. //////////////////////////////////////////////////////////////////////
  49. // Internal static utility functions
  50. //
  51. void x_StrInsertReplace( MCD_STR& str, int nLeft, int nReplace, const MCD_STR& strInsert )
  52. {
  53. // Insert strInsert into str at nLeft replacing nReplace chars
  54. // Reduce reallocs on growing string by reserving string space
  55. // If realloc needed, allow for 1.5 times the new length
  56. //
  57. int nStrLength = MCD_STRLENGTH(str);
  58. int nInsLength = MCD_STRLENGTH(strInsert);
  59. int nNewLength = nInsLength + nStrLength - nReplace;
  60. int nAllocLen = MCD_STRCAPACITY(str);
  61. #if defined(MCD_STRINSERTREPLACE) // STL, replace method
  62. if ( nNewLength > nAllocLen )
  63. MCD_BLDRESERVE( str, (nNewLength + nNewLength/ + ) );
  64. MCD_STRINSERTREPLACE( str, nLeft, nReplace, strInsert );
  65. #else // MFC, no replace method
  66. int nBufferLen = nNewLength;
  67. if ( nNewLength > nAllocLen )
  68. nBufferLen += nBufferLen/ + ;
  69. MCD_CHAR* pDoc = MCD_GETBUFFER( str, nBufferLen );
  70. if ( nInsLength != nReplace && nLeft+nReplace < nStrLength )
  71. memmove( &pDoc[nLeft+nInsLength], &pDoc[nLeft+nReplace], (nStrLength-nLeft-nReplace)*sizeof(MCD_CHAR) );
  72. if ( nInsLength )
  73. memcpy( &pDoc[nLeft], strInsert, nInsLength*sizeof(MCD_CHAR) );
  74. MCD_RELEASEBUFFER( str, pDoc, nNewLength );
  75. #endif // MFC, no replace method
  76. }
  77.  
  78. int x_Hash( MCD_PCSZ p, int nSize )
  79. {
  80. unsigned int n=;
  81. while (*p)
  82. n += (unsigned int)(*p++);
  83. return n % nSize;
  84. }
  85.  
  86. MCD_STR x_IntToStr( int n )
  87. {
  88. MCD_CHAR sz[];
  89. MCD_SPRINTF(MCD_SSZ(sz),MCD_T("%d"),n);
  90. MCD_STR s=sz;
  91. return s;
  92. }
  93.  
  94. int x_StrNCmp( MCD_PCSZ p1, MCD_PCSZ p2, int n, int bIgnoreCase = )
  95. {
  96. // Fast string compare to determine equality
  97. if ( bIgnoreCase )
  98. {
  99. bool bNonAsciiFound = false;
  100. MCD_CHAR c1, c2;
  101. while ( n-- )
  102. {
  103. c1 = *p1++;
  104. c2 = *p2++;
  105. if ( c1 != c2 )
  106. {
  107. if ( bNonAsciiFound )
  108. return c1 - c2;
  109. if ( c1 >= 'a' && c1 <= 'z' )
  110. c1 = (MCD_CHAR)( c1 - ('a'-'A') );
  111. if ( c2 >= 'a' && c2 <= 'z' )
  112. c2 = (MCD_CHAR)( c2 - ('a'-'A') );
  113. if ( c1 != c2 )
  114. return c1 - c2;
  115. }
  116. else if ( (unsigned int)c1 > )
  117. bNonAsciiFound = true;
  118. }
  119. }
  120. else
  121. {
  122. while ( n-- )
  123. {
  124. if ( *p1 != *p2 )
  125. return *p1 - *p2;
  126. p1++;
  127. p2++;
  128. }
  129. }
  130. return ;
  131. }
  132.  
  133. enum MarkupResultCode
  134. {
  135. MRC_COUNT = ,
  136. MRC_TYPE = ,
  137. MRC_NUMBER = ,
  138. MRC_ENCODING = ,
  139. MRC_LENGTH = ,
  140. MRC_MODIFY = ,
  141. MRC_MSG =
  142. };
  143.  
  144. void x_AddResult( MCD_STR& strResult, MCD_CSTR pszID, MCD_CSTR pszVal = NULL, int nResultCode = , int n = -, int n2 = - )
  145. {
  146. // Call this to append an error result to strResult, discard if accumulating too large
  147. if ( MCD_STRLENGTH(strResult) < )
  148. {
  149. // Use a temporary CMarkup object but keep strResult in a string to minimize memory footprint
  150. CMarkup mResult( strResult );
  151. if ( nResultCode & MRC_MODIFY )
  152. mResult.FindElem( pszID );
  153. else
  154. mResult.AddElem( pszID, MCD_T(""), CMarkup::MNF_WITHNOLINES );
  155. if ( pszVal.pcsz )
  156. {
  157. if ( nResultCode & MRC_TYPE )
  158. mResult.SetAttrib( MCD_T("type"), pszVal );
  159. else if ( nResultCode & MRC_ENCODING )
  160. mResult.SetAttrib( MCD_T("encoding"), pszVal );
  161. else if ( nResultCode & MRC_MSG )
  162. mResult.SetAttrib( MCD_T("msg"), pszVal );
  163. else
  164. mResult.SetAttrib( MCD_T("tagname"), pszVal );
  165. }
  166. if ( nResultCode & MRC_NUMBER )
  167. mResult.SetAttrib( MCD_T("n"), n );
  168. else if ( nResultCode & MRC_COUNT )
  169. mResult.SetAttrib( MCD_T("count"), n );
  170. else if ( nResultCode & MRC_LENGTH )
  171. mResult.SetAttrib( MCD_T("length"), n );
  172. else if ( n != - )
  173. mResult.SetAttrib( MCD_T("offset"), n );
  174. if ( n2 != - )
  175. mResult.SetAttrib( MCD_T("offset2"), n2 );
  176. strResult = mResult.GetDoc();
  177. }
  178. }
  179.  
  180. //////////////////////////////////////////////////////////////////////
  181. // Encoding conversion struct and methods
  182. //
  183. struct TextEncoding
  184. {
  185. TextEncoding( MCD_CSTR pszFromEncoding, const void* pFromBuffer, int nFromBufferLen )
  186. {
  187. m_strFromEncoding = pszFromEncoding;
  188. m_pFrom = pFromBuffer;
  189. m_nFromLen = nFromBufferLen;
  190. m_nFailedChars = ;
  191. m_nToCount = ;
  192. };
  193. int PerformConversion( void* pTo, MCD_CSTR pszToEncoding = NULL );
  194. bool FindRaggedEnd( int& nTruncBeforeBytes );
  195. #if defined(MARKUP_ICONV)
  196. static const char* IConvName( char* szEncoding, MCD_CSTR pszEncoding );
  197. int IConv( void* pTo, int nToCharSize, int nFromCharSize );
  198. #endif // ICONV
  199. #if ! defined(MARKUP_WCHAR)
  200. static bool CanConvert( MCD_CSTR pszToEncoding, MCD_CSTR pszFromEncoding );
  201. #endif // WCHAR
  202. MCD_STR m_strToEncoding;
  203. MCD_STR m_strFromEncoding;
  204. const void* m_pFrom;
  205. int m_nFromLen;
  206. int m_nToCount;
  207. int m_nFailedChars;
  208. };
  209.  
  210. // Encoding names
  211. // This is a precompiled ASCII hash table for speed and minimum memory requirement
  212. // Each entry consists of a 2 digit name length, 5 digit code page, and the encoding name
  213. // Each table slot can have multiple entries, table size 155 was chosen for even distribution
  214. //
  215. MCD_PCSZ EncodingNameTable[] =
  216. {
  217. MCD_T("0800949ksc_5601"),MCD_T("1920932cseucpkdfmtjapanese0920003x-cp20003"),
  218. MCD_T("1250221_iso-2022-jp0228591l10920004x-cp20004"),
  219. MCD_T("0228592l20920005x-cp20005"),
  220. MCD_T("0228593l30600850ibm8501000858ccsid00858"),
  221. MCD_T("0228594l40600437ibm4370701201ucs-2be0600860ibm860"),
  222. MCD_T("0600852ibm8520501250ms-ee0600861ibm8610228599l50751932cp51932"),
  223. MCD_T("0600862ibm8620620127ibm3670700858cp008581010021x-mac-thai0920261x-cp20261"),
  224. MCD_T("0600737ibm7370500869cp-gr1057003x-iscii-be0600863ibm863"),
  225. MCD_T("0750221ms502210628591ibm8190600855ibm8550600864ibm864"),
  226. MCD_T("0600775ibm7751057002x-iscii-de0300949uhc0228605l91028591iso-ir-1000600865ibm865"),
  227. MCD_T("1028594iso-ir-1101028592iso-ir-1010600866ibm8660500861cp-is0600857ibm857"),
  228. MCD_T("0950227x-cp50227"),
  229. MCD_T("0320866koi1628598csisolatinhebrew1057008x-iscii-ka"),
  230. MCD_T("1000950big5-hkscs1220106x-ia5-german0600869ibm869"),
  231. MCD_T("1057009x-iscii-ma0701200ucs-2le0712001utf32be0920269x-cp20269"),
  232. MCD_T("0800708asmo-7080500437cspc81765000unicode-1-1-utf-70612000utf-320920936x-cp20936"),
  233. MCD_T("1200775ebcdic-cp-be0628598hebrew0701201utf16be1765001unicode-1-1-utf-81765001unicode-2-0-utf-80551932x-euc"),
  234. MCD_T("1028595iso-ir-1441028597iso-ir-1260728605latin-90601200utf-161057011x-iscii-pa"),
  235. MCD_T("1028596iso-ir-1271028593iso-ir-1090751932ms51932"),
  236. MCD_T("0801253ms-greek0600949korean1050225iso2022-kr1128605iso_8859-150920949x-cp20949"),
  237. MCD_T("1200775ebcdic-cp-ch1028598iso-ir-1381057006x-iscii-as1450221iso-2022-jp-ms"),
  238. MCD_T("1057004x-iscii-ta1028599iso-ir-148"),
  239. MCD_T("1000949iso-ir-1490820127us-ascii"),MCD_T(""),
  240. MCD_T("1000936gb_2312-801900850cspc850multilingual0712000utf32le"),
  241. MCD_T("1057005x-iscii-te1300949csksc560119871965000x-unicode-2-0-utf-7"),
  242. MCD_T("0701200utf16le1965001x-unicode-2-0-utf-80928591iso8859-1"),
  243. MCD_T("0928592iso8859-21420002x_chinese-eten0520866koi8r1000932x-ms-cp932"),
  244. MCD_T("1320000x-chinese-cns1138598iso8859-8-i1057010x-iscii-gu0928593iso8859-3"),
  245. MCD_T("0928594iso8859-4"),MCD_T("0928595iso8859-51150221csiso2022jp"),
  246. MCD_T("0928596iso8859-60900154csptcp154"),
  247. MCD_T("0928597iso8859-70900932shift_jis1400154cyrillic-asian"),
  248. MCD_T("0928598iso8859-81057007x-iscii-or1150225csiso2022kr"),
  249. MCD_T("0721866koi8-ru0928599iso8859-9"),MCD_T("0910000macintosh"),MCD_T(""),
  250. MCD_T(""),MCD_T(""),
  251. MCD_T("1210004x-mac-arabic0800936gb2312800628598visual1520108x-ia5-norwegian"),
  252. MCD_T(""),MCD_T("0829001x-europa"),MCD_T(""),MCD_T("1510079x-mac-icelandic"),
  253. MCD_T("0800932sjis-win1128591csisolatin1"),MCD_T("1128592csisolatin2"),
  254. MCD_T("1400949ks_c_5601-19871128593csisolatin3"),MCD_T("1128594csisolatin4"),
  255. MCD_T("0400950big51128595csisolatin51400949ks_c_5601-1989"),
  256. MCD_T("0500775cp5001565000csunicode11utf7"),MCD_T("0501361johab"),
  257. MCD_T("1100932windows-9321100437codepage437"),
  258. MCD_T("1800862cspc862latinhebrew1310081x-mac-turkish"),MCD_T(""),
  259. MCD_T("0701256ms-arab0800775csibm5000500154cp154"),
  260. MCD_T("1100936windows-9360520127ascii"),
  261. MCD_T("1528597csisolatingreek1100874windows-874"),MCD_T("0500850cp850"),
  262. MCD_T("0700720dos-7200500950cp9500500932cp9320500437cp4370500860cp8601650222_iso-2022-jp$sio"),
  263. MCD_T("0500852cp8520500861cp8610700949ksc56010812001utf-32be"),
  264. MCD_T("0528597greek0500862cp8620520127cp3670500853cp853"),
  265. MCD_T("0500737cp7371150220iso-2022-jp0801201utf-16be0500863cp863"),
  266. MCD_T("0500936cp9360528591cp8194520932extended_unix_code_packed_format_for_japanese0500855cp8550500864cp864"),
  267. MCD_T("0500775cp7750500874cp8740800860csibm8600500865cp865"),
  268. MCD_T("0500866cp8660800861csibm8611150225iso-2022-kr0500857cp8571101201unicodefffe"),
  269. MCD_T("0700862dos-8620701255ms-hebr0500858cp858"),
  270. MCD_T("1210005x-mac-hebrew0500949cp9490800863csibm863"),
  271. MCD_T("0500869cp8691600437cspc8codepage4370700874tis-6200800855csibm8550800864csibm864"),
  272. MCD_T("0800950x-x-big50420866koi80800932ms_kanji0700874dos-8740800865csibm865"),
  273. MCD_T("0800866csibm8661210003x-mac-korean0800857csibm8570812000utf-32le"),
  274. MCD_T(""),MCD_T("0500932ms9320801200utf-16le1028591iso-8859-10500154pt154"),
  275. MCD_T("1028592iso-8859-20620866koi8-r0800869csibm869"),
  276. MCD_T("1500936csiso58gb2312800828597elot_9281238598iso-8859-8-i1028593iso-8859-30820127iso-ir-6"),
  277. MCD_T("1028594iso-8859-4"),
  278. MCD_T("0800852cspcp8520500936ms9361028595iso-8859-50621866koi8-u0701252ms-ansi"),
  279. MCD_T("1028596iso-8859-60220127us2400858pc-multilingual-850+euro"),
  280. MCD_T("1028597iso-8859-71028603iso8859-13"),
  281. MCD_T("1320000x-chinese_cns1028598iso-8859-8"),
  282. MCD_T("1828595csisolatincyrillic1028605iso8859-151028599iso-8859-9"),
  283. MCD_T("0465001utf8"),MCD_T("1510017x-mac-ukrainian"),MCD_T(""),
  284. MCD_T("0828595cyrillic"),MCD_T("0900936gb2312-80"),MCD_T(""),
  285. MCD_T("0720866cskoi8r1528591iso_8859-1:1987"),MCD_T("1528592iso_8859-2:1987"),
  286. MCD_T("1354936iso-4873:1986"),MCD_T("0700932sjis-ms1528593iso_8859-3:1988"),
  287. MCD_T("1528594iso_8859-4:19880600936gb23120701251ms-cyrl"),
  288. MCD_T("1528596iso_8859-6:19871528595iso_8859-5:1988"),
  289. MCD_T("1528597iso_8859-7:1987"),
  290. MCD_T("1201250windows-12501300932shifft_jis-ms"),
  291. MCD_T("0810029x-mac-ce1201251windows-12511528598iso_8859-8:19880900949ks_c_56011110000csmacintosh"),
  292. MCD_T("0601200cp12001201252windows-1252"),
  293. MCD_T("1052936hz-gb-23121201253windows-12531400949ks_c_5601_19871528599iso_8859-9:19890601201cp1201"),
  294. MCD_T("1201254windows-1254"),MCD_T("1000936csgb2312801201255windows-1255"),
  295. MCD_T("1201256windows-12561100932windows-31j"),
  296. MCD_T("1201257windows-12570601250cp12500601133cp1133"),
  297. MCD_T("0601251cp12511201258windows-12580601125cp1125"),
  298. MCD_T("0701254ms-turk0601252cp1252"),MCD_T("0601253cp12530601361cp1361"),
  299. MCD_T("0800949ks-c56010601254cp1254"),MCD_T("0651936euc-cn0601255cp1255"),
  300. MCD_T("0601256cp1256"),MCD_T("0601257cp12570600950csbig50800858ibm00858"),
  301. MCD_T("0601258cp1258"),MCD_T("0520105x-ia5"),
  302. MCD_T("0801250x-cp12501110006x-mac-greek0738598logical"),
  303. MCD_T("0801251x-cp1251"),MCD_T(""),
  304. MCD_T("1410001x-mac-japanese1200932cswindows31j"),
  305. MCD_T("0700936chinese0720127csascii0620932euc-jp"),
  306. MCD_T("0851936x-euc-cn0501200ucs-2"),MCD_T("0628597greek8"),
  307. MCD_T("0651949euc-kr"),MCD_T(""),MCD_T("0628591latin1"),
  308. MCD_T("0628592latin21100874iso-8859-11"),
  309. MCD_T("0628593latin31420127ansi_x3.4-19681420127ansi_x3.4-19861028591iso_8859-1"),
  310. MCD_T("0628594latin41028592iso_8859-20701200unicode1128603iso-8859-13"),
  311. MCD_T("1028593iso_8859-30628599latin51410082x-mac-croatian"),
  312. MCD_T("1028594iso_8859-41128605iso-8859-150565000utf-70851932x-euc-jp"),
  313. MCD_T("1300775cspc775baltic1028595iso_8859-50565001utf-80512000utf32"),
  314. MCD_T("1028596iso_8859-61710002x-mac-chinesetrad0601252x-ansi"),
  315. MCD_T("1028597iso_8859-70628605latin90501200utf160700154ptcp1541410010x-mac-romanian"),
  316. MCD_T("0900936iso-ir-581028598iso_8859-8"),MCD_T("1028599iso_8859-9"),
  317. MCD_T("1350221iso2022-jp-ms0400932sjis"),MCD_T("0751949cseuckr"),
  318. MCD_T("1420002x-chinese-eten"),MCD_T("1410007x-mac-cyrillic"),
  319. MCD_T("1000932shifft_jis"),MCD_T("0828596ecma-114"),MCD_T(""),
  320. MCD_T("0900932shift-jis"),MCD_T("0701256cp1256 1320107x-ia5-swedish"),
  321. MCD_T("0828597ecma-118"),
  322. MCD_T("1628596csisolatinarabic1710008x-mac-chinesesimp0600932x-sjis"),MCD_T(""),
  323. MCD_T("0754936gb18030"),MCD_T("1350221windows-502210712000cp12000"),
  324. MCD_T("0628596arabic0500936cn-gb0900932sjis-open0712001cp12001"),MCD_T(""),
  325. MCD_T(""),MCD_T("0700950cn-big50920127iso646-us1001133ibm-cp1133"),MCD_T(""),
  326. MCD_T("0800936csgb23120900949ks-c-56010310000mac"),
  327. MCD_T("1001257winbaltrim0750221cp502211020127iso-ir-6us"),
  328. MCD_T("1000932csshiftjis"),MCD_T("0300936gbk0765001cp65001"),
  329. MCD_T("1620127iso_646.irv:19911351932windows-519320920001x-cp20001")
  330. };
  331.  
  332. int x_GetEncodingCodePage( MCD_CSTR pszEncoding )
  333. {
  334. // redo for completeness, the iconv set, UTF-32, and uppercase
  335.  
  336. // Lookup strEncoding in EncodingNameTable and return Windows code page
  337. int nCodePage = -;
  338. int nEncLen = MCD_PSZLEN( pszEncoding );
  339. if ( ! nEncLen )
  340. nCodePage = MCD_ACP;
  341. else if ( x_StrNCmp(pszEncoding,MCD_T("UTF-32"),) == )
  342. nCodePage = MCD_UTF32;
  343. else if ( nEncLen < )
  344. {
  345. MCD_CHAR szEncodingLower[];
  346. for ( int nEncChar=; nEncChar<nEncLen; ++nEncChar )
  347. {
  348. MCD_CHAR cEncChar = pszEncoding[nEncChar];
  349. szEncodingLower[nEncChar] = (cEncChar>='A' && cEncChar<='Z')? (MCD_CHAR)(cEncChar+('a'-'A')) : cEncChar;
  350. }
  351. szEncodingLower[nEncLen] = '\0';
  352. MCD_PCSZ pEntry = EncodingNameTable[x_Hash(szEncodingLower,sizeof(EncodingNameTable)/sizeof(MCD_PCSZ))];
  353. while ( *pEntry )
  354. {
  355. // e.g. entry: 0565001utf-8 means length 05, code page 65001, encoding name utf-8
  356. int nEntryLen = (*pEntry - '') * ;
  357. ++pEntry;
  358. nEntryLen += (*pEntry - '');
  359. ++pEntry;
  360. MCD_PCSZ pCodePage = pEntry;
  361. pEntry += ;
  362. if ( nEntryLen == nEncLen && x_StrNCmp(szEncodingLower,pEntry,nEntryLen) == )
  363. {
  364. // Convert digits to integer up to code name which always starts with alpha
  365. nCodePage = MCD_PSZTOL( pCodePage, NULL, );
  366. break;
  367. }
  368. pEntry += nEntryLen;
  369. }
  370. }
  371. return nCodePage;
  372. }
  373.  
  374. #if ! defined(MARKUP_WCHAR)
  375. bool TextEncoding::CanConvert( MCD_CSTR pszToEncoding, MCD_CSTR pszFromEncoding )
  376. {
  377. // Return true if MB to MB conversion is possible
  378. #if defined(MARKUP_ICONV)
  379. // iconv_open should fail if either encoding not supported or one is alias for other
  380. char szTo[], szFrom[];
  381. iconv_t cd = iconv_open( IConvName(szTo,pszToEncoding), IConvName(szFrom,pszFromEncoding) );
  382. if ( cd == (iconv_t)- )
  383. return false;
  384. iconv_close(cd);
  385. #else
  386. int nToCP = x_GetEncodingCodePage( pszToEncoding );
  387. int nFromCP = x_GetEncodingCodePage( pszFromEncoding );
  388. if ( nToCP == - || nFromCP == - )
  389. return false;
  390. #if defined(MARKUP_WINCONV)
  391. if ( nToCP == MCD_ACP || nFromCP == MCD_ACP ) // either ACP ANSI?
  392. {
  393. int nACP = GetACP();
  394. if ( nToCP == MCD_ACP )
  395. nToCP = nACP;
  396. if ( nFromCP == MCD_ACP )
  397. nFromCP = nACP;
  398. }
  399. #else // no conversion API, but we can do AToUTF8 and UTF8ToA
  400. if ( nToCP != MCD_UTF8 && nFromCP != MCD_UTF8 ) // either UTF-8?
  401. return false;
  402. #endif // no conversion API
  403. if ( nToCP == nFromCP )
  404. return false;
  405. #endif // not ICONV
  406. return true;
  407. }
  408. #endif // not WCHAR
  409.  
  410. #if defined(MARKUP_ICONV)
  411. const char* TextEncoding::IConvName( char* szEncoding, MCD_CSTR pszEncoding )
  412. {
  413. // Make upper case char-based name from strEncoding which consists only of characters in the ASCII range
  414. int nEncChar = ;
  415. while ( pszEncoding[nEncChar] )
  416. {
  417. char cEncChar = (char)pszEncoding[nEncChar];
  418. szEncoding[nEncChar] = (cEncChar>='a' && cEncChar<='z')? (cEncChar-('a'-'A')) : cEncChar;
  419. ++nEncChar;
  420. }
  421. if ( nEncChar == && x_StrNCmp(szEncoding,"UTF-16",) == )
  422. {
  423. szEncoding[nEncChar++] = 'B';
  424. szEncoding[nEncChar++] = 'E';
  425. }
  426. szEncoding[nEncChar] = '\0';
  427. return szEncoding;
  428. }
  429.  
  430. int TextEncoding::IConv( void* pTo, int nToCharSize, int nFromCharSize )
  431. {
  432. // Converts from m_pFrom to pTo
  433. char szTo[], szFrom[];
  434. iconv_t cd = iconv_open( IConvName(szTo,m_strToEncoding), IConvName(szFrom,m_strFromEncoding) );
  435. int nToLenBytes = ;
  436. if ( cd != (iconv_t)- )
  437. {
  438. size_t nFromLenRemaining = (size_t)m_nFromLen * nFromCharSize;
  439. size_t nToCountRemaining = (size_t)m_nToCount * nToCharSize;
  440. size_t nToCountRemainingBefore;
  441. char* pToChar = (char*)pTo;
  442. char* pFromChar = (char*)m_pFrom;
  443. char* pToTempBuffer = NULL;
  444. const size_t nTempBufferSize = ;
  445. size_t nResult;
  446. if ( ! pTo )
  447. {
  448. pToTempBuffer = new char[nTempBufferSize];
  449. pToChar = pToTempBuffer;
  450. nToCountRemaining = nTempBufferSize;
  451. }
  452. while ( nFromLenRemaining )
  453. {
  454. nToCountRemainingBefore = nToCountRemaining;
  455. nResult = iconv( cd, &pFromChar, &nFromLenRemaining, &pToChar, &nToCountRemaining );
  456. nToLenBytes += (int)(nToCountRemainingBefore - nToCountRemaining);
  457. if ( nResult == (size_t)- )
  458. {
  459. int nErrno = errno;
  460. if ( nErrno == EILSEQ )
  461. {
  462. // Bypass bad char, question mark denotes problem in source string
  463. pFromChar += nFromCharSize;
  464. nFromLenRemaining -= nFromCharSize;
  465. if ( nToCharSize == )
  466. *pToChar = '?';
  467. else if ( nToCharSize == )
  468. *((unsigned short*)pToChar) = (unsigned short)'?';
  469. else if ( nToCharSize == )
  470. *((unsigned int*)pToChar) = (unsigned int)'?';
  471. pToChar += nToCharSize;
  472. nToCountRemaining -= nToCharSize;
  473. nToLenBytes += nToCharSize;
  474. size_t nInitFromLen = , nInitToCount = ;
  475. iconv(cd, NULL, &nInitFromLen ,NULL, &nInitToCount );
  476. }
  477. else if ( nErrno == EINVAL )
  478. break; // incomplete character or shift sequence at end of input
  479. else if ( nErrno == E2BIG && !pToTempBuffer )
  480. break; // output buffer full should only happen when using a temp buffer
  481. }
  482. else
  483. m_nFailedChars += nResult;
  484. if ( pToTempBuffer && nToCountRemaining < )
  485. {
  486. nToCountRemaining = nTempBufferSize;
  487. pToChar = pToTempBuffer;
  488. }
  489. }
  490. if ( pToTempBuffer )
  491. delete[] pToTempBuffer;
  492. iconv_close(cd);
  493. }
  494. return nToLenBytes / nToCharSize;
  495. }
  496. #endif
  497.  
  498. #if defined(MARKUP_WINCONV)
  499. bool x_NoDefaultChar( int nCP )
  500. {
  501. // WideCharToMultiByte fails if lpUsedDefaultChar is non-NULL for these code pages:
  502. return (bool)(nCP == || nCP == || nCP == || nCP == || nCP == || nCP == ||
  503. nCP == || nCP == || nCP == || nCP == || (nCP >= && nCP <= ) );
  504. }
  505. #endif
  506.  
  507. int TextEncoding::PerformConversion( void* pTo, MCD_CSTR pszToEncoding/*=NULL*/ )
  508. {
  509. // If pTo is not NULL, it must be large enough to hold result, length of result is returned
  510. // m_nFailedChars will be set to >0 if characters not supported in strToEncoding
  511. int nToLen = ;
  512. if ( pszToEncoding.pcsz )
  513. m_strToEncoding = pszToEncoding;
  514. int nToCP = x_GetEncodingCodePage( m_strToEncoding );
  515. if ( nToCP == - )
  516. nToCP = MCD_ACP;
  517. int nFromCP = x_GetEncodingCodePage( m_strFromEncoding );
  518. if ( nFromCP == - )
  519. nFromCP = MCD_ACP;
  520. m_nFailedChars = ;
  521.  
  522. #if ! defined(MARKUP_WINCONV) && ! defined(MARKUP_ICONV)
  523. // Only non-Unicode encoding supported is locale charset, must call setlocale
  524. if ( nToCP != MCD_UTF8 && nToCP != MCD_UTF16 && nToCP != MCD_UTF32 )
  525. nToCP = MCD_ACP;
  526. if ( nFromCP != MCD_UTF8 && nFromCP != MCD_UTF16 && nFromCP != MCD_UTF32 )
  527. nFromCP = MCD_ACP;
  528. if ( nFromCP == MCD_ACP )
  529. {
  530. const char* pA = (const char*)m_pFrom;
  531. int nALenRemaining = m_nFromLen;
  532. int nCharLen;
  533. wchar_t wcChar;
  534. char* pU = (char*)pTo;
  535. while ( nALenRemaining )
  536. {
  537. nCharLen = mbtowc( &wcChar, pA, nALenRemaining );
  538. if ( nCharLen < )
  539. {
  540. wcChar = (wchar_t)'?';
  541. nCharLen = ;
  542. }
  543. pA += nCharLen;
  544. nALenRemaining -= nCharLen;
  545. if ( nToCP == MCD_UTF8 )
  546. CMarkup::EncodeCharUTF8( (int)wcChar, pU, nToLen );
  547. else if ( nToCP == MCD_UTF16 )
  548. CMarkup::EncodeCharUTF16( (int)wcChar, (unsigned short*)pU, nToLen );
  549. else // UTF32
  550. {
  551. if ( pU )
  552. ((unsigned int*)pU)[nToLen] = (unsigned int)wcChar;
  553. ++nToLen;
  554. }
  555. }
  556. }
  557. else if ( nToCP == MCD_ACP )
  558. {
  559. union pUnicodeUnion { const char* p8; const unsigned short* p16; const unsigned int* p32; } pU;
  560. pU.p8 = (const char*)m_pFrom;
  561. const char* pUEnd = pU.p8 + m_nFromLen;
  562. if ( nFromCP == MCD_UTF16 )
  563. pUEnd = (char*)( pU.p16 + m_nFromLen );
  564. else if ( nFromCP == MCD_UTF32 )
  565. pUEnd = (char*)( pU.p32 + m_nFromLen );
  566. int nCharLen;
  567. char* pA = (char*)pTo;
  568. char szA[];
  569. int nUChar;
  570. while ( pU.p8 != pUEnd )
  571. {
  572. if ( nFromCP == MCD_UTF8 )
  573. nUChar = CMarkup::DecodeCharUTF8( pU.p8, pUEnd );
  574. else if ( nFromCP == MCD_UTF16 )
  575. nUChar = CMarkup::DecodeCharUTF16( pU.p16, (const unsigned short*)pUEnd );
  576. else // UTF32
  577. nUChar = *(pU.p32)++;
  578. if ( nUChar == - )
  579. nCharLen = -;
  580. else if ( nUChar & ~0xffff )
  581. nCharLen = -;
  582. else
  583. nCharLen = wctomb( pA?pA:szA, (wchar_t)nUChar );
  584. if ( nCharLen < )
  585. {
  586. if ( nCharLen == - )
  587. ++m_nFailedChars;
  588. nCharLen = ;
  589. if ( pA )
  590. *pA = '?';
  591. }
  592. if ( pA )
  593. pA += nCharLen;
  594. nToLen += nCharLen;
  595. }
  596. }
  597. #endif // not WINCONV and not ICONV
  598.  
  599. if ( nFromCP == MCD_UTF32 )
  600. {
  601. const unsigned int* p32 = (const unsigned int*)m_pFrom;
  602. const unsigned int* p32End = p32 + m_nFromLen;
  603. if ( nToCP == MCD_UTF8 )
  604. {
  605. char* p8 = (char*)pTo;
  606. while ( p32 != p32End )
  607. CMarkup::EncodeCharUTF8( *p32++, p8, nToLen );
  608. }
  609. else if ( nToCP == MCD_UTF16 )
  610. {
  611. unsigned short* p16 = (unsigned short*)pTo;
  612. while ( p32 != p32End )
  613. CMarkup::EncodeCharUTF16( (int)*p32++, p16, nToLen );
  614. }
  615. else // to ANSI
  616. {
  617. // WINCONV not supported for 32To8, since only used for sizeof(wchar_t) == 4
  618. #if defined(MARKUP_ICONV)
  619. nToLen = IConv( pTo, , );
  620. #endif // ICONV
  621. }
  622. }
  623. else if ( nFromCP == MCD_UTF16 )
  624. {
  625. // UTF16To8 will be deprecated since weird output buffer size sensitivity not worth implementing here
  626. const unsigned short* p16 = (const unsigned short*)m_pFrom;
  627. const unsigned short* p16End = p16 + m_nFromLen;
  628. int nUChar;
  629. if ( nToCP == MCD_UTF32 )
  630. {
  631. unsigned int* p32 = (unsigned int*)pTo;
  632. while ( p16 != p16End )
  633. {
  634. nUChar = CMarkup::DecodeCharUTF16( p16, p16End );
  635. if ( nUChar == - )
  636. nUChar = '?';
  637. if ( p32 )
  638. p32[nToLen] = (unsigned int)nUChar;
  639. ++nToLen;
  640. }
  641. }
  642. #if defined(MARKUP_WINCONV)
  643. else // to UTF-8 or other multi-byte
  644. {
  645. nToLen = WideCharToMultiByte(nToCP,,(const wchar_t*)m_pFrom,m_nFromLen,(char*)pTo,
  646. m_nToCount?m_nToCount+:,NULL,x_NoDefaultChar(nToCP)?NULL:&m_nFailedChars);
  647. }
  648. #else // not WINCONV
  649. else if ( nToCP == MCD_UTF8 )
  650. {
  651. char* p8 = (char*)pTo;
  652. while ( p16 != p16End )
  653. {
  654. nUChar = CMarkup::DecodeCharUTF16( p16, p16End );
  655. if ( nUChar == - )
  656. nUChar = '?';
  657. CMarkup::EncodeCharUTF8( nUChar, p8, nToLen );
  658. }
  659. }
  660. else // to ANSI
  661. {
  662. #if defined(MARKUP_ICONV)
  663. nToLen = IConv( pTo, , );
  664. #endif // ICONV
  665. }
  666. #endif // not WINCONV
  667. }
  668. else if ( nToCP == MCD_UTF16 ) // to UTF-16 from UTF-8/ANSI
  669. {
  670. #if defined(MARKUP_WINCONV)
  671. nToLen = MultiByteToWideChar(nFromCP,,(const char*)m_pFrom,m_nFromLen,(wchar_t*)pTo,m_nToCount);
  672. #else // not WINCONV
  673. if ( nFromCP == MCD_UTF8 )
  674. {
  675. const char* p8 = (const char*)m_pFrom;
  676. const char* p8End = p8 + m_nFromLen;
  677. int nUChar;
  678. unsigned short* p16 = (unsigned short*)pTo;
  679. while ( p8 != p8End )
  680. {
  681. nUChar = CMarkup::DecodeCharUTF8( p8, p8End );
  682. if ( nUChar == - )
  683. nUChar = '?';
  684. if ( p16 )
  685. p16[nToLen] = (unsigned short)nUChar;
  686. ++nToLen;
  687. }
  688. }
  689. else // from ANSI
  690. {
  691. #if defined(MARKUP_ICONV)
  692. nToLen = IConv( pTo, , );
  693. #endif // ICONV
  694. }
  695. #endif // not WINCONV
  696. }
  697. else if ( nToCP == MCD_UTF32 ) // to UTF-32 from UTF-8/ANSI
  698. {
  699. if ( nFromCP == MCD_UTF8 )
  700. {
  701. const char* p8 = (const char*)m_pFrom;
  702. const char* p8End = p8 + m_nFromLen;
  703. int nUChar;
  704. unsigned int* p32 = (unsigned int*)pTo;
  705. while ( p8 != p8End )
  706. {
  707. nUChar = CMarkup::DecodeCharUTF8( p8, p8End );
  708. if ( nUChar == - )
  709. nUChar = '?';
  710. if ( p32 )
  711. p32[nToLen] = (unsigned int)nUChar;
  712. ++nToLen;
  713. }
  714. }
  715. else // from ANSI
  716. {
  717. // WINCONV not supported for ATo32, since only used for sizeof(wchar_t) == 4
  718. #if defined(MARKUP_ICONV)
  719. // nToLen = IConv( pTo, 4, 1 );
  720. // Linux: had trouble getting IConv to leave the BOM off of the UTF-32 output stream
  721. // So converting via UTF-16 with native endianness
  722. unsigned short* pwszUTF16 = new unsigned short[m_nFromLen];
  723. MCD_STR strToEncoding = m_strToEncoding;
  724. m_strToEncoding = MCD_T("UTF-16BE");
  725. short nEndianTest = ;
  726. if ( ((char*)&nEndianTest)[] ) // Little-endian?
  727. m_strToEncoding = MCD_T("UTF-16LE");
  728. m_nToCount = m_nFromLen;
  729. int nUTF16Len = IConv( pwszUTF16, , );
  730. m_strToEncoding = strToEncoding;
  731. const unsigned short* p16 = (const unsigned short*)pwszUTF16;
  732. const unsigned short* p16End = p16 + nUTF16Len;
  733. int nUChar;
  734. unsigned int* p32 = (unsigned int*)pTo;
  735. while ( p16 != p16End )
  736. {
  737. nUChar = CMarkup::DecodeCharUTF16( p16, p16End );
  738. if ( nUChar == - )
  739. nUChar = '?';
  740. if ( p32 )
  741. *p32++ = (unsigned int)nUChar;
  742. ++nToLen;
  743. }
  744. delete[] pwszUTF16;
  745. #endif // ICONV
  746. }
  747. }
  748. else
  749. {
  750. #if defined(MARKUP_ICONV)
  751. nToLen = IConv( pTo, , );
  752. #elif defined(MARKUP_WINCONV)
  753. wchar_t* pwszUTF16 = new wchar_t[m_nFromLen];
  754. int nUTF16Len = MultiByteToWideChar(nFromCP,,(const char*)m_pFrom,m_nFromLen,pwszUTF16,m_nFromLen);
  755. nToLen = WideCharToMultiByte(nToCP,,pwszUTF16,nUTF16Len,(char*)pTo,m_nToCount,NULL,
  756. x_NoDefaultChar(nToCP)?NULL:&m_nFailedChars);
  757. delete[] pwszUTF16;
  758. #endif // WINCONV
  759. }
  760.  
  761. // Store the length in case this is called again after allocating output buffer to fit
  762. m_nToCount = nToLen;
  763. return nToLen;
  764. }
  765.  
  766. bool TextEncoding::FindRaggedEnd( int& nTruncBeforeBytes )
  767. {
  768. // Check for ragged end UTF-16 or multi-byte according to m_strToEncoding, expects at least 40 bytes to work with
  769. bool bSuccess = true;
  770. nTruncBeforeBytes = ;
  771. int nCP = x_GetEncodingCodePage( m_strFromEncoding );
  772. if ( nCP == MCD_UTF16 )
  773. {
  774. unsigned short* pUTF16Buffer = (unsigned short*)m_pFrom;
  775. const unsigned short* pUTF16Last = &pUTF16Buffer[m_nFromLen-];
  776. if ( CMarkup::DecodeCharUTF16(pUTF16Last,&pUTF16Buffer[m_nFromLen]) == - )
  777. nTruncBeforeBytes = ;
  778. }
  779. else // UTF-8, SBCS DBCS
  780. {
  781. if ( nCP == MCD_UTF8 )
  782. {
  783. char* pUTF8Buffer = (char*)m_pFrom;
  784. char* pUTF8End = &pUTF8Buffer[m_nFromLen];
  785. int nLast = m_nFromLen - ;
  786. const char* pUTF8Last = &pUTF8Buffer[nLast];
  787. while ( nLast > && CMarkup::DecodeCharUTF8(pUTF8Last,pUTF8End) == - )
  788. pUTF8Last = &pUTF8Buffer[--nLast];
  789. nTruncBeforeBytes = (int)(pUTF8End - pUTF8Last);
  790. }
  791. else
  792. {
  793. // Do a conversion-based test unless we can determine it is not multi-byte
  794. // If m_strEncoding="" default code page then GetACP can tell us the code page, otherwise just do the test
  795. #if defined(MARKUP_WINCONV)
  796. if ( nCP == )
  797. nCP = GetACP();
  798. #endif
  799. int nMultibyteCharsToTest = ;
  800. switch ( nCP )
  801. {
  802. case :
  803. nMultibyteCharsToTest = ;
  804. case : case : case : case : case : case : case : // Japanese
  805. case : case : case : case : case : case : // Korean
  806. case : case : case : case : case : // Taiwan
  807. case : case : case : case : case : case : case : // EBCDIC
  808. case : case : case : case : // Chinese
  809. case : case : case : case : case : case : // Chinese
  810. nCP = ;
  811. break;
  812. }
  813. if ( nMultibyteCharsToTest > m_nFromLen )
  814. nMultibyteCharsToTest = m_nFromLen;
  815. if ( nCP == && nMultibyteCharsToTest )
  816. {
  817. /*
  818. 1. convert the piece to Unicode with MultiByteToWideChar
  819. 2. Identify at least two Unicode code point boundaries at the end of
  820. the converted piece by stepping backwards from the end and re-
  821. converting the final 2 bytes, 3 bytes, 4 bytes etc, comparing the
  822. converted end string to the end of the entire converted piece to find
  823. a valid code point boundary.
  824. 3. Upon finding a code point boundary, I still want to make sure it
  825. will convert the same separately on either side of the divide as it
  826. does together, so separately convert the first byte and the remaining
  827. bytes and see if the result together is the same as the whole end, if
  828. not try the first two bytes and the remaining bytes. etc., until I
  829. find a useable dividing point. If none found, go back to step 2 and
  830. get a longer end string to try.
  831. */
  832. m_strToEncoding = MCD_T("UTF-16");
  833. m_nToCount = m_nFromLen*;
  834. unsigned short* pUTF16Buffer = new unsigned short[m_nToCount];
  835. int nUTF16Len = PerformConversion( (void*)pUTF16Buffer );
  836. int nOriginalByteLen = m_nFromLen;
  837.  
  838. // Guaranteed to have at least MARKUP_FILEBLOCKSIZE/2 bytes to work with
  839. const int nMaxBytesToTry = ;
  840. unsigned short wsz16End[nMaxBytesToTry*];
  841. unsigned short wsz16EndDivided[nMaxBytesToTry*];
  842. const char* pszOriginalBytes = (const char*)m_pFrom;
  843. int nBoundariesFound = ;
  844. bSuccess = false;
  845. while ( nTruncBeforeBytes < nMaxBytesToTry && ! bSuccess )
  846. {
  847. ++nTruncBeforeBytes;
  848. m_pFrom = &pszOriginalBytes[nOriginalByteLen-nTruncBeforeBytes];
  849. m_nFromLen = nTruncBeforeBytes;
  850. m_nToCount = nMaxBytesToTry*;
  851. int nEndUTF16Len = PerformConversion( (void*)wsz16End );
  852. if ( nEndUTF16Len && memcmp(wsz16End,&pUTF16Buffer[nUTF16Len-nEndUTF16Len],nEndUTF16Len*) == )
  853. {
  854. ++nBoundariesFound;
  855. if ( nBoundariesFound > )
  856. {
  857. int nDivideAt = ;
  858. while ( nDivideAt < nTruncBeforeBytes )
  859. {
  860. m_pFrom = &pszOriginalBytes[nOriginalByteLen-nTruncBeforeBytes];
  861. m_nFromLen = nDivideAt;
  862. m_nToCount = nMaxBytesToTry*;
  863. int nDividedUTF16Len = PerformConversion( (void*)wsz16EndDivided );
  864. if ( nDividedUTF16Len )
  865. {
  866. m_pFrom = &pszOriginalBytes[nOriginalByteLen-nTruncBeforeBytes+nDivideAt];
  867. m_nFromLen = nTruncBeforeBytes-nDivideAt;
  868. m_nToCount = nMaxBytesToTry*-nDividedUTF16Len;
  869. nDividedUTF16Len += PerformConversion( (void*)&wsz16EndDivided[nDividedUTF16Len] );
  870. if ( m_nToCount && nEndUTF16Len == nDividedUTF16Len && memcmp(wsz16End,wsz16EndDivided,nEndUTF16Len) == )
  871. {
  872. nTruncBeforeBytes -= nDivideAt;
  873. bSuccess = true;
  874. break;
  875. }
  876. }
  877. ++nDivideAt;
  878. }
  879. }
  880. }
  881. }
  882. delete [] pUTF16Buffer;
  883. }
  884. }
  885. }
  886. return bSuccess;
  887. }
  888.  
  889. bool x_EndianSwapRequired( int nDocFlags )
  890. {
  891. short nWord = ;
  892. char cFirstByte = ((char*)&nWord)[];
  893. if ( cFirstByte ) // LE
  894. {
  895. if ( nDocFlags & CMarkup::MDF_UTF16BEFILE )
  896. return true;
  897. }
  898. else if ( nDocFlags & CMarkup::MDF_UTF16LEFILE )
  899. return true;
  900. return false;
  901. }
  902.  
  903. void x_EndianSwapUTF16( unsigned short* pBuffer, int nCharLen )
  904. {
  905. unsigned short cChar;
  906. while ( nCharLen-- )
  907. {
  908. cChar = pBuffer[nCharLen];
  909. pBuffer[nCharLen] = (unsigned short)((cChar<<) | (cChar>>));
  910. }
  911. }
  912.  
  913. //////////////////////////////////////////////////////////////////////
  914. // Element position indexes
  915. // This is the primary means of storing the layout of the document
  916. //
  917. struct ElemPos
  918. {
  919. ElemPos() {};
  920. ElemPos( const ElemPos& pos ) { *this = pos; };
  921. int StartTagLen() const { return nStartTagLen; };
  922. void SetStartTagLen( int n ) { nStartTagLen = n; };
  923. void AdjustStartTagLen( int n ) { nStartTagLen += n; };
  924. int EndTagLen() const { return nEndTagLen; };
  925. void SetEndTagLen( int n ) { nEndTagLen = n; };
  926. bool IsEmptyElement() { return (StartTagLen()==nLength)?true:false; };
  927. int StartContent() const { return nStart + StartTagLen(); };
  928. int ContentLen() const { return nLength - StartTagLen() - EndTagLen(); };
  929. int StartAfter() const { return nStart + nLength; };
  930. int Level() const { return nFlags & 0xffff; };
  931. void SetLevel( int nLev ) { nFlags = (nFlags & ~0xffff) | nLev; };
  932. void ClearVirtualParent() { memset(this,,sizeof(ElemPos)); };
  933. void SetEndTagLenUnparsed() { SetEndTagLen(); };
  934. bool IsUnparsed() { return EndTagLen() == ; };
  935.  
  936. // Memory size: 8 32-bit integers == 32 bytes
  937. int nStart;
  938. int nLength;
  939. unsigned int nStartTagLen : ; // 4MB limit for start tag
  940. unsigned int nEndTagLen : ; // 1K limit for end tag
  941. int nFlags; // 16 bits flags, 16 bits level 65536 depth limit
  942. int iElemParent;
  943. int iElemChild; // first child
  944. int iElemNext; // next sibling
  945. int iElemPrev; // if this is first, iElemPrev points to last
  946. };
  947.  
  948. enum MarkupNodeFlagsInternal2
  949. {
  950. MNF_REPLACE = 0x001000,
  951. MNF_QUOTED = 0x008000,
  952. MNF_EMPTY = 0x010000,
  953. MNF_DELETED = 0x020000,
  954. MNF_FIRST = 0x080000,
  955. MNF_PUBLIC = 0x300000,
  956. MNF_ILLFORMED = 0x800000,
  957. MNF_USER = 0xf000000
  958. };
  959.  
  960. struct ElemPosTree
  961. {
  962. ElemPosTree() { Clear(); };
  963. ~ElemPosTree() { Release(); };
  964. enum { PA_SEGBITS = , PA_SEGMASK = 0xffff };
  965. void ReleaseElemPosTree() { Release(); Clear(); };
  966. void Release() { for (int n=;n<SegsUsed();++n) delete[] (char*)m_pSegs[n]; if (m_pSegs) delete[] (char*)m_pSegs; };
  967. void Clear() { m_nSegs=; m_nSize=; m_pSegs=NULL; };
  968. int GetSize() const { return m_nSize; };
  969. int SegsUsed() const { return ((m_nSize-)>>PA_SEGBITS) + ; };
  970. ElemPos& GetRefElemPosAt(int i) const { return m_pSegs[i>>PA_SEGBITS][i&PA_SEGMASK]; };
  971. void CopyElemPosTree( ElemPosTree* pOtherTree, int n );
  972. void GrowElemPosTree( int nNewSize );
  973. private:
  974. ElemPos** m_pSegs;
  975. int m_nSize;
  976. int m_nSegs;
  977. };
  978.  
  979. void ElemPosTree::CopyElemPosTree( ElemPosTree* pOtherTree, int n )
  980. {
  981. ReleaseElemPosTree();
  982. m_nSize = n;
  983. if ( m_nSize < )
  984. m_nSize = ;
  985. m_nSegs = SegsUsed();
  986. if ( m_nSegs )
  987. {
  988. m_pSegs = (ElemPos**)(new char[m_nSegs*sizeof(char*)]);
  989. int nSegSize = << PA_SEGBITS;
  990. for ( int nSeg=; nSeg < m_nSegs; ++nSeg )
  991. {
  992. if ( nSeg + == m_nSegs )
  993. nSegSize = m_nSize - (nSeg << PA_SEGBITS);
  994. m_pSegs[nSeg] = (ElemPos*)(new char[nSegSize*sizeof(ElemPos)]);
  995. memcpy( m_pSegs[nSeg], pOtherTree->m_pSegs[nSeg], nSegSize*sizeof(ElemPos) );
  996. }
  997. }
  998. }
  999.  
  1000. void ElemPosTree::GrowElemPosTree( int nNewSize )
  1001. {
  1002. // Called by x_AllocElemPos when the document is created or the array is filled
  1003. // The ElemPosTree class is implemented using segments to reduce contiguous memory requirements
  1004. // It reduces reallocations (copying of memory) since this only occurs within one segment
  1005. // The "Grow By" algorithm ensures there are no reallocations after 2 segments
  1006. //
  1007. // Grow By: new size can be at most one more complete segment
  1008. int nSeg = (m_nSize?m_nSize-:) >> PA_SEGBITS;
  1009. int nNewSeg = (nNewSize-) >> PA_SEGBITS;
  1010. if ( nNewSeg > nSeg + )
  1011. {
  1012. nNewSeg = nSeg + ;
  1013. nNewSize = (nNewSeg+) << PA_SEGBITS;
  1014. }
  1015.  
  1016. // Allocate array of segments
  1017. if ( m_nSegs <= nNewSeg )
  1018. {
  1019. int nNewSegments = + nNewSeg * ;
  1020. char* pNewSegments = new char[nNewSegments*sizeof(char*)];
  1021. if ( SegsUsed() )
  1022. memcpy( pNewSegments, m_pSegs, SegsUsed()*sizeof(char*) );
  1023. if ( m_pSegs )
  1024. delete[] (char*)m_pSegs;
  1025. m_pSegs = (ElemPos**)pNewSegments;
  1026. m_nSegs = nNewSegments;
  1027. }
  1028.  
  1029. // Calculate segment sizes
  1030. int nSegSize = m_nSize - (nSeg << PA_SEGBITS);
  1031. int nNewSegSize = nNewSize - (nNewSeg << PA_SEGBITS);
  1032.  
  1033. // Complete first segment
  1034. int nFullSegSize = << PA_SEGBITS;
  1035. if ( nSeg < nNewSeg && nSegSize < nFullSegSize )
  1036. {
  1037. char* pNewFirstSeg = new char[ nFullSegSize * sizeof(ElemPos) ];
  1038. if ( nSegSize )
  1039. {
  1040. // Reallocate
  1041. memcpy( pNewFirstSeg, m_pSegs[nSeg], nSegSize * sizeof(ElemPos) );
  1042. delete[] (char*)m_pSegs[nSeg];
  1043. }
  1044. m_pSegs[nSeg] = (ElemPos*)pNewFirstSeg;
  1045. }
  1046.  
  1047. // New segment
  1048. char* pNewSeg = new char[ nNewSegSize * sizeof(ElemPos) ];
  1049. if ( nNewSeg == nSeg && nSegSize )
  1050. {
  1051. // Reallocate
  1052. memcpy( pNewSeg, m_pSegs[nSeg], nSegSize * sizeof(ElemPos) );
  1053. delete[] (char*)m_pSegs[nSeg];
  1054. }
  1055. m_pSegs[nNewSeg] = (ElemPos*)pNewSeg;
  1056. m_nSize = nNewSize;
  1057. }
  1058.  
  1059. #define ELEM(i) m_pElemPosTree->GetRefElemPosAt(i)
  1060.  
  1061. //////////////////////////////////////////////////////////////////////
  1062. // NodePos stores information about an element or node during document creation and parsing
  1063. //
  1064. struct NodePos
  1065. {
  1066. NodePos() {};
  1067. NodePos( int n ) { nNodeFlags=n; nNodeType=; nStart=; nLength=; };
  1068. int nNodeType;
  1069. int nStart;
  1070. int nLength;
  1071. int nNodeFlags;
  1072. MCD_STR strMeta;
  1073. };
  1074.  
  1075. //////////////////////////////////////////////////////////////////////
  1076. // "Is Char" defines
  1077. // Quickly determine if a character matches a limited set
  1078. //
  1079. #define x_ISONEOF(c,f,l,s) ((c>=f&&c<=l)?(int)(s[c-f]):0)
  1080. // classic whitespace " \t\n\r"
  1081. #define x_ISWHITESPACE(c) x_ISONEOF(c,9,32,"\2\3\0\0\4\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1")
  1082. // end of word in a path " =/[]"
  1083. #define x_ISENDPATHWORD(c) x_ISONEOF(c,32,93,"\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\4\0\5")
  1084. // end of a name " \t\n\r/>"
  1085. #define x_ISENDNAME(c) x_ISONEOF(c,9,62,"\2\3\0\0\4\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1")
  1086. // a small set of chars cannot be second last in attribute value " \t\n\r\"\'"
  1087. #define x_ISNOTSECONDLASTINVAL(c) x_ISONEOF(c,9,39,"\2\3\0\0\4\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\5\0\0\0\0\1")
  1088. // first char of doc type tag name "EAN"
  1089. #define x_ISDOCTYPESTART(c) x_ISONEOF(c,65,78,"\2\0\0\0\1\0\0\0\0\0\0\0\0\3")
  1090. // attrib special char "<&>\"\'"
  1091. #define x_ISATTRIBSPECIAL(c) x_ISONEOF(c,34,62,"\4\0\0\0\2\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\3")
  1092. // parsed text special char "<&>"
  1093. #define x_ISSPECIAL(c) x_ISONEOF(c,38,62,"\2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\3")
  1094. // end of any name " \t\n\r<>=\\/?!\"';"
  1095. #define x_ISENDANYNAME(c) x_ISONEOF(c,9,92,"\2\3\0\0\4\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\1\1\0\0\0\0\1\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\1\5\1\1\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1")
  1096. // end of unquoted attrib value " \t\n\r>"
  1097. #define x_ISENDUNQUOTED(c) x_ISONEOF(c,9,62,"\2\3\0\0\4\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\5")
  1098. // end of attrib name "= \t\n\r>/?"
  1099. #define x_ISENDATTRIBNAME(c) x_ISONEOF(c,9,63,"\3\4\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\1\1\1")
  1100. // start of entity reference "A-Za-Z#_:"
  1101. #define x_ISSTARTENTREF(c) x_ISONEOF(c,35,122,"\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\0\0\0\0\1\2\3\4\5\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\1\0\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1")
  1102. // within entity reference "A-Za-Z0-9_:-."
  1103. #define x_ISINENTREF(c) x_ISONEOF(c,45,122,"\1\1\0\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\1\2\3\4\5\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\1\0\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1")
  1104.  
  1105. //////////////////////////////////////////////////////////////////////
  1106. // Token struct and tokenizing functions
  1107. // TokenPos handles parsing operations on a constant text pointer
  1108. //
  1109. struct TokenPos
  1110. {
  1111. TokenPos( MCD_CSTR sz, int n, FilePos* p=NULL ) { Clear(); m_pDocText=sz; m_nTokenFlags=n; m_pReaderFilePos=p; };
  1112. void Clear() { m_nL=; m_nR=-; m_nNext=; };
  1113. int Length() const { return m_nR - m_nL + ; };
  1114. MCD_PCSZ GetTokenPtr() const { return &m_pDocText[m_nL]; };
  1115. MCD_STR GetTokenText() const { return MCD_STR( GetTokenPtr(), Length() ); };
  1116. MCD_CHAR NextChar() { m_nNext += MCD_CLEN(&m_pDocText[m_nNext]); return m_pDocText[m_nNext]; };
  1117. int WhitespaceToTag( int n ) { m_nNext = n; if (FindAny()&&m_pDocText[m_nNext]!='<') { m_nNext=n; m_nR=n-; } return m_nNext; };
  1118. bool FindAny()
  1119. {
  1120. // Go to non-whitespace or end
  1121. MCD_CHAR cNext = m_pDocText[m_nNext];
  1122. while ( cNext && x_ISWHITESPACE(cNext) )
  1123. cNext = m_pDocText[++m_nNext];
  1124. m_nL = m_nNext;
  1125. m_nR = m_nNext-;
  1126. return m_pDocText[m_nNext]!='\0';
  1127. };
  1128. bool FindName()
  1129. {
  1130. if ( ! FindAny() ) // go to first non-whitespace
  1131. return false;
  1132. MCD_CHAR cNext = m_pDocText[m_nNext];
  1133. while ( cNext && ! x_ISENDANYNAME(cNext) )
  1134. cNext = NextChar();
  1135. if ( m_nNext == m_nL )
  1136. ++m_nNext; // it is a special char
  1137. m_nR = m_nNext - ;
  1138. return true;
  1139. }
  1140. bool Match( MCD_CSTR szName )
  1141. {
  1142. int nLen = Length();
  1143. return ( (x_StrNCmp( GetTokenPtr(), szName, nLen, m_nTokenFlags & CMarkup::MDF_IGNORECASE ) == )
  1144. && ( szName[nLen] == '\0' || x_ISENDPATHWORD(szName[nLen]) ) );
  1145. };
  1146. bool FindAttrib( MCD_PCSZ pAttrib, int n = , MCD_STR* pstrAttrib = NULL );
  1147. int ParseNode( NodePos& node );
  1148. int m_nL;
  1149. int m_nR;
  1150. int m_nNext;
  1151. MCD_PCSZ m_pDocText;
  1152. int m_nTokenFlags;
  1153. int m_nPreSpaceStart;
  1154. int m_nPreSpaceLength;
  1155. FilePos* m_pReaderFilePos;
  1156. };
  1157.  
  1158. bool TokenPos::FindAttrib( MCD_PCSZ pAttrib, int n/*=0*/, MCD_STR* pstrAttrib/*=NULL*/ )
  1159. {
  1160. // Return true if found, otherwise false and token.m_nNext is new insertion point
  1161. // If pAttrib is NULL find attrib n and leave token at attrib name
  1162. // If pAttrib is given, find matching attrib and leave token at value
  1163. // support non-well-formed attributes e.g. href=/advanced_search?hl=en, nowrap
  1164. // token also holds start and length of preceeding whitespace to support remove
  1165. //
  1166. int nTempPreSpaceStart;
  1167. int nTempPreSpaceLength;
  1168. MCD_CHAR cFirstChar, cNext;
  1169. int nAttrib = -; // starts at tag name
  1170. int nFoundAttribNameR = ;
  1171. bool bAfterEqual = false;
  1172. while ( )
  1173. {
  1174. // Starting at m_nNext, bypass whitespace and find the next token
  1175. nTempPreSpaceStart = m_nNext;
  1176. if ( ! FindAny() )
  1177. break;
  1178. nTempPreSpaceLength = m_nNext - nTempPreSpaceStart;
  1179.  
  1180. // Is it an opening quote?
  1181. cFirstChar = m_pDocText[m_nNext];
  1182. if ( cFirstChar == '\"' || cFirstChar == '\'' )
  1183. {
  1184. m_nTokenFlags |= MNF_QUOTED;
  1185.  
  1186. // Move past opening quote
  1187. ++m_nNext;
  1188. m_nL = m_nNext;
  1189.  
  1190. // Look for closing quote
  1191. cNext = m_pDocText[m_nNext];
  1192. while ( cNext && cNext != cFirstChar )
  1193. cNext = NextChar();
  1194.  
  1195. // Set right to before closing quote
  1196. m_nR = m_nNext - ;
  1197.  
  1198. // Set m_nNext past closing quote unless at end of document
  1199. if ( cNext )
  1200. ++m_nNext;
  1201. }
  1202. else
  1203. {
  1204. m_nTokenFlags &= ~MNF_QUOTED;
  1205.  
  1206. // Go until special char or whitespace
  1207. m_nL = m_nNext;
  1208. cNext = m_pDocText[m_nNext];
  1209. if ( bAfterEqual )
  1210. {
  1211. while ( cNext && ! x_ISENDUNQUOTED(cNext) )
  1212. cNext = NextChar();
  1213. }
  1214. else
  1215. {
  1216. while ( cNext && ! x_ISENDATTRIBNAME(cNext) )
  1217. cNext = NextChar();
  1218. }
  1219.  
  1220. // Adjust end position if it is one special char
  1221. if ( m_nNext == m_nL )
  1222. ++m_nNext; // it is a special char
  1223. m_nR = m_nNext - ;
  1224. }
  1225.  
  1226. if ( ! bAfterEqual && ! (m_nTokenFlags&MNF_QUOTED) )
  1227. {
  1228. // Is it an equal sign?
  1229. MCD_CHAR cChar = m_pDocText[m_nL];
  1230. if ( cChar == '=' )
  1231. {
  1232. bAfterEqual = true;
  1233. continue;
  1234. }
  1235.  
  1236. // Is it the end of the tag?
  1237. if ( cChar == '>' || cChar == '/' || cChar == '?' )
  1238. {
  1239. m_nNext = nTempPreSpaceStart;
  1240. break; // attrib not found
  1241. }
  1242.  
  1243. if ( nFoundAttribNameR )
  1244. break;
  1245.  
  1246. // Attribute name
  1247. if ( nAttrib != - )
  1248. {
  1249. if ( ! pAttrib )
  1250. {
  1251. if ( nAttrib == n )
  1252. {
  1253. // found by number
  1254. if ( pstrAttrib )
  1255. {
  1256. *pstrAttrib = GetTokenText();
  1257. nFoundAttribNameR = m_nR;
  1258. }
  1259. else
  1260. return true;
  1261. }
  1262. }
  1263. else if ( Match(pAttrib) )
  1264. {
  1265. // Matched attrib name, go forward to value
  1266. nFoundAttribNameR = m_nR;
  1267. }
  1268. if ( nFoundAttribNameR ) // either by n or name match
  1269. {
  1270. m_nPreSpaceStart = nTempPreSpaceStart;
  1271. m_nPreSpaceLength = nTempPreSpaceLength;
  1272. }
  1273. }
  1274. ++nAttrib;
  1275. }
  1276. else if ( nFoundAttribNameR )
  1277. break;
  1278. bAfterEqual = false;
  1279. }
  1280.  
  1281. if ( nFoundAttribNameR )
  1282. {
  1283. if ( ! bAfterEqual )
  1284. {
  1285. // when attribute has no value the value is the attribute name
  1286. m_nL = m_nPreSpaceStart + m_nPreSpaceLength;
  1287. m_nR = nFoundAttribNameR;
  1288. m_nNext = nFoundAttribNameR + ;
  1289. }
  1290. return true; // found by name
  1291. }
  1292. return false; // not found
  1293. }
  1294.  
  1295. //////////////////////////////////////////////////////////////////////
  1296. // Element tag stack: an array of TagPos structs to track nested elements
  1297. // This is used during parsing to match end tags with corresponding start tags
  1298. // For x_ParseElem only ElemStack::iTop is used with PushIntoLevel, PopOutOfLevel, and Current
  1299. // For file mode then the full capabilities are used to track counts of sibling tag names for path support
  1300. //
  1301. struct TagPos
  1302. {
  1303. TagPos() { Init(); };
  1304. void SetTagName( MCD_PCSZ pName, int n ) { MCD_STRASSIGN(strTagName,pName,n); };
  1305. void Init( int i=, int n= ) { nCount=; nTagNames=n; iNext=i; iPrev=; nSlot=-; iSlotPrev=; iSlotNext=; };
  1306. void IncCount() { if (nCount) ++nCount; };
  1307. MCD_STR strTagName;
  1308. int nCount;
  1309. int nTagNames;
  1310. int iParent;
  1311. int iNext;
  1312. int iPrev;
  1313. int nSlot;
  1314. int iSlotNext;
  1315. int iSlotPrev;
  1316. };
  1317.  
  1318. struct ElemStack
  1319. {
  1320. enum { LS_TABLESIZE = };
  1321. ElemStack() { iTop=; iUsed=; iPar=; nLevel=; nSize=; pL=NULL; Alloc(); pL[].Init(); InitTable(); };
  1322. ~ElemStack() { if (pL) delete [] pL; };
  1323. TagPos& Current() { return pL[iTop]; };
  1324. void InitTable() { memset(anTable,,sizeof(int)*LS_TABLESIZE); };
  1325. TagPos& NextParent( int& i ) { int iCur=i; i=pL[i].iParent; return pL[iCur]; };
  1326. TagPos& GetRefTagPosAt( int i ) { return pL[i]; };
  1327. void Push( MCD_PCSZ pName, int n ) { ++iUsed; if (iUsed==nSize) Alloc(nSize*); pL[iUsed].SetTagName(pName,n); pL[iUsed].iParent=iPar; iTop=iUsed; };
  1328. void IntoLevel() { iPar = iTop; ++nLevel; };
  1329. void OutOfLevel() { if (iPar!=iTop) Pop(); iPar = pL[iTop].iParent; --nLevel; };
  1330. void PushIntoLevel( MCD_PCSZ pName, int n ) { ++iTop; if (iTop==nSize) Alloc(nSize*); pL[iTop].SetTagName(pName,n); };
  1331. void PopOutOfLevel() { --iTop; };
  1332. void Pop() { iTop = iPar; while (iUsed && pL[iUsed].iParent==iPar) { if (pL[iUsed].nSlot!=-) Unslot(pL[iUsed]); --iUsed; } };
  1333. void Slot( int n ) { pL[iUsed].nSlot=n; int i=anTable[n]; anTable[n]=iUsed; pL[iUsed].iSlotNext=i; if (i) pL[i].iSlotPrev=iUsed; };
  1334. void Unslot( TagPos& lp ) { int n=lp.iSlotNext,p=lp.iSlotPrev; if (n) pL[n].iSlotPrev=p; if (p) pL[p].iSlotNext=n; else anTable[lp.nSlot]=n; };
  1335. static int CalcSlot( MCD_PCSZ pName, int n, bool bIC );
  1336. void PushTagAndCount( TokenPos& token );
  1337. int iTop;
  1338. int nLevel;
  1339. int iPar;
  1340. protected:
  1341. void Alloc( int nNewSize ) { TagPos* pLNew = new TagPos[nNewSize]; Copy(pLNew); nSize=nNewSize; };
  1342. void Copy( TagPos* pLNew ) { for(int n=;n<nSize;++n) pLNew[n]=pL[n]; if (pL) delete [] pL; pL=pLNew; };
  1343. TagPos* pL;
  1344. int iUsed;
  1345. int nSize;
  1346. int anTable[LS_TABLESIZE];
  1347. };
  1348.  
  1349. int ElemStack::CalcSlot( MCD_PCSZ pName, int n, bool bIC )
  1350. {
  1351. // If bIC (ASCII ignore case) then return an ASCII case insensitive hash
  1352. unsigned int nHash = ;
  1353. MCD_PCSZ pEnd = pName + n;
  1354. while ( pName != pEnd )
  1355. {
  1356. nHash += (unsigned int)(*pName);
  1357. if ( bIC && *pName >= 'A' && *pName <= 'Z' )
  1358. nHash += ('a'-'A');
  1359. ++pName;
  1360. }
  1361. return nHash%LS_TABLESIZE;
  1362. }
  1363.  
  1364. void ElemStack::PushTagAndCount( TokenPos& token )
  1365. {
  1366. // Check for a matching tag name at the top level and set current if found or add new one
  1367. // Calculate hash of tag name, support ignore ASCII case for MDF_IGNORECASE
  1368. int nSlot = -;
  1369. int iNext = ;
  1370. MCD_PCSZ pTagName = token.GetTokenPtr();
  1371. if ( iTop != iPar )
  1372. {
  1373. // See if tag name is already used, first try previous sibling (almost always)
  1374. iNext = iTop;
  1375. if ( token.Match(Current().strTagName) )
  1376. {
  1377. iNext = -;
  1378. Current().IncCount();
  1379. }
  1380. else
  1381. {
  1382. nSlot = CalcSlot( pTagName, token.Length(), (token.m_nTokenFlags & CMarkup::MDF_IGNORECASE)?true:false );
  1383. int iLookup = anTable[nSlot];
  1384. while ( iLookup )
  1385. {
  1386. TagPos& tag = pL[iLookup];
  1387. if ( tag.iParent == iPar && token.Match(tag.strTagName) )
  1388. {
  1389. pL[tag.iPrev].iNext = tag.iNext;
  1390. if ( tag.iNext )
  1391. pL[tag.iNext].iPrev = tag.iPrev;
  1392. tag.nTagNames = Current().nTagNames;
  1393. tag.iNext = iTop;
  1394. tag.IncCount();
  1395. iTop = iLookup;
  1396. iNext = -;
  1397. break;
  1398. }
  1399. iLookup = tag.iSlotNext;
  1400. }
  1401. }
  1402. }
  1403. if ( iNext != - )
  1404. {
  1405. // Turn off in the rare case where a document uses unique tag names like record1, record2, etc, more than 256
  1406. int nTagNames = ;
  1407. if ( iNext )
  1408. nTagNames = Current().nTagNames;
  1409. if ( nTagNames == )
  1410. {
  1411. MCD_STRASSIGN( (Current().strTagName), pTagName, (token.Length()) );
  1412. Current().nCount = ;
  1413. Unslot( Current() );
  1414. }
  1415. else
  1416. {
  1417. Push( pTagName, token.Length() );
  1418. Current().Init( iNext, nTagNames+ );
  1419. }
  1420. if ( nSlot == - )
  1421. nSlot = CalcSlot( pTagName, token.Length(), (token.m_nTokenFlags & CMarkup::MDF_IGNORECASE)?true:false );
  1422. Slot( nSlot );
  1423. }
  1424. }
  1425.  
  1426. //////////////////////////////////////////////////////////////////////
  1427. // FilePos is created for a file while it is open
  1428. // In file mode the file stays open between CMarkup calls and is stored in m_pFilePos
  1429. //
  1430. struct FilePos
  1431. {
  1432. FilePos()
  1433. {
  1434. m_fp=NULL; m_nDocFlags=; m_nFileByteLen=; m_nFileByteOffset=; m_nOpFileByteLen=; m_nBlockSizeBasis=MARKUP_FILEBLOCKSIZE;
  1435. m_nFileCharUnitSize=; m_nOpFileTextLen=; m_pstrBuffer=NULL; m_nReadBufferStart=; m_nReadBufferRemoved=; m_nReadGatherStart=-;
  1436. };
  1437. bool FileOpen( MCD_CSTR_FILENAME szFileName );
  1438. bool FileRead( void* pBuffer );
  1439. bool FileReadText( MCD_STR& strDoc );
  1440. bool FileCheckRaggedEnd( void* pBuffer );
  1441. bool FileReadNextBuffer();
  1442. void FileGatherStart( int nStart );
  1443. int FileGatherEnd( MCD_STR& strSubDoc );
  1444. bool FileWrite( void* pBuffer, const void* pConstBuffer = NULL );
  1445. bool FileWriteText( const MCD_STR& strDoc, int nWriteStrLen = - );
  1446. bool FileFlush( MCD_STR& strBuffer, int nWriteStrLen = -, bool bFflush = false );
  1447. bool FileClose();
  1448. void FileSpecifyEncoding( MCD_STR* pstrEncoding );
  1449. bool FileAtTop();
  1450. bool FileErrorAddResult();
  1451.  
  1452. FILE* m_fp;
  1453. int m_nDocFlags;
  1454. int m_nOpFileByteLen;
  1455. int m_nBlockSizeBasis;
  1456. MCD_INTFILEOFFSET m_nFileByteLen;
  1457. MCD_INTFILEOFFSET m_nFileByteOffset;
  1458. int m_nFileCharUnitSize;
  1459. int m_nOpFileTextLen;
  1460. MCD_STR m_strIOResult;
  1461. MCD_STR m_strEncoding;
  1462. MCD_STR* m_pstrBuffer;
  1463. ElemStack m_elemstack;
  1464. int m_nReadBufferStart;
  1465. int m_nReadBufferRemoved;
  1466. int m_nReadGatherStart;
  1467. MCD_STR m_strReadGatherMarkup;
  1468. };
  1469.  
  1470. struct BomTableStruct { const char* pszBom; int nBomLen; MCD_PCSZ pszBomEnc; int nBomFlag; } BomTable[] =
  1471. {
  1472. { "\xef\xbb\xbf", , MCD_T("UTF-8"), CMarkup::MDF_UTF8PREAMBLE },
  1473. { "\xff\xfe", , MCD_T("UTF-16LE"), CMarkup::MDF_UTF16LEFILE },
  1474. { "\xfe\xff", , MCD_T("UTF-16BE"), CMarkup::MDF_UTF16BEFILE },
  1475. { NULL,,NULL, }
  1476. };
  1477.  
  1478. bool FilePos::FileErrorAddResult()
  1479. {
  1480. // strerror has difficulties cross-platform
  1481. // VC++ leaves MCD_STRERROR undefined and uses FormatMessage
  1482. // Non-VC++ use strerror (even for MARKUP_WCHAR and convert)
  1483. // additional notes:
  1484. // _WIN32_WCE (Windows CE) has no strerror (Embedded VC++ uses FormatMessage)
  1485. // _MSC_VER >= 1310 (VC++ 2003/7.1) has _wcserror (but not used)
  1486. //
  1487. const int nErrorBufferSize = ;
  1488. int nErr = ;
  1489. MCD_CHAR szError[nErrorBufferSize+];
  1490. #if defined(MCD_STRERROR) // C error routine
  1491. nErr = (int)errno;
  1492. #if defined(MARKUP_WCHAR)
  1493. char szMBError[nErrorBufferSize+];
  1494. strncpy( szMBError, MCD_STRERROR, nErrorBufferSize );
  1495. szMBError[nErrorBufferSize] = '\0';
  1496. TextEncoding textencoding( MCD_T(""), (const void*)szMBError, strlen(szMBError) );
  1497. textencoding.m_nToCount = nErrorBufferSize;
  1498. int nWideLen = textencoding.PerformConversion( (void*)szError, MCD_ENC );
  1499. szError[nWideLen] = '\0';
  1500. #else
  1501. MCD_PSZNCPY( szError, MCD_STRERROR, nErrorBufferSize );
  1502. szError[nErrorBufferSize] = '\0';
  1503. #endif
  1504. #else // no C error routine, use Windows API
  1505. DWORD dwErr = ::GetLastError();
  1506. if ( ::FormatMessage(0x1200,,dwErr,,szError,nErrorBufferSize,) < )
  1507. szError[] = '\0';
  1508. nErr = (int)dwErr;
  1509. #endif // no C error routine
  1510. MCD_STR strError = szError;
  1511. for ( int nChar=; nChar<MCD_STRLENGTH(strError); ++nChar )
  1512. if ( strError[nChar] == '\r' || strError[nChar] == '\n' )
  1513. {
  1514. strError = MCD_STRMID( strError, , nChar ); // no trailing newline
  1515. break;
  1516. }
  1517. x_AddResult( m_strIOResult, MCD_T("file_error"), strError, MRC_MSG|MRC_NUMBER, nErr );
  1518. return false;
  1519. }
  1520.  
  1521. void FilePos::FileSpecifyEncoding( MCD_STR* pstrEncoding )
  1522. {
  1523. // In ReadTextFile, WriteTextFile and Open, the pstrEncoding argument can override or return the detected encoding
  1524. if ( pstrEncoding && m_strEncoding != *pstrEncoding )
  1525. {
  1526. if ( m_nFileCharUnitSize == && *pstrEncoding != MCD_T("") )
  1527. m_strEncoding = *pstrEncoding; // override the encoding
  1528. else // just report the encoding
  1529. *pstrEncoding = m_strEncoding;
  1530. }
  1531. }
  1532.  
  1533. bool FilePos::FileAtTop()
  1534. {
  1535. // Return true if in the first block of file mode, max BOM < 5 bytes
  1536. if ( ((m_nDocFlags & CMarkup::MDF_READFILE) && m_nFileByteOffset < (MCD_INTFILEOFFSET)m_nOpFileByteLen + )
  1537. || ((m_nDocFlags & CMarkup::MDF_WRITEFILE) && m_nFileByteOffset < ) )
  1538. return true;
  1539. return false;
  1540. }
  1541.  
  1542. bool FilePos::FileOpen( MCD_CSTR_FILENAME szFileName )
  1543. {
  1544. MCD_STRCLEAR( m_strIOResult );
  1545.  
  1546. // Open file
  1547. MCD_PCSZ_FILENAME pMode = MCD_T_FILENAME("rb");
  1548. if ( m_nDocFlags & CMarkup::MDF_APPENDFILE )
  1549. pMode = MCD_T_FILENAME("ab");
  1550. else if ( m_nDocFlags & CMarkup::MDF_WRITEFILE )
  1551. pMode = MCD_T_FILENAME("wb");
  1552. m_fp = NULL;
  1553. MCD_FOPEN( m_fp, szFileName, pMode );
  1554. if ( ! m_fp )
  1555. return FileErrorAddResult();
  1556.  
  1557. // Prepare file
  1558. bool bSuccess = true;
  1559. int nBomLen = ;
  1560. m_nFileCharUnitSize = ; // unless UTF-16 BOM
  1561. if ( m_nDocFlags & CMarkup::MDF_READFILE )
  1562. {
  1563. // Get file length
  1564. MCD_FSEEK( m_fp, , SEEK_END );
  1565. m_nFileByteLen = MCD_FTELL( m_fp );
  1566. MCD_FSEEK( m_fp, , SEEK_SET );
  1567.  
  1568. // Read the top of the file to check BOM and encoding
  1569. int nReadTop = ;
  1570. if ( m_nFileByteLen < nReadTop )
  1571. nReadTop = (int)m_nFileByteLen;
  1572. if ( nReadTop )
  1573. {
  1574. char* pFileTop = new char[nReadTop];
  1575. if ( nReadTop )
  1576. bSuccess = ( fread( pFileTop, nReadTop, , m_fp ) == );
  1577. if ( bSuccess )
  1578. {
  1579. // Check for Byte Order Mark (preamble)
  1580. int nBomCheck = ;
  1581. m_nDocFlags &= ~( CMarkup::MDF_UTF16LEFILE | CMarkup::MDF_UTF8PREAMBLE );
  1582. while ( BomTable[nBomCheck].pszBom )
  1583. {
  1584. while ( nBomLen < BomTable[nBomCheck].nBomLen )
  1585. {
  1586. if ( nBomLen >= nReadTop || pFileTop[nBomLen] != BomTable[nBomCheck].pszBom[nBomLen] )
  1587. break;
  1588. ++nBomLen;
  1589. }
  1590. if ( nBomLen == BomTable[nBomCheck].nBomLen )
  1591. {
  1592. m_nDocFlags |= BomTable[nBomCheck].nBomFlag;
  1593. if ( nBomLen == )
  1594. m_nFileCharUnitSize = ;
  1595. m_strEncoding = BomTable[nBomCheck].pszBomEnc;
  1596. break;
  1597. }
  1598. ++nBomCheck;
  1599. nBomLen = ;
  1600. }
  1601. if ( nReadTop > nBomLen )
  1602. MCD_FSEEK( m_fp, nBomLen, SEEK_SET );
  1603.  
  1604. // Encoding check
  1605. if ( ! nBomLen )
  1606. {
  1607. MCD_STR strDeclCheck;
  1608. #if defined(MARKUP_WCHAR) // WCHAR
  1609. TextEncoding textencoding( MCD_T("UTF-8"), (const void*)pFileTop, nReadTop );
  1610. MCD_CHAR* pWideBuffer = MCD_GETBUFFER(strDeclCheck,nReadTop);
  1611. textencoding.m_nToCount = nReadTop;
  1612. int nDeclWideLen = textencoding.PerformConversion( (void*)pWideBuffer, MCD_ENC );
  1613. MCD_RELEASEBUFFER(strDeclCheck,pWideBuffer,nDeclWideLen);
  1614. #else // not WCHAR
  1615. MCD_STRASSIGN(strDeclCheck,pFileTop,nReadTop);
  1616. #endif // not WCHAR
  1617. m_strEncoding = CMarkup::GetDeclaredEncoding( strDeclCheck );
  1618. }
  1619. // Assume markup files starting with < sign are UTF-8 if otherwise unknown
  1620. if ( MCD_STRISEMPTY(m_strEncoding) && pFileTop[] == '<' )
  1621. m_strEncoding = MCD_T("UTF-8");
  1622. }
  1623. delete [] pFileTop;
  1624. }
  1625. }
  1626. else if ( m_nDocFlags & CMarkup::MDF_WRITEFILE )
  1627. {
  1628. if ( m_nDocFlags & CMarkup::MDF_APPENDFILE )
  1629. {
  1630. // fopen for append does not move the file pointer to the end until first I/O operation
  1631. MCD_FSEEK( m_fp, , SEEK_END );
  1632. m_nFileByteLen = MCD_FTELL( m_fp );
  1633. }
  1634. int nBomCheck = ;
  1635. while ( BomTable[nBomCheck].pszBom )
  1636. {
  1637. if ( m_nDocFlags & BomTable[nBomCheck].nBomFlag )
  1638. {
  1639. nBomLen = BomTable[nBomCheck].nBomLen;
  1640. if ( nBomLen == )
  1641. m_nFileCharUnitSize = ;
  1642. m_strEncoding = BomTable[nBomCheck].pszBomEnc;
  1643. if ( m_nFileByteLen ) // append
  1644. nBomLen = ;
  1645. else // write BOM
  1646. bSuccess = ( fwrite(BomTable[nBomCheck].pszBom,nBomLen,,m_fp) == );
  1647. break;
  1648. }
  1649. ++nBomCheck;
  1650. }
  1651. }
  1652. if ( ! bSuccess )
  1653. return FileErrorAddResult();
  1654.  
  1655. if ( m_nDocFlags & CMarkup::MDF_APPENDFILE )
  1656. m_nFileByteOffset = m_nFileByteLen;
  1657. else
  1658. m_nFileByteOffset = (MCD_INTFILEOFFSET)nBomLen;
  1659. if ( nBomLen )
  1660. x_AddResult( m_strIOResult, MCD_T("bom") );
  1661. return bSuccess;
  1662. }
  1663.  
  1664. bool FilePos::FileRead( void* pBuffer )
  1665. {
  1666. bool bSuccess = ( fread( pBuffer,m_nOpFileByteLen,,m_fp) == );
  1667. m_nOpFileTextLen = m_nOpFileByteLen / m_nFileCharUnitSize;
  1668. if ( bSuccess )
  1669. {
  1670. m_nFileByteOffset += m_nOpFileByteLen;
  1671. x_AddResult( m_strIOResult, MCD_T("read"), m_strEncoding, MRC_ENCODING|MRC_LENGTH, m_nOpFileTextLen );
  1672.  
  1673. // Microsoft components can produce apparently valid docs with some nulls at ends of values
  1674. int nNullCount = ;
  1675. int nNullCheckCharsRemaining = m_nOpFileTextLen;
  1676. char* pAfterNull = NULL;
  1677. char* pNullScan = (char*)pBuffer;
  1678. bool bSingleByteChar = m_nFileCharUnitSize == ;
  1679. while ( nNullCheckCharsRemaining-- )
  1680. {
  1681. if ( bSingleByteChar? (! *pNullScan) : (! (*(unsigned short*)pNullScan)) )
  1682. {
  1683. if ( pAfterNull && pNullScan != pAfterNull )
  1684. memmove( pAfterNull - (nNullCount*m_nFileCharUnitSize), pAfterNull, pNullScan - pAfterNull );
  1685. pAfterNull = pNullScan + m_nFileCharUnitSize;
  1686. ++nNullCount;
  1687. }
  1688. pNullScan += m_nFileCharUnitSize;
  1689. }
  1690. if ( pAfterNull && pNullScan != pAfterNull )
  1691. memmove( pAfterNull - (nNullCount*m_nFileCharUnitSize), pAfterNull, pNullScan - pAfterNull );
  1692. if ( nNullCount )
  1693. {
  1694. x_AddResult( m_strIOResult, MCD_T("nulls_removed"), NULL, MRC_COUNT, nNullCount );
  1695. m_nOpFileTextLen -= nNullCount;
  1696. }
  1697.  
  1698. // Big endian/little endian conversion
  1699. if ( m_nFileCharUnitSize > && x_EndianSwapRequired(m_nDocFlags) )
  1700. {
  1701. x_EndianSwapUTF16( (unsigned short*)pBuffer, m_nOpFileTextLen );
  1702. x_AddResult( m_strIOResult, MCD_T("endian_swap") );
  1703. }
  1704. }
  1705. if ( ! bSuccess )
  1706. FileErrorAddResult();
  1707. return bSuccess;
  1708. }
  1709.  
  1710. bool FilePos::FileCheckRaggedEnd( void* pBuffer )
  1711. {
  1712. // In file read mode, piece of file text in memory must end on a character boundary
  1713. // This check must happen after the encoding has been decided, so after UTF-8 autodetection
  1714. // If ragged, adjust file position, m_nOpFileTextLen and m_nOpFileByteLen
  1715. int nTruncBeforeBytes = ;
  1716. TextEncoding textencoding( m_strEncoding, pBuffer, m_nOpFileTextLen );
  1717. if ( ! textencoding.FindRaggedEnd(nTruncBeforeBytes) )
  1718. {
  1719.  
  1720. // Input must be garbled? decoding error before potentially ragged end, add error result and continue
  1721. MCD_STR strEncoding = m_strEncoding;
  1722. if ( MCD_STRISEMPTY(strEncoding) )
  1723. strEncoding = MCD_T("ANSI");
  1724. x_AddResult( m_strIOResult, MCD_T("truncation_error"), strEncoding, MRC_ENCODING );
  1725. }
  1726. else if ( nTruncBeforeBytes )
  1727. {
  1728. nTruncBeforeBytes *= -;
  1729. m_nFileByteOffset += nTruncBeforeBytes;
  1730. MCD_FSEEK( m_fp, m_nFileByteOffset, SEEK_SET );
  1731. m_nOpFileByteLen += nTruncBeforeBytes;
  1732. m_nOpFileTextLen += nTruncBeforeBytes / m_nFileCharUnitSize;
  1733. x_AddResult( m_strIOResult, MCD_T("read"), NULL, MRC_MODIFY|MRC_LENGTH, m_nOpFileTextLen );
  1734. }
  1735. return true;
  1736. }
  1737.  
  1738. bool FilePos::FileReadText( MCD_STR& strDoc )
  1739. {
  1740. bool bSuccess = true;
  1741. MCD_STRCLEAR( m_strIOResult );
  1742. if ( ! m_nOpFileByteLen )
  1743. {
  1744. x_AddResult( m_strIOResult, MCD_T("read"), m_strEncoding, MRC_ENCODING|MRC_LENGTH, );
  1745. return bSuccess;
  1746. }
  1747.  
  1748. // Only read up to end of file (a single read byte length cannot be over the capacity of int)
  1749. bool bCheckRaggedEnd = true;
  1750. MCD_INTFILEOFFSET nBytesRemaining = m_nFileByteLen - m_nFileByteOffset;
  1751. if ( (MCD_INTFILEOFFSET)m_nOpFileByteLen >= nBytesRemaining )
  1752. {
  1753. m_nOpFileByteLen = (int)nBytesRemaining;
  1754. bCheckRaggedEnd = false;
  1755. }
  1756.  
  1757. if ( m_nDocFlags & (CMarkup::MDF_UTF16LEFILE | CMarkup::MDF_UTF16BEFILE) )
  1758. {
  1759. int nUTF16Len = m_nOpFileByteLen / ;
  1760. #if defined(MARKUP_WCHAR) // WCHAR
  1761. int nBufferSizeForGrow = nUTF16Len + nUTF16Len/; // extra 1%
  1762. #if MARKUP_SIZEOFWCHAR == 4 // sizeof(wchar_t) == 4
  1763. unsigned short* pUTF16Buffer = new unsigned short[nUTF16Len+];
  1764. bSuccess = FileRead( pUTF16Buffer );
  1765. if ( bSuccess )
  1766. {
  1767. if ( bCheckRaggedEnd )
  1768. FileCheckRaggedEnd( (void*)pUTF16Buffer );
  1769. TextEncoding textencoding( MCD_T("UTF-16"), (const void*)pUTF16Buffer, m_nOpFileTextLen );
  1770. textencoding.m_nToCount = nBufferSizeForGrow;
  1771. MCD_CHAR* pUTF32Buffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow);
  1772. int nUTF32Len = textencoding.PerformConversion( (void*)pUTF32Buffer, MCD_T("UTF-32") );
  1773. MCD_RELEASEBUFFER(strDoc,pUTF32Buffer,nUTF32Len);
  1774. x_AddResult( m_strIOResult, MCD_T("converted_to"), MCD_T("UTF-32"), MRC_ENCODING|MRC_LENGTH, nUTF32Len );
  1775. }
  1776. #else // sizeof(wchar_t) == 2
  1777. MCD_CHAR* pUTF16Buffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow);
  1778. bSuccess = FileRead( pUTF16Buffer );
  1779. if ( bSuccess && bCheckRaggedEnd )
  1780. FileCheckRaggedEnd( (void*)pUTF16Buffer );
  1781. MCD_RELEASEBUFFER(strDoc,pUTF16Buffer,m_nOpFileTextLen);
  1782. #endif // sizeof(wchar_t) == 2
  1783. #else // not WCHAR
  1784. // Convert file from UTF-16; it needs to be in memory as UTF-8 or MBCS
  1785. unsigned short* pUTF16Buffer = new unsigned short[nUTF16Len+];
  1786. bSuccess = FileRead( pUTF16Buffer );
  1787. if ( bSuccess && bCheckRaggedEnd )
  1788. FileCheckRaggedEnd( (void*)pUTF16Buffer );
  1789. TextEncoding textencoding( MCD_T("UTF-16"), (const void*)pUTF16Buffer, m_nOpFileTextLen );
  1790. int nMBLen = textencoding.PerformConversion( NULL, MCD_ENC );
  1791. int nBufferSizeForGrow = nMBLen + nMBLen/; // extra 1%
  1792. MCD_CHAR* pMBBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow);
  1793. textencoding.PerformConversion( (void*)pMBBuffer );
  1794. delete [] pUTF16Buffer;
  1795. MCD_RELEASEBUFFER(strDoc,pMBBuffer,nMBLen);
  1796. x_AddResult( m_strIOResult, MCD_T("converted_to"), MCD_ENC, MRC_ENCODING|MRC_LENGTH, nMBLen );
  1797. if ( textencoding.m_nFailedChars )
  1798. x_AddResult( m_strIOResult, MCD_T("conversion_loss") );
  1799. #endif // not WCHAR
  1800. }
  1801. else // single or multibyte file (i.e. not UTF-16)
  1802. {
  1803. #if defined(MARKUP_WCHAR) // WCHAR
  1804. char* pBuffer = new char[m_nOpFileByteLen];
  1805. bSuccess = FileRead( pBuffer );
  1806. if ( MCD_STRISEMPTY(m_strEncoding) )
  1807. {
  1808. int nNonASCII;
  1809. bool bErrorAtEnd;
  1810. if ( CMarkup::DetectUTF8(pBuffer,m_nOpFileByteLen,&nNonASCII,&bErrorAtEnd) || (bCheckRaggedEnd && bErrorAtEnd) )
  1811. {
  1812. m_strEncoding = MCD_T("UTF-8");
  1813. x_AddResult( m_strIOResult, MCD_T("read"), m_strEncoding, MRC_MODIFY|MRC_ENCODING );
  1814. }
  1815. x_AddResult( m_strIOResult, MCD_T("utf8_detection") );
  1816. }
  1817. if ( bSuccess && bCheckRaggedEnd )
  1818. FileCheckRaggedEnd( (void*)pBuffer );
  1819. TextEncoding textencoding( m_strEncoding, (const void*)pBuffer, m_nOpFileTextLen );
  1820. int nWideLen = textencoding.PerformConversion( NULL, MCD_ENC );
  1821. int nBufferSizeForGrow = nWideLen + nWideLen/; // extra 1%
  1822. MCD_CHAR* pWideBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow);
  1823. textencoding.PerformConversion( (void*)pWideBuffer );
  1824. MCD_RELEASEBUFFER( strDoc, pWideBuffer, nWideLen );
  1825. delete [] pBuffer;
  1826. x_AddResult( m_strIOResult, MCD_T("converted_to"), MCD_ENC, MRC_ENCODING|MRC_LENGTH, nWideLen );
  1827. #else // not WCHAR
  1828. // After loading a file with unknown multi-byte encoding
  1829. bool bAssumeUnknownIsNative = false;
  1830. if ( MCD_STRISEMPTY(m_strEncoding) )
  1831. {
  1832. bAssumeUnknownIsNative = true;
  1833. m_strEncoding = MCD_ENC;
  1834. }
  1835. if ( TextEncoding::CanConvert(MCD_ENC,m_strEncoding) )
  1836. {
  1837. char* pBuffer = new char[m_nOpFileByteLen];
  1838. bSuccess = FileRead( pBuffer );
  1839. if ( bSuccess && bCheckRaggedEnd )
  1840. FileCheckRaggedEnd( (void*)pBuffer );
  1841. TextEncoding textencoding( m_strEncoding, (const void*)pBuffer, m_nOpFileTextLen );
  1842. int nMBLen = textencoding.PerformConversion( NULL, MCD_ENC );
  1843. int nBufferSizeForGrow = nMBLen + nMBLen/; // extra 1%
  1844. MCD_CHAR* pMBBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow);
  1845. textencoding.PerformConversion( (void*)pMBBuffer );
  1846. MCD_RELEASEBUFFER( strDoc, pMBBuffer, nMBLen );
  1847. delete [] pBuffer;
  1848. x_AddResult( m_strIOResult, MCD_T("converted_to"), MCD_ENC, MRC_ENCODING|MRC_LENGTH, nMBLen );
  1849. if ( textencoding.m_nFailedChars )
  1850. x_AddResult( m_strIOResult, MCD_T("conversion_loss") );
  1851. }
  1852. else // load directly into string
  1853. {
  1854. int nBufferSizeForGrow = m_nOpFileByteLen + m_nOpFileByteLen/; // extra 1%
  1855. MCD_CHAR* pBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow);
  1856. bSuccess = FileRead( pBuffer );
  1857. bool bConvertMB = false;
  1858. if ( bAssumeUnknownIsNative )
  1859. {
  1860. // Might need additional conversion if we assumed an encoding
  1861. int nNonASCII;
  1862. bool bErrorAtEnd;
  1863. bool bIsUTF8 = CMarkup::DetectUTF8( pBuffer, m_nOpFileByteLen, &nNonASCII, &bErrorAtEnd ) || (bCheckRaggedEnd && bErrorAtEnd);
  1864. MCD_STR strDetectedEncoding = bIsUTF8? MCD_T("UTF-8"): MCD_T("");
  1865. if ( nNonASCII && m_strEncoding != strDetectedEncoding ) // only need to convert non-ASCII
  1866. bConvertMB = true;
  1867. m_strEncoding = strDetectedEncoding;
  1868. if ( bIsUTF8 )
  1869. x_AddResult( m_strIOResult, MCD_T("read"), m_strEncoding, MRC_MODIFY|MRC_ENCODING );
  1870. }
  1871. if ( bSuccess && bCheckRaggedEnd )
  1872. FileCheckRaggedEnd( (void*)pBuffer );
  1873. MCD_RELEASEBUFFER( strDoc, pBuffer, m_nOpFileTextLen );
  1874. if ( bConvertMB )
  1875. {
  1876. TextEncoding textencoding( m_strEncoding, MCD_2PCSZ(strDoc), m_nOpFileTextLen );
  1877. int nMBLen = textencoding.PerformConversion( NULL, MCD_ENC );
  1878. nBufferSizeForGrow = nMBLen + nMBLen/; // extra 1%
  1879. MCD_STR strConvDoc;
  1880. pBuffer = MCD_GETBUFFER(strConvDoc,nBufferSizeForGrow);
  1881. textencoding.PerformConversion( (void*)pBuffer );
  1882. MCD_RELEASEBUFFER( strConvDoc, pBuffer, nMBLen );
  1883. strDoc = strConvDoc;
  1884. x_AddResult( m_strIOResult, MCD_T("converted_to"), MCD_ENC, MRC_ENCODING|MRC_LENGTH, nMBLen );
  1885. if ( textencoding.m_nFailedChars )
  1886. x_AddResult( m_strIOResult, MCD_T("conversion_loss") );
  1887. }
  1888. if ( bAssumeUnknownIsNative )
  1889. x_AddResult( m_strIOResult, MCD_T("utf8_detection") );
  1890. }
  1891. #endif // not WCHAR
  1892. }
  1893. return bSuccess;
  1894. }
  1895.  
  1896. bool FilePos::FileWrite( void* pBuffer, const void* pConstBuffer /*=NULL*/ )
  1897. {
  1898. m_nOpFileByteLen = m_nOpFileTextLen * m_nFileCharUnitSize;
  1899. if ( ! pConstBuffer )
  1900. pConstBuffer = pBuffer;
  1901. unsigned short* pTempEndianBuffer = NULL;
  1902. if ( x_EndianSwapRequired(m_nDocFlags) )
  1903. {
  1904. if ( ! pBuffer )
  1905. {
  1906. pTempEndianBuffer = new unsigned short[m_nOpFileTextLen];
  1907. memcpy( pTempEndianBuffer, pConstBuffer, m_nOpFileTextLen * );
  1908. pBuffer = pTempEndianBuffer;
  1909. pConstBuffer = pTempEndianBuffer;
  1910. }
  1911. x_EndianSwapUTF16( (unsigned short*)pBuffer, m_nOpFileTextLen );
  1912. x_AddResult( m_strIOResult, MCD_T("endian_swap") );
  1913. }
  1914. bool bSuccess = ( fwrite( pConstBuffer, m_nOpFileByteLen, , m_fp ) == );
  1915. if ( pTempEndianBuffer )
  1916. delete [] pTempEndianBuffer;
  1917. if ( bSuccess )
  1918. {
  1919. m_nFileByteOffset += m_nOpFileByteLen;
  1920. x_AddResult( m_strIOResult, MCD_T("write"), m_strEncoding, MRC_ENCODING|MRC_LENGTH, m_nOpFileTextLen );
  1921. }
  1922. else
  1923. FileErrorAddResult();
  1924. return bSuccess;
  1925. }
  1926.  
  1927. bool FilePos::FileWriteText( const MCD_STR& strDoc, int nWriteStrLen/*=-1*/ )
  1928. {
  1929. bool bSuccess = true;
  1930. MCD_STRCLEAR( m_strIOResult );
  1931. MCD_PCSZ pDoc = MCD_2PCSZ(strDoc);
  1932. if ( nWriteStrLen == - )
  1933. nWriteStrLen = MCD_STRLENGTH(strDoc);
  1934. if ( ! nWriteStrLen )
  1935. {
  1936. x_AddResult( m_strIOResult, MCD_T("write"), m_strEncoding, MRC_ENCODING|MRC_LENGTH, );
  1937. return bSuccess;
  1938. }
  1939.  
  1940. if ( m_nDocFlags & (CMarkup::MDF_UTF16LEFILE | CMarkup::MDF_UTF16BEFILE) )
  1941. {
  1942. #if defined(MARKUP_WCHAR) // WCHAR
  1943. #if MARKUP_SIZEOFWCHAR == 4 // sizeof(wchar_t) == 4
  1944. TextEncoding textencoding( MCD_T("UTF-32"), (const void*)pDoc, nWriteStrLen );
  1945. m_nOpFileTextLen = textencoding.PerformConversion( NULL, MCD_T("UTF-16") );
  1946. unsigned short* pUTF16Buffer = new unsigned short[m_nOpFileTextLen];
  1947. textencoding.PerformConversion( (void*)pUTF16Buffer );
  1948. x_AddResult( m_strIOResult, MCD_T("converted_from"), MCD_T("UTF-32"), MRC_ENCODING|MRC_LENGTH, nWriteStrLen );
  1949. bSuccess = FileWrite( pUTF16Buffer );
  1950. delete [] pUTF16Buffer;
  1951. #else // sizeof(wchar_t) == 2
  1952. m_nOpFileTextLen = nWriteStrLen;
  1953. bSuccess = FileWrite( NULL, pDoc );
  1954. #endif
  1955. #else // not WCHAR
  1956. TextEncoding textencoding( MCD_ENC, (const void*)pDoc, nWriteStrLen );
  1957. m_nOpFileTextLen = textencoding.PerformConversion( NULL, MCD_T("UTF-16") );
  1958. unsigned short* pUTF16Buffer = new unsigned short[m_nOpFileTextLen];
  1959. textencoding.PerformConversion( (void*)pUTF16Buffer );
  1960. x_AddResult( m_strIOResult, MCD_T("converted_from"), MCD_ENC, MRC_ENCODING|MRC_LENGTH, nWriteStrLen );
  1961. bSuccess = FileWrite( pUTF16Buffer );
  1962. delete [] pUTF16Buffer;
  1963. #endif // not WCHAR
  1964. }
  1965. else // single or multibyte file (i.e. not UTF-16)
  1966. {
  1967. #if ! defined(MARKUP_WCHAR) // not WCHAR
  1968. if ( ! TextEncoding::CanConvert(m_strEncoding,MCD_ENC) )
  1969. {
  1970. // Same or unsupported multi-byte to multi-byte, so save directly from string
  1971. m_nOpFileTextLen = nWriteStrLen;
  1972. bSuccess = FileWrite( NULL, pDoc );
  1973. return bSuccess;
  1974. }
  1975. #endif // not WCHAR
  1976. TextEncoding textencoding( MCD_ENC, (const void*)pDoc, nWriteStrLen );
  1977. m_nOpFileTextLen = textencoding.PerformConversion( NULL, m_strEncoding );
  1978. char* pMBBuffer = new char[m_nOpFileTextLen];
  1979. textencoding.PerformConversion( (void*)pMBBuffer );
  1980. x_AddResult( m_strIOResult, MCD_T("converted_from"), MCD_ENC, MRC_ENCODING|MRC_LENGTH, nWriteStrLen );
  1981. if ( textencoding.m_nFailedChars )
  1982. x_AddResult( m_strIOResult, MCD_T("conversion_loss") );
  1983. bSuccess = FileWrite( pMBBuffer );
  1984. delete [] pMBBuffer;
  1985. }
  1986.  
  1987. return bSuccess;
  1988. }
  1989.  
  1990. bool FilePos::FileClose()
  1991. {
  1992. if ( m_fp )
  1993. {
  1994. if ( fclose(m_fp) )
  1995. FileErrorAddResult();
  1996. m_fp = NULL;
  1997. m_nDocFlags &= ~(CMarkup::MDF_WRITEFILE|CMarkup::MDF_READFILE|CMarkup::MDF_APPENDFILE);
  1998. return true;
  1999. }
  2000. return false;
  2001. }
  2002.  
  2003. bool FilePos::FileReadNextBuffer()
  2004. {
  2005. // If not end of file, returns amount to subtract from offsets
  2006. if ( m_nFileByteOffset < m_nFileByteLen )
  2007. {
  2008. // Prepare to put this node at beginning
  2009. MCD_STR& str = *m_pstrBuffer;
  2010. int nDocLength = MCD_STRLENGTH( str );
  2011. int nRemove = m_nReadBufferStart;
  2012. m_nReadBufferRemoved = nRemove;
  2013.  
  2014. // Gather
  2015. if ( m_nReadGatherStart != - )
  2016. {
  2017. if ( m_nReadBufferStart > m_nReadGatherStart )
  2018. {
  2019. // In case it is a large subdoc, reduce reallocs by using x_StrInsertReplace
  2020. MCD_STR strAppend = MCD_STRMID( str, m_nReadGatherStart, m_nReadBufferStart - m_nReadGatherStart );
  2021. x_StrInsertReplace( m_strReadGatherMarkup, MCD_STRLENGTH(m_strReadGatherMarkup), , strAppend );
  2022. }
  2023. m_nReadGatherStart = ;
  2024. }
  2025.  
  2026. // Increase capacity if keeping more than half of nDocLength
  2027. int nKeepLength = nDocLength - nRemove;
  2028. if ( nKeepLength > nDocLength / )
  2029. m_nBlockSizeBasis *= ;
  2030. if ( nRemove )
  2031. x_StrInsertReplace( str, , nRemove, MCD_STR() );
  2032. MCD_STR strRead;
  2033. m_nOpFileByteLen = m_nBlockSizeBasis - nKeepLength;
  2034. m_nOpFileByteLen += - m_nOpFileByteLen % ; // round up to 4-byte offset
  2035. FileReadText( strRead );
  2036. x_StrInsertReplace( str, nKeepLength, , strRead );
  2037. m_nReadBufferStart = ; // next time just elongate/increase capacity
  2038. return true;
  2039. }
  2040. return false;
  2041. }
  2042.  
  2043. void FilePos::FileGatherStart( int nStart )
  2044. {
  2045. m_nReadGatherStart = nStart;
  2046. }
  2047.  
  2048. int FilePos::FileGatherEnd( MCD_STR& strMarkup )
  2049. {
  2050. int nStart = m_nReadGatherStart;
  2051. m_nReadGatherStart = -;
  2052. strMarkup = m_strReadGatherMarkup;
  2053. MCD_STRCLEAR( m_strReadGatherMarkup );
  2054. return nStart;
  2055. }
  2056.  
  2057. bool FilePos::FileFlush( MCD_STR& strBuffer, int nWriteStrLen/*=-1*/, bool bFflush/*=false*/ )
  2058. {
  2059. bool bSuccess = true;
  2060. MCD_STRCLEAR( m_strIOResult );
  2061. if ( nWriteStrLen == - )
  2062. nWriteStrLen = MCD_STRLENGTH( strBuffer );
  2063. if ( nWriteStrLen )
  2064. {
  2065. if ( (! m_nFileByteOffset) && MCD_STRISEMPTY(m_strEncoding) && ! MCD_STRISEMPTY(strBuffer) )
  2066. {
  2067. m_strEncoding = CMarkup::GetDeclaredEncoding( strBuffer );
  2068. if ( MCD_STRISEMPTY(m_strEncoding) )
  2069. m_strEncoding = MCD_T("UTF-8");
  2070. }
  2071. bSuccess = FileWriteText( strBuffer, nWriteStrLen );
  2072. if ( bSuccess )
  2073. x_StrInsertReplace( strBuffer, , nWriteStrLen, MCD_STR() );
  2074. }
  2075. if ( bFflush && bSuccess )
  2076. {
  2077. if ( fflush(m_fp) )
  2078. bSuccess = FileErrorAddResult();
  2079. }
  2080. return bSuccess;
  2081. }
  2082.  
  2083. //////////////////////////////////////////////////////////////////////
  2084. // PathPos encapsulates parsing of the path string used in Find methods
  2085. //
  2086. struct PathPos
  2087. {
  2088. PathPos( MCD_PCSZ pszPath, bool b ) { p=pszPath; bReader=b; i=; iPathAttribName=; iSave=; nPathType=; if (!ParsePath()) nPathType=-; };
  2089. int GetTypeAndInc() { i=-; if (p) { if (p[]=='/') { if (p[]=='/') i=; else i=; } else if (p[]) i=; } nPathType=i+; return nPathType; };
  2090. int GetNumAndInc() { int n=; while (p[i]>=''&&p[i]<='') n=n*+(int)p[i++]-(int)''; return n; };
  2091. MCD_PCSZ GetValAndInc() { ++i; MCD_CHAR cEnd=']'; if (p[i]=='\''||p[i]=='\"') cEnd=p[i++]; int iVal=i; IncWord(cEnd); nLen=i-iVal; if (cEnd!=']') ++i; return &p[iVal]; };
  2092. int GetValOrWordLen() { return nLen; };
  2093. MCD_CHAR GetChar() { return p[i]; };
  2094. bool IsAtPathEnd() { return ((!p[i])||(iPathAttribName&&i+>=iPathAttribName))?true:false; };
  2095. MCD_PCSZ GetPtr() { return &p[i]; };
  2096. void SaveOffset() { iSave=i; };
  2097. void RevertOffset() { i=iSave; };
  2098. void RevertOffsetAsName() { i=iSave; nPathType=; };
  2099. MCD_PCSZ GetWordAndInc() { int iWord=i; IncWord(); nLen=i-iWord; return &p[iWord]; };
  2100. void IncWord() { while (p[i]&&!x_ISENDPATHWORD(p[i])) i+=MCD_CLEN(&p[i]); };
  2101. void IncWord( MCD_CHAR c ) { while (p[i]&&p[i]!=c) i+=MCD_CLEN(&p[i]); };
  2102. void IncChar() { ++i; };
  2103. void Inc( int n ) { i+=n; };
  2104. bool IsAnywherePath() { return nPathType == ; };
  2105. bool IsAbsolutePath() { return nPathType == ; };
  2106. bool IsPath() { return nPathType > ; };
  2107. bool ValidPath() { return nPathType != -; };
  2108. MCD_PCSZ GetPathAttribName() { if (iPathAttribName) return &p[iPathAttribName]; return NULL; };
  2109. bool AttribPredicateMatch( TokenPos& token );
  2110. private:
  2111. bool ParsePath();
  2112. int nPathType; // -1 invalid, 0 empty, 1 name, 2 absolute path, 3 anywhere path
  2113. bool bReader;
  2114. MCD_PCSZ p;
  2115. int i;
  2116. int iPathAttribName;
  2117. int iSave;
  2118. int nLen;
  2119. };
  2120.  
  2121. bool PathPos::ParsePath()
  2122. {
  2123. // Determine if the path seems to be in a valid format before attempting to find
  2124. if ( GetTypeAndInc() )
  2125. {
  2126. SaveOffset();
  2127. while ( )
  2128. {
  2129. if ( ! GetChar() )
  2130. return false;
  2131. IncWord(); // Tag name
  2132. if ( GetChar() == '[' ) // predicate
  2133. {
  2134. IncChar(); // [
  2135. if ( GetChar() >= '' && GetChar() <= '' )
  2136. GetNumAndInc();
  2137. else // attrib or child tag name
  2138. {
  2139. if ( GetChar() == '@' )
  2140. {
  2141. IncChar(); // @
  2142. IncWord(); // attrib name
  2143. if ( GetChar() == '=' )
  2144. GetValAndInc();
  2145. }
  2146. else
  2147. {
  2148. if ( bReader )
  2149. return false;
  2150. IncWord();
  2151. }
  2152. }
  2153. if ( GetChar() != ']' )
  2154. return false;
  2155. IncChar(); // ]
  2156. }
  2157.  
  2158. // Another level of path
  2159. if ( GetChar() == '/' )
  2160. {
  2161. if ( IsAnywherePath() )
  2162. return false; // multiple levels not supported for // path
  2163. IncChar();
  2164. if ( GetChar() == '@' )
  2165. {
  2166. // FindGetData and FindSetData support paths ending in attribute
  2167. IncChar(); // @
  2168. iPathAttribName = i;
  2169. IncWord(); // attrib name
  2170. if ( GetChar() )
  2171. return false; // it should have ended with attribute name
  2172. break;
  2173. }
  2174. }
  2175. else
  2176. {
  2177. if ( GetChar() )
  2178. return false; // not a slash, so it should have ended here
  2179. break;
  2180. }
  2181. }
  2182. RevertOffset();
  2183. }
  2184. return true;
  2185. }
  2186.  
  2187. bool PathPos::AttribPredicateMatch( TokenPos& token )
  2188. {
  2189. // Support attribute predicate matching in regular and file read mode
  2190. // token.m_nNext must already be set to node.nStart + 1 or ELEM(i).nStart + 1
  2191. IncChar(); // @
  2192. if ( token.FindAttrib(GetPtr()) )
  2193. {
  2194. IncWord();
  2195. if ( GetChar() == '=' )
  2196. {
  2197. MCD_PCSZ pszVal = GetValAndInc();
  2198. MCD_STR strPathValue = CMarkup::UnescapeText( pszVal, GetValOrWordLen() );
  2199. MCD_STR strAttribValue = CMarkup::UnescapeText( token.GetTokenPtr(), token.Length(), token.m_nTokenFlags );
  2200. if ( strPathValue != strAttribValue )
  2201. return false;
  2202. }
  2203. return true;
  2204. }
  2205. return false;
  2206. }
  2207.  
  2208. //////////////////////////////////////////////////////////////////////
  2209. // A map is a table of SavedPos structs
  2210. //
  2211. struct SavedPos
  2212. {
  2213. // SavedPos is an entry in the SavedPosMap hash table
  2214. SavedPos() { nSavedPosFlags=; iPos=; };
  2215. MCD_STR strName;
  2216. int iPos;
  2217. enum { SPM_MAIN = , SPM_CHILD = , SPM_USED = , SPM_LAST = };
  2218. int nSavedPosFlags;
  2219. };
  2220.  
  2221. struct SavedPosMap
  2222. {
  2223. // SavedPosMap is only created if SavePos/RestorePos are used
  2224. SavedPosMap( int nSize ) { nMapSize=nSize; pTable = new SavedPos*[nSize]; memset(pTable,,nSize*sizeof(SavedPos*)); };
  2225. ~SavedPosMap() { if (pTable) { for (int n=;n<nMapSize;++n) if (pTable[n]) delete[] pTable[n]; delete[] pTable; } };
  2226. SavedPos** pTable;
  2227. int nMapSize;
  2228. };
  2229.  
  2230. struct SavedPosMapArray
  2231. {
  2232. // SavedPosMapArray keeps pointers to SavedPosMap instances
  2233. SavedPosMapArray() { m_pMaps = NULL; };
  2234. ~SavedPosMapArray() { ReleaseMaps(); };
  2235. void ReleaseMaps() { SavedPosMap**p = m_pMaps; if (p) { while (*p) delete *p++; delete[] m_pMaps; m_pMaps=NULL; } };
  2236. bool GetMap( SavedPosMap*& pMap, int nMap, int nMapSize = );
  2237. void CopySavedPosMaps( SavedPosMapArray* pOtherMaps );
  2238. SavedPosMap** m_pMaps; // NULL terminated array
  2239. };
  2240.  
  2241. bool SavedPosMapArray::GetMap( SavedPosMap*& pMap, int nMap, int nMapSize /*=7*/ )
  2242. {
  2243. // Find or create map, returns true if map(s) created
  2244. SavedPosMap** pMapsExisting = m_pMaps;
  2245. int nMapIndex = ;
  2246. if ( pMapsExisting )
  2247. {
  2248. // Length of array is unknown, so loop through maps
  2249. while ( nMapIndex <= nMap )
  2250. {
  2251. pMap = pMapsExisting[nMapIndex];
  2252. if ( ! pMap )
  2253. break;
  2254. if ( nMapIndex == nMap )
  2255. return false; // not created
  2256. ++nMapIndex;
  2257. }
  2258. nMapIndex = ;
  2259. }
  2260.  
  2261. // Create map(s)
  2262. // If you access map 1 before map 0 created, then 2 maps will be created
  2263. m_pMaps = new SavedPosMap*[nMap+];
  2264. if ( pMapsExisting )
  2265. {
  2266. while ( pMapsExisting[nMapIndex] )
  2267. {
  2268. m_pMaps[nMapIndex] = pMapsExisting[nMapIndex];
  2269. ++nMapIndex;
  2270. }
  2271. delete[] pMapsExisting;
  2272. }
  2273. while ( nMapIndex <= nMap )
  2274. {
  2275. m_pMaps[nMapIndex] = new SavedPosMap( nMapSize );
  2276. ++nMapIndex;
  2277. }
  2278. m_pMaps[nMapIndex] = NULL;
  2279. pMap = m_pMaps[nMap];
  2280. return true; // map(s) created
  2281. }
  2282.  
  2283. void SavedPosMapArray::CopySavedPosMaps( SavedPosMapArray* pOtherMaps )
  2284. {
  2285. ReleaseMaps();
  2286. if ( pOtherMaps->m_pMaps )
  2287. {
  2288. int nMap = ;
  2289. SavedPosMap* pMap = NULL;
  2290. while ( pOtherMaps->m_pMaps[nMap] )
  2291. {
  2292. SavedPosMap* pMapSrc = pOtherMaps->m_pMaps[nMap];
  2293. GetMap( pMap, nMap, pMapSrc->nMapSize );
  2294. for ( int nSlot=; nSlot < pMap->nMapSize; ++nSlot )
  2295. {
  2296. SavedPos* pCopySavedPos = pMapSrc->pTable[nSlot];
  2297. if ( pCopySavedPos )
  2298. {
  2299. int nCount = ;
  2300. while ( pCopySavedPos[nCount].nSavedPosFlags & SavedPos::SPM_USED )
  2301. {
  2302. ++nCount;
  2303. if ( pCopySavedPos[nCount-].nSavedPosFlags & SavedPos::SPM_LAST )
  2304. break;
  2305. }
  2306. if ( nCount )
  2307. {
  2308. SavedPos* pNewSavedPos = new SavedPos[nCount];
  2309. for ( int nCopy=; nCopy<nCount; ++nCopy )
  2310. pNewSavedPos[nCopy] = pCopySavedPos[nCopy];
  2311. pNewSavedPos[nCount-].nSavedPosFlags |= SavedPos::SPM_LAST;
  2312. pMap->pTable[nSlot] = pNewSavedPos;
  2313. }
  2314. }
  2315. }
  2316. ++nMap;
  2317. }
  2318. }
  2319. }
  2320.  
  2321. //////////////////////////////////////////////////////////////////////
  2322. // Core parser function
  2323. //
  2324. int TokenPos::ParseNode( NodePos& node )
  2325. {
  2326. // Call this with m_nNext set to the start of the node or tag
  2327. // Upon return m_nNext points to the char after the node or tag
  2328. // m_nL and m_nR are set to name location if it is a tag with a name
  2329. // node members set to node location, strMeta used for parse error
  2330. //
  2331. // <!--...--> comment
  2332. // <!DOCTYPE ...> dtd
  2333. // <?target ...?> processing instruction
  2334. // <![CDATA[...]]> cdata section
  2335. // <NAME ...> element start tag
  2336. // </NAME ...> element end tag
  2337. //
  2338. // returns the nodetype or
  2339. // 0 for end tag
  2340. // -1 for bad node
  2341. // -2 for end of document
  2342. //
  2343. enum ParseBits
  2344. {
  2345. PD_OPENTAG = ,
  2346. PD_BANG = ,
  2347. PD_DASH = ,
  2348. PD_BRACKET = ,
  2349. PD_TEXTORWS = ,
  2350. PD_DOCTYPE = ,
  2351. PD_INQUOTE_S = ,
  2352. PD_INQUOTE_D = ,
  2353. PD_EQUALS = ,
  2354. PD_NOQUOTEVAL =
  2355. };
  2356. int nParseFlags = ;
  2357.  
  2358. MCD_PCSZ pFindEnd = NULL;
  2359. int nNodeType = -;
  2360. int nEndLen = ;
  2361. int nName = ;
  2362. int nNameLen = ;
  2363. unsigned int cDminus1 = , cDminus2 = ;
  2364. #define FINDNODETYPE(e,t) { pFindEnd=e; nEndLen=(sizeof(e)-1)/sizeof(MCD_CHAR); nNodeType=t; }
  2365. #define FINDNODETYPENAME(e,t,n) { FINDNODETYPE(e,t) nName=(int)(pD-m_pDocText)+n; }
  2366. #define FINDNODEBAD(e) { pFindEnd=MCD_T(">"); nEndLen=1; x_AddResult(node.strMeta,e,NULL,0,m_nNext); nNodeType=-1; }
  2367.  
  2368. node.nStart = m_nNext;
  2369. node.nNodeFlags = ;
  2370.  
  2371. MCD_PCSZ pD = &m_pDocText[m_nNext];
  2372. unsigned int cD;
  2373. while ( )
  2374. {
  2375. cD = (unsigned int)*pD;
  2376. if ( ! cD )
  2377. {
  2378. m_nNext = (int)(pD - m_pDocText);
  2379. if ( m_pReaderFilePos ) // read file mode
  2380. {
  2381. // Read buffer may only be removed on the first FileReadNextBuffer in this node
  2382. int nRemovedAlready = m_pReaderFilePos->m_nReadBufferRemoved;
  2383. if ( m_pReaderFilePos->FileReadNextBuffer() ) // more text in file?
  2384. {
  2385. int nNodeLength = m_nNext - node.nStart;
  2386. int nRemove = m_pReaderFilePos->m_nReadBufferRemoved;
  2387. if ( nRemove )
  2388. {
  2389. node.nStart -= nRemove;
  2390. if ( nName )
  2391. nName -= nRemove;
  2392. else if ( nNameLen )
  2393. {
  2394. m_nL -= nRemove;
  2395. m_nR -= nRemove;
  2396. }
  2397. m_nNext -= nRemove;
  2398. }
  2399. int nNewOffset = node.nStart + nNodeLength;
  2400. MCD_STR& str = *m_pReaderFilePos->m_pstrBuffer;
  2401. m_pDocText = MCD_2PCSZ( str );
  2402. pD = &m_pDocText[nNewOffset];
  2403. cD = (unsigned int)*pD; // loaded char replaces null terminator
  2404. }
  2405. if (nRemovedAlready) // preserve m_nReadBufferRemoved for caller of ParseNode
  2406. m_pReaderFilePos->m_nReadBufferRemoved = nRemovedAlready;
  2407. }
  2408. if ( ! cD )
  2409. {
  2410. if ( m_nNext == node.nStart )
  2411. {
  2412. node.nLength = ;
  2413. node.nNodeType = ;
  2414. return -; // end of document
  2415. }
  2416. if ( nNodeType != CMarkup::MNT_WHITESPACE && nNodeType != CMarkup::MNT_TEXT )
  2417. {
  2418. MCD_PCSZ pType = MCD_T("tag");
  2419. if ( (nParseFlags & PD_DOCTYPE) || nNodeType == CMarkup::MNT_DOCUMENT_TYPE )
  2420. pType = MCD_T("document_type");
  2421. else if ( nNodeType == CMarkup::MNT_ELEMENT )
  2422. pType = MCD_T("start_tag");
  2423. else if ( nNodeType == )
  2424. pType = MCD_T("end_tag");
  2425. else if ( nNodeType == CMarkup::MNT_CDATA_SECTION )
  2426. pType = MCD_T("cdata_section");
  2427. else if ( nNodeType == CMarkup::MNT_PROCESSING_INSTRUCTION )
  2428. pType = MCD_T("processing_instruction");
  2429. else if ( nNodeType == CMarkup::MNT_COMMENT )
  2430. pType = MCD_T("comment");
  2431. nNodeType = -;
  2432. x_AddResult(node.strMeta,MCD_T("unterminated_tag_syntax"),pType,MRC_TYPE,node.nStart);
  2433. }
  2434. break;
  2435. }
  2436. }
  2437.  
  2438. if ( nName )
  2439. {
  2440. if ( x_ISENDNAME(cD) )
  2441. {
  2442. nNameLen = (int)(pD - m_pDocText) - nName;
  2443. m_nL = nName;
  2444. m_nR = nName + nNameLen - ;
  2445. nName = ;
  2446. cDminus2 = ;
  2447. cDminus1 = ;
  2448. }
  2449. else
  2450. {
  2451. pD += MCD_CLEN( pD );
  2452. continue;
  2453. }
  2454. }
  2455.  
  2456. if ( pFindEnd )
  2457. {
  2458. if ( cD == '>' && ! (nParseFlags & (PD_INQUOTE_S|PD_INQUOTE_D)) )
  2459. {
  2460. m_nNext = (int)(pD - m_pDocText) + ;
  2461. if ( nEndLen == )
  2462. {
  2463. pFindEnd = NULL;
  2464. if ( nNodeType == CMarkup::MNT_ELEMENT && cDminus1 == '/' )
  2465. {
  2466. if ( (! cDminus2) || (!(nParseFlags&PD_NOQUOTEVAL)) || x_ISNOTSECONDLASTINVAL(cDminus2) )
  2467. node.nNodeFlags |= MNF_EMPTY;
  2468. }
  2469. }
  2470. else if ( m_nNext - > nEndLen )
  2471. {
  2472. // Test for end of PI or comment
  2473. MCD_PCSZ pEnd = pD - nEndLen + ;
  2474. MCD_PCSZ pInFindEnd = pFindEnd;
  2475. int nLen = nEndLen;
  2476. while ( --nLen && *pEnd++ == *pInFindEnd++ );
  2477. if ( nLen == )
  2478. pFindEnd = NULL;
  2479. }
  2480. nParseFlags &= ~PD_NOQUOTEVAL; // make sure PD_NOQUOTEVAL is off
  2481. if ( ! pFindEnd && ! (nParseFlags & PD_DOCTYPE) )
  2482. break;
  2483. }
  2484. else if ( cD == '<' && (nNodeType == CMarkup::MNT_TEXT || nNodeType == -) )
  2485. {
  2486. m_nNext = (int)(pD - m_pDocText);
  2487. break;
  2488. }
  2489. else if ( nNodeType & CMarkup::MNT_ELEMENT )
  2490. {
  2491. if ( (nParseFlags & (PD_INQUOTE_S|PD_INQUOTE_D|PD_NOQUOTEVAL)) )
  2492. {
  2493. if ( cD == '\"' && (nParseFlags&PD_INQUOTE_D) )
  2494. nParseFlags ^= PD_INQUOTE_D; // off
  2495. else if ( cD == '\'' && (nParseFlags&PD_INQUOTE_S) )
  2496. nParseFlags ^= PD_INQUOTE_S; // off
  2497. else if ( (nParseFlags&PD_NOQUOTEVAL) && x_ISWHITESPACE(cD) )
  2498. nParseFlags ^= PD_NOQUOTEVAL; // off
  2499. }
  2500. else // not in attrib value
  2501. {
  2502. // Only set INQUOTE status when preceeded by equal sign
  2503. if ( cD == '\"' && (nParseFlags&PD_EQUALS) )
  2504. nParseFlags ^= PD_INQUOTE_D|PD_EQUALS; // D on, equals off
  2505. else if ( cD == '\'' && (nParseFlags&PD_EQUALS) )
  2506. nParseFlags ^= PD_INQUOTE_S|PD_EQUALS; // S on, equals off
  2507. else if ( cD == '=' && cDminus1 != '=' && ! (nParseFlags&PD_EQUALS) )
  2508. nParseFlags ^= PD_EQUALS; // on
  2509. else if ( (nParseFlags&PD_EQUALS) && ! x_ISWHITESPACE(cD) )
  2510. nParseFlags ^= PD_NOQUOTEVAL|PD_EQUALS; // no quote val on, equals off
  2511. }
  2512. cDminus2 = cDminus1;
  2513. cDminus1 = cD;
  2514. }
  2515. else if ( nNodeType & CMarkup::MNT_DOCUMENT_TYPE )
  2516. {
  2517. if ( cD == '\"' && ! (nParseFlags&PD_INQUOTE_S) )
  2518. nParseFlags ^= PD_INQUOTE_D; // toggle
  2519. else if ( cD == '\'' && ! (nParseFlags&PD_INQUOTE_D) )
  2520. nParseFlags ^= PD_INQUOTE_S; // toggle
  2521. }
  2522. }
  2523. else if ( nParseFlags )
  2524. {
  2525. if ( nParseFlags & PD_TEXTORWS )
  2526. {
  2527. if ( cD == '<' )
  2528. {
  2529. m_nNext = (int)(pD - m_pDocText);
  2530. nNodeType = CMarkup::MNT_WHITESPACE;
  2531. break;
  2532. }
  2533. else if ( ! x_ISWHITESPACE(cD) )
  2534. {
  2535. nParseFlags ^= PD_TEXTORWS;
  2536. FINDNODETYPE( MCD_T("<"), CMarkup::MNT_TEXT )
  2537. }
  2538. }
  2539. else if ( nParseFlags & PD_OPENTAG )
  2540. {
  2541. nParseFlags ^= PD_OPENTAG;
  2542. if ( cD > 0x60 || ( cD > 0x40 && cD < 0x5b ) || cD == 0x5f || cD == 0x3a )
  2543. FINDNODETYPENAME( MCD_T(">"), CMarkup::MNT_ELEMENT, )
  2544. else if ( cD == '/' )
  2545. FINDNODETYPENAME( MCD_T(">"), , )
  2546. else if ( cD == '!' )
  2547. nParseFlags |= PD_BANG;
  2548. else if ( cD == '?' )
  2549. FINDNODETYPENAME( MCD_T("?>"), CMarkup::MNT_PROCESSING_INSTRUCTION, )
  2550. else
  2551. FINDNODEBAD( MCD_T("first_tag_syntax") )
  2552. }
  2553. else if ( nParseFlags & PD_BANG )
  2554. {
  2555. nParseFlags ^= PD_BANG;
  2556. if ( cD == '-' )
  2557. nParseFlags |= PD_DASH;
  2558. else if ( nParseFlags & PD_DOCTYPE )
  2559. {
  2560. if ( x_ISDOCTYPESTART(cD) ) // <!ELEMENT ATTLIST ENTITY NOTATION
  2561. FINDNODETYPE( MCD_T(">"), CMarkup::MNT_DOCUMENT_TYPE )
  2562. else
  2563. FINDNODEBAD( MCD_T("doctype_tag_syntax") )
  2564. }
  2565. else
  2566. {
  2567. if ( cD == '[' )
  2568. nParseFlags |= PD_BRACKET;
  2569. else if ( cD == 'D' )
  2570. nParseFlags |= PD_DOCTYPE;
  2571. else
  2572. FINDNODEBAD( MCD_T("exclamation_tag_syntax") )
  2573. }
  2574. }
  2575. else if ( nParseFlags & PD_DASH )
  2576. {
  2577. nParseFlags ^= PD_DASH;
  2578. if ( cD == '-' )
  2579. FINDNODETYPE( MCD_T("-->"), CMarkup::MNT_COMMENT )
  2580. else
  2581. FINDNODEBAD( MCD_T("comment_tag_syntax") )
  2582. }
  2583. else if ( nParseFlags & PD_BRACKET )
  2584. {
  2585. nParseFlags ^= PD_BRACKET;
  2586. if ( cD == 'C' )
  2587. FINDNODETYPE( MCD_T("]]>"), CMarkup::MNT_CDATA_SECTION )
  2588. else
  2589. FINDNODEBAD( MCD_T("cdata_section_syntax") )
  2590. }
  2591. else if ( nParseFlags & PD_DOCTYPE )
  2592. {
  2593. if ( cD == '<' )
  2594. nParseFlags |= PD_OPENTAG;
  2595. else if ( cD == '>' )
  2596. {
  2597. m_nNext = (int)(pD - m_pDocText) + ;
  2598. nNodeType = CMarkup::MNT_DOCUMENT_TYPE;
  2599. break;
  2600. }
  2601. }
  2602. }
  2603. else if ( cD == '<' )
  2604. {
  2605. nParseFlags |= PD_OPENTAG;
  2606. }
  2607. else
  2608. {
  2609. nNodeType = CMarkup::MNT_WHITESPACE;
  2610. if ( x_ISWHITESPACE(cD) )
  2611. nParseFlags |= PD_TEXTORWS;
  2612. else
  2613. FINDNODETYPE( MCD_T("<"), CMarkup::MNT_TEXT )
  2614. }
  2615. pD += MCD_CLEN( pD );
  2616. }
  2617. node.nLength = m_nNext - node.nStart;
  2618. node.nNodeType = nNodeType;
  2619. return nNodeType;
  2620. }
  2621.  
  2622. //////////////////////////////////////////////////////////////////////
  2623. // CMarkup public methods
  2624. //
  2625. CMarkup::~CMarkup()
  2626. {
  2627. delete m_pSavedPosMaps;
  2628. delete m_pElemPosTree;
  2629. }
  2630.  
  2631. void CMarkup::operator=( const CMarkup& markup )
  2632. {
  2633. // Copying not supported during file mode because of file pointer
  2634. if ( (m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE)) || (markup.m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE)) )
  2635. return;
  2636. m_iPosParent = markup.m_iPosParent;
  2637. m_iPos = markup.m_iPos;
  2638. m_iPosChild = markup.m_iPosChild;
  2639. m_iPosFree = markup.m_iPosFree;
  2640. m_iPosDeleted = markup.m_iPosDeleted;
  2641. m_nNodeType = markup.m_nNodeType;
  2642. m_nNodeOffset = markup.m_nNodeOffset;
  2643. m_nNodeLength = markup.m_nNodeLength;
  2644. m_strDoc = markup.m_strDoc;
  2645. m_strResult = markup.m_strResult;
  2646. m_nDocFlags = markup.m_nDocFlags;
  2647. m_pElemPosTree->CopyElemPosTree( markup.m_pElemPosTree, m_iPosFree );
  2648. m_pSavedPosMaps->CopySavedPosMaps( markup.m_pSavedPosMaps );
  2649. MARKUP_SETDEBUGSTATE;
  2650. }
  2651.  
  2652. bool CMarkup::SetDoc( MCD_PCSZ pDoc )
  2653. {
  2654. // pDoc is markup text, not a filename!
  2655. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  2656. return false;
  2657. // Set document text
  2658. if ( pDoc )
  2659. m_strDoc = pDoc;
  2660. else
  2661. {
  2662. MCD_STRCLEARSIZE( m_strDoc );
  2663. m_pElemPosTree->ReleaseElemPosTree();
  2664. }
  2665.  
  2666. MCD_STRCLEAR(m_strResult);
  2667. return x_ParseDoc();
  2668. }
  2669.  
  2670. bool CMarkup::SetDoc( const MCD_STR& strDoc )
  2671. {
  2672. // strDoc is markup text, not a filename!
  2673. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  2674. return false;
  2675. m_strDoc = strDoc;
  2676. MCD_STRCLEAR(m_strResult);
  2677. return x_ParseDoc();
  2678. }
  2679.  
  2680. bool CMarkup::IsWellFormed()
  2681. {
  2682. if ( m_nDocFlags & MDF_WRITEFILE )
  2683. return true;
  2684. if ( m_nDocFlags & MDF_READFILE )
  2685. {
  2686. if ( ! (ELEM().nFlags & MNF_ILLFORMED) )
  2687. return true;
  2688. }
  2689. else if ( m_pElemPosTree->GetSize()
  2690. && ! (ELEM().nFlags & MNF_ILLFORMED)
  2691. && ELEM().iElemChild
  2692. && ! ELEM(ELEM().iElemChild).iElemNext )
  2693. return true;
  2694. return false;
  2695. }
  2696.  
  2697. MCD_STR CMarkup::GetError() const
  2698. {
  2699. // For backwards compatibility, return a readable English string built from m_strResult
  2700. // In release 11.0 you can use GetResult and examine result in XML format
  2701. CMarkup mResult( m_strResult );
  2702. MCD_STR strError;
  2703. int nSyntaxErrors = ;
  2704. while ( mResult.FindElem() )
  2705. {
  2706. MCD_STR strItem;
  2707. MCD_STR strID = mResult.GetTagName();
  2708.  
  2709. // Parse result
  2710. if ( strID == MCD_T("root_has_sibling") )
  2711. strItem = MCD_T("root element has sibling");
  2712. else if ( strID == MCD_T("no_root_element") )
  2713. strItem = MCD_T("no root element");
  2714. else if ( strID == MCD_T("lone_end_tag") )
  2715. strItem = MCD_T("lone end tag '") + mResult.GetAttrib(MCD_T("tagname")) + MCD_T("' at offset ")
  2716. + mResult.GetAttrib(MCD_T("offset"));
  2717. else if ( strID == MCD_T("unended_start_tag") )
  2718. strItem = MCD_T("start tag '") + mResult.GetAttrib(MCD_T("tagname")) + MCD_T("' at offset ")
  2719. + mResult.GetAttrib(MCD_T("offset")) + MCD_T(" expecting end tag at offset ") + mResult.GetAttrib(MCD_T("offset2"));
  2720. else if ( strID == MCD_T("first_tag_syntax") )
  2721. strItem = MCD_T("tag syntax error at offset ") + mResult.GetAttrib(MCD_T("offset"))
  2722. + MCD_T(" expecting tag name / ! or ?");
  2723. else if ( strID == MCD_T("exclamation_tag_syntax") )
  2724. strItem = MCD_T("tag syntax error at offset ") + mResult.GetAttrib(MCD_T("offset"))
  2725. + MCD_T(" expecting 'DOCTYPE' [ or -");
  2726. else if ( strID == MCD_T("doctype_tag_syntax") )
  2727. strItem = MCD_T("tag syntax error at offset ") + mResult.GetAttrib(MCD_T("offset"))
  2728. + MCD_T(" expecting markup declaration"); // ELEMENT ATTLIST ENTITY NOTATION
  2729. else if ( strID == MCD_T("comment_tag_syntax") )
  2730. strItem = MCD_T("tag syntax error at offset ") + mResult.GetAttrib(MCD_T("offset"))
  2731. + MCD_T(" expecting - to begin comment");
  2732. else if ( strID == MCD_T("cdata_section_syntax") )
  2733. strItem = MCD_T("tag syntax error at offset ") + mResult.GetAttrib(MCD_T("offset"))
  2734. + MCD_T(" expecting 'CDATA'");
  2735. else if ( strID == MCD_T("unterminated_tag_syntax") )
  2736. strItem = MCD_T("unterminated tag at offset ") + mResult.GetAttrib(MCD_T("offset"));
  2737.  
  2738. // Report only the first syntax or well-formedness error
  2739. if ( ! MCD_STRISEMPTY(strItem) )
  2740. {
  2741. ++nSyntaxErrors;
  2742. if ( nSyntaxErrors > )
  2743. continue;
  2744. }
  2745.  
  2746. // I/O results
  2747. if ( strID == MCD_T("file_error") )
  2748. strItem = mResult.GetAttrib(MCD_T("msg"));
  2749. else if ( strID == MCD_T("bom") )
  2750. strItem = MCD_T("BOM +");
  2751. else if ( strID == MCD_T("read") || strID == MCD_T("write") || strID == MCD_T("converted_to") || strID == MCD_T("converted_from") )
  2752. {
  2753. if ( strID == MCD_T("converted_to") )
  2754. strItem = MCD_T("to ");
  2755. MCD_STR strEncoding = mResult.GetAttrib( MCD_T("encoding") );
  2756. if ( ! MCD_STRISEMPTY(strEncoding) )
  2757. strItem += strEncoding + MCD_T(" ");
  2758. strItem += MCD_T("length ") + mResult.GetAttrib(MCD_T("length"));
  2759. if ( strID == MCD_T("converted_from") )
  2760. strItem += MCD_T(" to");
  2761. }
  2762. else if ( strID == MCD_T("nulls_removed") )
  2763. strItem = MCD_T("removed ") + mResult.GetAttrib(MCD_T("count")) + MCD_T(" nulls");
  2764. else if ( strID == MCD_T("conversion_loss") )
  2765. strItem = MCD_T("(chars lost in conversion!)");
  2766. else if ( strID == MCD_T("utf8_detection") )
  2767. strItem = MCD_T("(used UTF-8 detection)");
  2768. else if ( strID == MCD_T("endian_swap") )
  2769. strItem = MCD_T("endian swap");
  2770. else if ( strID == MCD_T("truncation_error") )
  2771. strItem = MCD_T("encoding ") + mResult.GetAttrib(MCD_T("encoding")) + MCD_T(" adjustment error");
  2772.  
  2773. // Concatenate result item to error string
  2774. if ( ! MCD_STRISEMPTY(strItem) )
  2775. {
  2776. if ( ! MCD_STRISEMPTY(strError) )
  2777. strError += MCD_T(" ");
  2778. strError += strItem;
  2779. }
  2780. }
  2781. return strError;
  2782. }
  2783.  
  2784. bool CMarkup::Load( MCD_CSTR_FILENAME szFileName )
  2785. {
  2786. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  2787. return false;
  2788. if ( ! ReadTextFile(szFileName, m_strDoc, &m_strResult, &m_nDocFlags) )
  2789. return false;
  2790. return x_ParseDoc();
  2791. }
  2792.  
  2793. bool CMarkup::ReadTextFile( MCD_CSTR_FILENAME szFileName, MCD_STR& strDoc, MCD_STR* pstrResult, int* pnDocFlags, MCD_STR* pstrEncoding )
  2794. {
  2795. // Static utility method to load text file into strDoc
  2796. //
  2797. FilePos file;
  2798. file.m_nDocFlags = (pnDocFlags?*pnDocFlags:) | MDF_READFILE;
  2799. bool bSuccess = file.FileOpen( szFileName );
  2800. if ( pstrResult )
  2801. *pstrResult = file.m_strIOResult;
  2802. MCD_STRCLEAR(strDoc);
  2803. if ( bSuccess )
  2804. {
  2805. file.FileSpecifyEncoding( pstrEncoding );
  2806. file.m_nOpFileByteLen = (int)((MCD_INTFILEOFFSET)(file.m_nFileByteLen - file.m_nFileByteOffset));
  2807. bSuccess = file.FileReadText( strDoc );
  2808. file.FileClose();
  2809. if ( pstrResult )
  2810. *pstrResult += file.m_strIOResult;
  2811. if ( pnDocFlags )
  2812. *pnDocFlags = file.m_nDocFlags;
  2813. }
  2814. return bSuccess;
  2815. }
  2816.  
  2817. bool CMarkup::Save( MCD_CSTR_FILENAME szFileName )
  2818. {
  2819. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  2820. return false;
  2821. return WriteTextFile( szFileName, m_strDoc, &m_strResult, &m_nDocFlags );
  2822. }
  2823.  
  2824. bool CMarkup::WriteTextFile( MCD_CSTR_FILENAME szFileName, const MCD_STR& strDoc, MCD_STR* pstrResult, int* pnDocFlags, MCD_STR* pstrEncoding )
  2825. {
  2826. // Static utility method to save strDoc to text file
  2827. //
  2828. FilePos file;
  2829. file.m_nDocFlags = (pnDocFlags?*pnDocFlags:) | MDF_WRITEFILE;
  2830. bool bSuccess = file.FileOpen( szFileName );
  2831. if ( pstrResult )
  2832. *pstrResult = file.m_strIOResult;
  2833. if ( bSuccess )
  2834. {
  2835. if ( MCD_STRISEMPTY(file.m_strEncoding) && ! MCD_STRISEMPTY(strDoc) )
  2836. {
  2837. file.m_strEncoding = GetDeclaredEncoding( strDoc );
  2838. if ( MCD_STRISEMPTY(file.m_strEncoding) )
  2839. file.m_strEncoding = MCD_T("UTF-8"); // to do: MDF_ANSIFILE
  2840. }
  2841. file.FileSpecifyEncoding( pstrEncoding );
  2842. bSuccess = file.FileWriteText( strDoc );
  2843. file.FileClose();
  2844. if ( pstrResult )
  2845. *pstrResult += file.m_strIOResult;
  2846. if ( pnDocFlags )
  2847. *pnDocFlags = file.m_nDocFlags;
  2848. }
  2849. return bSuccess;
  2850. }
  2851.  
  2852. bool CMarkup::FindElem( MCD_CSTR szName )
  2853. {
  2854. if ( m_nDocFlags & MDF_WRITEFILE )
  2855. return false;
  2856. if ( m_pElemPosTree->GetSize() )
  2857. {
  2858. // Change current position only if found
  2859. PathPos path( szName, false );
  2860. int iPos = x_FindElem( m_iPosParent, m_iPos, path );
  2861. if ( iPos )
  2862. {
  2863. // Assign new position
  2864. x_SetPos( ELEM(iPos).iElemParent, iPos, );
  2865. return true;
  2866. }
  2867. }
  2868. return false;
  2869. }
  2870.  
  2871. bool CMarkup::FindChildElem( MCD_CSTR szName )
  2872. {
  2873. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  2874. return false;
  2875. // Shorthand: if no current main position, find first child under parent element
  2876. if ( ! m_iPos )
  2877. FindElem();
  2878. // Change current child position only if found
  2879. PathPos path( szName, false );
  2880. int iPosChild = x_FindElem( m_iPos, m_iPosChild, path );
  2881.  
  2882. if ( iPosChild )
  2883. {
  2884. // Assign new position
  2885. int iPos = ELEM(iPosChild).iElemParent;
  2886. x_SetPos( ELEM(iPos).iElemParent, iPos, iPosChild );
  2887. return true;
  2888. }
  2889. return false;
  2890. }
  2891.  
  2892. MCD_STR CMarkup::EscapeText( MCD_CSTR szText, int nFlags )
  2893. {
  2894. // Convert text as seen outside XML document to XML friendly
  2895. // replacing special characters with ampersand escape codes
  2896. // E.g. convert "6>7" to "6&gt;7"
  2897. //
  2898. // &lt; less than
  2899. // &amp; ampersand
  2900. // &gt; greater than
  2901. //
  2902. // and for attributes:
  2903. //
  2904. // &apos; apostrophe or single quote
  2905. // &quot; double quote
  2906. //
  2907. static MCD_PCSZ apReplace[] = { NULL,MCD_T("&lt;"),MCD_T("&amp;"),MCD_T("&gt;"),MCD_T("&quot;"),MCD_T("&apos;") };
  2908. MCD_STR strText;
  2909. MCD_PCSZ pSource = szText;
  2910. int nDestSize = MCD_PSZLEN(pSource);
  2911. nDestSize += nDestSize / + ;
  2912. MCD_BLDRESERVE(strText,nDestSize);
  2913. MCD_CHAR cSource = *pSource;
  2914. int nFound;
  2915. int nCharLen;
  2916. while ( cSource )
  2917. {
  2918. MCD_BLDCHECK(strText,nDestSize,);
  2919. nFound = ((nFlags&MNF_ESCAPEQUOTES)?x_ISATTRIBSPECIAL(cSource):x_ISSPECIAL(cSource));
  2920. if ( nFound )
  2921. {
  2922. bool bIgnoreAmpersand = false;
  2923. if ( (nFlags&MNF_WITHREFS) && cSource == '&' )
  2924. {
  2925. // Do not replace ampersand if it is start of any entity reference
  2926. // &[#_:A-Za-zU][_:-.A-Za-z0-9U]*; where U is > 0x7f
  2927. MCD_PCSZ pCheckEntity = pSource;
  2928. ++pCheckEntity;
  2929. MCD_CHAR c = *pCheckEntity;
  2930. if ( x_ISSTARTENTREF(c) || ((unsigned int)c)>0x7f )
  2931. {
  2932. while ( )
  2933. {
  2934. pCheckEntity += MCD_CLEN( pCheckEntity );
  2935. c = *pCheckEntity;
  2936. if ( c == ';' )
  2937. {
  2938. int nEntityLen = (int)(pCheckEntity - pSource) + ;
  2939. MCD_BLDAPPENDN(strText,pSource,nEntityLen);
  2940. pSource = pCheckEntity;
  2941. bIgnoreAmpersand = true;
  2942. }
  2943. else if ( x_ISINENTREF(c) || ((unsigned int)c)>0x7f )
  2944. continue;
  2945. break;
  2946. }
  2947. }
  2948. }
  2949. if ( ! bIgnoreAmpersand )
  2950. {
  2951. MCD_BLDAPPEND(strText,apReplace[nFound]);
  2952. }
  2953. ++pSource; // ASCII, so 1 byte
  2954. }
  2955. else
  2956. {
  2957. nCharLen = MCD_CLEN( pSource );
  2958. MCD_BLDAPPENDN(strText,pSource,nCharLen);
  2959. pSource += nCharLen;
  2960. }
  2961. cSource = *pSource;
  2962. }
  2963.  
  2964. MCD_BLDRELEASE(strText);
  2965. return strText;
  2966. }
  2967.  
  2968. // Predefined character entities
  2969. // By default UnescapeText will decode standard HTML entities as well as the 5 in XML
  2970. // To unescape only the 5 standard XML entities, use this short table instead:
  2971. // MCD_PCSZ PredefEntityTable[4] =
  2972. // { MCD_T("20060lt"),MCD_T("40034quot"),MCD_T("30038amp"),MCD_T("20062gt40039apos") };
  2973. //
  2974. // This is a precompiled ASCII hash table for speed and minimum memory requirement
  2975. // Each entry consists of a 1 digit code name length, 4 digit code point, and the code name
  2976. // Each table slot can have multiple entries, table size 130 was chosen for even distribution
  2977. //
  2978. MCD_PCSZ PredefEntityTable[] =
  2979. {
  2980. MCD_T("60216oslash60217ugrave60248oslash60249ugrave"),
  2981. MCD_T("50937omega60221yacute58968lceil50969omega60253yacute"),
  2982. MCD_T("50916delta50206icirc50948delta50238icirc68472weierp"),MCD_T("40185sup1"),
  2983. MCD_T("68970lfloor40178sup2"),
  2984. MCD_T("50922kappa60164curren50954kappa58212mdash40179sup3"),
  2985. MCD_T("59830diams58211ndash"),MCD_T("68855otimes58969rceil"),
  2986. MCD_T("50338oelig50212ocirc50244ocirc50339oelig58482trade"),
  2987. MCD_T("50197aring50931sigma50229aring50963sigma"),
  2988. MCD_T("50180acute68971rfloor50732tilde"),MCD_T("68249lsaquo"),
  2989. MCD_T("58734infin68201thinsp"),MCD_T("50161iexcl"),
  2990. MCD_T("50920theta50219ucirc50952theta50251ucirc"),MCD_T("58254oline"),
  2991. MCD_T("58260frasl68727lowast"),MCD_T("59827clubs60191iquest68250rsaquo"),
  2992. MCD_T("58629crarr50181micro"),MCD_T("58222bdquo"),MCD_T(""),
  2993. MCD_T("58243prime60177plusmn58242prime"),MCD_T("40914beta40946beta"),MCD_T(""),
  2994. MCD_T(""),MCD_T(""),MCD_T("50171laquo50215times"),MCD_T("40710circ"),
  2995. MCD_T("49001lang"),MCD_T("58220ldquo40175macr"),
  2996. MCD_T("40182para50163pound48476real"),MCD_T(""),MCD_T("58713notin50187raquo"),
  2997. MCD_T("48773cong50223szlig50978upsih"),
  2998. MCD_T("58776asymp58801equiv49002rang58218sbquo"),
  2999. MCD_T("50222thorn48659darr48595darr40402fnof58221rdquo50254thorn"),
  3000. MCD_T("40162cent58722minus"),MCD_T("58707exist40170ordf"),MCD_T(""),
  3001. MCD_T("40921iota58709empty48660harr48596harr40953iota"),MCD_T(""),
  3002. MCD_T("40196auml40228auml48226bull40167sect48838sube"),MCD_T(""),
  3003. MCD_T("48656larr48592larr58853oplus"),MCD_T("30176deg58216lsquo40186ordm"),
  3004. MCD_T("40203euml40039apos40235euml48712isin40160nbsp"),
  3005. MCD_T("40918zeta40950zeta"),MCD_T("38743and48195emsp48719prod"),
  3006. MCD_T("30935chi38745cap30967chi48194ensp"),
  3007. MCD_T("40207iuml40239iuml48706part48869perp48658rarr48594rarr"),
  3008. MCD_T("38736ang48836nsub58217rsquo"),MCD_T(""),
  3009. MCD_T("48901sdot48657uarr48593uarr"),MCD_T("40169copy48364euro"),
  3010. MCD_T("30919eta30951eta"),MCD_T("40214ouml40246ouml48839supe"),MCD_T(""),
  3011. MCD_T(""),MCD_T("30038amp30174reg"),MCD_T("48733prop"),MCD_T(""),
  3012. MCD_T("30208eth30934phi40220uuml30240eth30966phi40252uuml"),MCD_T(""),MCD_T(""),
  3013. MCD_T(""),MCD_T("40376yuml40255yuml"),MCD_T(""),MCD_T("40034quot48204zwnj"),
  3014. MCD_T("38746cup68756there4"),MCD_T("30929rho30961rho38764sim"),
  3015. MCD_T("30932tau38834sub30964tau"),MCD_T("38747int38206lrm38207rlm"),
  3016. MCD_T("30936psi30968psi30165yen"),MCD_T(""),MCD_T("28805ge30168uml"),
  3017. MCD_T("30982piv"),MCD_T(""),MCD_T("30172not"),MCD_T(""),MCD_T("28804le"),
  3018. MCD_T("30173shy"),MCD_T("39674loz28800ne38721sum"),MCD_T(""),MCD_T(""),
  3019. MCD_T("38835sup"),MCD_T("28715ni"),MCD_T(""),MCD_T("20928pi20960pi38205zwj"),
  3020. MCD_T(""),MCD_T("60923lambda20062gt60955lambda"),MCD_T(""),MCD_T(""),
  3021. MCD_T("60199ccedil60231ccedil"),MCD_T(""),MCD_T("20060lt"),
  3022. MCD_T("20926xi28744or20958xi"),MCD_T("20924mu20956mu"),MCD_T("20925nu20957nu"),
  3023. MCD_T("68225dagger68224dagger"),MCD_T("80977thetasym"),MCD_T(""),MCD_T(""),
  3024. MCD_T(""),MCD_T("78501alefsym"),MCD_T(""),MCD_T(""),MCD_T(""),
  3025. MCD_T("60193aacute60195atilde60225aacute60227atilde"),MCD_T(""),
  3026. MCD_T("70927omicron60247divide70959omicron"),MCD_T("60192agrave60224agrave"),
  3027. MCD_T("60201eacute60233eacute60962sigmaf"),MCD_T("70917epsilon70949epsilon"),
  3028. MCD_T(""),MCD_T("60200egrave60232egrave"),MCD_T("60205iacute60237iacute"),
  3029. MCD_T(""),MCD_T(""),MCD_T("60204igrave68230hellip60236igrave"),
  3030. MCD_T("60166brvbar"),
  3031. MCD_T("60209ntilde68704forall58711nabla60241ntilde69824spades"),
  3032. MCD_T("60211oacute60213otilde60189frac1260183middot60243oacute60245otilde"),
  3033. MCD_T(""),MCD_T("50184cedil60188frac14"),
  3034. MCD_T("50198aelig50194acirc60210ograve50226acirc50230aelig60242ograve"),
  3035. MCD_T("50915gamma60190frac3450947gamma58465image58730radic"),
  3036. MCD_T("60352scaron60353scaron"),MCD_T("60218uacute69829hearts60250uacute"),
  3037. MCD_T("50913alpha50202ecirc70933upsilon50945alpha50234ecirc70965upsilon"),
  3038. MCD_T("68240permil")
  3039. };
  3040.  
  3041. MCD_STR CMarkup::UnescapeText( MCD_CSTR szText, int nTextLength /*=-1*/, int nFlags /*=0*/ )
  3042. {
  3043. // Convert XML friendly text to text as seen outside XML document
  3044. // ampersand escape codes replaced with special characters e.g. convert "6&gt;7" to "6>7"
  3045. // ampersand numeric codes replaced with character e.g. convert < to <
  3046. // Conveniently the result is always the same or shorter in byte length
  3047. //
  3048. MCD_STR strText;
  3049. MCD_PCSZ pSource = szText;
  3050. if ( nTextLength == - )
  3051. nTextLength = MCD_PSZLEN(szText);
  3052. MCD_BLDRESERVE(strText,nTextLength);
  3053. MCD_CHAR szCodeName[];
  3054. bool bAlterWhitespace = (nFlags & (MDF_TRIMWHITESPACE|MDF_COLLAPSEWHITESPACE))?true:false;
  3055. bool bCollapseWhitespace = (nFlags & MDF_COLLAPSEWHITESPACE)?true:false;
  3056. int nCharWhitespace = -; // start of string
  3057. int nCharLen;
  3058. int nChar = ;
  3059. while ( nChar < nTextLength )
  3060. {
  3061. if ( pSource[nChar] == '&' )
  3062. {
  3063. if ( bAlterWhitespace )
  3064. nCharWhitespace = ;
  3065.  
  3066. // Get corresponding unicode code point
  3067. int nUnicode = ;
  3068.  
  3069. // Look for terminating semi-colon within 9 ASCII characters
  3070. int nCodeLen = ;
  3071. MCD_CHAR cCodeChar = pSource[nChar+];
  3072. while ( nCodeLen < && ((unsigned int)cCodeChar) < && cCodeChar != ';' )
  3073. {
  3074. if ( cCodeChar >= 'A' && cCodeChar <= 'Z') // upper case?
  3075. cCodeChar += ('a' - 'A'); // make lower case
  3076. szCodeName[nCodeLen] = cCodeChar;
  3077. ++nCodeLen;
  3078. cCodeChar = pSource[nChar++nCodeLen];
  3079. }
  3080. if ( cCodeChar == ';' ) // found semi-colon?
  3081. {
  3082. // Decode szCodeName
  3083. szCodeName[nCodeLen] = '\0';
  3084. if ( *szCodeName == '#' ) // numeric character reference?
  3085. {
  3086. // Is it a hex number?
  3087. int nBase = ; // decimal
  3088. int nNumberOffset = ; // after #
  3089. if ( szCodeName[] == 'x' )
  3090. {
  3091. nNumberOffset = ; // after #x
  3092. nBase = ; // hex
  3093. }
  3094. nUnicode = MCD_PSZTOL( &szCodeName[nNumberOffset], NULL, nBase );
  3095. }
  3096. else // does not start with #
  3097. {
  3098. // Look for matching code name in PredefEntityTable
  3099. MCD_PCSZ pEntry = PredefEntityTable[x_Hash(szCodeName,sizeof(PredefEntityTable)/sizeof(MCD_PCSZ))];
  3100. while ( *pEntry )
  3101. {
  3102. // e.g. entry: 40039apos means length 4, code point 0039, code name apos
  3103. int nEntryLen = (*pEntry - '');
  3104. ++pEntry;
  3105. MCD_PCSZ pCodePoint = pEntry;
  3106. pEntry += ;
  3107. if ( nEntryLen == nCodeLen && x_StrNCmp(szCodeName,pEntry,nEntryLen) == )
  3108. {
  3109. // Convert digits to integer up to code name which always starts with alpha
  3110. nUnicode = MCD_PSZTOL( pCodePoint, NULL, );
  3111. break;
  3112. }
  3113. pEntry += nEntryLen;
  3114. }
  3115. }
  3116. }
  3117.  
  3118. // If a code point found, encode it into text
  3119. if ( nUnicode )
  3120. {
  3121. MCD_CHAR szChar[];
  3122. nCharLen = ;
  3123. #if defined(MARKUP_WCHAR) // WCHAR
  3124. #if MARKUP_SIZEOFWCHAR == 4 // sizeof(wchar_t) == 4
  3125. szChar[] = (MCD_CHAR)nUnicode;
  3126. nCharLen = ;
  3127. #else // sizeof(wchar_t) == 2
  3128. EncodeCharUTF16( nUnicode, (unsigned short*)szChar, nCharLen );
  3129. #endif
  3130. #elif defined(MARKUP_MBCS) // MBCS/double byte
  3131. #if defined(MARKUP_WINCONV)
  3132. int nUsedDefaultChar = ;
  3133. wchar_t wszUTF16[];
  3134. EncodeCharUTF16( nUnicode, (unsigned short*)wszUTF16, nCharLen );
  3135. nCharLen = WideCharToMultiByte( CP_ACP, , wszUTF16, nCharLen, szChar, , NULL, &nUsedDefaultChar );
  3136. if ( nUsedDefaultChar || nCharLen <= )
  3137. nUnicode = ;
  3138. #else // not WINCONV
  3139. wchar_t wcUnicode = (wchar_t)nUnicode;
  3140. nCharLen = wctomb( szChar, wcUnicode );
  3141. if ( nCharLen <= )
  3142. nUnicode = ;
  3143. #endif // not WINCONV
  3144. #else // not WCHAR and not MBCS/double byte
  3145. EncodeCharUTF8( nUnicode, szChar, nCharLen );
  3146. #endif // not WCHAR and not MBCS/double byte
  3147. // Increment index past ampersand semi-colon
  3148. if ( nUnicode ) // must check since MBCS case can clear it
  3149. {
  3150. MCD_BLDAPPENDN(strText,szChar,nCharLen);
  3151. nChar += nCodeLen + ;
  3152. }
  3153. }
  3154. if ( ! nUnicode )
  3155. {
  3156. // If the code is not converted, leave it as is
  3157. MCD_BLDAPPEND1(strText,'&');
  3158. ++nChar;
  3159. }
  3160. }
  3161. else if ( bAlterWhitespace && x_ISWHITESPACE(pSource[nChar]) )
  3162. {
  3163. if ( nCharWhitespace == && bCollapseWhitespace )
  3164. {
  3165. nCharWhitespace = MCD_BLDLEN(strText);
  3166. MCD_BLDAPPEND1(strText,' ');
  3167. }
  3168. else if ( nCharWhitespace != - && ! bCollapseWhitespace )
  3169. {
  3170. if ( nCharWhitespace == )
  3171. nCharWhitespace = MCD_BLDLEN(strText);
  3172. MCD_BLDAPPEND1(strText,pSource[nChar]);
  3173. }
  3174. ++nChar;
  3175. }
  3176. else // not &
  3177. {
  3178. if ( bAlterWhitespace )
  3179. nCharWhitespace = ;
  3180. nCharLen = MCD_CLEN(&pSource[nChar]);
  3181. MCD_BLDAPPENDN(strText,&pSource[nChar],nCharLen);
  3182. nChar += nCharLen;
  3183. }
  3184. }
  3185. if ( bAlterWhitespace && nCharWhitespace > )
  3186. {
  3187. MCD_BLDTRUNC(strText,nCharWhitespace);
  3188. }
  3189. MCD_BLDRELEASE(strText);
  3190. return strText;
  3191. }
  3192.  
  3193. bool CMarkup::DetectUTF8( const char* pText, int nTextLen, int* pnNonASCII/*=NULL*/, bool* bErrorAtEnd/*=NULL*/ )
  3194. {
  3195. // return true if ASCII or all non-ASCII byte sequences are valid UTF-8 pattern:
  3196. // ASCII 0xxxxxxx
  3197. // 2-byte 110xxxxx 10xxxxxx
  3198. // 3-byte 1110xxxx 10xxxxxx 10xxxxxx
  3199. // 4-byte 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  3200. // *pnNonASCII is set (if pnNonASCII is not NULL) to the number of non-ASCII UTF-8 sequences
  3201. // or if an invalid UTF-8 sequence is found, to 1 + the valid non-ASCII sequences up to the invalid sequence
  3202. // *bErrorAtEnd is set (if bErrorAtEnd is not NULL) to true if the UTF-8 was cut off at the end in mid valid sequence
  3203. int nUChar;
  3204. if ( pnNonASCII )
  3205. *pnNonASCII = ;
  3206. const char* pTextEnd = pText + nTextLen;
  3207. while ( *pText && pText != pTextEnd )
  3208. {
  3209. if ( (unsigned char)(*pText) & 0x80 )
  3210. {
  3211. if ( pnNonASCII )
  3212. ++(*pnNonASCII);
  3213. nUChar = DecodeCharUTF8( pText, pTextEnd );
  3214. if ( nUChar == - )
  3215. {
  3216. if ( bErrorAtEnd )
  3217. *bErrorAtEnd = (pTextEnd == pText)? true:false;
  3218. return false;
  3219. }
  3220. }
  3221. else
  3222. ++pText;
  3223. }
  3224. if ( bErrorAtEnd )
  3225. *bErrorAtEnd = false;
  3226. return true;
  3227. }
  3228.  
  3229. int CMarkup::DecodeCharUTF8( const char*& pszUTF8, const char* pszUTF8End/*=NULL*/ )
  3230. {
  3231. // Return Unicode code point and increment pszUTF8 past 1-4 bytes
  3232. // pszUTF8End can be NULL if pszUTF8 is null terminated
  3233. int nUChar = (unsigned char)*pszUTF8;
  3234. ++pszUTF8;
  3235. if ( nUChar & 0x80 )
  3236. {
  3237. int nExtraChars;
  3238. if ( ! (nUChar & 0x20) )
  3239. {
  3240. nExtraChars = ;
  3241. nUChar &= 0x1f;
  3242. }
  3243. else if ( ! (nUChar & 0x10) )
  3244. {
  3245. nExtraChars = ;
  3246. nUChar &= 0x0f;
  3247. }
  3248. else if ( ! (nUChar & 0x08) )
  3249. {
  3250. nExtraChars = ;
  3251. nUChar &= 0x07;
  3252. }
  3253. else
  3254. return -;
  3255. while ( nExtraChars-- )
  3256. {
  3257. if ( pszUTF8 == pszUTF8End || ! (*pszUTF8 & 0x80) )
  3258. return -;
  3259. nUChar = nUChar<<;
  3260. nUChar |= *pszUTF8 & 0x3f;
  3261. ++pszUTF8;
  3262. }
  3263. }
  3264. return nUChar;
  3265. }
  3266.  
  3267. void CMarkup::EncodeCharUTF16( int nUChar, unsigned short* pwszUTF16, int& nUTF16Len )
  3268. {
  3269. // Write UTF-16 sequence to pwszUTF16 for Unicode code point nUChar and update nUTF16Len
  3270. // Be sure pwszUTF16 has room for up to 2 wide chars
  3271. if ( nUChar & ~0xffff )
  3272. {
  3273. if ( pwszUTF16 )
  3274. {
  3275. // Surrogate pair
  3276. nUChar -= 0x10000;
  3277. pwszUTF16[nUTF16Len++] = (unsigned short)(((nUChar>>) & 0x3ff) | 0xd800); // W1
  3278. pwszUTF16[nUTF16Len++] = (unsigned short)((nUChar & 0x3ff) | 0xdc00); // W2
  3279. }
  3280. else
  3281. nUTF16Len += ;
  3282. }
  3283. else
  3284. {
  3285. if ( pwszUTF16 )
  3286. pwszUTF16[nUTF16Len++] = (unsigned short)nUChar;
  3287. else
  3288. ++nUTF16Len;
  3289. }
  3290. }
  3291.  
  3292. int CMarkup::DecodeCharUTF16( const unsigned short*& pwszUTF16, const unsigned short* pszUTF16End/*=NULL*/ )
  3293. {
  3294. // Return Unicode code point and increment pwszUTF16 past 1 or 2 (if surrogrates) UTF-16 code points
  3295. // pszUTF16End can be NULL if pszUTF16 is zero terminated
  3296. int nUChar = *pwszUTF16;
  3297. ++pwszUTF16;
  3298. if ( (nUChar & ~0x000007ff) == 0xd800 ) // W1
  3299. {
  3300. if ( pwszUTF16 == pszUTF16End || ! (*pwszUTF16) ) // W2
  3301. return -; // incorrect UTF-16
  3302. nUChar = (((nUChar & 0x3ff) << ) | (*pwszUTF16 & 0x3ff)) + 0x10000;
  3303. ++pwszUTF16;
  3304. }
  3305. return nUChar;
  3306. }
  3307.  
  3308. void CMarkup::EncodeCharUTF8( int nUChar, char* pszUTF8, int& nUTF8Len )
  3309. {
  3310. // Write UTF-8 sequence to pszUTF8 for Unicode code point nUChar and update nUTF8Len
  3311. // Be sure pszUTF8 has room for up to 4 bytes
  3312. if ( ! (nUChar & ~0x0000007f) ) // < 0x80
  3313. {
  3314. if ( pszUTF8 )
  3315. pszUTF8[nUTF8Len++] = (char)nUChar;
  3316. else
  3317. ++nUTF8Len;
  3318. }
  3319. else if ( ! (nUChar & ~0x000007ff) ) // < 0x800
  3320. {
  3321. if ( pszUTF8 )
  3322. {
  3323. pszUTF8[nUTF8Len++] = (char)(((nUChar&0x7c0)>>)|0xc0);
  3324. pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80);
  3325. }
  3326. else
  3327. nUTF8Len += ;
  3328. }
  3329. else if ( ! (nUChar & ~0x0000ffff) ) // < 0x10000
  3330. {
  3331. if ( pszUTF8 )
  3332. {
  3333. pszUTF8[nUTF8Len++] = (char)(((nUChar&0xf000)>>)|0xe0);
  3334. pszUTF8[nUTF8Len++] = (char)(((nUChar&0xfc0)>>)|0x80);
  3335. pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80);
  3336. }
  3337. else
  3338. nUTF8Len += ;
  3339. }
  3340. else // < 0x110000
  3341. {
  3342. if ( pszUTF8 )
  3343. {
  3344. pszUTF8[nUTF8Len++] = (char)(((nUChar&0x1c0000)>>)|0xf0);
  3345. pszUTF8[nUTF8Len++] = (char)(((nUChar&0x3f000)>>)|0x80);
  3346. pszUTF8[nUTF8Len++] = (char)(((nUChar&0xfc0)>>)|0x80);
  3347. pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80);
  3348. }
  3349. else
  3350. nUTF8Len += ;
  3351. }
  3352. }
  3353.  
  3354. int CMarkup::UTF16To8( char* pszUTF8, const unsigned short* pwszUTF16, int nUTF8Count )
  3355. {
  3356. // Supports the same arguments as wcstombs
  3357. // the pwszUTF16 source must be a NULL-terminated UTF-16 string
  3358. // if pszUTF8 is NULL, the number of bytes required is returned and nUTF8Count is ignored
  3359. // otherwise pszUTF8 is filled with the result string and NULL-terminated if nUTF8Count allows
  3360. // nUTF8Count is the byte size of pszUTF8 and must be large enough for the NULL if NULL desired
  3361. // and the number of bytes (excluding NULL) is returned
  3362. //
  3363. int nUChar, nUTF8Len = ;
  3364. while ( *pwszUTF16 )
  3365. {
  3366. // Decode UTF-16
  3367. nUChar = DecodeCharUTF16( pwszUTF16, NULL );
  3368. if ( nUChar == - )
  3369. nUChar = '?';
  3370.  
  3371. // Encode UTF-8
  3372. if ( pszUTF8 && nUTF8Len + > nUTF8Count )
  3373. {
  3374. int nUTF8LenSoFar = nUTF8Len;
  3375. EncodeCharUTF8( nUChar, NULL, nUTF8Len );
  3376. if ( nUTF8Len > nUTF8Count )
  3377. return nUTF8LenSoFar;
  3378. nUTF8Len = nUTF8LenSoFar;
  3379. }
  3380. EncodeCharUTF8( nUChar, pszUTF8, nUTF8Len );
  3381. }
  3382. if ( pszUTF8 && nUTF8Len < nUTF8Count )
  3383. pszUTF8[nUTF8Len] = ;
  3384. return nUTF8Len;
  3385. }
  3386.  
  3387. int CMarkup::UTF8To16( unsigned short* pwszUTF16, const char* pszUTF8, int nUTF8Count )
  3388. {
  3389. // Supports the same arguments as mbstowcs
  3390. // the pszUTF8 source must be a UTF-8 string which will be processed up to NULL-terminator or nUTF8Count
  3391. // if pwszUTF16 is NULL, the number of UTF-16 chars required is returned
  3392. // nUTF8Count is maximum UTF-8 bytes to convert and should include NULL if NULL desired in result
  3393. // if pwszUTF16 is not NULL it is filled with the result string and it must be large enough
  3394. // result will be NULL-terminated if NULL encountered in pszUTF8 before nUTF8Count
  3395. // and the number of UTF-8 bytes converted is returned
  3396. //
  3397. const char* pszPosUTF8 = pszUTF8;
  3398. const char* pszUTF8End = pszUTF8 + nUTF8Count;
  3399. int nUChar, nUTF8Len = , nUTF16Len = ;
  3400. while ( pszPosUTF8 != pszUTF8End )
  3401. {
  3402. nUChar = DecodeCharUTF8( pszPosUTF8, pszUTF8End );
  3403. if ( ! nUChar )
  3404. {
  3405. if ( pwszUTF16 )
  3406. pwszUTF16[nUTF16Len] = ;
  3407. break;
  3408. }
  3409. else if ( nUChar == - )
  3410. nUChar = '?';
  3411.  
  3412. // Encode UTF-16
  3413. EncodeCharUTF16( nUChar, pwszUTF16, nUTF16Len );
  3414. }
  3415. nUTF8Len = (int)(pszPosUTF8 - pszUTF8);
  3416. if ( ! pwszUTF16 )
  3417. return nUTF16Len;
  3418. return nUTF8Len;
  3419. }
  3420.  
  3421. #if ! defined(MARKUP_WCHAR) // not WCHAR
  3422. MCD_STR CMarkup::UTF8ToA( MCD_CSTR pszUTF8, int* pnFailed/*=NULL*/ )
  3423. {
  3424. // Converts from UTF-8 to locale ANSI charset
  3425. MCD_STR strANSI;
  3426. int nMBLen = (int)MCD_PSZLEN( pszUTF8 );
  3427. if ( pnFailed )
  3428. *pnFailed = ;
  3429. if ( nMBLen )
  3430. {
  3431. TextEncoding textencoding( MCD_T("UTF-8"), (const void*)pszUTF8, nMBLen );
  3432. textencoding.m_nToCount = nMBLen;
  3433. MCD_CHAR* pANSIBuffer = MCD_GETBUFFER(strANSI,textencoding.m_nToCount);
  3434. nMBLen = textencoding.PerformConversion( (void*)pANSIBuffer );
  3435. MCD_RELEASEBUFFER(strANSI,pANSIBuffer,nMBLen);
  3436. if ( pnFailed )
  3437. *pnFailed = textencoding.m_nFailedChars;
  3438. }
  3439. return strANSI;
  3440. }
  3441.  
  3442. MCD_STR CMarkup::AToUTF8( MCD_CSTR pszANSI )
  3443. {
  3444. // Converts locale ANSI charset to UTF-8
  3445. MCD_STR strUTF8;
  3446. int nMBLen = (int)MCD_PSZLEN( pszANSI );
  3447. if ( nMBLen )
  3448. {
  3449. TextEncoding textencoding( MCD_T(""), (const void*)pszANSI, nMBLen );
  3450. textencoding.m_nToCount = nMBLen * ;
  3451. MCD_CHAR* pUTF8Buffer = MCD_GETBUFFER(strUTF8,textencoding.m_nToCount);
  3452. nMBLen = textencoding.PerformConversion( (void*)pUTF8Buffer, MCD_T("UTF-8") );
  3453. MCD_RELEASEBUFFER(strUTF8,pUTF8Buffer,nMBLen);
  3454. }
  3455. return strUTF8;
  3456. }
  3457. #endif // not WCHAR
  3458.  
  3459. MCD_STR CMarkup::GetDeclaredEncoding( MCD_CSTR szDoc )
  3460. {
  3461. // Extract encoding attribute from XML Declaration, or HTML meta charset
  3462. MCD_STR strEncoding;
  3463. TokenPos token( szDoc, MDF_IGNORECASE );
  3464. NodePos node;
  3465. bool bHtml = false;
  3466. int nTypeFound = ;
  3467. while ( nTypeFound >= )
  3468. {
  3469. nTypeFound = token.ParseNode( node );
  3470. int nNext = token.m_nNext;
  3471. if ( nTypeFound == MNT_PROCESSING_INSTRUCTION && node.nStart == )
  3472. {
  3473. token.m_nNext = node.nStart + ; // after <?
  3474. if ( token.FindName() && token.Match(MCD_T("xml")) )
  3475. {
  3476. // e.g. <?xml version="1.0" encoding="UTF-8"?>
  3477. if ( token.FindAttrib(MCD_T("encoding")) )
  3478. strEncoding = token.GetTokenText();
  3479. break;
  3480. }
  3481. }
  3482. else if ( nTypeFound == ) // end tag
  3483. {
  3484. // Check for end of HTML head
  3485. token.m_nNext = node.nStart + ; // after </
  3486. if ( token.FindName() && token.Match(MCD_T("head")) )
  3487. break;
  3488. }
  3489. else if ( nTypeFound == MNT_ELEMENT )
  3490. {
  3491. token.m_nNext = node.nStart + ; // after <
  3492. token.FindName();
  3493. if ( ! bHtml )
  3494. {
  3495. if ( ! token.Match(MCD_T("html")) )
  3496. break;
  3497. bHtml = true;
  3498. }
  3499. else if ( token.Match(MCD_T("meta")) )
  3500. {
  3501. // e.g. <META http-equiv=Content-Type content="text/html; charset=UTF-8">
  3502. int nAttribOffset = node.nStart + ;
  3503. token.m_nNext = nAttribOffset;
  3504. if ( token.FindAttrib(MCD_T("http-equiv")) && token.Match(MCD_T("Content-Type")) )
  3505. {
  3506. token.m_nNext = nAttribOffset;
  3507. if ( token.FindAttrib(MCD_T("content")) )
  3508. {
  3509. int nContentEndOffset = token.m_nNext;
  3510. token.m_nNext = token.m_nL;
  3511. while ( token.m_nNext < nContentEndOffset && token.FindName() )
  3512. {
  3513. if ( token.Match(MCD_T("charset")) && token.FindName() && token.Match(MCD_T("=")) )
  3514. {
  3515. token.FindName();
  3516. strEncoding = token.GetTokenText();
  3517. break;
  3518. }
  3519. }
  3520. }
  3521. break;
  3522. }
  3523. }
  3524. }
  3525. token.m_nNext = nNext;
  3526. }
  3527. return strEncoding;
  3528. }
  3529.  
  3530. int CMarkup::GetEncodingCodePage( MCD_CSTR pszEncoding )
  3531. {
  3532. return x_GetEncodingCodePage( pszEncoding );
  3533. }
  3534.  
  3535. int CMarkup::FindNode( int nType )
  3536. {
  3537. // Change current node position only if a node is found
  3538. // If nType is 0 find any node, otherwise find node of type nType
  3539. // Return type of node or 0 if not found
  3540.  
  3541. // Determine where in document to start scanning for node
  3542. int nNodeOffset = m_nNodeOffset;
  3543. if ( m_nNodeType > MNT_ELEMENT )
  3544. {
  3545. // By-pass current node
  3546. nNodeOffset += m_nNodeLength;
  3547. }
  3548. else // element or no current main position
  3549. {
  3550. // Set position to begin looking for node
  3551. if ( m_iPos )
  3552. {
  3553. // After element
  3554. nNodeOffset = ELEM(m_iPos).StartAfter();
  3555. }
  3556. else if ( m_iPosParent )
  3557. {
  3558. // Immediately after start tag of parent
  3559. if ( ELEM(m_iPosParent).IsEmptyElement() )
  3560. return ;
  3561. else
  3562. nNodeOffset = ELEM(m_iPosParent).StartContent();
  3563. }
  3564. }
  3565.  
  3566. // Get nodes until we find what we're looking for
  3567. int nTypeFound = ;
  3568. int iPosNew = m_iPos;
  3569. TokenPos token( m_strDoc, m_nDocFlags );
  3570. NodePos node;
  3571. token.m_nNext = nNodeOffset;
  3572. do
  3573. {
  3574. nNodeOffset = token.m_nNext;
  3575. nTypeFound = token.ParseNode( node );
  3576. if ( nTypeFound == )
  3577. {
  3578. // Check if we have reached the end of the parent element
  3579. if ( m_iPosParent && nNodeOffset == ELEM(m_iPosParent).StartContent()
  3580. + ELEM(m_iPosParent).ContentLen() )
  3581. return ;
  3582. nTypeFound = MNT_LONE_END_TAG; // otherwise it is a lone end tag
  3583. }
  3584. else if ( nTypeFound < )
  3585. {
  3586. if ( nTypeFound == - ) // end of document
  3587. return ;
  3588. // -1 is node error
  3589. nTypeFound = MNT_NODE_ERROR;
  3590. }
  3591. else if ( nTypeFound == MNT_ELEMENT )
  3592. {
  3593. if ( iPosNew )
  3594. iPosNew = ELEM(iPosNew).iElemNext;
  3595. else
  3596. iPosNew = ELEM(m_iPosParent).iElemChild;
  3597. if ( ! iPosNew )
  3598. return ;
  3599. if ( ! nType || (nType & nTypeFound) )
  3600. {
  3601. // Found element node, move position to this element
  3602. x_SetPos( m_iPosParent, iPosNew, );
  3603. return m_nNodeType;
  3604. }
  3605. token.m_nNext = ELEM(iPosNew).StartAfter();
  3606. }
  3607. }
  3608. while ( nType && ! (nType & nTypeFound) );
  3609.  
  3610. m_iPos = iPosNew;
  3611. m_iPosChild = ;
  3612. m_nNodeOffset = node.nStart;
  3613. m_nNodeLength = node.nLength;
  3614. m_nNodeType = nTypeFound;
  3615. MARKUP_SETDEBUGSTATE;
  3616. return m_nNodeType;
  3617. }
  3618.  
  3619. bool CMarkup::RemoveNode()
  3620. {
  3621. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  3622. return false;
  3623. if ( m_iPos || m_nNodeLength )
  3624. {
  3625. x_RemoveNode( m_iPosParent, m_iPos, m_nNodeType, m_nNodeOffset, m_nNodeLength );
  3626. m_iPosChild = ;
  3627. MARKUP_SETDEBUGSTATE;
  3628. return true;
  3629. }
  3630. return false;
  3631. }
  3632.  
  3633. MCD_STR CMarkup::GetTagName() const
  3634. {
  3635. // Return the tag name at the current main position
  3636. MCD_STR strTagName;
  3637.  
  3638. // This method is primarily for elements, however
  3639. // it does return something for certain other nodes
  3640. if ( m_nNodeLength )
  3641. {
  3642. switch ( m_nNodeType )
  3643. {
  3644. case MNT_PROCESSING_INSTRUCTION:
  3645. case MNT_LONE_END_TAG:
  3646. {
  3647. // <?target or </tagname
  3648. TokenPos token( m_strDoc, m_nDocFlags );
  3649. token.m_nNext = m_nNodeOffset + ;
  3650. if ( token.FindName() )
  3651. strTagName = token.GetTokenText();
  3652. }
  3653. break;
  3654. case MNT_COMMENT:
  3655. strTagName = MCD_T("#comment");
  3656. break;
  3657. case MNT_CDATA_SECTION:
  3658. strTagName = MCD_T("#cdata-section");
  3659. break;
  3660. case MNT_DOCUMENT_TYPE:
  3661. {
  3662. // <!DOCTYPE name
  3663. TokenPos token( m_strDoc, m_nDocFlags );
  3664. token.m_nNext = m_nNodeOffset + ;
  3665. if ( token.FindName() && token.FindName() )
  3666. strTagName = token.GetTokenText();
  3667. }
  3668. break;
  3669. case MNT_TEXT:
  3670. case MNT_WHITESPACE:
  3671. strTagName = MCD_T("#text");
  3672. break;
  3673. }
  3674. return strTagName;
  3675. }
  3676.  
  3677. if ( m_iPos )
  3678. strTagName = x_GetTagName( m_iPos );
  3679. return strTagName;
  3680. }
  3681.  
  3682. bool CMarkup::IntoElem()
  3683. {
  3684. // Make current element the parent
  3685. if ( m_iPos && m_nNodeType == MNT_ELEMENT )
  3686. {
  3687. x_SetPos( m_iPos, m_iPosChild, );
  3688. return true;
  3689. }
  3690. return false;
  3691. }
  3692.  
  3693. bool CMarkup::OutOfElem()
  3694. {
  3695. // Go to parent element
  3696. if ( m_iPosParent )
  3697. {
  3698. x_SetPos( ELEM(m_iPosParent).iElemParent, m_iPosParent, m_iPos );
  3699. return true;
  3700. }
  3701. return false;
  3702. }
  3703.  
  3704. bool CMarkup::GetNthAttrib( int n, MCD_STR& strAttrib, MCD_STR& strValue ) const
  3705. {
  3706. // Return nth attribute name and value from main position
  3707. TokenPos token( m_strDoc, m_nDocFlags );
  3708. if ( m_iPos && m_nNodeType == MNT_ELEMENT )
  3709. token.m_nNext = ELEM(m_iPos).nStart + ;
  3710. else if ( m_nNodeLength && m_nNodeType == MNT_PROCESSING_INSTRUCTION )
  3711. token.m_nNext = m_nNodeOffset + ;
  3712. else
  3713. return false;
  3714. if ( token.FindAttrib(NULL,n,&strAttrib) )
  3715. {
  3716. strValue = UnescapeText( token.GetTokenPtr(), token.Length(), m_nDocFlags );
  3717. return true;
  3718. }
  3719. return false;
  3720. }
  3721.  
  3722. MCD_STR CMarkup::GetAttribName( int n ) const
  3723. {
  3724. // Return nth attribute name of main position
  3725. TokenPos token( m_strDoc, m_nDocFlags );
  3726. if ( m_iPos && m_nNodeType == MNT_ELEMENT )
  3727. token.m_nNext = ELEM(m_iPos).nStart + ;
  3728. else if ( m_nNodeLength && m_nNodeType == MNT_PROCESSING_INSTRUCTION )
  3729. token.m_nNext = m_nNodeOffset + ;
  3730. else
  3731. return MCD_T("");
  3732. if ( token.FindAttrib(NULL,n) )
  3733. return token.GetTokenText();
  3734. return MCD_T("");
  3735. }
  3736.  
  3737. bool CMarkup::SavePos( MCD_CSTR szPosName /*=""*/, int nMap /*=0*/ )
  3738. {
  3739. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  3740. return false;
  3741. // Save current element position in saved position map
  3742. if ( szPosName )
  3743. {
  3744. SavedPosMap* pMap;
  3745. m_pSavedPosMaps->GetMap( pMap, nMap );
  3746. SavedPos savedpos;
  3747. if ( szPosName )
  3748. savedpos.strName = szPosName;
  3749. if ( m_iPosChild )
  3750. {
  3751. savedpos.iPos = m_iPosChild;
  3752. savedpos.nSavedPosFlags |= SavedPos::SPM_CHILD;
  3753. }
  3754. else if ( m_iPos )
  3755. {
  3756. savedpos.iPos = m_iPos;
  3757. savedpos.nSavedPosFlags |= SavedPos::SPM_MAIN;
  3758. }
  3759. else
  3760. {
  3761. savedpos.iPos = m_iPosParent;
  3762. }
  3763. savedpos.nSavedPosFlags |= SavedPos::SPM_USED;
  3764.  
  3765. int nSlot = x_Hash( szPosName, pMap->nMapSize);
  3766. SavedPos* pSavedPos = pMap->pTable[nSlot];
  3767. int nOffset = ;
  3768. if ( ! pSavedPos )
  3769. {
  3770. pSavedPos = new SavedPos[];
  3771. pSavedPos[].nSavedPosFlags = SavedPos::SPM_LAST;
  3772. pMap->pTable[nSlot] = pSavedPos;
  3773. }
  3774. else
  3775. {
  3776. while ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_USED )
  3777. {
  3778. if ( pSavedPos[nOffset].strName == (MCD_PCSZ)szPosName )
  3779. break;
  3780. if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST )
  3781. {
  3782. int nNewSize = (nOffset + ) * ;
  3783. SavedPos* pNewSavedPos = new SavedPos[nNewSize];
  3784. for ( int nCopy=; nCopy<=nOffset; ++nCopy )
  3785. pNewSavedPos[nCopy] = pSavedPos[nCopy];
  3786. pNewSavedPos[nOffset].nSavedPosFlags ^= SavedPos::SPM_LAST;
  3787. pNewSavedPos[nNewSize-].nSavedPosFlags = SavedPos::SPM_LAST;
  3788. delete [] pSavedPos;
  3789. pSavedPos = pNewSavedPos;
  3790. pMap->pTable[nSlot] = pSavedPos;
  3791. ++nOffset;
  3792. break;
  3793. }
  3794. ++nOffset;
  3795. }
  3796. }
  3797. if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST )
  3798. savedpos.nSavedPosFlags |= SavedPos::SPM_LAST;
  3799. pSavedPos[nOffset] = savedpos;
  3800.  
  3801. /*
  3802. // To review hash table balance, uncomment and watch strBalance
  3803. MCD_STR strBalance, strSlot;
  3804. for ( nSlot=0; nSlot < pMap->nMapSize; ++nSlot )
  3805. {
  3806. pSavedPos = pMap->pTable[nSlot];
  3807. int nCount = 0;
  3808. while ( pSavedPos && pSavedPos->nSavedPosFlags & SavedPos::SPM_USED )
  3809. {
  3810. ++nCount;
  3811. if ( pSavedPos->nSavedPosFlags & SavedPos::SPM_LAST )
  3812. break;
  3813. ++pSavedPos;
  3814. }
  3815. strSlot.Format( MCD_T("%d "), nCount );
  3816. strBalance += strSlot;
  3817. }
  3818. */
  3819. return true;
  3820. }
  3821. return false;
  3822. }
  3823.  
  3824. bool CMarkup::RestorePos( MCD_CSTR szPosName /*=""*/, int nMap /*=0*/ )
  3825. {
  3826. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  3827. return false;
  3828. // Restore element position if found in saved position map
  3829. if ( szPosName )
  3830. {
  3831. SavedPosMap* pMap;
  3832. m_pSavedPosMaps->GetMap( pMap, nMap );
  3833. int nSlot = x_Hash( szPosName, pMap->nMapSize );
  3834. SavedPos* pSavedPos = pMap->pTable[nSlot];
  3835. if ( pSavedPos )
  3836. {
  3837. int nOffset = ;
  3838. while ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_USED )
  3839. {
  3840. if ( pSavedPos[nOffset].strName == (MCD_PCSZ)szPosName )
  3841. {
  3842. int i = pSavedPos[nOffset].iPos;
  3843. if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_CHILD )
  3844. x_SetPos( ELEM(ELEM(i).iElemParent).iElemParent, ELEM(i).iElemParent, i );
  3845. else if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_MAIN )
  3846. x_SetPos( ELEM(i).iElemParent, i, );
  3847. else
  3848. x_SetPos( i, , );
  3849. return true;
  3850. }
  3851. if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST )
  3852. break;
  3853. ++nOffset;
  3854. }
  3855. }
  3856. }
  3857. return false;
  3858. }
  3859.  
  3860. bool CMarkup::SetMapSize( int nSize, int nMap /*=0*/ )
  3861. {
  3862. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  3863. return false;
  3864. // Set saved position map hash table size before using it
  3865. // Returns false if map already exists
  3866. // Some prime numbers: 53, 101, 211, 503, 1009, 2003, 10007, 20011, 50021, 100003, 200003, 500009
  3867. SavedPosMap* pNewMap;
  3868. return m_pSavedPosMaps->GetMap( pNewMap, nMap, nSize );
  3869. }
  3870.  
  3871. bool CMarkup::RemoveElem()
  3872. {
  3873. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  3874. return false;
  3875. // Remove current main position element
  3876. if ( m_iPos && m_nNodeType == MNT_ELEMENT )
  3877. {
  3878. int iPos = x_RemoveElem( m_iPos );
  3879. x_SetPos( m_iPosParent, iPos, );
  3880. return true;
  3881. }
  3882. return false;
  3883. }
  3884.  
  3885. bool CMarkup::RemoveChildElem()
  3886. {
  3887. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  3888. return false;
  3889. // Remove current child position element
  3890. if ( m_iPosChild )
  3891. {
  3892. int iPosChild = x_RemoveElem( m_iPosChild );
  3893. x_SetPos( m_iPosParent, m_iPos, iPosChild );
  3894. return true;
  3895. }
  3896. return false;
  3897. }
  3898.  
  3899. //////////////////////////////////////////////////////////////////////
  3900. // CMarkup private methods
  3901. //
  3902. void CMarkup::x_InitMarkup()
  3903. {
  3904. // Only called from CMarkup constructors
  3905. m_pFilePos = NULL;
  3906. m_pSavedPosMaps = new SavedPosMapArray;
  3907. m_pElemPosTree = new ElemPosTree;
  3908.  
  3909. // To always ignore case, define MARKUP_IGNORECASE
  3910. #if defined(MARKUP_IGNORECASE) // ignore case
  3911. m_nDocFlags = MDF_IGNORECASE;
  3912. #else // not ignore case
  3913. m_nDocFlags = ;
  3914. #endif // not ignore case
  3915. }
  3916.  
  3917. int CMarkup::x_GetParent( int i )
  3918. {
  3919. return ELEM(i).iElemParent;
  3920. }
  3921.  
  3922. void CMarkup::x_SetPos( int iPosParent, int iPos, int iPosChild )
  3923. {
  3924. m_iPosParent = iPosParent;
  3925. m_iPos = iPos;
  3926. m_iPosChild = iPosChild;
  3927. m_nNodeOffset = ;
  3928. m_nNodeLength = ;
  3929. m_nNodeType = iPos?MNT_ELEMENT:;
  3930. MARKUP_SETDEBUGSTATE;
  3931. }
  3932.  
  3933. #if defined(_DEBUG) // DEBUG
  3934. void CMarkup::x_SetDebugState()
  3935. {
  3936. // Set m_pDebugCur and m_pDebugPos to point into document
  3937. MCD_PCSZ pD = MCD_2PCSZ(m_strDoc);
  3938.  
  3939. // Node (non-element) position is determined differently in file mode
  3940. if ( m_nNodeLength || (m_nNodeOffset && !m_pFilePos)
  3941. || (m_pFilePos && (!m_iPos) && (!m_iPosParent) && ! m_pFilePos->FileAtTop()) )
  3942. {
  3943. if ( ! m_nNodeLength )
  3944. m_pDebugCur = MCD_T("main position offset"); // file mode only
  3945. else
  3946. m_pDebugCur = MCD_T("main position node");
  3947. m_pDebugPos = &pD[m_nNodeOffset];
  3948. }
  3949. else
  3950. {
  3951. if ( m_iPosChild )
  3952. {
  3953. m_pDebugCur = MCD_T("child position element");
  3954. m_pDebugPos = &pD[ELEM(m_iPosChild).nStart];
  3955. }
  3956. else if ( m_iPos )
  3957. {
  3958. m_pDebugCur = MCD_T("main position element");
  3959. m_pDebugPos = &pD[ELEM(m_iPos).nStart];
  3960. }
  3961. else if ( m_iPosParent )
  3962. {
  3963. m_pDebugCur = MCD_T("parent position element");
  3964. m_pDebugPos = &pD[ELEM(m_iPosParent).nStart];
  3965. }
  3966. else
  3967. {
  3968. m_pDebugCur = MCD_T("top of document");
  3969. m_pDebugPos = pD;
  3970. }
  3971. }
  3972. }
  3973. #endif // DEBUG
  3974.  
  3975. int CMarkup::x_GetFreePos()
  3976. {
  3977. if ( m_iPosFree == m_pElemPosTree->GetSize() )
  3978. x_AllocElemPos();
  3979. return m_iPosFree++;
  3980. }
  3981.  
  3982. bool CMarkup::x_AllocElemPos( int nNewSize /*=0*/ )
  3983. {
  3984. // Resize m_aPos when the document is created or the array is filled
  3985. if ( ! nNewSize )
  3986. nNewSize = m_iPosFree + (m_iPosFree>>); // Grow By: multiply size by 1.5
  3987. if ( m_pElemPosTree->GetSize() < nNewSize )
  3988. m_pElemPosTree->GrowElemPosTree( nNewSize );
  3989. return true;
  3990. }
  3991.  
  3992. bool CMarkup::x_ParseDoc()
  3993. {
  3994. // Reset indexes
  3995. ResetPos();
  3996. m_pSavedPosMaps->ReleaseMaps();
  3997.  
  3998. // Starting size of position array: 1 element per 64 bytes of document
  3999. // Tight fit when parsing small doc, only 0 to 2 reallocs when parsing large doc
  4000. // Start at 8 when creating new document
  4001. int nDocLen = MCD_STRLENGTH(m_strDoc);
  4002. m_iPosFree = ;
  4003. x_AllocElemPos( nDocLen / + );
  4004. m_iPosDeleted = ;
  4005.  
  4006. // Parse document
  4007. ELEM().ClearVirtualParent();
  4008. if ( nDocLen )
  4009. {
  4010. TokenPos token( m_strDoc, m_nDocFlags );
  4011. int iPos = x_ParseElem( , token );
  4012. ELEM().nLength = nDocLen;
  4013. if ( iPos > )
  4014. {
  4015. ELEM().iElemChild = iPos;
  4016. if ( ELEM(iPos).iElemNext )
  4017. x_AddResult( m_strResult, MCD_T("root_has_sibling") );
  4018. }
  4019. else
  4020. x_AddResult( m_strResult, MCD_T("no_root_element") );
  4021. }
  4022.  
  4023. ResetPos();
  4024. return IsWellFormed();
  4025. }
  4026.  
  4027. int CMarkup::x_ParseElem( int iPosParent, TokenPos& token )
  4028. {
  4029. // This is either called by x_ParseDoc or x_AddSubDoc or x_SetElemContent
  4030. // Returns index of the first element encountered or zero if no elements
  4031. //
  4032. int iPosRoot = ;
  4033. int iPos = iPosParent;
  4034. int iVirtualParent = iPosParent;
  4035. int nRootDepth = ELEM(iPos).Level();
  4036. int nMatchLevel;
  4037. int iPosMatch;
  4038. int iTag;
  4039. int nTypeFound;
  4040. int iPosFirst;
  4041. int iPosLast;
  4042. ElemPos* pElem;
  4043. ElemPos* pElemParent;
  4044. ElemPos* pElemChild;
  4045.  
  4046. // Loop through the nodes of the document
  4047. ElemStack elemstack;
  4048. NodePos node;
  4049. token.m_nNext = ;
  4050. while ( )
  4051. {
  4052. nTypeFound = token.ParseNode( node );
  4053. nMatchLevel = ;
  4054. if ( nTypeFound == MNT_ELEMENT ) // start tag
  4055. {
  4056. iPos = x_GetFreePos();
  4057. if ( ! iPosRoot )
  4058. iPosRoot = iPos;
  4059. pElem = &ELEM(iPos);
  4060. pElem->iElemParent = iPosParent;
  4061. pElem->iElemNext = ;
  4062. pElemParent = &ELEM(iPosParent);
  4063. if ( pElemParent->iElemChild )
  4064. {
  4065. iPosFirst = pElemParent->iElemChild;
  4066. pElemChild = &ELEM(iPosFirst);
  4067. iPosLast = pElemChild->iElemPrev;
  4068. ELEM(iPosLast).iElemNext = iPos;
  4069. pElem->iElemPrev = iPosLast;
  4070. pElemChild->iElemPrev = iPos;
  4071. pElem->nFlags = ;
  4072. }
  4073. else
  4074. {
  4075. pElemParent->iElemChild = iPos;
  4076. pElem->iElemPrev = iPos;
  4077. pElem->nFlags = MNF_FIRST;
  4078. }
  4079. pElem->SetLevel( nRootDepth + elemstack.iTop );
  4080. pElem->iElemChild = ;
  4081. pElem->nStart = node.nStart;
  4082. pElem->SetStartTagLen( node.nLength );
  4083. if ( node.nNodeFlags & MNF_EMPTY )
  4084. {
  4085. iPos = iPosParent;
  4086. pElem->SetEndTagLen( );
  4087. pElem->nLength = node.nLength;
  4088. }
  4089. else
  4090. {
  4091. iPosParent = iPos;
  4092. elemstack.PushIntoLevel( token.GetTokenPtr(), token.Length() );
  4093. }
  4094. }
  4095. else if ( nTypeFound == ) // end tag
  4096. {
  4097. iPosMatch = iPos;
  4098. iTag = elemstack.iTop;
  4099. nMatchLevel = iTag;
  4100. while ( nMatchLevel && ! token.Match(elemstack.GetRefTagPosAt(iTag--).strTagName) )
  4101. {
  4102. --nMatchLevel;
  4103. iPosMatch = ELEM(iPosMatch).iElemParent;
  4104. }
  4105. if ( nMatchLevel == )
  4106. {
  4107. // Not matched at all, it is a lone end tag, a non-element node
  4108. ELEM(iVirtualParent).nFlags |= MNF_ILLFORMED;
  4109. ELEM(iPos).nFlags |= MNF_ILLDATA;
  4110. x_AddResult( m_strResult, MCD_T("lone_end_tag"), token.GetTokenText(), , node.nStart );
  4111. }
  4112. else
  4113. {
  4114. pElem = &ELEM(iPosMatch);
  4115. pElem->nLength = node.nStart - pElem->nStart + node.nLength;
  4116. pElem->SetEndTagLen( node.nLength );
  4117. }
  4118. }
  4119. else if ( nTypeFound == - )
  4120. {
  4121. ELEM(iVirtualParent).nFlags |= MNF_ILLFORMED;
  4122. ELEM(iPos).nFlags |= MNF_ILLDATA;
  4123. m_strResult += node.strMeta;
  4124. }
  4125.  
  4126. // Matched end tag, or end of document
  4127. if ( nMatchLevel || nTypeFound == - )
  4128. {
  4129. if ( elemstack.iTop > nMatchLevel )
  4130. ELEM(iVirtualParent).nFlags |= MNF_ILLFORMED;
  4131.  
  4132. // Process any non-ended elements
  4133. while ( elemstack.iTop > nMatchLevel )
  4134. {
  4135. // Element with no end tag
  4136. pElem = &ELEM(iPos);
  4137. int iPosChild = pElem->iElemChild;
  4138. iPosParent = pElem->iElemParent;
  4139. pElem->SetEndTagLen( );
  4140. pElem->nFlags |= MNF_NONENDED;
  4141. pElem->iElemChild = ;
  4142. pElem->nLength = pElem->StartTagLen();
  4143. if ( pElem->nFlags & MNF_ILLDATA )
  4144. {
  4145. pElem->nFlags ^= MNF_ILLDATA;
  4146. ELEM(iPosParent).nFlags |= MNF_ILLDATA;
  4147. }
  4148. while ( iPosChild )
  4149. {
  4150. ELEM(iPosChild).iElemParent = iPosParent;
  4151. ELEM(iPosChild).iElemPrev = iPos;
  4152. ELEM(iPos).iElemNext = iPosChild;
  4153. iPos = iPosChild;
  4154. iPosChild = ELEM(iPosChild).iElemNext;
  4155. }
  4156.  
  4157. // If end tag did not match, top node is end tag that did not match pElem
  4158. // if end of document, any nodes below top have no end tag
  4159. // second offset represents location where end tag was expected but end of document or other end tag was found
  4160. // end tag that was found is token.GetTokenText() but not reported in error
  4161. int nOffset2 = (nTypeFound==)? token.m_nL-: MCD_STRLENGTH(m_strDoc);
  4162. x_AddResult( m_strResult, MCD_T("unended_start_tag"), elemstack.Current().strTagName, , pElem->nStart, nOffset2 );
  4163.  
  4164. iPos = iPosParent;
  4165. elemstack.PopOutOfLevel();
  4166. }
  4167. if ( nTypeFound == - )
  4168. break;
  4169. iPosParent = ELEM(iPos).iElemParent;
  4170. iPos = iPosParent;
  4171. elemstack.PopOutOfLevel();
  4172. }
  4173. }
  4174. return iPosRoot;
  4175. }
  4176.  
  4177. int CMarkup::x_FindElem( int iPosParent, int iPos, PathPos& path ) const
  4178. {
  4179. // If pPath is NULL or empty, go to next sibling element
  4180. // Otherwise go to next sibling element with matching path
  4181. //
  4182. if ( ! path.ValidPath() )
  4183. return ;
  4184.  
  4185. // Paths other than simple tag name are only supported in the developer version
  4186. if ( path.IsAnywherePath() || path.IsAbsolutePath() )
  4187. return ;
  4188.  
  4189. if ( iPos )
  4190. iPos = ELEM(iPos).iElemNext;
  4191. else
  4192. iPos = ELEM(iPosParent).iElemChild;
  4193.  
  4194. // Finished here if pPath not specified
  4195. if ( ! path.IsPath() )
  4196. return iPos;
  4197.  
  4198. // Search
  4199. TokenPos token( m_strDoc, m_nDocFlags );
  4200. while ( iPos )
  4201. {
  4202. // Compare tag name
  4203. token.m_nNext = ELEM(iPos).nStart + ;
  4204. token.FindName(); // Locate tag name
  4205. if ( token.Match(path.GetPtr()) )
  4206. return iPos;
  4207. iPos = ELEM(iPos).iElemNext;
  4208. }
  4209. return ;
  4210.  
  4211. }
  4212.  
  4213. MCD_STR CMarkup::x_GetPath( int iPos ) const
  4214. {
  4215. // In file mode, iPos is an index into m_pFilePos->m_elemstack or zero
  4216. MCD_STR strPath;
  4217. while ( iPos )
  4218. {
  4219. MCD_STR strTagName;
  4220. int iPosParent;
  4221. int nCount = ;
  4222. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  4223. {
  4224. TagPos& tag = m_pFilePos->m_elemstack.GetRefTagPosAt(iPos);
  4225. strTagName = tag.strTagName;
  4226. nCount = tag.nCount;
  4227. iPosParent = tag.iParent;
  4228. }
  4229. else
  4230. {
  4231. strTagName = x_GetTagName( iPos );
  4232. PathPos path( MCD_2PCSZ(strTagName), false );
  4233. iPosParent = ELEM(iPos).iElemParent;
  4234. int iPosSib = ;
  4235. while ( iPosSib != iPos )
  4236. {
  4237. path.RevertOffset();
  4238. iPosSib = x_FindElem( iPosParent, iPosSib, path );
  4239. ++nCount;
  4240. }
  4241. }
  4242. if ( nCount == )
  4243. strPath = MCD_T("/") + strTagName + strPath;
  4244. else
  4245. {
  4246. MCD_CHAR szPred[];
  4247. MCD_SPRINTF( MCD_SSZ(szPred), MCD_T("[%d]"), nCount );
  4248. strPath = MCD_T("/") + strTagName + szPred + strPath;
  4249. }
  4250. iPos = iPosParent;
  4251. }
  4252. return strPath;
  4253. }
  4254.  
  4255. MCD_STR CMarkup::x_GetTagName( int iPos ) const
  4256. {
  4257. // Return the tag name at specified element
  4258. TokenPos token( m_strDoc, m_nDocFlags );
  4259. token.m_nNext = ELEM(iPos).nStart + ;
  4260. if ( ! iPos || ! token.FindName() )
  4261. return MCD_T("");
  4262.  
  4263. // Return substring of document
  4264. return token.GetTokenText();
  4265. }
  4266.  
  4267. MCD_STR CMarkup::x_GetAttrib( int iPos, MCD_PCSZ pAttrib ) const
  4268. {
  4269. // Return the value of the attrib
  4270. TokenPos token( m_strDoc, m_nDocFlags );
  4271. if ( iPos && m_nNodeType == MNT_ELEMENT )
  4272. token.m_nNext = ELEM(iPos).nStart + ;
  4273. else if ( iPos == m_iPos && m_nNodeLength && m_nNodeType == MNT_PROCESSING_INSTRUCTION )
  4274. token.m_nNext = m_nNodeOffset + ;
  4275. else
  4276. return MCD_T("");
  4277.  
  4278. if ( pAttrib && token.FindAttrib(pAttrib) )
  4279. return UnescapeText( token.GetTokenPtr(), token.Length(), m_nDocFlags );
  4280. return MCD_T("");
  4281. }
  4282.  
  4283. bool CMarkup::x_SetAttrib( int iPos, MCD_PCSZ pAttrib, int nValue, int nFlags /*=0*/ )
  4284. {
  4285. // Convert integer to string
  4286. MCD_CHAR szVal[];
  4287. MCD_SPRINTF( MCD_SSZ(szVal), MCD_T("%d"), nValue );
  4288. return x_SetAttrib( iPos, pAttrib, szVal, nFlags );
  4289. }
  4290.  
  4291. bool CMarkup::x_SetAttrib( int iPos, MCD_PCSZ pAttrib, MCD_PCSZ pValue, int nFlags /*=0*/ )
  4292. {
  4293. if ( m_nDocFlags & MDF_READFILE )
  4294. return false;
  4295. int nNodeStart = ;
  4296. if ( iPos && m_nNodeType == MNT_ELEMENT )
  4297. nNodeStart = ELEM(iPos).nStart;
  4298. else if ( iPos == m_iPos && m_nNodeLength && m_nNodeType == MNT_PROCESSING_INSTRUCTION )
  4299. nNodeStart = m_nNodeOffset;
  4300. else
  4301. return false;
  4302.  
  4303. // Create insertion text depending on whether attribute already exists
  4304. // Decision: for empty value leaving attrib="" instead of removing attrib
  4305. TokenPos token( m_strDoc, m_nDocFlags );
  4306. token.m_nNext = nNodeStart + ((m_nNodeType == MNT_ELEMENT)?:);
  4307. int nReplace = ;
  4308. int nInsertAt;
  4309. MCD_STR strEscapedValue = EscapeText( pValue, MNF_ESCAPEQUOTES|nFlags );
  4310. int nEscapedValueLen = MCD_STRLENGTH( strEscapedValue );
  4311. MCD_STR strInsert;
  4312. if ( token.FindAttrib(pAttrib) )
  4313. {
  4314. // Replace value
  4315. MCD_BLDRESERVE( strInsert, nEscapedValueLen + );
  4316. MCD_BLDAPPEND1( strInsert, x_ATTRIBQUOTE );
  4317. MCD_BLDAPPENDN( strInsert, MCD_2PCSZ(strEscapedValue), nEscapedValueLen );
  4318. MCD_BLDAPPEND1( strInsert, x_ATTRIBQUOTE );
  4319. MCD_BLDRELEASE( strInsert );
  4320. nInsertAt = token.m_nL - ((token.m_nTokenFlags&MNF_QUOTED)?:);
  4321. nReplace = token.Length() + ((token.m_nTokenFlags&MNF_QUOTED)?:);
  4322. }
  4323. else
  4324. {
  4325. // Insert string name value pair
  4326. int nAttribNameLen = MCD_PSZLEN( pAttrib );
  4327. MCD_BLDRESERVE( strInsert, nAttribNameLen + nEscapedValueLen + );
  4328. MCD_BLDAPPEND1( strInsert, ' ' );
  4329. MCD_BLDAPPENDN( strInsert, pAttrib, nAttribNameLen );
  4330. MCD_BLDAPPEND1( strInsert, '=' );
  4331. MCD_BLDAPPEND1( strInsert, x_ATTRIBQUOTE );
  4332. MCD_BLDAPPENDN( strInsert, MCD_2PCSZ(strEscapedValue), nEscapedValueLen );
  4333. MCD_BLDAPPEND1( strInsert, x_ATTRIBQUOTE );
  4334. MCD_BLDRELEASE( strInsert );
  4335. nInsertAt = token.m_nNext;
  4336. }
  4337.  
  4338. int nAdjust = MCD_STRLENGTH(strInsert) - nReplace;
  4339. if ( m_nDocFlags & MDF_WRITEFILE )
  4340. {
  4341. int nNewDocLength = MCD_STRLENGTH(m_strDoc) + nAdjust;
  4342. MCD_STRCLEAR( m_strResult );
  4343. if ( nNodeStart && nNewDocLength > m_pFilePos->m_nBlockSizeBasis )
  4344. {
  4345. int nDocCapacity = MCD_STRCAPACITY(m_strDoc);
  4346. if ( nNewDocLength > nDocCapacity )
  4347. {
  4348. m_pFilePos->FileFlush( *m_pFilePos->m_pstrBuffer, nNodeStart );
  4349. m_strResult = m_pFilePos->m_strIOResult;
  4350. nInsertAt -= nNodeStart;
  4351. m_nNodeOffset = ;
  4352. if ( m_nNodeType == MNT_ELEMENT )
  4353. ELEM(iPos).nStart = ;
  4354. }
  4355. }
  4356. }
  4357. x_DocChange( nInsertAt, nReplace, strInsert );
  4358. if ( m_nNodeType == MNT_PROCESSING_INSTRUCTION )
  4359. {
  4360. x_AdjustForNode( m_iPosParent, m_iPos, nAdjust );
  4361. m_nNodeLength += nAdjust;
  4362. }
  4363. else
  4364. {
  4365. ELEM(iPos).AdjustStartTagLen( nAdjust );
  4366. ELEM(iPos).nLength += nAdjust;
  4367. x_Adjust( iPos, nAdjust );
  4368. }
  4369. MARKUP_SETDEBUGSTATE;
  4370. return true;
  4371. }
  4372.  
  4373. bool CMarkup::x_CreateNode( MCD_STR& strNode, int nNodeType, MCD_PCSZ pText )
  4374. {
  4375. // Set strNode based on nNodeType and szData
  4376. // Return false if szData would jeopardize well-formed document
  4377. //
  4378. switch ( nNodeType )
  4379. {
  4380. case MNT_PROCESSING_INSTRUCTION:
  4381. strNode = MCD_T("<?");
  4382. strNode += pText;
  4383. strNode += MCD_T("?>");
  4384. break;
  4385. case MNT_COMMENT:
  4386. strNode = MCD_T("<!--");
  4387. strNode += pText;
  4388. strNode += MCD_T("-->");
  4389. break;
  4390. case MNT_ELEMENT:
  4391. strNode = MCD_T("<");
  4392. strNode += pText;
  4393. strNode += MCD_T("/>");
  4394. break;
  4395. case MNT_TEXT:
  4396. case MNT_WHITESPACE:
  4397. strNode = EscapeText( pText );
  4398. break;
  4399. case MNT_DOCUMENT_TYPE:
  4400. strNode = pText;
  4401. break;
  4402. case MNT_LONE_END_TAG:
  4403. strNode = MCD_T("</");
  4404. strNode += pText;
  4405. strNode += MCD_T(">");
  4406. break;
  4407. case MNT_CDATA_SECTION:
  4408. if ( MCD_PSZSTR(pText,MCD_T("]]>")) != NULL )
  4409. return false;
  4410. strNode = MCD_T("<![CDATA[");
  4411. strNode += pText;
  4412. strNode += MCD_T("]]>");
  4413. break;
  4414. }
  4415. return true;
  4416. }
  4417.  
  4418. MCD_STR CMarkup::x_EncodeCDATASection( MCD_PCSZ szData )
  4419. {
  4420. // Split CDATA Sections if there are any end delimiters
  4421. MCD_STR strData = MCD_T("<![CDATA[");
  4422. MCD_PCSZ pszNextStart = szData;
  4423. MCD_PCSZ pszEnd = MCD_PSZSTR( szData, MCD_T("]]>") );
  4424. while ( pszEnd )
  4425. {
  4426. strData += MCD_STR( pszNextStart, (int)(pszEnd - pszNextStart) );
  4427. strData += MCD_T("]]]]><![CDATA[>");
  4428. pszNextStart = pszEnd + ;
  4429. pszEnd = MCD_PSZSTR( pszNextStart, MCD_T("]]>") );
  4430. }
  4431. strData += pszNextStart;
  4432. strData += MCD_T("]]>");
  4433. return strData;
  4434. }
  4435.  
  4436. bool CMarkup::x_SetData( int iPos, int nValue )
  4437. {
  4438. // Convert integer to string
  4439. MCD_CHAR szVal[];
  4440. MCD_SPRINTF( MCD_SSZ(szVal), MCD_T("%d"), nValue );
  4441. return x_SetData( iPos, szVal, );
  4442. }
  4443.  
  4444. bool CMarkup::x_SetData( int iPos, MCD_PCSZ szData, int nFlags )
  4445. {
  4446. if ( m_nDocFlags & MDF_READFILE )
  4447. return false;
  4448. MCD_STR strInsert;
  4449. if ( m_nDocFlags & MDF_WRITEFILE )
  4450. {
  4451. if ( ! iPos || m_nNodeType != || ! ELEM(iPos).IsEmptyElement() )
  4452. return false; // only set data on current empty element (no other kinds of nodes)
  4453. }
  4454. if ( iPos == m_iPos && m_nNodeLength )
  4455. {
  4456. // Not an element
  4457. if ( ! x_CreateNode(strInsert, m_nNodeType, szData) )
  4458. return false;
  4459. x_DocChange( m_nNodeOffset, m_nNodeLength, strInsert );
  4460. x_AdjustForNode( m_iPosParent, iPos, MCD_STRLENGTH(strInsert) - m_nNodeLength );
  4461. m_nNodeLength = MCD_STRLENGTH(strInsert);
  4462. MARKUP_SETDEBUGSTATE;
  4463. return true;
  4464. }
  4465.  
  4466. // Set data in iPos element
  4467. if ( ! iPos || ELEM(iPos).iElemChild )
  4468. return false;
  4469.  
  4470. // Build strInsert from szData based on nFlags
  4471. if ( nFlags & MNF_WITHCDATA )
  4472. strInsert = x_EncodeCDATASection( szData );
  4473. else
  4474. strInsert = EscapeText( szData, nFlags );
  4475.  
  4476. // Insert
  4477. NodePos node( MNF_WITHNOLINES|MNF_REPLACE );
  4478. node.strMeta = strInsert;
  4479. int iPosBefore = ;
  4480. int nReplace = x_InsertNew( iPos, iPosBefore, node );
  4481. int nAdjust = MCD_STRLENGTH(node.strMeta) - nReplace;
  4482. x_Adjust( iPos, nAdjust );
  4483. ELEM(iPos).nLength += nAdjust;
  4484. if ( ELEM(iPos).nFlags & MNF_ILLDATA )
  4485. ELEM(iPos).nFlags &= ~MNF_ILLDATA;
  4486. MARKUP_SETDEBUGSTATE;
  4487. return true;
  4488. }
  4489.  
  4490. MCD_STR CMarkup::x_GetData( int iPos )
  4491. {
  4492. if ( iPos == m_iPos && m_nNodeLength )
  4493. {
  4494. if ( m_nNodeType == MNT_COMMENT )
  4495. return MCD_STRMID( m_strDoc, m_nNodeOffset+, m_nNodeLength- );
  4496. else if ( m_nNodeType == MNT_PROCESSING_INSTRUCTION )
  4497. return MCD_STRMID( m_strDoc, m_nNodeOffset+, m_nNodeLength- );
  4498. else if ( m_nNodeType == MNT_CDATA_SECTION )
  4499. return MCD_STRMID( m_strDoc, m_nNodeOffset+, m_nNodeLength- );
  4500. else if ( m_nNodeType == MNT_TEXT )
  4501. return UnescapeText( &(MCD_2PCSZ(m_strDoc))[m_nNodeOffset], m_nNodeLength, m_nDocFlags );
  4502. else if ( m_nNodeType == MNT_LONE_END_TAG )
  4503. return MCD_STRMID( m_strDoc, m_nNodeOffset+, m_nNodeLength- );
  4504. return MCD_STRMID( m_strDoc, m_nNodeOffset, m_nNodeLength );
  4505. }
  4506.  
  4507. // Return a string representing data between start and end tag
  4508. // Return empty string if there are any children elements
  4509. MCD_STR strData;
  4510. if ( iPos && ! ELEM(iPos).IsEmptyElement() )
  4511. {
  4512. ElemPos* pElem = &ELEM(iPos);
  4513. int nStartContent = pElem->StartContent();
  4514. if ( pElem->IsUnparsed() )
  4515. {
  4516. TokenPos token( m_strDoc, m_nDocFlags, m_pFilePos );
  4517. token.m_nNext = nStartContent;
  4518. NodePos node;
  4519. m_pFilePos->m_nReadBufferStart = pElem->nStart;
  4520. while ( )
  4521. {
  4522. m_pFilePos->m_nReadBufferRemoved = ; // will be non-zero after ParseNode if read buffer shifted
  4523. token.ParseNode( node );
  4524. if ( m_pFilePos->m_nReadBufferRemoved )
  4525. {
  4526. pElem->nStart = ;
  4527. MARKUP_SETDEBUGSTATE;
  4528. }
  4529. if ( node.nNodeType == MNT_TEXT )
  4530. strData += UnescapeText( &token.m_pDocText[node.nStart], node.nLength, m_nDocFlags );
  4531. else if ( node.nNodeType == MNT_CDATA_SECTION )
  4532. strData += MCD_STRMID( m_strDoc, node.nStart+, node.nLength- );
  4533. else if ( node.nNodeType == MNT_ELEMENT )
  4534. {
  4535. MCD_STRCLEAR(strData);
  4536. break;
  4537. }
  4538. else if ( node.nNodeType == )
  4539. {
  4540. if ( token.Match(m_pFilePos->m_elemstack.Current().strTagName) )
  4541. {
  4542. pElem->SetEndTagLen( node.nLength );
  4543. pElem->nLength = node.nStart + node.nLength - pElem->nStart;
  4544. m_pFilePos->m_elemstack.OutOfLevel();
  4545. }
  4546. else
  4547. {
  4548. MCD_STRCLEAR(strData);
  4549. }
  4550. break;
  4551. }
  4552. }
  4553. }
  4554. else if ( ! pElem->iElemChild )
  4555. {
  4556. // Quick scan for any tags inside content
  4557. int nContentLen = pElem->ContentLen();
  4558. MCD_PCSZ pszContent = &(MCD_2PCSZ(m_strDoc))[nStartContent];
  4559. MCD_PCSZ pszTag = MCD_PSZCHR( pszContent, '<' );
  4560. if ( pszTag && ((int)(pszTag-pszContent) < nContentLen) )
  4561. {
  4562. // Concatenate all CDATA Sections and text nodes, ignore other nodes
  4563. TokenPos token( m_strDoc, m_nDocFlags );
  4564. token.m_nNext = nStartContent;
  4565. NodePos node;
  4566. while ( token.m_nNext < nStartContent + nContentLen )
  4567. {
  4568. token.ParseNode( node );
  4569. if ( node.nNodeType == MNT_TEXT )
  4570. strData += UnescapeText( &token.m_pDocText[node.nStart], node.nLength, m_nDocFlags );
  4571. else if ( node.nNodeType == MNT_CDATA_SECTION )
  4572. strData += MCD_STRMID( m_strDoc, node.nStart+, node.nLength- );
  4573. }
  4574. }
  4575. else // no tags
  4576. strData = UnescapeText( &(MCD_2PCSZ(m_strDoc))[nStartContent], nContentLen, m_nDocFlags );
  4577. }
  4578. }
  4579. return strData;
  4580. }
  4581.  
  4582. MCD_STR CMarkup::x_GetElemContent( int iPos ) const
  4583. {
  4584. if ( ! (m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE)) )
  4585. {
  4586. ElemPos* pElem = &ELEM(iPos);
  4587. if ( iPos && pElem->ContentLen() )
  4588. return MCD_STRMID( m_strDoc, pElem->StartContent(), pElem->ContentLen() );
  4589. }
  4590. return MCD_T("");
  4591. }
  4592.  
  4593. bool CMarkup::x_SetElemContent( MCD_PCSZ szContent )
  4594. {
  4595. MCD_STRCLEAR(m_strResult);
  4596. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  4597. return false;
  4598.  
  4599. // Set data in iPos element only
  4600. if ( ! m_iPos )
  4601. return false;
  4602.  
  4603. if ( m_nNodeLength )
  4604. return false; // not an element
  4605.  
  4606. // Unlink all children
  4607. int iPos = m_iPos;
  4608. int iPosChild = ELEM(iPos).iElemChild;
  4609. bool bHadChild = (iPosChild != );
  4610. while ( iPosChild )
  4611. iPosChild = x_ReleaseSubDoc( iPosChild );
  4612. if ( bHadChild )
  4613. x_CheckSavedPos();
  4614.  
  4615. // Parse content
  4616. bool bWellFormed = true;
  4617. TokenPos token( szContent, m_nDocFlags );
  4618. int iPosVirtual = x_GetFreePos();
  4619. ELEM(iPosVirtual).ClearVirtualParent();
  4620. ELEM(iPosVirtual).SetLevel( ELEM(iPos).Level() + );
  4621. iPosChild = x_ParseElem( iPosVirtual, token );
  4622. if ( ELEM(iPosVirtual).nFlags & MNF_ILLFORMED )
  4623. bWellFormed = false;
  4624. ELEM(iPos).nFlags = (ELEM(iPos).nFlags & ~MNF_ILLDATA) | (ELEM(iPosVirtual).nFlags & MNF_ILLDATA);
  4625.  
  4626. // Prepare insert and adjust offsets
  4627. NodePos node( MNF_WITHNOLINES|MNF_REPLACE );
  4628. node.strMeta = szContent;
  4629. int iPosBefore = ;
  4630. int nReplace = x_InsertNew( iPos, iPosBefore, node );
  4631.  
  4632. // Adjust and link in the inserted elements
  4633. x_Adjust( iPosChild, node.nStart );
  4634. ELEM(iPosChild).nStart += node.nStart;
  4635. ELEM(iPos).iElemChild = iPosChild;
  4636. while ( iPosChild )
  4637. {
  4638. ELEM(iPosChild).iElemParent = iPos;
  4639. iPosChild = ELEM(iPosChild).iElemNext;
  4640. }
  4641. x_ReleasePos( iPosVirtual );
  4642.  
  4643. int nAdjust = MCD_STRLENGTH(node.strMeta) - nReplace;
  4644. x_Adjust( iPos, nAdjust, true );
  4645. ELEM(iPos).nLength += nAdjust;
  4646.  
  4647. x_SetPos( m_iPosParent, m_iPos, );
  4648. return bWellFormed;
  4649. }
  4650.  
  4651. void CMarkup::x_DocChange( int nLeft, int nReplace, const MCD_STR& strInsert )
  4652. {
  4653. x_StrInsertReplace( m_strDoc, nLeft, nReplace, strInsert );
  4654. }
  4655.  
  4656. void CMarkup::x_Adjust( int iPos, int nShift, bool bAfterPos /*=false*/ )
  4657. {
  4658. // Loop through affected elements and adjust indexes
  4659. // Algorithm:
  4660. // 1. update children unless bAfterPos
  4661. // (if no children or bAfterPos is true, length of iPos not affected)
  4662. // 2. update starts of next siblings and their children
  4663. // 3. go up until there is a next sibling of a parent and update starts
  4664. // 4. step 2
  4665. int iPosTop = ELEM(iPos).iElemParent;
  4666. bool bPosFirst = bAfterPos; // mark as first to skip its children
  4667.  
  4668. // Stop when we've reached the virtual parent (which has no tags)
  4669. while ( ELEM(iPos).StartTagLen() )
  4670. {
  4671. // Were we at containing parent of affected position?
  4672. bool bPosTop = false;
  4673. if ( iPos == iPosTop )
  4674. {
  4675. // Move iPosTop up one towards root
  4676. iPosTop = ELEM(iPos).iElemParent;
  4677. bPosTop = true;
  4678. }
  4679.  
  4680. // Traverse to the next update position
  4681. if ( ! bPosTop && ! bPosFirst && ELEM(iPos).iElemChild )
  4682. {
  4683. // Depth first
  4684. iPos = ELEM(iPos).iElemChild;
  4685. }
  4686. else if ( ELEM(iPos).iElemNext )
  4687. {
  4688. iPos = ELEM(iPos).iElemNext;
  4689. }
  4690. else
  4691. {
  4692. // Look for next sibling of a parent of iPos
  4693. // When going back up, parents have already been done except iPosTop
  4694. while ( )
  4695. {
  4696. iPos = ELEM(iPos).iElemParent;
  4697. if ( iPos == iPosTop )
  4698. break;
  4699. if ( ELEM(iPos).iElemNext )
  4700. {
  4701. iPos = ELEM(iPos).iElemNext;
  4702. break;
  4703. }
  4704. }
  4705. }
  4706. bPosFirst = false;
  4707.  
  4708. // Shift indexes at iPos
  4709. if ( iPos != iPosTop )
  4710. ELEM(iPos).nStart += nShift;
  4711. else
  4712. ELEM(iPos).nLength += nShift;
  4713. }
  4714. }
  4715.  
  4716. int CMarkup::x_InsertNew( int iPosParent, int& iPosRel, NodePos& node )
  4717. {
  4718. // Parent empty tag or tags with no content?
  4719. bool bEmptyParentTag = iPosParent && ELEM(iPosParent).IsEmptyElement();
  4720. bool bNoContentParentTags = iPosParent && ! ELEM(iPosParent).ContentLen();
  4721. if ( iPosRel && ! node.nLength ) // current position element?
  4722. {
  4723. node.nStart = ELEM(iPosRel).nStart;
  4724. if ( ! (node.nNodeFlags & MNF_INSERT) ) // follow iPosRel
  4725. node.nStart += ELEM(iPosRel).nLength;
  4726. }
  4727. else if ( bEmptyParentTag ) // parent has no separate end tag?
  4728. {
  4729. // Split empty parent element
  4730. if ( ELEM(iPosParent).nFlags & MNF_NONENDED )
  4731. node.nStart = ELEM(iPosParent).StartContent();
  4732. else
  4733. node.nStart = ELEM(iPosParent).StartContent() - ;
  4734. }
  4735. else if ( node.nLength || (m_nDocFlags&MDF_WRITEFILE) ) // non-element node or a file mode zero length position?
  4736. {
  4737. if ( ! (node.nNodeFlags & MNF_INSERT) )
  4738. node.nStart += node.nLength; // after node or file mode position
  4739. }
  4740. else // no current node
  4741. {
  4742. // Insert relative to parent's content
  4743. if ( node.nNodeFlags & (MNF_INSERT|MNF_REPLACE) )
  4744. node.nStart = ELEM(iPosParent).StartContent(); // beginning of parent's content
  4745. else // in front of parent's end tag
  4746. node.nStart = ELEM(iPosParent).StartAfter() - ELEM(iPosParent).EndTagLen();
  4747. }
  4748.  
  4749. // Go up to start of next node, unless its splitting an empty element
  4750. if ( ! (node.nNodeFlags&(MNF_WITHNOLINES|MNF_REPLACE)) && ! bEmptyParentTag )
  4751. {
  4752. TokenPos token( m_strDoc, m_nDocFlags );
  4753. node.nStart = token.WhitespaceToTag( node.nStart );
  4754. }
  4755.  
  4756. // Is insert relative to element position? (i.e. not other kind of node)
  4757. if ( ! node.nLength )
  4758. {
  4759. // Modify iPosRel to reflect position before
  4760. if ( iPosRel )
  4761. {
  4762. if ( node.nNodeFlags & MNF_INSERT )
  4763. {
  4764. if ( ! (ELEM(iPosRel).nFlags & MNF_FIRST) )
  4765. iPosRel = ELEM(iPosRel).iElemPrev;
  4766. else
  4767. iPosRel = ;
  4768. }
  4769. }
  4770. else if ( ! (node.nNodeFlags & MNF_INSERT) )
  4771. {
  4772. // If parent has a child, add after last child
  4773. if ( ELEM(iPosParent).iElemChild )
  4774. iPosRel = ELEM(ELEM(iPosParent).iElemChild).iElemPrev;
  4775. }
  4776. }
  4777.  
  4778. // Get node length (needed for x_AddNode and x_AddSubDoc in file write mode)
  4779. node.nLength = MCD_STRLENGTH(node.strMeta);
  4780.  
  4781. // Prepare end of lines
  4782. if ( (! (node.nNodeFlags & MNF_WITHNOLINES)) && (bEmptyParentTag || bNoContentParentTags) )
  4783. node.nStart += MCD_EOLLEN;
  4784. if ( ! (node.nNodeFlags & MNF_WITHNOLINES) )
  4785. node.strMeta += MCD_EOL;
  4786.  
  4787. // Calculate insert offset and replace length
  4788. int nReplace = ;
  4789. int nInsertAt = node.nStart;
  4790. if ( bEmptyParentTag )
  4791. {
  4792. MCD_STR strTagName = x_GetTagName( iPosParent );
  4793. MCD_STR strFormat;
  4794. if ( node.nNodeFlags & MNF_WITHNOLINES )
  4795. strFormat = MCD_T(">");
  4796. else
  4797. strFormat = MCD_T(">") MCD_EOL;
  4798. strFormat += node.strMeta;
  4799. strFormat += MCD_T("</");
  4800. strFormat += strTagName;
  4801. node.strMeta = strFormat;
  4802. if ( ELEM(iPosParent).nFlags & MNF_NONENDED )
  4803. {
  4804. nInsertAt = ELEM(iPosParent).StartAfter() - ;
  4805. nReplace = ;
  4806. ELEM(iPosParent).nFlags ^= MNF_NONENDED;
  4807. }
  4808. else
  4809. {
  4810. nInsertAt = ELEM(iPosParent).StartAfter() - ;
  4811. nReplace = ;
  4812. ELEM(iPosParent).AdjustStartTagLen( - );
  4813. }
  4814. ELEM(iPosParent).SetEndTagLen( + MCD_STRLENGTH(strTagName) );
  4815. }
  4816. else
  4817. {
  4818. if ( node.nNodeFlags & MNF_REPLACE )
  4819. {
  4820. nInsertAt = ELEM(iPosParent).StartContent();
  4821. nReplace = ELEM(iPosParent).ContentLen();
  4822. }
  4823. else if ( bNoContentParentTags )
  4824. {
  4825. node.strMeta = MCD_EOL + node.strMeta;
  4826. nInsertAt = ELEM(iPosParent).StartContent();
  4827. }
  4828. }
  4829. if ( m_nDocFlags & MDF_WRITEFILE )
  4830. {
  4831. // Check if buffer is full
  4832. int nNewDocLength = MCD_STRLENGTH(m_strDoc) + MCD_STRLENGTH(node.strMeta) - nReplace;
  4833. int nFlushTo = node.nStart;
  4834. MCD_STRCLEAR( m_strResult );
  4835. if ( bEmptyParentTag )
  4836. nFlushTo = ELEM(iPosParent).nStart;
  4837. if ( nFlushTo && nNewDocLength > m_pFilePos->m_nBlockSizeBasis )
  4838. {
  4839. int nDocCapacity = MCD_STRCAPACITY(m_strDoc);
  4840. if ( nNewDocLength > nDocCapacity )
  4841. {
  4842. if ( bEmptyParentTag )
  4843. ELEM(iPosParent).nStart = ;
  4844. node.nStart -= nFlushTo;
  4845. nInsertAt -= nFlushTo;
  4846. m_pFilePos->FileFlush( m_strDoc, nFlushTo );
  4847. m_strResult = m_pFilePos->m_strIOResult;
  4848. }
  4849. }
  4850. }
  4851. x_DocChange( nInsertAt, nReplace, node.strMeta );
  4852. return nReplace;
  4853. }
  4854.  
  4855. bool CMarkup::x_AddElem( MCD_PCSZ pName, int nValue, int nFlags )
  4856. {
  4857. // Convert integer to string
  4858. MCD_CHAR szVal[];
  4859. MCD_SPRINTF( MCD_SSZ(szVal), MCD_T("%d"), nValue );
  4860. return x_AddElem( pName, szVal, nFlags );
  4861. }
  4862.  
  4863. bool CMarkup::x_AddElem( MCD_PCSZ pName, MCD_PCSZ pValue, int nFlags )
  4864. {
  4865. if ( m_nDocFlags & MDF_READFILE )
  4866. return false;
  4867. if ( nFlags & MNF_CHILD )
  4868. {
  4869. // Adding a child element under main position
  4870. if ( ! m_iPos || (m_nDocFlags & MDF_WRITEFILE) )
  4871. return false;
  4872. }
  4873.  
  4874. // Cannot have data in non-ended element
  4875. if ( (nFlags&MNF_WITHNOEND) && pValue && pValue[] )
  4876. return false;
  4877.  
  4878. // Node and element structures
  4879. NodePos node( nFlags );
  4880. int iPosParent = , iPosBefore = ;
  4881. int iPos = x_GetFreePos();
  4882. ElemPos* pElem = &ELEM(iPos);
  4883.  
  4884. // Locate where to add element relative to current node
  4885. if ( nFlags & MNF_CHILD )
  4886. {
  4887. iPosParent = m_iPos;
  4888. iPosBefore = m_iPosChild;
  4889. }
  4890. else
  4891. {
  4892. iPosParent = m_iPosParent;
  4893. iPosBefore = m_iPos;
  4894. node.nStart = m_nNodeOffset;
  4895. node.nLength = m_nNodeLength;
  4896. }
  4897.  
  4898. // Create string for insert
  4899. // If no pValue is specified, an empty element is created
  4900. // i.e. either <NAME>value</NAME> or <NAME/>
  4901. //
  4902. int nLenName = MCD_PSZLEN(pName);
  4903. if ( ! pValue || ! pValue[] )
  4904. {
  4905. // <NAME/> empty element
  4906. MCD_BLDRESERVE( node.strMeta, nLenName + );
  4907. MCD_BLDAPPEND1( node.strMeta, '<' );
  4908. MCD_BLDAPPENDN( node.strMeta, pName, nLenName );
  4909. if ( nFlags & MNF_WITHNOEND )
  4910. {
  4911. MCD_BLDAPPEND1( node.strMeta, '>' );
  4912. }
  4913. else
  4914. {
  4915. if ( nFlags & MNF_WITHXHTMLSPACE )
  4916. {
  4917. MCD_BLDAPPENDN( node.strMeta, MCD_T(" />"), );
  4918. }
  4919. else
  4920. {
  4921. MCD_BLDAPPENDN( node.strMeta, MCD_T("/>"), );
  4922. }
  4923. }
  4924. MCD_BLDRELEASE( node.strMeta );
  4925. pElem->nLength = MCD_STRLENGTH( node.strMeta );
  4926. pElem->SetStartTagLen( pElem->nLength );
  4927. pElem->SetEndTagLen( );
  4928. }
  4929. else
  4930. {
  4931. // <NAME>value</NAME>
  4932. MCD_STR strValue;
  4933. if ( nFlags & MNF_WITHCDATA )
  4934. strValue = x_EncodeCDATASection( pValue );
  4935. else
  4936. strValue = EscapeText( pValue, nFlags );
  4937. int nLenValue = MCD_STRLENGTH(strValue);
  4938. pElem->nLength = nLenName * + nLenValue + ;
  4939. MCD_BLDRESERVE( node.strMeta, pElem->nLength );
  4940. MCD_BLDAPPEND1( node.strMeta, '<' );
  4941. MCD_BLDAPPENDN( node.strMeta, pName, nLenName );
  4942. MCD_BLDAPPEND1( node.strMeta, '>' );
  4943. MCD_BLDAPPENDN( node.strMeta, MCD_2PCSZ(strValue), nLenValue );
  4944. MCD_BLDAPPENDN( node.strMeta, MCD_T("</"), );
  4945. MCD_BLDAPPENDN( node.strMeta, pName, nLenName );
  4946. MCD_BLDAPPEND1( node.strMeta, '>' );
  4947. MCD_BLDRELEASE( node.strMeta );
  4948. pElem->SetEndTagLen( nLenName + );
  4949. pElem->SetStartTagLen( nLenName + );
  4950. }
  4951.  
  4952. // Insert
  4953. int nReplace = x_InsertNew( iPosParent, iPosBefore, node );
  4954. pElem->nStart = node.nStart;
  4955. pElem->iElemChild = ;
  4956. if ( nFlags & MNF_WITHNOEND )
  4957. pElem->nFlags = MNF_NONENDED;
  4958. else
  4959. pElem->nFlags = ;
  4960. if ( m_nDocFlags & MDF_WRITEFILE )
  4961. {
  4962. iPosParent = x_UnlinkPrevElem( iPosParent, iPosBefore, iPos );
  4963. TokenPos token( m_strDoc, m_nDocFlags );
  4964. token.m_nL = pElem->nStart + ;
  4965. token.m_nR = pElem->nStart + nLenName;
  4966. m_pFilePos->m_elemstack.PushTagAndCount( token );
  4967. }
  4968. else
  4969. {
  4970. x_LinkElem( iPosParent, iPosBefore, iPos );
  4971. x_Adjust( iPos, MCD_STRLENGTH(node.strMeta) - nReplace );
  4972. }
  4973. if ( nFlags & MNF_CHILD )
  4974. x_SetPos( m_iPosParent, iPosParent, iPos );
  4975. else
  4976. x_SetPos( iPosParent, iPos, );
  4977. return true;
  4978. }
  4979.  
  4980. MCD_STR CMarkup::x_GetSubDoc( int iPos )
  4981. {
  4982. if ( iPos && ! (m_nDocFlags&MDF_WRITEFILE) )
  4983. {
  4984. if ( ! (m_nDocFlags&MDF_READFILE) )
  4985. {
  4986. TokenPos token( m_strDoc, m_nDocFlags );
  4987. token.WhitespaceToTag( ELEM(iPos).StartAfter() );
  4988. token.m_nL = ELEM(iPos).nStart;
  4989. return token.GetTokenText();
  4990. }
  4991. }
  4992. return MCD_T("");
  4993. }
  4994.  
  4995. bool CMarkup::x_AddSubDoc( MCD_PCSZ pSubDoc, int nFlags )
  4996. {
  4997. if ( m_nDocFlags & MDF_READFILE || ((nFlags & MNF_CHILD) && (m_nDocFlags & MDF_WRITEFILE)) )
  4998. return false;
  4999.  
  5000. MCD_STRCLEAR(m_strResult);
  5001. NodePos node( nFlags );
  5002. int iPosParent, iPosBefore;
  5003. if ( nFlags & MNF_CHILD )
  5004. {
  5005. // Add a subdocument under main position, before or after child
  5006. if ( ! m_iPos )
  5007. return false;
  5008. iPosParent = m_iPos;
  5009. iPosBefore = m_iPosChild;
  5010. }
  5011. else
  5012. {
  5013. // Add a subdocument under parent position, before or after main
  5014. iPosParent = m_iPosParent;
  5015. iPosBefore = m_iPos;
  5016. node.nStart = m_nNodeOffset;
  5017. node.nLength = m_nNodeLength;
  5018. }
  5019.  
  5020. // Parse subdocument, generating indexes based on the subdocument string to be offset later
  5021. bool bWellFormed = true;
  5022. TokenPos token( pSubDoc, m_nDocFlags );
  5023. int iPosVirtual = x_GetFreePos();
  5024. ELEM(iPosVirtual).ClearVirtualParent();
  5025. ELEM(iPosVirtual).SetLevel( ELEM(iPosParent).Level() + );
  5026. int iPos = x_ParseElem( iPosVirtual, token );
  5027. if ( (!iPos) || ELEM(iPosVirtual).nFlags & MNF_ILLFORMED )
  5028. bWellFormed = false;
  5029. if ( ELEM(iPosVirtual).nFlags & MNF_ILLDATA )
  5030. ELEM(iPosParent).nFlags |= MNF_ILLDATA;
  5031.  
  5032. // File write mode handling
  5033. bool bBypassSubDoc = false;
  5034. if ( m_nDocFlags & MDF_WRITEFILE )
  5035. {
  5036. // Current position will bypass subdoc unless well-formed single element
  5037. if ( (! bWellFormed) || ELEM(iPos).iElemChild || ELEM(iPos).iElemNext )
  5038. bBypassSubDoc = true;
  5039.  
  5040. // Count tag names of top level elements (usually one) in given markup
  5041. int iPosTop = iPos;
  5042. while ( iPosTop )
  5043. {
  5044. token.m_nNext = ELEM(iPosTop).nStart + ;
  5045. token.FindName();
  5046. m_pFilePos->m_elemstack.PushTagAndCount( token );
  5047. iPosTop = ELEM(iPosTop).iElemNext;
  5048. }
  5049. }
  5050.  
  5051. // Extract subdocument without leading/trailing nodes
  5052. int nExtractStart = ;
  5053. int iPosLast = ELEM(iPos).iElemPrev;
  5054. if ( bWellFormed )
  5055. {
  5056. nExtractStart = ELEM(iPos).nStart;
  5057. int nExtractLength = ELEM(iPos).nLength;
  5058. if ( iPos != iPosLast )
  5059. {
  5060. nExtractLength = ELEM(iPosLast).nStart - nExtractStart + ELEM(iPosLast).nLength;
  5061. bWellFormed = false; // treat as subdoc here, but return not well-formed
  5062. }
  5063. MCD_STRASSIGN(node.strMeta,&pSubDoc[nExtractStart],nExtractLength);
  5064. }
  5065. else
  5066. {
  5067. node.strMeta = pSubDoc;
  5068. node.nNodeFlags |= MNF_WITHNOLINES;
  5069. }
  5070.  
  5071. // Insert
  5072. int nReplace = x_InsertNew( iPosParent, iPosBefore, node );
  5073.  
  5074. // Clean up indexes
  5075. if ( m_nDocFlags & MDF_WRITEFILE )
  5076. {
  5077. if ( bBypassSubDoc )
  5078. {
  5079. // Release indexes used in parsing the subdocument
  5080. m_iPosParent = x_UnlinkPrevElem( iPosParent, iPosBefore, );
  5081. m_iPosFree = ;
  5082. m_iPosDeleted = ;
  5083. m_iPos = ;
  5084. m_nNodeOffset = node.nStart + node.nLength;
  5085. m_nNodeLength = ;
  5086. m_nNodeType = ;
  5087. MARKUP_SETDEBUGSTATE;
  5088. return bWellFormed;
  5089. }
  5090. else // single element added
  5091. {
  5092. m_iPos = iPos;
  5093. ElemPos* pElem = &ELEM(iPos);
  5094. pElem->nStart = node.nStart;
  5095. m_iPosParent = x_UnlinkPrevElem( iPosParent, iPosBefore, iPos );
  5096. x_ReleasePos( iPosVirtual );
  5097. }
  5098. }
  5099. else
  5100. {
  5101. // Adjust and link in the inserted elements
  5102. // iPosVirtual will stop it from affecting rest of document
  5103. int nAdjust = node.nStart - nExtractStart;
  5104. if ( iPos && nAdjust )
  5105. {
  5106. x_Adjust( iPos, nAdjust );
  5107. ELEM(iPos).nStart += nAdjust;
  5108. }
  5109. int iPosChild = iPos;
  5110. while ( iPosChild )
  5111. {
  5112. int iPosNext = ELEM(iPosChild).iElemNext;
  5113. x_LinkElem( iPosParent, iPosBefore, iPosChild );
  5114. iPosBefore = iPosChild;
  5115. iPosChild = iPosNext;
  5116. }
  5117. x_ReleasePos( iPosVirtual );
  5118.  
  5119. // Now adjust remainder of document
  5120. x_Adjust( iPosLast, MCD_STRLENGTH(node.strMeta) - nReplace, true );
  5121. }
  5122.  
  5123. // Set position to top element of subdocument
  5124. if ( nFlags & MNF_CHILD )
  5125. x_SetPos( m_iPosParent, iPosParent, iPos );
  5126. else // Main
  5127. x_SetPos( m_iPosParent, iPos, );
  5128. return bWellFormed;
  5129. }
  5130.  
  5131. int CMarkup::x_RemoveElem( int iPos )
  5132. {
  5133. // Determine whether any whitespace up to next tag
  5134. TokenPos token( m_strDoc, m_nDocFlags );
  5135. int nAfterEnd = token.WhitespaceToTag( ELEM(iPos).StartAfter() );
  5136.  
  5137. // Remove from document, adjust affected indexes, and unlink
  5138. int nLen = nAfterEnd - ELEM(iPos).nStart;
  5139. x_DocChange( ELEM(iPos).nStart, nLen, MCD_STR() );
  5140. x_Adjust( iPos, - nLen, true );
  5141. int iPosPrev = x_UnlinkElem( iPos );
  5142. x_CheckSavedPos();
  5143. return iPosPrev; // new position
  5144. }
  5145.  
  5146. void CMarkup::x_LinkElem( int iPosParent, int iPosBefore, int iPos )
  5147. {
  5148. // Update links between elements and initialize nFlags
  5149. ElemPos* pElem = &ELEM(iPos);
  5150. if ( m_nDocFlags & MDF_WRITEFILE )
  5151. {
  5152. // In file write mode, only keep virtual parent 0 plus one element
  5153. if ( iPosParent )
  5154. x_ReleasePos( iPosParent );
  5155. else if ( iPosBefore )
  5156. x_ReleasePos( iPosBefore );
  5157. iPosParent = ;
  5158. ELEM(iPosParent).iElemChild = iPos;
  5159. pElem->iElemParent = iPosParent;
  5160. pElem->iElemPrev = iPos;
  5161. pElem->iElemNext = ;
  5162. pElem->nFlags |= MNF_FIRST;
  5163. }
  5164. else
  5165. {
  5166. pElem->iElemParent = iPosParent;
  5167. if ( iPosBefore )
  5168. {
  5169. // Link in after iPosBefore
  5170. pElem->nFlags &= ~MNF_FIRST;
  5171. pElem->iElemNext = ELEM(iPosBefore).iElemNext;
  5172. if ( pElem->iElemNext )
  5173. ELEM(pElem->iElemNext).iElemPrev = iPos;
  5174. else
  5175. ELEM(ELEM(iPosParent).iElemChild).iElemPrev = iPos;
  5176. ELEM(iPosBefore).iElemNext = iPos;
  5177. pElem->iElemPrev = iPosBefore;
  5178. }
  5179. else
  5180. {
  5181. // Link in as first child
  5182. pElem->nFlags |= MNF_FIRST;
  5183. if ( ELEM(iPosParent).iElemChild )
  5184. {
  5185. pElem->iElemNext = ELEM(iPosParent).iElemChild;
  5186. pElem->iElemPrev = ELEM(pElem->iElemNext).iElemPrev;
  5187. ELEM(pElem->iElemNext).iElemPrev = iPos;
  5188. ELEM(pElem->iElemNext).nFlags ^= MNF_FIRST;
  5189. }
  5190. else
  5191. {
  5192. pElem->iElemNext = ;
  5193. pElem->iElemPrev = iPos;
  5194. }
  5195. ELEM(iPosParent).iElemChild = iPos;
  5196. }
  5197. if ( iPosParent )
  5198. pElem->SetLevel( ELEM(iPosParent).Level() + );
  5199. }
  5200. }
  5201.  
  5202. int CMarkup::x_UnlinkElem( int iPos )
  5203. {
  5204. // Fix links to remove element and mark as deleted
  5205. // return previous position or zero if none
  5206. ElemPos* pElem = &ELEM(iPos);
  5207.  
  5208. // Find previous sibling and bypass removed element
  5209. int iPosPrev = ;
  5210. if ( pElem->nFlags & MNF_FIRST )
  5211. {
  5212. if ( pElem->iElemNext ) // set next as first child
  5213. {
  5214. ELEM(pElem->iElemParent).iElemChild = pElem->iElemNext;
  5215. ELEM(pElem->iElemNext).iElemPrev = pElem->iElemPrev;
  5216. ELEM(pElem->iElemNext).nFlags |= MNF_FIRST;
  5217. }
  5218. else // no children remaining
  5219. ELEM(pElem->iElemParent).iElemChild = ;
  5220. }
  5221. else
  5222. {
  5223. iPosPrev = pElem->iElemPrev;
  5224. ELEM(iPosPrev).iElemNext = pElem->iElemNext;
  5225. if ( pElem->iElemNext )
  5226. ELEM(pElem->iElemNext).iElemPrev = iPosPrev;
  5227. else
  5228. ELEM(ELEM(pElem->iElemParent).iElemChild).iElemPrev = iPosPrev;
  5229. }
  5230. x_ReleaseSubDoc( iPos );
  5231. return iPosPrev;
  5232. }
  5233.  
  5234. int CMarkup::x_UnlinkPrevElem( int iPosParent, int iPosBefore, int iPos )
  5235. {
  5236. // In file write mode, only keep virtual parent 0 plus one element if currently at element
  5237. if ( iPosParent )
  5238. {
  5239. x_ReleasePos( iPosParent );
  5240. iPosParent = ;
  5241. }
  5242. else if ( iPosBefore )
  5243. x_ReleasePos( iPosBefore );
  5244. ELEM(iPosParent).iElemChild = iPos;
  5245. ELEM(iPosParent).nLength = MCD_STRLENGTH(m_strDoc);
  5246. if ( iPos )
  5247. {
  5248. ElemPos* pElem = &ELEM(iPos);
  5249. pElem->iElemParent = iPosParent;
  5250. pElem->iElemPrev = iPos;
  5251. pElem->iElemNext = ;
  5252. pElem->nFlags |= MNF_FIRST;
  5253. }
  5254. return iPosParent;
  5255. }
  5256.  
  5257. int CMarkup::x_ReleasePos( int iPos )
  5258. {
  5259. int iPosNext = ELEM(iPos).iElemNext;
  5260. ELEM(iPos).iElemNext = m_iPosDeleted;
  5261. ELEM(iPos).nFlags = MNF_DELETED;
  5262. m_iPosDeleted = iPos;
  5263. return iPosNext;
  5264. }
  5265.  
  5266. int CMarkup::x_ReleaseSubDoc( int iPos )
  5267. {
  5268. // Mark position structures as deleted by depth first traversal
  5269. // Tricky because iElemNext used in traversal is overwritten for linked list of deleted
  5270. // Return value is what iElemNext was before being overwritten
  5271. //
  5272. int iPosNext = , iPosTop = iPos;
  5273. while ( )
  5274. {
  5275. if ( ELEM(iPos).iElemChild )
  5276. iPos = ELEM(iPos).iElemChild;
  5277. else
  5278. {
  5279. while ( )
  5280. {
  5281. iPosNext = x_ReleasePos( iPos );
  5282. if ( iPosNext || iPos == iPosTop )
  5283. break;
  5284. iPos = ELEM(iPos).iElemParent;
  5285. }
  5286. if ( iPos == iPosTop )
  5287. break;
  5288. iPos = iPosNext;
  5289. }
  5290. }
  5291. return iPosNext;
  5292. }
  5293.  
  5294. void CMarkup::x_CheckSavedPos()
  5295. {
  5296. // Remove any saved positions now pointing to deleted elements
  5297. // Must be done as part of element removal before position reassigned
  5298. if ( m_pSavedPosMaps->m_pMaps )
  5299. {
  5300. int nMap = ;
  5301. while ( m_pSavedPosMaps->m_pMaps[nMap] )
  5302. {
  5303. SavedPosMap* pMap = m_pSavedPosMaps->m_pMaps[nMap];
  5304. for ( int nSlot = ; nSlot < pMap->nMapSize; ++nSlot )
  5305. {
  5306. SavedPos* pSavedPos = pMap->pTable[nSlot];
  5307. if ( pSavedPos )
  5308. {
  5309. int nOffset = ;
  5310. int nSavedPosCount = ;
  5311. while ( )
  5312. {
  5313. if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_USED )
  5314. {
  5315. int iPos = pSavedPos[nOffset].iPos;
  5316. if ( ! (ELEM(iPos).nFlags & MNF_DELETED) )
  5317. {
  5318. if ( nSavedPosCount < nOffset )
  5319. {
  5320. pSavedPos[nSavedPosCount] = pSavedPos[nOffset];
  5321. pSavedPos[nSavedPosCount].nSavedPosFlags &= ~SavedPos::SPM_LAST;
  5322. }
  5323. ++nSavedPosCount;
  5324. }
  5325. }
  5326. if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST )
  5327. {
  5328. while ( nSavedPosCount <= nOffset )
  5329. pSavedPos[nSavedPosCount++].nSavedPosFlags &= ~SavedPos::SPM_USED;
  5330. break;
  5331. }
  5332. ++nOffset;
  5333. }
  5334. }
  5335. }
  5336. ++nMap;
  5337. }
  5338. }
  5339. }
  5340.  
  5341. void CMarkup::x_AdjustForNode( int iPosParent, int iPos, int nShift )
  5342. {
  5343. // Adjust affected indexes
  5344. bool bAfterPos = true;
  5345. if ( ! iPos )
  5346. {
  5347. // Change happened before or at first element under iPosParent
  5348. // If there are any children of iPosParent, adjust from there
  5349. // otherwise start at parent and adjust from there
  5350. iPos = ELEM(iPosParent).iElemChild;
  5351. if ( iPos )
  5352. {
  5353. ELEM(iPos).nStart += nShift;
  5354. bAfterPos = false;
  5355. }
  5356. else
  5357. {
  5358. iPos = iPosParent;
  5359. ELEM(iPos).nLength += nShift;
  5360. }
  5361. }
  5362. x_Adjust( iPos, nShift, bAfterPos );
  5363. }
  5364.  
  5365. bool CMarkup::x_AddNode( int nNodeType, MCD_PCSZ pText, int nNodeFlags )
  5366. {
  5367. if ( m_nDocFlags & MDF_READFILE )
  5368. return false;
  5369.  
  5370. // Comments, DTDs, and processing instructions are followed by CRLF
  5371. // Other nodes are usually concerned with mixed content, so no CRLF
  5372. if ( ! (nNodeType & (MNT_PROCESSING_INSTRUCTION|MNT_COMMENT|MNT_DOCUMENT_TYPE)) )
  5373. nNodeFlags |= MNF_WITHNOLINES;
  5374.  
  5375. // Add node of nNodeType after current node position
  5376. NodePos node( nNodeFlags );
  5377. if ( ! x_CreateNode(node.strMeta, nNodeType, pText) )
  5378. return false;
  5379.  
  5380. // Insert the new node relative to current node
  5381. node.nStart = m_nNodeOffset;
  5382. node.nLength = m_nNodeLength;
  5383. node.nNodeType = nNodeType;
  5384. int iPosBefore = m_iPos;
  5385. int nReplace = x_InsertNew( m_iPosParent, iPosBefore, node );
  5386.  
  5387. // If its a new element, create an ElemPos
  5388. int iPos = iPosBefore;
  5389. ElemPos* pElem = NULL;
  5390. if ( nNodeType == MNT_ELEMENT )
  5391. {
  5392. // Set indexes
  5393. iPos = x_GetFreePos();
  5394. pElem = &ELEM(iPos);
  5395. pElem->nStart = node.nStart;
  5396. pElem->SetStartTagLen( node.nLength );
  5397. pElem->SetEndTagLen( );
  5398. pElem->nLength = node.nLength;
  5399. node.nStart = ;
  5400. node.nLength = ;
  5401. pElem->iElemChild = ;
  5402. pElem->nFlags = ;
  5403. x_LinkElem( m_iPosParent, iPosBefore, iPos );
  5404. }
  5405. if ( m_nDocFlags & MDF_WRITEFILE )
  5406. {
  5407. m_iPosParent = x_UnlinkPrevElem( m_iPosParent, iPosBefore, iPos );
  5408. if ( nNodeType == MNT_ELEMENT )
  5409. {
  5410. TokenPos token( m_strDoc, m_nDocFlags );
  5411. token.m_nL = pElem->nStart + ;
  5412. token.m_nR = pElem->nStart + pElem->nLength - ;
  5413. m_pFilePos->m_elemstack.PushTagAndCount( token );
  5414. }
  5415. }
  5416. else // need to adjust element positions after iPos
  5417. x_AdjustForNode( m_iPosParent, iPos, MCD_STRLENGTH(node.strMeta) - nReplace );
  5418.  
  5419. // Store current position
  5420. m_iPos = iPos;
  5421. m_iPosChild = ;
  5422. m_nNodeOffset = node.nStart;
  5423. m_nNodeLength = node.nLength;
  5424. m_nNodeType = nNodeType;
  5425. MARKUP_SETDEBUGSTATE;
  5426. return true;
  5427. }
  5428.  
  5429. void CMarkup::x_RemoveNode( int iPosParent, int& iPos, int& nNodeType, int& nNodeOffset, int& nNodeLength )
  5430. {
  5431. int iPosPrev = iPos;
  5432.  
  5433. // Removing an element?
  5434. if ( nNodeType == MNT_ELEMENT )
  5435. {
  5436. nNodeOffset = ELEM(iPos).nStart;
  5437. nNodeLength = ELEM(iPos).nLength;
  5438. iPosPrev = x_UnlinkElem( iPos );
  5439. x_CheckSavedPos();
  5440. }
  5441.  
  5442. // Find previous node type, offset and length
  5443. int nPrevOffset = ;
  5444. if ( iPosPrev )
  5445. nPrevOffset = ELEM(iPosPrev).StartAfter();
  5446. else if ( iPosParent )
  5447. nPrevOffset = ELEM(iPosParent).StartContent();
  5448. TokenPos token( m_strDoc, m_nDocFlags );
  5449. NodePos node;
  5450. token.m_nNext = nPrevOffset;
  5451. int nPrevType = ;
  5452. while ( token.m_nNext < nNodeOffset )
  5453. {
  5454. nPrevOffset = token.m_nNext;
  5455. nPrevType = token.ParseNode( node );
  5456. }
  5457. int nPrevLength = nNodeOffset - nPrevOffset;
  5458. if ( ! nPrevLength )
  5459. {
  5460. // Previous node is iPosPrev element
  5461. nPrevOffset = ;
  5462. if ( iPosPrev )
  5463. nPrevType = MNT_ELEMENT;
  5464. }
  5465.  
  5466. // Remove node from document
  5467. x_DocChange( nNodeOffset, nNodeLength, MCD_STR() );
  5468. x_AdjustForNode( iPosParent, iPosPrev, - nNodeLength );
  5469.  
  5470. // Was removed node a lone end tag?
  5471. if ( nNodeType == MNT_LONE_END_TAG )
  5472. {
  5473. // See if we can unset parent MNF_ILLDATA flag
  5474. token.m_nNext = ELEM(iPosParent).StartContent();
  5475. int nEndOfContent = token.m_nNext + ELEM(iPosParent).ContentLen();
  5476. int iPosChild = ELEM(iPosParent).iElemChild;
  5477. while ( token.m_nNext < nEndOfContent )
  5478. {
  5479. if ( token.ParseNode(node) <= )
  5480. break;
  5481. if ( node.nNodeType == MNT_ELEMENT )
  5482. {
  5483. token.m_nNext = ELEM(iPosChild).StartAfter();
  5484. iPosChild = ELEM(iPosChild).iElemNext;
  5485. }
  5486. }
  5487. if ( token.m_nNext == nEndOfContent )
  5488. ELEM(iPosParent).nFlags &= ~MNF_ILLDATA;
  5489. }
  5490.  
  5491. nNodeType = nPrevType;
  5492. nNodeOffset = nPrevOffset;
  5493. nNodeLength = nPrevLength;
  5494. iPos = iPosPrev;
  5495. }

2:生成Userinfo.xml

  1. #include "stdafx.h"
  2. #include<stdlib.h>
  3. #include<iostream>
  4. #include<string.h>
  5. #include"Markup.h"
  6.  
  7. using namespace std;
  8.  
  9. int main()
  10. {
  11. CMarkup xml;
  12.  
  13. xml.SetDoc("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n");
  14. xml.AddElem((MCD_CSTR)"UserInfo"); //在当前主位置元素或最后兄弟位置之后增加一个元素
  15. xml.IntoElem(); //进入当前主位置的下一级,当前的位置变为父位置。
  16. xml.AddElem((MCD_CSTR)"UserID",(MCD_CSTR)"Jason");
  17. xml.AddElem((MCD_CSTR)"UserID",(MCD_CSTR)"evil");
  18. xml.OutOfElem(); //使当前父位置变成当前位置。
  19. xml.Save((MCD_CSTR)"UserInfo.xml"); //可指定目录 将XML数据写入文件中
  20.  
  21. return ;
  22. }

运行结果:会在本地程序目录下生成一个UserInfo.xml

3:浏览特定元素

  1. #include "stdafx.h"
  2. #include<stdlib.h>
  3. #include<iostream>
  4. #include<string.h>
  5. #include"Markup.h"
  6.  
  7. using namespace std;
  8.  
  9. int main()
  10. {
  11. CMarkup xml;
  12.  
  13. xml.Load((MCD_CSTR)"UserInfo.xml");
  14. BOOL bFind = true;
  15. xml.ResetMainPos();//将当前主位置复位为第一个兄弟位置之前
  16. while (xml.FindChildElem((MCD_CSTR)"UserID")) //定位到下一个子元素,匹配元素名或路径。
  17. {
  18. xml.IntoElem();//进入当前主位置的下一级,当前的位置变为父位置
  19. CString strTagName = _T("");
  20. CString strData = _T("");
  21. strTagName = xml.GetTagName(); //得到主位置元素(或正在进行的指令的)标签名称
  22. strData = xml.GetData(); // 得到当前主位置元素或节点的字符串值
  23.  
  24. //以下为了看输出结果把CString 转 char*类型
  25. int len1 =WideCharToMultiByte(CP_ACP,,strTagName,-,NULL,,NULL,NULL);
  26. char ptagName[];
  27. WideCharToMultiByte(CP_ACP,,strTagName,-,ptagName,len1,NULL,NULL );
  28.  
  29. int len2 =WideCharToMultiByte(CP_ACP,,strData,-,NULL,,NULL,NULL);
  30. char pData[];
  31. WideCharToMultiByte(CP_ACP,,strData,-,pData,len2,NULL,NULL );
  32.  
  33. printf("\n-----tagName:%s,Data:%s\n",ptagName,pData);
  34. xml.OutOfElem();//使当前父位置变成当前位置
  35. }
  36.  
  37. system("pause");
  38. return ;
  39. }

输出结果为:

4:修改,把UserID为Jason值的修改snake

  1. #include "stdafx.h"
  2. #include<stdlib.h>
  3. #include<iostream>
  4. #include<string.h>
  5. #include"Markup.h"
  6.  
  7. using namespace std;
  8.  
  9. int main()
  10. {
  11. CMarkup xml;
  12.  
  13. BOOL bLoadXml = false;
  14. bLoadXml = xml.Load((MCD_CSTR)"UserInfo.xml");
  15.  
  16. if( bLoadXml )
  17. {
  18. CString strUserID = _T("");
  19. xml.ResetMainPos();
  20. xml.FindElem(); //定位到下一个元素,可能和一个标签名或路径匹配。 也就是定位到UserID;
  21. xml.IntoElem();
  22. while (xml.FindElem((MCD_CSTR)"UserID"))
  23. {
  24. strUserID = xml.GetData();
  25. if (strUserID=="Jason")
  26. {
  27. xml.SetData(CString("snake"));
  28. xml.Save((MCD_CSTR)"UserInfo.xml");
  29. break;
  30. }
  31. }
  32. }
  33.  
  34. system("pause");
  35. return ;
  36. }

结果为:

5:添加元素,添加在最后面

  1. #include "stdafx.h"
  2. #include<stdlib.h>
  3. #include<iostream>
  4. #include<string.h>
  5. #include"Markup.h"
  6.  
  7. using namespace std;
  8.  
  9. int main()
  10. {
  11. BOOL bLoadXml = false;
  12. CMarkup xml;
  13. bLoadXml = xml.Load((MCD_CSTR)"UserInfo.xml");
  14. if (bLoadXml)
  15. {
  16. xml.ResetMainPos();
  17. xml.FindElem();
  18. xml.IntoElem();
  19. xml.AddElem((MCD_CSTR)"UserID",(MCD_CSTR)"Jason");
  20. xml.OutOfElem();
  21. xml.Save((MCD_CSTR)"UserInfo.xml");
  22. }
  23.  
  24. system("pause");
  25. return ;
  26. }

结果为:

6:添加元素到最前面,使用的InsertElem()函数

  1. #include "stdafx.h"
  2. #include<stdlib.h>
  3. #include<iostream>
  4. #include<string.h>
  5. #include"Markup.h"
  6.  
  7. using namespace std;
  8.  
  9. int main()
  10. {
  11. BOOL bLoadXml = false;
  12. CMarkup xml;
  13. bLoadXml = xml.Load((MCD_CSTR)"UserInfo.xml");
  14. if (bLoadXml)
  15. {
  16. xml.ResetMainPos();
  17. xml.FindElem();
  18. xml.IntoElem();
  19. xml.InsertElem((MCD_CSTR)"UserID",(MCD_CSTR)"AddUserIDHead");
  20. xml.OutOfElem();
  21. xml.Save((MCD_CSTR)"UserInfo.xml");
  22. }
  23.  
  24. system("pause");
  25. return ;
  26. }

结果为:

7:删除元素 使用的是RemoveChildElem

  1. #include "stdafx.h"
  2. #include<stdlib.h>
  3. #include<iostream>
  4. #include<string.h>
  5. #include"Markup.h"
  6.  
  7. using namespace std;
  8.  
  9. int main()
  10. {
  11. CMarkup xml;
  12. xml.Load((MCD_CSTR)"UserInfo.xml");
  13. BOOL bFind = true;
  14. xml.ResetMainPos();
  15.  
  16. while (bFind)
  17. {
  18. bFind = xml.FindChildElem((MCD_CSTR)"UserID");
  19. if (bFind)
  20. {
  21. CString strData = _T("");
  22. strData = xml.GetChildData();
  23.  
  24. if (strData==(MCD_CSTR)"snake")
  25. {
  26. xml.RemoveChildElem();
  27. xml.Save((MCD_CSTR)"UserInfo.xml");
  28. break;
  29. }
  30. }
  31. }
  32.  
  33. system("pause");
  34. return ;
  35. }

结果为:

使用Markup解析xml文件的更多相关文章

  1. 解析xml文件的四种方式

    什么是 XML? XML 指可扩展标记语言(EXtensible Markup Language) XML 是一种标记语言,很类似 HTML XML 的设计宗旨是传输数据,而非显示数据 XML 标签没 ...

  2. C++生成和解析XML文件

    1.xml 指可扩展标记语言(EXtensible Markup Language) 2.xml 是一种标记语言,类似html 3.xml 的设计宗旨是传输数据,而非显示数据 4.xml 标签没有被预 ...

  3. Java解析XML文件的方式

    在项目里,我们往往会把一些配置信息放到xml文件里,或者各部门间会通过xml文件来交换业务数据,所以有时候我们会遇到“解析xml文件”的需求.一般来讲,有基于DOM树和SAX的两种解析xml文件的方式 ...

  4. Android 解析XML文件和生成XML文件

    解析XML文件 public static void initXML(Context context) { //can't create in /data/media/0 because permis ...

  5. JAVA使用SAX解析XML文件

    在我的另一篇文章(http://www.cnblogs.com/anivia/p/5849712.html)中,通过一个例子介绍了使用DOM来解析XML文件,那么本篇文章通过相同的XML文件介绍如何使 ...

  6. JAVA中使用DOM解析XML文件

    XML是一种方便快捷高效的数据保存传输的格式,在JSON广泛使用之前,XML是服务器和客户端之间数据传输的主要方式.因此,需要使用各种方式,解析服务器传送过来的信息,以供使用者查看. JAVA作为一种 ...

  7. java中采用dom4j解析xml文件

    一.前言 在最近的开发中用到了dom4j来解析xml文件,以前听说过来解析xml文件的几种标准方式:但是从来的没有应用过来,所以可以在google中搜索dmo4j解析xml文件的方式,学习一下dom4 ...

  8. XML:使用DOM技术解析xML文件中的城市,实现select级联选择

    中国的城市xml格式:cities.xml <?xml version="1.0" encoding="utf-8"?> <china> ...

  9. dom4j如何解析XML文件

    最近在 一些对xml文件的操作,下面简单写一个dom4j解析xml文件并将其封装到一个javabean中的例子,只是具有针对性的,不是通用的,仅供参考哦~~ 首先说:dom4j是一个java的XML ...

随机推荐

  1. 关于Unity中如何代码动态修改天空盒

    在Unity中动态修改天空盒有两种方法: 一.为每个Texture建立天空盒材质球,需要更换时直接将对应材质球作为天空盒,缺点是建立的材质球太多 private void ChangeSkybox(M ...

  2. 移动前端的html5 head 头标签

    DOCTYPE DOCTYPE(Document Type),该声明位于文档中最前面的位置,处于 html 标签之前,此标签告知浏览器文档使用哪种 HTML 或者 XHTML 规范. 使用 HTML5 ...

  3. Java基础学习笔记九 Java基础语法之this和super

    构造方法 我们对封装已经有了基本的了解,接下来我们来看一个新的问题,依然以Person为例,由于Person中的属性都被private了,外界无法直接访问属性,必须对外提供相应的set和get方法.当 ...

  4. JavaScript(第二十天)【DOM操作表格及样式】

    DOM在操作生成HTML上,还是比较简明的.不过,由于浏览器总是存在兼容和陷阱,导致最终的操作就不是那么简单方便了.本章主要了解一下DOM操作表格和样式的一些知识. 一.操作表格 <table& ...

  5. 课后练习:C语言实现Linux命令——od

    课后练习:C语言实现Linux命令--od --------CONTENTS-------- 题目详情与分析 设计思路 遇到的问题及解决 待实现的设想与思考 学习反思与感悟 附1:myod.c「1.0 ...

  6. 简单的C语言编译器--语义制导翻译

      语法分析是最难写的,而这部分确实最伤脑的.大量的语义动作分析差点把我逼疯.   简而言之,这部分的作用就是在每次归约之后,都进行一些语义动作,最终让我们得到测试程序的三地址码,即中间代码. 1. ...

  7. 《高级软件测试》11.15.全组完成jira安装,开始任务的部分书写

    今日任务完成情况如下: 小段:完成linux环境上jira的安装,并将jira的安装过程录制下来 小费:完成linux环境下jira的安装,开始部分任务的书写 小高:完成了jira的安装,并进一步熟悉 ...

  8. Scala Option类型

    转载自: Scala 初学者指南, 这里有一系列很棒的文章 类型 Option 可能你已经见过它在 Map API 中的使用:在实现自己的提取器时,我们也用过它, 然而,它还需要更多的解释. 你可能会 ...

  9. django + nginx + uwsgi + websocket

    最近使用django框架做了一个简单的聊天机器人demo, 开发的过程中使用了django自带的websocket模块,当使用django框架自带的wsgi服务去启动的话,没有什么问题.如果要使用uw ...

  10. js实现短暂提示框

    业务场景:当鼠标移入某元素时,显示提示框进行介绍.当鼠标移除时,会自动消失.引入ToolTip.js和ToolTip.css 主方法:ToolTip.show(需要提示的元素id, 随意不重复即可, ...