validator.js 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. 'use strict';
  2. const util = require('./util');
  3. const defaultOptions = {
  4. allowBooleanAttributes: false, //A tag can have attributes without any value
  5. unpairedTags: []
  6. };
  7. //const tagsPattern = new RegExp("<\\/?([\\w:\\-_\.]+)\\s*\/?>","g");
  8. exports.validate = function (xmlData, options) {
  9. options = Object.assign({}, defaultOptions, options);
  10. //xmlData = xmlData.replace(/(\r\n|\n|\r)/gm,"");//make it single line
  11. //xmlData = xmlData.replace(/(^\s*<\?xml.*?\?>)/g,"");//Remove XML starting tag
  12. //xmlData = xmlData.replace(/(<!DOCTYPE[\s\w\"\.\/\-\:]+(\[.*\])*\s*>)/g,"");//Remove DOCTYPE
  13. const tags = [];
  14. let tagFound = false;
  15. //indicates that the root tag has been closed (aka. depth 0 has been reached)
  16. let reachedRoot = false;
  17. if (xmlData[0] === '\ufeff') {
  18. // check for byte order mark (BOM)
  19. xmlData = xmlData.substr(1);
  20. }
  21. for (let i = 0; i < xmlData.length; i++) {
  22. if (xmlData[i] === '<' && xmlData[i+1] === '?') {
  23. i+=2;
  24. i = readPI(xmlData,i);
  25. if (i.err) return i;
  26. }else if (xmlData[i] === '<') {
  27. //starting of tag
  28. //read until you reach to '>' avoiding any '>' in attribute value
  29. let tagStartPos = i;
  30. i++;
  31. if (xmlData[i] === '!') {
  32. i = readCommentAndCDATA(xmlData, i);
  33. continue;
  34. } else {
  35. let closingTag = false;
  36. if (xmlData[i] === '/') {
  37. //closing tag
  38. closingTag = true;
  39. i++;
  40. }
  41. //read tagname
  42. let tagName = '';
  43. for (; i < xmlData.length &&
  44. xmlData[i] !== '>' &&
  45. xmlData[i] !== ' ' &&
  46. xmlData[i] !== '\t' &&
  47. xmlData[i] !== '\n' &&
  48. xmlData[i] !== '\r'; i++
  49. ) {
  50. tagName += xmlData[i];
  51. }
  52. tagName = tagName.trim();
  53. //console.log(tagName);
  54. if (tagName[tagName.length - 1] === '/') {
  55. //self closing tag without attributes
  56. tagName = tagName.substring(0, tagName.length - 1);
  57. //continue;
  58. i--;
  59. }
  60. if (!validateTagName(tagName)) {
  61. let msg;
  62. if (tagName.trim().length === 0) {
  63. msg = "Invalid space after '<'.";
  64. } else {
  65. msg = "Tag '"+tagName+"' is an invalid name.";
  66. }
  67. return getErrorObject('InvalidTag', msg, getLineNumberForPosition(xmlData, i));
  68. }
  69. const result = readAttributeStr(xmlData, i);
  70. if (result === false) {
  71. return getErrorObject('InvalidAttr', "Attributes for '"+tagName+"' have open quote.", getLineNumberForPosition(xmlData, i));
  72. }
  73. let attrStr = result.value;
  74. i = result.index;
  75. if (attrStr[attrStr.length - 1] === '/') {
  76. //self closing tag
  77. const attrStrStart = i - attrStr.length;
  78. attrStr = attrStr.substring(0, attrStr.length - 1);
  79. const isValid = validateAttributeString(attrStr, options);
  80. if (isValid === true) {
  81. tagFound = true;
  82. //continue; //text may presents after self closing tag
  83. } else {
  84. //the result from the nested function returns the position of the error within the attribute
  85. //in order to get the 'true' error line, we need to calculate the position where the attribute begins (i - attrStr.length) and then add the position within the attribute
  86. //this gives us the absolute index in the entire xml, which we can use to find the line at last
  87. return getErrorObject(isValid.err.code, isValid.err.msg, getLineNumberForPosition(xmlData, attrStrStart + isValid.err.line));
  88. }
  89. } else if (closingTag) {
  90. if (!result.tagClosed) {
  91. return getErrorObject('InvalidTag', "Closing tag '"+tagName+"' doesn't have proper closing.", getLineNumberForPosition(xmlData, i));
  92. } else if (attrStr.trim().length > 0) {
  93. return getErrorObject('InvalidTag', "Closing tag '"+tagName+"' can't have attributes or invalid starting.", getLineNumberForPosition(xmlData, tagStartPos));
  94. } else if (tags.length === 0) {
  95. return getErrorObject('InvalidTag', "Closing tag '"+tagName+"' has not been opened.", getLineNumberForPosition(xmlData, tagStartPos));
  96. } else {
  97. const otg = tags.pop();
  98. if (tagName !== otg.tagName) {
  99. let openPos = getLineNumberForPosition(xmlData, otg.tagStartPos);
  100. return getErrorObject('InvalidTag',
  101. "Expected closing tag '"+otg.tagName+"' (opened in line "+openPos.line+", col "+openPos.col+") instead of closing tag '"+tagName+"'.",
  102. getLineNumberForPosition(xmlData, tagStartPos));
  103. }
  104. //when there are no more tags, we reached the root level.
  105. if (tags.length == 0) {
  106. reachedRoot = true;
  107. }
  108. }
  109. } else {
  110. const isValid = validateAttributeString(attrStr, options);
  111. if (isValid !== true) {
  112. //the result from the nested function returns the position of the error within the attribute
  113. //in order to get the 'true' error line, we need to calculate the position where the attribute begins (i - attrStr.length) and then add the position within the attribute
  114. //this gives us the absolute index in the entire xml, which we can use to find the line at last
  115. return getErrorObject(isValid.err.code, isValid.err.msg, getLineNumberForPosition(xmlData, i - attrStr.length + isValid.err.line));
  116. }
  117. //if the root level has been reached before ...
  118. if (reachedRoot === true) {
  119. return getErrorObject('InvalidXml', 'Multiple possible root nodes found.', getLineNumberForPosition(xmlData, i));
  120. } else if(options.unpairedTags.indexOf(tagName) !== -1){
  121. //don't push into stack
  122. } else {
  123. tags.push({tagName, tagStartPos});
  124. }
  125. tagFound = true;
  126. }
  127. //skip tag text value
  128. //It may include comments and CDATA value
  129. for (i++; i < xmlData.length; i++) {
  130. if (xmlData[i] === '<') {
  131. if (xmlData[i + 1] === '!') {
  132. //comment or CADATA
  133. i++;
  134. i = readCommentAndCDATA(xmlData, i);
  135. continue;
  136. } else if (xmlData[i+1] === '?') {
  137. i = readPI(xmlData, ++i);
  138. if (i.err) return i;
  139. } else{
  140. break;
  141. }
  142. } else if (xmlData[i] === '&') {
  143. const afterAmp = validateAmpersand(xmlData, i);
  144. if (afterAmp == -1)
  145. return getErrorObject('InvalidChar', "char '&' is not expected.", getLineNumberForPosition(xmlData, i));
  146. i = afterAmp;
  147. }else{
  148. if (reachedRoot === true && !isWhiteSpace(xmlData[i])) {
  149. return getErrorObject('InvalidXml', "Extra text at the end", getLineNumberForPosition(xmlData, i));
  150. }
  151. }
  152. } //end of reading tag text value
  153. if (xmlData[i] === '<') {
  154. i--;
  155. }
  156. }
  157. } else {
  158. if ( isWhiteSpace(xmlData[i])) {
  159. continue;
  160. }
  161. return getErrorObject('InvalidChar', "char '"+xmlData[i]+"' is not expected.", getLineNumberForPosition(xmlData, i));
  162. }
  163. }
  164. if (!tagFound) {
  165. return getErrorObject('InvalidXml', 'Start tag expected.', 1);
  166. }else if (tags.length == 1) {
  167. return getErrorObject('InvalidTag', "Unclosed tag '"+tags[0].tagName+"'.", getLineNumberForPosition(xmlData, tags[0].tagStartPos));
  168. }else if (tags.length > 0) {
  169. return getErrorObject('InvalidXml', "Invalid '"+
  170. JSON.stringify(tags.map(t => t.tagName), null, 4).replace(/\r?\n/g, '')+
  171. "' found.", {line: 1, col: 1});
  172. }
  173. return true;
  174. };
  175. function isWhiteSpace(char){
  176. return char === ' ' || char === '\t' || char === '\n' || char === '\r';
  177. }
  178. /**
  179. * Read Processing insstructions and skip
  180. * @param {*} xmlData
  181. * @param {*} i
  182. */
  183. function readPI(xmlData, i) {
  184. const start = i;
  185. for (; i < xmlData.length; i++) {
  186. if (xmlData[i] == '?' || xmlData[i] == ' ') {
  187. //tagname
  188. const tagname = xmlData.substr(start, i - start);
  189. if (i > 5 && tagname === 'xml') {
  190. return getErrorObject('InvalidXml', 'XML declaration allowed only at the start of the document.', getLineNumberForPosition(xmlData, i));
  191. } else if (xmlData[i] == '?' && xmlData[i + 1] == '>') {
  192. //check if valid attribut string
  193. i++;
  194. break;
  195. } else {
  196. continue;
  197. }
  198. }
  199. }
  200. return i;
  201. }
  202. function readCommentAndCDATA(xmlData, i) {
  203. if (xmlData.length > i + 5 && xmlData[i + 1] === '-' && xmlData[i + 2] === '-') {
  204. //comment
  205. for (i += 3; i < xmlData.length; i++) {
  206. if (xmlData[i] === '-' && xmlData[i + 1] === '-' && xmlData[i + 2] === '>') {
  207. i += 2;
  208. break;
  209. }
  210. }
  211. } else if (
  212. xmlData.length > i + 8 &&
  213. xmlData[i + 1] === 'D' &&
  214. xmlData[i + 2] === 'O' &&
  215. xmlData[i + 3] === 'C' &&
  216. xmlData[i + 4] === 'T' &&
  217. xmlData[i + 5] === 'Y' &&
  218. xmlData[i + 6] === 'P' &&
  219. xmlData[i + 7] === 'E'
  220. ) {
  221. let angleBracketsCount = 1;
  222. for (i += 8; i < xmlData.length; i++) {
  223. if (xmlData[i] === '<') {
  224. angleBracketsCount++;
  225. } else if (xmlData[i] === '>') {
  226. angleBracketsCount--;
  227. if (angleBracketsCount === 0) {
  228. break;
  229. }
  230. }
  231. }
  232. } else if (
  233. xmlData.length > i + 9 &&
  234. xmlData[i + 1] === '[' &&
  235. xmlData[i + 2] === 'C' &&
  236. xmlData[i + 3] === 'D' &&
  237. xmlData[i + 4] === 'A' &&
  238. xmlData[i + 5] === 'T' &&
  239. xmlData[i + 6] === 'A' &&
  240. xmlData[i + 7] === '['
  241. ) {
  242. for (i += 8; i < xmlData.length; i++) {
  243. if (xmlData[i] === ']' && xmlData[i + 1] === ']' && xmlData[i + 2] === '>') {
  244. i += 2;
  245. break;
  246. }
  247. }
  248. }
  249. return i;
  250. }
  251. const doubleQuote = '"';
  252. const singleQuote = "'";
  253. /**
  254. * Keep reading xmlData until '<' is found outside the attribute value.
  255. * @param {string} xmlData
  256. * @param {number} i
  257. */
  258. function readAttributeStr(xmlData, i) {
  259. let attrStr = '';
  260. let startChar = '';
  261. let tagClosed = false;
  262. for (; i < xmlData.length; i++) {
  263. if (xmlData[i] === doubleQuote || xmlData[i] === singleQuote) {
  264. if (startChar === '') {
  265. startChar = xmlData[i];
  266. } else if (startChar !== xmlData[i]) {
  267. //if vaue is enclosed with double quote then single quotes are allowed inside the value and vice versa
  268. } else {
  269. startChar = '';
  270. }
  271. } else if (xmlData[i] === '>') {
  272. if (startChar === '') {
  273. tagClosed = true;
  274. break;
  275. }
  276. }
  277. attrStr += xmlData[i];
  278. }
  279. if (startChar !== '') {
  280. return false;
  281. }
  282. return {
  283. value: attrStr,
  284. index: i,
  285. tagClosed: tagClosed
  286. };
  287. }
  288. /**
  289. * Select all the attributes whether valid or invalid.
  290. */
  291. const validAttrStrRegxp = new RegExp('(\\s*)([^\\s=]+)(\\s*=)?(\\s*([\'"])(([\\s\\S])*?)\\5)?', 'g');
  292. //attr, ="sd", a="amit's", a="sd"b="saf", ab cd=""
  293. function validateAttributeString(attrStr, options) {
  294. //console.log("start:"+attrStr+":end");
  295. //if(attrStr.trim().length === 0) return true; //empty string
  296. const matches = util.getAllMatches(attrStr, validAttrStrRegxp);
  297. const attrNames = {};
  298. for (let i = 0; i < matches.length; i++) {
  299. if (matches[i][1].length === 0) {
  300. //nospace before attribute name: a="sd"b="saf"
  301. return getErrorObject('InvalidAttr', "Attribute '"+matches[i][2]+"' has no space in starting.", getPositionFromMatch(matches[i]))
  302. } else if (matches[i][3] !== undefined && matches[i][4] === undefined) {
  303. return getErrorObject('InvalidAttr', "Attribute '"+matches[i][2]+"' is without value.", getPositionFromMatch(matches[i]));
  304. } else if (matches[i][3] === undefined && !options.allowBooleanAttributes) {
  305. //independent attribute: ab
  306. return getErrorObject('InvalidAttr', "boolean attribute '"+matches[i][2]+"' is not allowed.", getPositionFromMatch(matches[i]));
  307. }
  308. /* else if(matches[i][6] === undefined){//attribute without value: ab=
  309. return { err: { code:"InvalidAttr",msg:"attribute " + matches[i][2] + " has no value assigned."}};
  310. } */
  311. const attrName = matches[i][2];
  312. if (!validateAttrName(attrName)) {
  313. return getErrorObject('InvalidAttr', "Attribute '"+attrName+"' is an invalid name.", getPositionFromMatch(matches[i]));
  314. }
  315. if (!attrNames.hasOwnProperty(attrName)) {
  316. //check for duplicate attribute.
  317. attrNames[attrName] = 1;
  318. } else {
  319. return getErrorObject('InvalidAttr', "Attribute '"+attrName+"' is repeated.", getPositionFromMatch(matches[i]));
  320. }
  321. }
  322. return true;
  323. }
  324. function validateNumberAmpersand(xmlData, i) {
  325. let re = /\d/;
  326. if (xmlData[i] === 'x') {
  327. i++;
  328. re = /[\da-fA-F]/;
  329. }
  330. for (; i < xmlData.length; i++) {
  331. if (xmlData[i] === ';')
  332. return i;
  333. if (!xmlData[i].match(re))
  334. break;
  335. }
  336. return -1;
  337. }
  338. function validateAmpersand(xmlData, i) {
  339. // https://www.w3.org/TR/xml/#dt-charref
  340. i++;
  341. if (xmlData[i] === ';')
  342. return -1;
  343. if (xmlData[i] === '#') {
  344. i++;
  345. return validateNumberAmpersand(xmlData, i);
  346. }
  347. let count = 0;
  348. for (; i < xmlData.length; i++, count++) {
  349. if (xmlData[i].match(/\w/) && count < 20)
  350. continue;
  351. if (xmlData[i] === ';')
  352. break;
  353. return -1;
  354. }
  355. return i;
  356. }
  357. function getErrorObject(code, message, lineNumber) {
  358. return {
  359. err: {
  360. code: code,
  361. msg: message,
  362. line: lineNumber.line || lineNumber,
  363. col: lineNumber.col,
  364. },
  365. };
  366. }
  367. function validateAttrName(attrName) {
  368. return util.isName(attrName);
  369. }
  370. // const startsWithXML = /^xml/i;
  371. function validateTagName(tagname) {
  372. return util.isName(tagname) /* && !tagname.match(startsWithXML) */;
  373. }
  374. //this function returns the line number for the character at the given index
  375. function getLineNumberForPosition(xmlData, index) {
  376. const lines = xmlData.substring(0, index).split(/\r?\n/);
  377. return {
  378. line: lines.length,
  379. // column number is last line's length + 1, because column numbering starts at 1:
  380. col: lines[lines.length - 1].length + 1
  381. };
  382. }
  383. //this function returns the position of the first character of match within attrStr
  384. function getPositionFromMatch(match) {
  385. return match.startIndex + match[1].length;
  386. }