第三方XML解析器(xpath.js)给出错误“未被捕获的结束标记名称:div与当前的起始标记名称不匹配”

问题描述:

使用parse.com的云代码时,我试图从网页上抓取数据以发送给我iOS应用我已经在iOS中本地实现了网络抓取代码,但我试图将此任务移至后端。我使用的是被称为​​第三方XML解析器(xpath.js)给出错误“未被捕获的结束标记名称:div与当前的起始标记名称不匹配”

Parse.Cloud.define("test", function(request, response) { 


     Parse.Cloud.httpRequest({ 
     url: "http://menu.ha.ucla.edu/foodpro/default.asp", 
     success: function(httpResponse) { 
     var text = httpResponse.text; 
     var xpath = require("cloud/xpath.js"), dom = require("cloud/dom-parser.js").DOMParser; 
     var doc = new dom().parseFromString(text); 
     var cells = xpath.select("//td[starts-with(@class, 'menugridcell')]", doc); 

     response.success("test " + cells.count); 
     var listNode = xpath.select("//ul", cells[0])[0]; 
     }, 
    error: function(httpResponse) { 
     console.error('Request failed with response code ' + httpResponse.status); 
     } 
}); 
}); 

但是一个node.js的图书馆,当我运行的代码,我收到此错误:

"Uncaught end tag name: div is not match the current start tagName:script" 

就像我前面提到的,我已经能够成功地刮Web数据与一个单独的Objective-C库,所以标签是一致的,问题不在于源代码内。

对于源代码,这里是webpage I'm scraping。 *不会让我直接链接到源代码,否则我会给一个直接的链接。

编辑:

这里是DOM-parser.js

function DOMParser(options){ 
    this.options = options ||{locator:{}}; 

} 
DOMParser.prototype.parseFromString = function(source,mimeType){  
    var options = this.options; 
    var sax = new XMLReader(); 
    var domBuilder = options.domBuilder || new DOMHandler();//contentHandler and LexicalHandler 
    var errorHandler = options.errorHandler; 
    var locator = options.locator; 
    var defaultNSMap = options.xmlns||{}; 
    var entityMap = {'lt':'<','gt':'>','amp':'&','quot':'"','apos':"'"} 
    if(locator){ 
     domBuilder.setDocumentLocator(locator) 
    } 

    sax.errorHandler = buildErrorHandler(errorHandler,domBuilder,locator); 
    sax.domBuilder = options.domBuilder || domBuilder; 
    if(/\/x?html?$/.test(mimeType)){ 
     entityMap.nbsp = '\xa0'; 
     entityMap.copy = '\xa9'; 
     defaultNSMap['']= 'http://www.w3.org/1999/xhtml'; 
    } 
    if(source){ 
     sax.parse(source,defaultNSMap,entityMap); 
    }else{ 
     sax.errorHandler.error("invalid document source"); 
    } 
    return domBuilder.document; 
} 
function buildErrorHandler(errorImpl,domBuilder,locator){ 
    if(!errorImpl){ 
     if(domBuilder instanceof DOMHandler){ 
      return domBuilder; 
     } 
     errorImpl = domBuilder ; 
    } 
    var errorHandler = {} 
    var isCallback = errorImpl instanceof Function; 
    locator = locator||{} 
    function build(key){ 
     var fn = errorImpl[key]; 
     if(!fn){ 
      if(isCallback){ 
       fn = errorImpl.length == 2?function(msg){errorImpl(key,msg)}:errorImpl; 
      }else{ 
       var i=arguments.length; 
       while(--i){ 
        if(fn = errorImpl[arguments[i]]){ 
         break; 
        } 
       } 
      } 
     } 
     errorHandler[key] = fn && function(msg){ 
      fn(msg+_locator(locator)); 
     }||function(){}; 
    } 
    build('warning','warn'); 
    build('error','warn','warning'); 
    build('fatalError','warn','warning','error'); 
    return errorHandler; 
} 
/** 
* +ContentHandler+ErrorHandler 
* +LexicalHandler+EntityResolver2 
* -DeclHandler-DTDHandler 
* 
* DefaultHandler:EntityResolver, DTDHandler, ContentHandler, ErrorHandler 
* DefaultHandler2:DefaultHandler,LexicalHandler, DeclHandler, EntityResolver2 
* @link http://www.saxproject.org/apidoc/org/xml/sax/helpers/DefaultHandler.html 
*/ 
function DOMHandler() { 
    this.cdata = false; 
} 
function position(locator,node){ 
    node.lineNumber = locator.lineNumber; 
    node.columnNumber = locator.columnNumber; 
} 
/** 
* @see org.xml.sax.ContentHandler#startDocument 
* @link http://www.saxproject.org/apidoc/org/xml/sax/ContentHandler.html 
*/ 
DOMHandler.prototype = { 
    startDocument : function() { 
     this.document = new DOMImplementation().createDocument(null, null, null); 
     if (this.locator) { 
      this.document.documentURI = this.locator.systemId; 
     } 
    }, 
    startElement:function(namespaceURI, localName, qName, attrs) { 
     var doc = this.document; 
     var el = doc.createElementNS(namespaceURI, qName||localName); 
     var len = attrs.length; 
     appendElement(this, el); 
     this.currentElement = el; 

     this.locator && position(this.locator,el) 
     for (var i = 0 ; i < len; i++) { 
      var namespaceURI = attrs.getURI(i); 
      var value = attrs.getValue(i); 
      var qName = attrs.getQName(i); 
      var attr = doc.createAttributeNS(namespaceURI, qName); 
      if(attr.getOffset){ 
       position(attr.getOffset(1),attr) 
      } 
      attr.value = attr.nodeValue = value; 
      el.setAttributeNode(attr) 
     } 
    }, 
    endElement:function(namespaceURI, localName, qName) { 
     var current = this.currentElement 
     var tagName = current.tagName; 
     this.currentElement = current.parentNode; 
    }, 
    startPrefixMapping:function(prefix, uri) { 
    }, 
    endPrefixMapping:function(prefix) { 
    }, 
    processingInstruction:function(target, data) { 
     var ins = this.document.createProcessingInstruction(target, data); 
     this.locator && position(this.locator,ins) 
     appendElement(this, ins); 
    }, 
    ignorableWhitespace:function(ch, start, length) { 
    }, 
    characters:function(chars, start, length) { 
     chars = _toString.apply(this,arguments) 
     //console.log(chars) 
     if(this.currentElement && chars){ 
      if (this.cdata) { 
       var charNode = this.document.createCDATASection(chars); 
       this.currentElement.appendChild(charNode); 
      } else { 
       var charNode = this.document.createTextNode(chars); 
       this.currentElement.appendChild(charNode); 
      } 
      this.locator && position(this.locator,charNode) 
     } 
    }, 
    skippedEntity:function(name) { 
    }, 
    endDocument:function() { 
     this.document.normalize(); 
    }, 
    setDocumentLocator:function (locator) { 
     if(this.locator = locator){// && !('lineNumber' in locator)){ 
      locator.lineNumber = 0; 
     } 
    }, 
    //LexicalHandler 
    comment:function(chars, start, length) { 
     chars = _toString.apply(this,arguments) 
     var comm = this.document.createComment(chars); 
     this.locator && position(this.locator,comm) 
     appendElement(this, comm); 
    }, 

    startCDATA:function() { 
     //used in characters() methods 
     this.cdata = true; 
    }, 
    endCDATA:function() { 
     this.cdata = false; 
    }, 

    startDTD:function(name, publicId, systemId) { 
     var impl = this.document.implementation; 
     if (impl && impl.createDocumentType) { 
      var dt = impl.createDocumentType(name, publicId, systemId); 
      this.locator && position(this.locator,dt) 
      appendElement(this, dt); 
     } 
    }, 
    /** 
    * @see org.xml.sax.ErrorHandler 
    * @link http://www.saxproject.org/apidoc/org/xml/sax/ErrorHandler.html 
    */ 
    warning:function(error) { 
     console.warn(error,_locator(this.locator)); 
    }, 
    error:function(error) { 
     console.error(error,_locator(this.locator)); 
    }, 
    fatalError:function(error) { 
     console.error(error,_locator(this.locator)); 
     throw error; 
    } 
} 
function _locator(l){ 
    if(l){ 
     return '\[email protected]'+(l.systemId ||'')+'#[line:'+l.lineNumber+',col:'+l.columnNumber+']' 
    } 
} 
function _toString(chars,start,length){ 
    if(typeof chars == 'string'){ 
     return chars.substr(start,length) 
    }else{//java sax connect width xmldom on rhino(what about: "? && !(chars instanceof String)") 
     if(chars.length >= start+length || start){ 
      return new java.lang.String(chars,start,length)+''; 
     } 
     return chars; 
    } 
} 

/* 
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/LexicalHandler.html 
* used method of org.xml.sax.ext.LexicalHandler: 
* #comment(chars, start, length) 
* #startCDATA() 
* #endCDATA() 
* #startDTD(name, publicId, systemId) 
* 
* 
* IGNORED method of org.xml.sax.ext.LexicalHandler: 
* #endDTD() 
* #startEntity(name) 
* #endEntity(name) 
* 
* 
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/DeclHandler.html 
* IGNORED method of org.xml.sax.ext.DeclHandler 
* #attributeDecl(eName, aName, type, mode, value) 
* #elementDecl(name, model) 
* #externalEntityDecl(name, publicId, systemId) 
* #internalEntityDecl(name, value) 
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/EntityResolver2.html 
* IGNORED method of org.xml.sax.EntityResolver2 
* #resolveEntity(String name,String publicId,String baseURI,String systemId) 
* #resolveEntity(publicId, systemId) 
* #getExternalSubset(name, baseURI) 
* @link http://www.saxproject.org/apidoc/org/xml/sax/DTDHandler.html 
* IGNORED method of org.xml.sax.DTDHandler 
* #notationDecl(name, publicId, systemId) {}; 
* #unparsedEntityDecl(name, publicId, systemId, notationName) {}; 
*/ 
"endDTD,startEntity,endEntity,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,resolveEntity,getExternalSubset,notationDecl,unparsedEntityDecl".replace(/\w+/g,function(key){ 
    DOMHandler.prototype[key] = function(){return null} 
}) 

/* Private static helpers treated below as private instance methods, so don't need to add these to the public API; we might use a Relator to also get rid of non-standard public properties */ 
function appendElement (hander,node) { 
    if (!hander.currentElement) { 
     hander.document.appendChild(node); 
    } else { 
     hander.currentElement.appendChild(node); 
    } 
}//appendChild and setAttributeNS are preformance key 

if(typeof require == 'function'){ 
    var XMLReader = require('cloud/sax').XMLReader; 
    var DOMImplementation = exports.DOMImplementation = require('cloud/dom').DOMImplementation; 
    exports.XMLSerializer = require('cloud/dom').XMLSerializer ; 
    exports.DOMParser = DOMParser; 
} 
+0

在这种情况下,什么库是'cloud/dom-parser.js'?它可能是解析为XML而不是HTML。如果你做'.parseFromString(text,'text/html');'? – loganfsmyth 2014-09-27 17:59:03

+0

否 - 相同的错误。我不知道什么库dom-parser.js是,但我将文件中的代码添加到问题 – Mahir 2014-09-28 06:58:19

+0

您可以创建一个jsfiddle来演示此问题吗? – 2014-10-08 21:59:27

代码给定的页面包含在HTML脚本一些XML标记。开放标签可能会被忽略,因为它们包含预先印好的qoutation标记。解析器找到</div>(在脚本中的一个字符串中)并尝试使其与开头<script>匹配并失败。您的解析器尝试读取XML,但不知道xhtml脚本区域是CData。

您必须告诉解析忽略(或读取CData)脚本标记。对不起,但我不知道该怎么办。

最好的问候Majo