大家好,又见面了,我是你们的朋友全栈君。
本文整理汇总了Python中lxml.etree.ParserError方法的典型用法代码示例。如果您正苦于以下问题:Python etree.ParserError方法的具体用法?Python etree.ParserError怎么用?Python etree.ParserError使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在模块lxml.etree的用法示例。
在下文中一共展示了etree.ParserError方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: feed
点赞 6
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import ParserError [as 别名]
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
elif isinstance(markup, unicode):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
# or the parser won”t be initialized.
data = markup.read(self.CHUNK_SIZE)
try:
self.parser = self.parser_for(self.soup.original_encoding)
self.parser.feed(data)
while len(data) != 0:
# Now call feed() on the rest of the data, chunk by chunk.
data = markup.read(self.CHUNK_SIZE)
if len(data) != 0:
self.parser.feed(data)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
raise ParserRejectedMarkup(str(e))
开发者ID:MarcelloLins,项目名称:ServerlessCrawler-VancouverRealState,代码行数:22,
示例2: feed
点赞 6
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import ParserError [as 别名]
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
elif isinstance(markup, str):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
# or the parser won”t be initialized.
data = markup.read(self.CHUNK_SIZE)
try:
self.parser = self.parser_for(self.soup.original_encoding)
self.parser.feed(data)
while len(data) != 0:
# Now call feed() on the rest of the data, chunk by chunk.
data = markup.read(self.CHUNK_SIZE)
if len(data) != 0:
self.parser.feed(data)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e))
开发者ID:the-ethan-hunt,项目名称:B.E.N.J.I.,代码行数:22,
示例3: extract_html_content
点赞 6
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import ParserError [as 别名]
def extract_html_content(self, html_body, fix_html=True):
“””Ingestor implementation.”””
if html_body is None:
return
try:
try:
doc = html.fromstring(html_body)
except ValueError:
# Ship around encoding declarations.
# https://stackoverflow.com/questions/3402520
html_body = self.RE_XML_ENCODING.sub(“”, html_body, count=1)
doc = html.fromstring(html_body)
except (ParserError, ParseError, ValueError):
raise ProcessingException(“HTML could not be parsed.”)
self.extract_html_header(doc)
self.cleaner(doc)
text = self.extract_html_text(doc)
self.result.flag(self.result.FLAG_HTML)
self.result.emit_html_body(html_body, text)
开发者ID:occrp-attic,项目名称:ingestors,代码行数:22,
示例4: ingest
点赞 6
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import ParserError [as 别名]
def ingest(self, file_path):
“””Ingestor implementation.”””
file_size = self.result.size or os.path.getsize(file_path)
if file_size > self.MAX_SIZE:
raise ProcessingException(“XML file is too large.”)
try:
doc = etree.parse(file_path)
except (ParserError, ParseError):
raise ProcessingException(“XML could not be parsed.”)
text = self.extract_html_text(doc.getroot())
transform = etree.XSLT(self.XSLT)
html_doc = transform(doc)
html_body = html.tostring(html_doc, encoding=str, pretty_print=True)
self.result.flag(self.result.FLAG_HTML)
self.result.emit_html_body(html_body, text)
开发者ID:occrp-attic,项目名称:ingestors,代码行数:19,
示例5: _retrieve_html_page
点赞 6
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import ParserError [as 别名]
def _retrieve_html_page(self):
“””
Download the requested player”s stats page.
Download the requested page and strip all of the comment tags before
returning a PyQuery object which will be used to parse the data.
Oftentimes, important data is contained in tables which are hidden in
HTML comments and not accessible via PyQuery.
Returns
——-
PyQuery object
The requested page is returned as a queriable PyQuery object with
the comment tags removed.
“””
url = self._build_url()
try:
url_data = pq(url)
except (HTTPError, ParserError):
return None
# For NFL, a 404 page doesn”t actually raise a 404 error, so it needs
# to be manually checked.
if “Page Not Found (404 error)” in str(url_data):
return None
return pq(utils._remove_html_comment_tags(url_data))
开发者ID:roclark,项目名称:sportsreference,代码行数:27,
示例6: _retrieve_html_page
点赞 6
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import ParserError [as 别名]
def _retrieve_html_page(self):
“””
Download the requested player”s stats page.
Download the requested page and strip all of the comment tags before
returning a pyquery object which will be used to parse the data.
Returns
——-
PyQuery object
The requested page is returned as a queriable PyQuery object with
the comment tags removed.
“””
url = self._build_url()
try:
url_data = pq(url)
except (HTTPError, ParserError):
return None
return pq(utils._remove_html_comment_tags(url_data))
开发者ID:roclark,项目名称:sportsreference,代码行数:21,
示例7: _retrieve_html_page
点赞 6
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import ParserError [as 别名]
def _retrieve_html_page(self):
“””
Download the requested player”s stats page.
Download the requested page and strip all of the comment tags before
returning a pyquery object which will be used to parse the data.
Returns
——-
PyQuery object
The requested page is returned as a queriable PyQuery object with
the comment tags removed.
“””
url = PLAYER_URL % self._player_id
try:
url_data = pq(url)
except (HTTPError, ParserError):
return None
return pq(utils._remove_html_comment_tags(url_data))
开发者ID:roclark,项目名称:sportsreference,代码行数:21,
示例8: _pull_conference_page
点赞 6
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import ParserError [as 别名]
def _pull_conference_page(self, conference_abbreviation, year):
“””
Download the conference page.
Download the conference page for the requested conference and season
and create a PyQuery object.
Parameters
———-
conference_abbreviation : string
A string of the requested conference”s abbreviation, such as
“big-12”.
year : string
A string of the requested year to pull conference information from.
“””
try:
return pq(CONFERENCE_URL % (conference_abbreviation, year))
except (HTTPError, ParserError):
return None
开发者ID:roclark,项目名称:sportsreference,代码行数:21,
示例9: feed
点赞 6
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import ParserError [as 别名]
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
elif isinstance(markup, str):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
# or the parser won”t be initialized.
data = markup.read(self.CHUNK_SIZE)
try:
self.parser = self.parser_for(self.soup.original_encoding)
self.parser.feed(data)
while len(data) != 0:
# Now call feed() on the rest of the data, chunk by chunk.
data = markup.read(self.CHUNK_SIZE)
if len(data) != 0:
self.parser.feed(data)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(e)
开发者ID:Tautulli,项目名称:Tautulli,代码行数:22,
注:本文中的lxml.etree.ParserError方法示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。
发布者:全栈程序员-用户IM,转载请注明出处:https://javaforall.cn/149151.html原文链接:https://javaforall.cn
【正版授权,激活自己账号】: Jetbrains全家桶Ide使用,1年售后保障,每天仅需1毛
【官方授权 正版激活】: 官方授权 正版激活 支持Jetbrains家族下所有IDE 使用个人JB账号...