对Python3 解析html的几种操作方式小结

yipeiwu_com6年前Python基础

解析html是爬虫后的重要的一个处理数据的环节。一下记录解析html的几种方式。

先介绍基础的辅助函数,主要用于获取html并输入解析后的结束

#把传递解析函数,便于下面的修改
def get_html(url, paraser=bs4_paraser):
 headers = {
  'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate, sdch',
  'Accept-Language': 'zh-CN,zh;q=0.8',
  'Host': 'www.360kan.com',
  'Proxy-Connection': 'keep-alive',
  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
 }
 request = urllib2.Request(url, headers=headers)
 response = urllib2.urlopen(request)
 response.encoding = 'utf-8'
 if response.code == 200:
  data = StringIO.StringIO(response.read())
  gzipper = gzip.GzipFile(fileobj=data)
  data = gzipper.read()
  value = paraser(data) # open('E:/h5/haPkY0osd0r5UB.html').read()
  return value
 else:
  pass
 
 
value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser)
for row in value:
 print row

1,lxml.html的方式进行解析,

The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt. It is unique in that it combines the speed and XML feature completeness of these libraries with the simplicity of a native Python API, mostly compatible but superior to the well-known ElementTree API. The latest release works with all CPython versions from 2.6 to 3.5. See the introduction for more information about background and goals of the lxml project. Some common questions are answered in the FAQ. [官网](http://lxml.de/)

def lxml_parser(page):
 data = []
 doc = etree.HTML(page)
 all_div = doc.xpath('//div[@class="yingping-list-wrap"]')
 for row in all_div:
  # 获取每一个影评,即影评的item
  all_div_item = row.xpath('.//div[@class="item"]') # find_all('div', attrs={'class': 'item'})
  for r in all_div_item:
   value = {}
   # 获取影评的标题部分
   title = r.xpath('.//div[@class="g-clear title-wrap"][1]')
   value['title'] = title[0].xpath('./a/text()')[0]
   value['title_href'] = title[0].xpath('./a/@href')[0]
   score_text = title[0].xpath('./div/span/span/@style')[0]
   score_text = re.search(r'\d+', score_text).group()
   value['score'] = int(score_text) / 20
   # 时间
   value['time'] = title[0].xpath('./div/span[@class="time"]/text()')[0]
   # 多少人喜欢
   value['people'] = int(
     re.search(r'\d+', title[0].xpath('./div[@class="num"]/span/text()')[0]).group())
   data.append(value)
 return data

2,使用BeautifulSoup,不多说了,大家网上找资料看看

def bs4_paraser(html):
 all_value = []
 value = {}
 soup = BeautifulSoup(html, 'html.parser')
 # 获取影评的部分
 all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1)
 for row in all_div:
  # 获取每一个影评,即影评的item
  all_div_item = row.find_all('div', attrs={'class': 'item'})
  for r in all_div_item:
   # 获取影评的标题部分
   title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1)
   if title is not None and len(title) > 0:
    value['title'] = title[0].a.string
    value['title_href'] = title[0].a['href']
    score_text = title[0].div.span.span['style']
    score_text = re.search(r'\d+', score_text).group()
    value['score'] = int(score_text) / 20
    # 时间
    value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string
    # 多少人喜欢
    value['people'] = int(
      re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group())
   # print r
   all_value.append(value)
   value = {}
 return all_value

3,使用SGMLParser,主要是通过start、end tag的方式进行了,解析工程比较明朗,但是有点麻烦,而且该案例的场景不太适合该方法,(哈哈)

class CommentParaser(SGMLParser):
 def __init__(self):
  SGMLParser.__init__(self)
  self.__start_div_yingping = False
  self.__start_div_item = False
  self.__start_div_gclear = False
  self.__start_div_ratingwrap = False
  self.__start_div_num = False
  # a
  self.__start_a = False
  # span 3中状态
  self.__span_state = 0
  # 数据
  self.__value = {}
  self.data = []
 
 def start_div(self, attrs):
  for k, v in attrs:
   if k == 'class' and v == 'yingping-list-wrap':
    self.__start_div_yingping = True
   elif k == 'class' and v == 'item':
    self.__start_div_item = True
   elif k == 'class' and v == 'g-clear title-wrap':
    self.__start_div_gclear = True
   elif k == 'class' and v == 'rating-wrap g-clear':
    self.__start_div_ratingwrap = True
   elif k == 'class' and v == 'num':
    self.__start_div_num = True
 
 def end_div(self):
  if self.__start_div_yingping:
   if self.__start_div_item:
    if self.__start_div_gclear:
     if self.__start_div_num or self.__start_div_ratingwrap:
      if self.__start_div_num:
       self.__start_div_num = False
      if self.__start_div_ratingwrap:
       self.__start_div_ratingwrap = False
     else:
      self.__start_div_gclear = False
    else:
     self.data.append(self.__value)
     self.__value = {}
     self.__start_div_item = False
   else:
    self.__start_div_yingping = False
 
 def start_a(self, attrs):
  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
   self.__start_a = True
   for k, v in attrs:
    if k == 'href':
     self.__value['href'] = v
 
 def end_a(self):
  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
   self.__start_a = False
 
 def start_span(self, attrs):
  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
   if self.__start_div_ratingwrap:
    if self.__span_state != 1:
     for k, v in attrs:
      if k == 'class' and v == 'rating':
       self.__span_state = 1
      elif k == 'class' and v == 'time':
       self.__span_state = 2
    else:
     for k, v in attrs:
      if k == 'style':
       score_text = re.search(r'\d+', v).group()
     self.__value['score'] = int(score_text) / 20
     self.__span_state = 3
   elif self.__start_div_num:
    self.__span_state = 4
 
 def end_span(self):
  self.__span_state = 0
 
 def handle_data(self, data):
  if self.__start_a:
   self.__value['title'] = data
  elif self.__span_state == 2:
   self.__value['time'] = data
  elif self.__span_state == 4:
   score_text = re.search(r'\d+', data).group()
   self.__value['people'] = int(score_text)
  pass
def sgl_parser(html):
 parser = CommentParaser()
 parser.feed(html)
 return parser.data

4,HTMLParaer,与3原理相识,就是调用的方法不太一样,基本上可以公用,

class CommentHTMLParser(HTMLParser.HTMLParser):
 def __init__(self):
  HTMLParser.HTMLParser.__init__(self)
  self.__start_div_yingping = False
  self.__start_div_item = False
  self.__start_div_gclear = False
  self.__start_div_ratingwrap = False
  self.__start_div_num = False
  # a
  self.__start_a = False
  # span 3中状态
  self.__span_state = 0
  # 数据
  self.__value = {}
  self.data = []
 
 def handle_starttag(self, tag, attrs):
  if tag == 'div':
   for k, v in attrs:
    if k == 'class' and v == 'yingping-list-wrap':
     self.__start_div_yingping = True
    elif k == 'class' and v == 'item':
     self.__start_div_item = True
    elif k == 'class' and v == 'g-clear title-wrap':
     self.__start_div_gclear = True
    elif k == 'class' and v == 'rating-wrap g-clear':
     self.__start_div_ratingwrap = True
    elif k == 'class' and v == 'num':
     self.__start_div_num = True
  elif tag == 'a':
   if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
    self.__start_a = True
    for k, v in attrs:
     if k == 'href':
      self.__value['href'] = v
  elif tag == 'span':
   if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
    if self.__start_div_ratingwrap:
     if self.__span_state != 1:
      for k, v in attrs:
       if k == 'class' and v == 'rating':
        self.__span_state = 1
       elif k == 'class' and v == 'time':
        self.__span_state = 2
     else:
      for k, v in attrs:
       if k == 'style':
        score_text = re.search(r'\d+', v).group()
      self.__value['score'] = int(score_text) / 20
      self.__span_state = 3
    elif self.__start_div_num:
     self.__span_state = 4
 
 def handle_endtag(self, tag):
  if tag == 'div':
   if self.__start_div_yingping:
    if self.__start_div_item:
     if self.__start_div_gclear:
      if self.__start_div_num or self.__start_div_ratingwrap:
       if self.__start_div_num:
        self.__start_div_num = False
       if self.__start_div_ratingwrap:
        self.__start_div_ratingwrap = False
      else:
       self.__start_div_gclear = False
     else:
      self.data.append(self.__value)
      self.__value = {}
      self.__start_div_item = False
    else:
     self.__start_div_yingping = False
  elif tag == 'a':
   if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
    self.__start_a = False
  elif tag == 'span':
   self.__span_state = 0
 
 def handle_data(self, data):
  if self.__start_a:
   self.__value['title'] = data
  elif self.__span_state == 2:
   self.__value['time'] = data
  elif self.__span_state == 4:
   score_text = re.search(r'\d+', data).group()
   self.__value['people'] = int(score_text)
  pass
def html_parser(html):
 parser = CommentHTMLParser()
 parser.feed(html)
 return parser.data

3,4对于该案例来说确实是不太适合,趁现在有空记录下来,功学习使用!

以上这篇对Python3 解析html的几种操作方式小结就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持【听图阁-专注于Python设计】。

相关文章

python basemap 画出经纬度并标定的实例

如下所示: 两个函数:Basemap.drawparallels ##纬度 Basemap.drawmeridians ##经度 from mpl_toolkits.bas...

利用Python如何生成hash值示例详解

一、介绍 如果在Python中需要对用户输入的密码或者其他内容进行加密,首选的方法是生成hash值。 在Python中可以利用二个模块来进行:    &n...

python脚本实现验证码识别

python脚本实现验证码识别

最近在折腾验证码识别。最终的脚本的识别率在92%左右,9000张验证码大概能识别出八千三四百张左右。好吧,其实是验证码太简单。下面就是要识别的验证码。 我主要用的是Python中的P...

Python 多线程搜索txt文件的内容,并写入搜到的内容(Lock)方法

废话不多说,直接上代码吧! import threading import os class Find(threading.Thread): #搜索数据的线程类 def __i...

浅谈Python的list中的选取范围

序列是Python中最基本的数据结构。序列中的每个元素都分配一个数字 - 它的位置,或索引,第一个索引是0,第二个索引是1,依此类推。 Python有6个序列的内置类型,但最常见的是列表...