pyquary库

初始化

  • 字符串初始化

    • from pyquery import PyQuery as pq doc = pq(html) #生命pq对象 print(doc('li')) # 获取li标签
  • URL初始化

  • 文件初始化

    • doc = pq(filename='demo.html') #跟本文件在同一个目录下 print(doc('li'))

基本CSS选择器

  • from pyquery import PyQuery as pq doc = pq(html) print(doc('#container .list li')) #传递 id ,类,标签之类的

查找元素

  • 子元素

    • from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') #先选中了 .list 中的内容 print(type(items)) print(items) lis = items.find('li') #查找items中的li的标签内容 print(type(lis)) print(lis)
  • 查找直接的子元素

    • lis = items.children()
      print(type(lis)) print(lis)
  • 父元素

    • from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') #获取.list的pq对象 container = items.parent() #获取其父元素 print(type(container)) #打印其类型 print(container) #打印内容
  • 祖先元素

    • from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') parents = items.parents() #查找所有的祖先节点 print(type(parents)) print(parents) #这里返回了两个,一个是最大的,一个是直接的父元素
  • 兄弟元素

    • from pyquery import PyQuery as pq doc = pq(html) #首先获取pq对象 li = doc('.list .item-0.active') #加空格后代选择器 没有加空格是并列选择器 print(li.siblings()) #兄弟元素

遍历

  • 单个元素

    • from pyquery import PyQuery as pq doc = pq(html) lis = doc('li').items() #生成一个迭代器 print(type(lis)) for li in lis: #for循环依次遍历 print(li)

获取信息

  • 获取属性

    • from pyquery import PyQuery as pq doc = pq(html) a = doc('.item-0.active a') print(a) print(a.attr('href')) #使用的是函数 print(a.attr.href) #使用 .
  • 获取文本

    • from pyquery import PyQuery as pq doc = pq(html) a = doc('.item-0.active a') print(a) print(a.text()) #只要其中包含的文本信息
  • 获取HTML

    • from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) print(li.html()) #获取了li里面的html

DOM操作

  • addClass、removeClass

    • from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) li.removeClass('active') #删除类 print(li) li.addClass('active') #添加类 print(li)
  • attr、css

    • from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) li.attr('name', 'link') #添加属性name = link的属性 print(li) li.css('font-size', '14px') #添加css style属性 print(li)
  • remove

    • from pyquery import PyQuery as pq doc = pq(html) wrap = doc('.wrap') print(wrap.text()) wrap.find('p').remove() #把p标签进行删除 print(wrap.text())

其他DOM方法

  • 伪类选择器

    • from pyquery import PyQuery as pq doc = pq(html) li = doc('li:first-child') #li中的第一个li print(li) li = doc('li:last-child') print(li) li = doc('li:nth-child(2)') print(li) li = doc('li:gt(2)') print(li) li = doc('li:nth-child(2n)') print(li) li = doc('li:contains(second)') print(li)

官方文档

pyquery

初始化

字符串初始化

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
html = '''
<div>
    <ul>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html) #声明pq对象
print(doc('li')) #获取li标签的信息
1
2
3
4
5
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>

URL初始化

1
2
3
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com') #url地址
print(doc('head'))
1
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>ç™¾åº¦ä¸€ä¸‹ï¼Œä½ å°±çŸ¥é“</title></head>

文件初始化

1
2
3
from pyquery import PyQuery as pq
doc = pq(filename='demo.html') #传入本地的html文件
print(doc('li'))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
---------------------------------------------------------------------------

FileNotFoundError                         Traceback (most recent call last)

<ipython-input-3-0bb1b77a760d> in <module>
      1 from pyquery import PyQuery as pq
----> 2 doc = pq(filename='demo.html') #传入本地的html文件
      3 print(doc('li'))


D:\Anaconda3\envs\CPU\lib\site-packages\pyquery\pyquery.py in __init__(self, *args, **kwargs)
    215             # specific case to get the dom
    216             if 'filename' in kwargs:
--> 217                 html = open(kwargs['filename'])
    218             elif 'url' in kwargs:
    219                 url = kwargs.pop('url')


FileNotFoundError: [Errno 2] No such file or directory: 'demo.html'

基本CSS选择器

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .list li'))  #传递 id ,类,标签之类的
1
2
3
4
5
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>

查找元素

子元素

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list') #先选中了 .list 中的内容
print(type(items))
print(items)
lis = items.find('li') #查找items中的li的标签内容
print(type(lis)) #每一个都是pq对象,可以链式的查找
print(lis)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>

<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>

1
2
3
lis = items.children()  
print(type(lis))
print(lis)
1
2
3
4
5
6
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>

1
2
lis = items.children('.active')
print(lis)
1
2
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>

父元素

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')  #获取.list的pq对象
container = items.parent()   #获取其父元素
print(type(container))  #打印其类型
print(container)  #打印内容
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
<class 'pyquery.pyquery.PyQuery'>
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
parents = items.parents() #查找所有的祖先节点
print(type(parents))
print(parents)  #这里返回了两个,一个是最大的,一个是直接的父元素
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
<class 'pyquery.pyquery.PyQuery'>
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div><div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>

1
2
parent = items.parents('.wrap')
print(parent)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>

兄弟元素

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)  #首先获取pq对象
li = doc('.list .item-0.active')  #加空格后代选择器    没有加空格是并列选择器
print(li.siblings())  #兄弟元素
1
2
3
4
<li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0">first item</li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active')
print(li.siblings('.active'))
1
<li class="item-1 active"><a href="link4.html">fourth item</a></li>

遍历

单个元素

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
1
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items()  #生成一个迭代器
print(type(lis))
for li in lis: #for循环依次遍历
    print(li)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
<class 'generator'>
<li class="item-0">first item</li>

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-0"><a href="link5.html">fifth item</a></li>

获取信息

获取属性

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.attr('href')) #使用的是函数
print(a.attr.href)  #使用 .
1
2
3
<a href="link3.html"><span class="bold">third item</span></a>
link3.html
link3.html

获取文本

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text())  #只要其中包含的文本信息
1
2
<a href="link3.html"><span class="bold">third item</span></a>
third item

获取HTML

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
print(li.html()) #获取了li里面的html
1
2
3
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

<a href="link3.html"><span class="bold">third item</span></a>

DOM操作

addClass、removeClass

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.removeClass('active') #删除类
print(li)
li.addClass('active') #添加类
print(li)
1
2
3
4
5
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

attr、css

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link')   #添加属性name = link的属性
print(li)
li.css('font-size', '14px') #添加css  style属性
print(li)
1
2
3
4
5
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>

<li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third item</span></a></li>

remove

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
html = '''
<div class="wrap">
    Hello, World
    <p>This is a paragraph.</p>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove() #把p标签进行删除
print(wrap.text())
1
2
3
Hello, World
This is a paragraph.
Hello, World

其他DOM方法

http://pyquery.readthedocs.io/en/latest/api.html

伪类选择器

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child') #li中的第一个li
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)')
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(second)')
print(li)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
<li class="item-0">first item</li>

<li class="item-0"><a href="link5.html">fifth item</a></li>

<li class="item-1"><a href="link2.html">second item</a></li>

<li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>

<li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>

<li class="item-1"><a href="link2.html">second item</a></li>

更多CSS选择器可以查看 http://www.w3school.com.cn/css/index.asp

官方文档

http://pyquery.readthedocs.io/

1

1