<PYTHON>[BeautifulSoup]
설치
pip install BeautifulSoup4
파싱방법
soup.title
# <title>The Dormouse's story</title>
soup.title.name
# u'title'
soup.title.string
# u'The Dormouse's story'
soup.title.parent.name
# u'head'
soup.p
# <p class="title"><b>The Dormouse's story</b></p>
soup.p['class']
# u'title'
soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
URL 추출
for link in soup.find_all('a'):
print(link.get('href'))
샘플
웹페이지 파싱
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 |
#-*- coding:utf-8 -*-
import urllib
from bs4 import BeautifulSoup
html = urllib.urlopen(URL)
fSoup = BeautifulSoup(html, "lxml")
statusImg = fSoup.find(keyword)
s = statusImg.get('key')
ss = s.split('/')
if ss[3] == "value":
statusNum = "1"
elif ss[3] == "value":
statusNum = "2"
elif ss[3] == "value":
statusNum = "3"
else:
pass |
cs |
xml 파일 파싱
1
2
3
4
5
6
7 |
from bs4 import BeautifulSoup
f = open('B.xml')
xml = f.read()
fSoup = BeautifulSoup(xml, 'xml')
for meas in fSoup.findAll('keyword'):
print meas.get('value') |
cs |
soup.title
# <title>The Dormouse's story</title>
soup.title.name
# u'title'
soup.title.string
# u'The Dormouse's story'
soup.title.parent.name
# u'head'
soup.p
# <p class="title"><b>The Dormouse's story</b></p>
soup.p['class']
# u'title'
soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>