-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremove_tag.py
42 lines (31 loc) · 1.16 KB
/
remove_tag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# Question 4: Remove Tags
# When we add our words to the index, we don't really want to include
# html tags such as <body>, <head>, <table>, <a href="..."> and so on.
# Write a procedure, remove_tags, that takes as input a string and returns
# a list of words, in order, with the tags removed. Tags are defined to be
# strings surrounded by < >. Words are separated by whitespace or tags.
# You may assume the input does not include any unclosed tags, that is,
# there will be no '<' without a following '>'.
def remove_tags(string):
s=""
i=0
while i<len(string):
if string[i]=='<':
i=string.find('>',i)
s+=' '
else:
s+=string[i]
i+=1
return s.split()
print remove_tags('''<h1>Title</h1><p>This is a
<a href="http://www.udacity.com">link</a>.<p>''')
#>>> ['Title','This','is','a','link','.']
print remove_tags('''<table cellpadding='3'>
<tr><td>Hello</td><td>World!</td></tr>
</table>''')
#>>> ['Hello','World!']
print remove_tags("<hello><goodbye>")
#>>> []
print remove_tags("<br/>This line starts with a tag")
print remove_tags("This is plain text.")
#>>> ['This', 'is', 'plain', 'text.']