"No one is harder on a talented person than the person themselves" - Linda Wilkinson ; "Trust your guts and don't follow the herd" ; "Validate direction not destination" ;

June 01, 2018

Day #112 - Web Scraping

For topic modelling, had to scrap a few websites to obtain data for the same.


#Python 3
import httplib2
import urllib.request
import re
from nltk.corpus import wordnet
from bs4 import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('https://www.abc.com/india-car-news.htm/2')
path_to_article = "D:\Scrap\Articles.txt"
filehandle = open(path_to_article,'w')
urltoadd = []
for link in BeautifulSoup(response, 'html.parser',parseOnlyThese=SoupStrainer('a')):
if link.has_attr('href'):
if(('.htm' in link['href']) and ('http' not in link['href'])):
print('https://www.abc.com'+link['href'])
urltoadd.append('https://www.abc.com'+link['href'])
for url in urltoadd:
print('Article - Content')
print(url)
with urllib.request.urlopen(url) as url:
page = url.read()
page = str(page)
soup = BeautifulSoup(page)
txt = soup.body.getText()
nstr = re.sub(r'[?|$|.|!]',r'',txt)
nestr = re.sub(r'[^a-zA-Z0-9 ]',r'',nstr)
print(nestr)
words = nestr.split(" ")
news = ""
for word in words:
if wordnet.synsets(word):
news = news + word + ' '
print(news)
filehandle.write(news + "\n")
filehandle.close()
Happy Learning!!!

No comments: