# Sentiment analysis on a dataset

In [None]:
#Only install VADER if you have not done so from previous exercise/activities
#!pip install vaderSentiment

In [1]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

pd.options.display.max_colwidth = 400

In [3]:
vac_df = pd.read_csv("https://csc10800.github.io/assets/datasets/vaccination_sample.csv")
vac_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     10000 non-null  int64 
 1   id             10000 non-null  int64 
 2   user_location  7101 non-null   object
 3   date           10000 non-null  object
 4   text           10000 non-null  object
 5   hashtags       7874 non-null   object
 6   source         9999 non-null   object
 7   retweets       10000 non-null  int64 
 8   favorites      10000 non-null  int64 
 9   is_retweet     10000 non-null  bool  
dtypes: bool(1), int64(4), object(5)
memory usage: 713.0+ KB


In [4]:
vac_df.head(2)

Unnamed: 0.1,Unnamed: 0,id,user_location,date,text,hashtags,source,retweets,favorites,is_retweet
0,120350,1409806858089418758,"Colombo, Sri Lanka.",2021-06-29 09:32:02,ðŸ”º Health Ministry's Advisory Committee gives green light to use #Pfizer or #Moderna as the 02nd dose for those whoâ€¦ https://t.co/DvWxmn2jRv,"['Pfizer', 'Moderna']",Twitter Web App,1,1,False
1,85865,1398689762567880705,"Chennai, India",2021-05-29 17:16:40,Got Vaccinated. First dose #Covaxin \n#CovidVaccine,"['Covaxin', 'CovidVaccine']",Twitter for Android,0,2,False


## Removing unused columns

In [5]:
vac_df = vac_df.drop(columns = ['Unnamed: 0','id','hashtags','source','is_retweet'])

In [6]:
vac_df.head(2)

Unnamed: 0,user_location,date,text,retweets,favorites
0,"Colombo, Sri Lanka.",2021-06-29 09:32:02,ðŸ”º Health Ministry's Advisory Committee gives green light to use #Pfizer or #Moderna as the 02nd dose for those whoâ€¦ https://t.co/DvWxmn2jRv,1,1
1,"Chennai, India",2021-05-29 17:16:40,Got Vaccinated. First dose #Covaxin \n#CovidVaccine,0,2


## Create function to automate the process of running sentiment scores in data frame

In [None]:
def calculate_sentiment(text):
    # Run VADER on the text
    scores = sentimentAnalyser.polarity_scores(text)
    # Extract the compound score
    compound_score = scores['compound']
    # Return compound score
    return compound_score

## Apply function on a column

In [None]:
vac_df['sentiment_score'] = vac_df['text'].apply(calculate_sentiment)

In [None]:
vac_df.info()

## Sort by sentiment score

In [None]:
vac_df.sort_values(by='sentiment_score', ascending=False)[:10]

## Convert date column to datetime type

In [None]:
# Convert the date column to datetime values
vac_df['date'] = pd.to_datetime(vac_df['date'], format = "%Y-%m-%d %H:%M:%S")

# Make date the index of the DataFrame
vac_df = vac_df.set_index('date')

## Plot by sentiment score

In [None]:
vac_df.resample('M')['sentiment_score'].mean().plot(
    title="Tweet Sentiment by Month")

## Zoom to locate specific row(s)

In [None]:
vac_df.loc["2021-06"].resample('D')['sentiment_score'].mean().plot(
    title="Tweet Sentiment by Day in June 2021");

#Add the .get_figure() and .savefig('name_of_your_graph') methods to the end of your graph. 
#Graph will be saved in the same folder as your notebook
vac_df.loc["2021-06"].resample('D')['sentiment_score'].mean().plot(
    title="Tweet Sentiment by Day in June 2021").get_figure().savefig('tweetsentiment_jun21')

### Compare most negative sentiment on 06-27 and most negative overall for the month of June

In [None]:
vac_df.loc["2021-06-26"].sort_values(by='sentiment_score')[:5] #Top 5 most negative sentiments

In [None]:
vac_df.loc["2021-06"].sort_values(by='sentiment_score')[:5] #Top 5 most negative sentiments

## Save csv file

In [None]:
vac_df.to_csv("vac_df_sentiment.csv", encoding='utf-8', index=False)