{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "10e8ea93-0ad3-4df6-9244-fa8bfcc66f54",
   "metadata": {},
   "source": [
    "# Sentiment analysis on a dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf75d65f-78e6-41d3-8ed0-c5dae4018043",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Only install VADER if you have not done so from previous exercise/activities\n",
    "#!pip install vaderSentiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "544444a5-b036-4f4e-bc70-5dce8eb93bbe",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
    "\n",
    "pd.options.display.max_colwidth = 400"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e6b7e765-8f9c-4a25-9ea5-79448ae80f02",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 10000 entries, 0 to 9999\n",
      "Data columns (total 10 columns):\n",
      " #   Column         Non-Null Count  Dtype \n",
      "---  ------         --------------  ----- \n",
      " 0   Unnamed: 0     10000 non-null  int64 \n",
      " 1   id             10000 non-null  int64 \n",
      " 2   user_location  7101 non-null   object\n",
      " 3   date           10000 non-null  object\n",
      " 4   text           10000 non-null  object\n",
      " 5   hashtags       7874 non-null   object\n",
      " 6   source         9999 non-null   object\n",
      " 7   retweets       10000 non-null  int64 \n",
      " 8   favorites      10000 non-null  int64 \n",
      " 9   is_retweet     10000 non-null  bool  \n",
      "dtypes: bool(1), int64(4), object(5)\n",
      "memory usage: 713.0+ KB\n"
     ]
    }
   ],
   "source": [
    "vac_df = pd.read_csv(\"https://csc10800.github.io/assets/datasets/vaccination_sample.csv\")\n",
    "vac_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "35b2d8d8-f580-4756-b2e1-23a908fbd04c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>id</th>\n",
       "      <th>user_location</th>\n",
       "      <th>date</th>\n",
       "      <th>text</th>\n",
       "      <th>hashtags</th>\n",
       "      <th>source</th>\n",
       "      <th>retweets</th>\n",
       "      <th>favorites</th>\n",
       "      <th>is_retweet</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>120350</td>\n",
       "      <td>1409806858089418758</td>\n",
       "      <td>Colombo, Sri Lanka.</td>\n",
       "      <td>2021-06-29 09:32:02</td>\n",
       "      <td>🔺 Health Ministry's Advisory Committee gives green light to use #Pfizer or #Moderna as the 02nd dose for those who… https://t.co/DvWxmn2jRv</td>\n",
       "      <td>['Pfizer', 'Moderna']</td>\n",
       "      <td>Twitter Web App</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>85865</td>\n",
       "      <td>1398689762567880705</td>\n",
       "      <td>Chennai, India</td>\n",
       "      <td>2021-05-29 17:16:40</td>\n",
       "      <td>Got Vaccinated. First dose #Covaxin \\n#CovidVaccine</td>\n",
       "      <td>['Covaxin', 'CovidVaccine']</td>\n",
       "      <td>Twitter for Android</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0                   id        user_location                 date  \\\n",
       "0      120350  1409806858089418758  Colombo, Sri Lanka.  2021-06-29 09:32:02   \n",
       "1       85865  1398689762567880705       Chennai, India  2021-05-29 17:16:40   \n",
       "\n",
       "                                                                                                                                          text  \\\n",
       "0  🔺 Health Ministry's Advisory Committee gives green light to use #Pfizer or #Moderna as the 02nd dose for those who… https://t.co/DvWxmn2jRv   \n",
       "1                                                                                          Got Vaccinated. First dose #Covaxin \\n#CovidVaccine   \n",
       "\n",
       "                      hashtags               source  retweets  favorites  \\\n",
       "0        ['Pfizer', 'Moderna']      Twitter Web App         1          1   \n",
       "1  ['Covaxin', 'CovidVaccine']  Twitter for Android         0          2   \n",
       "\n",
       "   is_retweet  \n",
       "0       False  \n",
       "1       False  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vac_df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14c6541d-fe80-4dbe-8def-e0cb3c2bfd94",
   "metadata": {},
   "source": [
    "## Removing unused columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "dc65252a-bf43-4054-adbf-34a7c937577d",
   "metadata": {},
   "outputs": [],
   "source": [
    "vac_df = vac_df.drop(columns = ['Unnamed: 0','id','hashtags','source','is_retweet'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f292ae6f-72eb-4f1f-864e-99703208aefc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_location</th>\n",
       "      <th>date</th>\n",
       "      <th>text</th>\n",
       "      <th>retweets</th>\n",
       "      <th>favorites</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Colombo, Sri Lanka.</td>\n",
       "      <td>2021-06-29 09:32:02</td>\n",
       "      <td>🔺 Health Ministry's Advisory Committee gives green light to use #Pfizer or #Moderna as the 02nd dose for those who… https://t.co/DvWxmn2jRv</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Chennai, India</td>\n",
       "      <td>2021-05-29 17:16:40</td>\n",
       "      <td>Got Vaccinated. First dose #Covaxin \\n#CovidVaccine</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         user_location                 date  \\\n",
       "0  Colombo, Sri Lanka.  2021-06-29 09:32:02   \n",
       "1       Chennai, India  2021-05-29 17:16:40   \n",
       "\n",
       "                                                                                                                                          text  \\\n",
       "0  🔺 Health Ministry's Advisory Committee gives green light to use #Pfizer or #Moderna as the 02nd dose for those who… https://t.co/DvWxmn2jRv   \n",
       "1                                                                                          Got Vaccinated. First dose #Covaxin \\n#CovidVaccine   \n",
       "\n",
       "   retweets  favorites  \n",
       "0         1          1  \n",
       "1         0          2  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vac_df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a10805d8-b7dd-4cbc-bd0a-2490b5313dce",
   "metadata": {},
   "source": [
    "## Create function to automate the process of running sentiment scores in data frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e06a53f9-6a64-4b24-8342-1606ffb50f3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_sentiment(text):\n",
    "    # Run VADER on the text\n",
    "    scores = sentimentAnalyser.polarity_scores(text)\n",
    "    # Extract the compound score\n",
    "    compound_score = scores['compound']\n",
    "    # Return compound score\n",
    "    return compound_score"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b17c971f-b2ac-437c-8524-52d97f7db8e2",
   "metadata": {},
   "source": [
    "## Apply function on a column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7eca891-bd9f-49b6-97b2-d8668a6a4c4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "vac_df['sentiment_score'] = vac_df['text'].apply(calculate_sentiment)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7625e45e-f8be-44fb-9806-7f0149e5fa7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "vac_df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab6e0f5a-e2cc-4a43-8242-13c631f50d48",
   "metadata": {},
   "source": [
    "## Sort by sentiment score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3744e12c-2cb8-4b21-8a61-36f9b5f7a013",
   "metadata": {},
   "outputs": [],
   "source": [
    "vac_df.sort_values(by='sentiment_score', ascending=False)[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b56b639e-db66-46d2-b988-ee707893002e",
   "metadata": {},
   "source": [
    "## Convert date column to datetime type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfc9fd5a-6527-4df4-884a-ab087cbad210",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert the date column to datetime values\n",
    "vac_df['date'] = pd.to_datetime(vac_df['date'], format = \"%Y-%m-%d %H:%M:%S\")\n",
    "\n",
    "# Make date the index of the DataFrame\n",
    "vac_df = vac_df.set_index('date')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "93557cbf-0670-458c-bee4-376051727454",
   "metadata": {},
   "source": [
    "## Plot by sentiment score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33a6874d-aadb-43f7-bbf0-876b53470766",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "vac_df.resample('M')['sentiment_score'].mean().plot(\n",
    "    title=\"Tweet Sentiment by Month\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1e18b58e-e7b2-4933-b05f-54bd56310646",
   "metadata": {},
   "source": [
    "## Zoom to locate specific row(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae125d84-89b1-4b4b-8c91-98db96cff0aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "vac_df.loc[\"2021-06\"].resample('D')['sentiment_score'].mean().plot(\n",
    "    title=\"Tweet Sentiment by Day in June 2021\");\n",
    "\n",
    "#Add the .get_figure() and .savefig('name_of_your_graph') methods to the end of your graph. \n",
    "#Graph will be saved in the same folder as your notebook\n",
    "vac_df.loc[\"2021-06\"].resample('D')['sentiment_score'].mean().plot(\n",
    "    title=\"Tweet Sentiment by Day in June 2021\").get_figure().savefig('tweetsentiment_jun21')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "65dc1fbe-d15f-4804-8d46-21da45d08ad9",
   "metadata": {},
   "source": [
    "### Compare most negative sentiment on 06-27 and most negative overall for the month of June"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6586147e-9a79-40a7-b275-6b8adc4b1c6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "vac_df.loc[\"2021-06-26\"].sort_values(by='sentiment_score')[:5] #Top 5 most negative sentiments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e5cf01f-c81b-4d10-a7a2-28cf1176d42d",
   "metadata": {},
   "outputs": [],
   "source": [
    "vac_df.loc[\"2021-06\"].sort_values(by='sentiment_score')[:5] #Top 5 most negative sentiments"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f12b37fb-7043-407b-aaac-5dbea71d1a71",
   "metadata": {},
   "source": [
    "## Save csv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5132641d-89da-446f-b2f1-783468cd93df",
   "metadata": {},
   "outputs": [],
   "source": [
    "vac_df.to_csv(\"vac_df_sentiment.csv\", encoding='utf-8', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}