{ "cells": [ { "cell_type": "markdown", "id": "10e8ea93-0ad3-4df6-9244-fa8bfcc66f54", "metadata": {}, "source": [ "# Sentiment analysis on a dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "bf75d65f-78e6-41d3-8ed0-c5dae4018043", "metadata": {}, "outputs": [], "source": [ "#Only install VADER if you have not done so from previous exercise/activities\n", "#!pip install vaderSentiment" ] }, { "cell_type": "code", "execution_count": 1, "id": "544444a5-b036-4f4e-bc70-5dce8eb93bbe", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n", "\n", "pd.options.display.max_colwidth = 400" ] }, { "cell_type": "code", "execution_count": 3, "id": "e6b7e765-8f9c-4a25-9ea5-79448ae80f02", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 10000 entries, 0 to 9999\n", "Data columns (total 10 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Unnamed: 0 10000 non-null int64 \n", " 1 id 10000 non-null int64 \n", " 2 user_location 7101 non-null object\n", " 3 date 10000 non-null object\n", " 4 text 10000 non-null object\n", " 5 hashtags 7874 non-null object\n", " 6 source 9999 non-null object\n", " 7 retweets 10000 non-null int64 \n", " 8 favorites 10000 non-null int64 \n", " 9 is_retweet 10000 non-null bool \n", "dtypes: bool(1), int64(4), object(5)\n", "memory usage: 713.0+ KB\n" ] } ], "source": [ "vac_df = pd.read_csv(\"https://csc10800.github.io/assets/datasets/vaccination_sample.csv\")\n", "vac_df.info()" ] }, { "cell_type": "code", "execution_count": 4, "id": "35b2d8d8-f580-4756-b2e1-23a908fbd04c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0iduser_locationdatetexthashtagssourceretweetsfavoritesis_retweet
01203501409806858089418758Colombo, Sri Lanka.2021-06-29 09:32:02🔺 Health Ministry's Advisory Committee gives green light to use #Pfizer or #Moderna as the 02nd dose for those who… https://t.co/DvWxmn2jRv['Pfizer', 'Moderna']Twitter Web App11False
1858651398689762567880705Chennai, India2021-05-29 17:16:40Got Vaccinated. First dose #Covaxin \\n#CovidVaccine['Covaxin', 'CovidVaccine']Twitter for Android02False
\n", "
" ], "text/plain": [ " Unnamed: 0 id user_location date \\\n", "0 120350 1409806858089418758 Colombo, Sri Lanka. 2021-06-29 09:32:02 \n", "1 85865 1398689762567880705 Chennai, India 2021-05-29 17:16:40 \n", "\n", " text \\\n", "0 🔺 Health Ministry's Advisory Committee gives green light to use #Pfizer or #Moderna as the 02nd dose for those who… https://t.co/DvWxmn2jRv \n", "1 Got Vaccinated. First dose #Covaxin \\n#CovidVaccine \n", "\n", " hashtags source retweets favorites \\\n", "0 ['Pfizer', 'Moderna'] Twitter Web App 1 1 \n", "1 ['Covaxin', 'CovidVaccine'] Twitter for Android 0 2 \n", "\n", " is_retweet \n", "0 False \n", "1 False " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vac_df.head(2)" ] }, { "cell_type": "markdown", "id": "14c6541d-fe80-4dbe-8def-e0cb3c2bfd94", "metadata": {}, "source": [ "## Removing unused columns" ] }, { "cell_type": "code", "execution_count": 5, "id": "dc65252a-bf43-4054-adbf-34a7c937577d", "metadata": {}, "outputs": [], "source": [ "vac_df = vac_df.drop(columns = ['Unnamed: 0','id','hashtags','source','is_retweet'])" ] }, { "cell_type": "code", "execution_count": 6, "id": "f292ae6f-72eb-4f1f-864e-99703208aefc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_locationdatetextretweetsfavorites
0Colombo, Sri Lanka.2021-06-29 09:32:02🔺 Health Ministry's Advisory Committee gives green light to use #Pfizer or #Moderna as the 02nd dose for those who… https://t.co/DvWxmn2jRv11
1Chennai, India2021-05-29 17:16:40Got Vaccinated. First dose #Covaxin \\n#CovidVaccine02
\n", "
" ], "text/plain": [ " user_location date \\\n", "0 Colombo, Sri Lanka. 2021-06-29 09:32:02 \n", "1 Chennai, India 2021-05-29 17:16:40 \n", "\n", " text \\\n", "0 🔺 Health Ministry's Advisory Committee gives green light to use #Pfizer or #Moderna as the 02nd dose for those who… https://t.co/DvWxmn2jRv \n", "1 Got Vaccinated. First dose #Covaxin \\n#CovidVaccine \n", "\n", " retweets favorites \n", "0 1 1 \n", "1 0 2 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vac_df.head(2)" ] }, { "cell_type": "markdown", "id": "a10805d8-b7dd-4cbc-bd0a-2490b5313dce", "metadata": {}, "source": [ "## Create function to automate the process of running sentiment scores in data frame" ] }, { "cell_type": "code", "execution_count": null, "id": "e06a53f9-6a64-4b24-8342-1606ffb50f3a", "metadata": {}, "outputs": [], "source": [ "def calculate_sentiment(text):\n", " # Run VADER on the text\n", " scores = sentimentAnalyser.polarity_scores(text)\n", " # Extract the compound score\n", " compound_score = scores['compound']\n", " # Return compound score\n", " return compound_score" ] }, { "cell_type": "markdown", "id": "b17c971f-b2ac-437c-8524-52d97f7db8e2", "metadata": {}, "source": [ "## Apply function on a column" ] }, { "cell_type": "code", "execution_count": null, "id": "d7eca891-bd9f-49b6-97b2-d8668a6a4c4c", "metadata": {}, "outputs": [], "source": [ "vac_df['sentiment_score'] = vac_df['text'].apply(calculate_sentiment)" ] }, { "cell_type": "code", "execution_count": null, "id": "7625e45e-f8be-44fb-9806-7f0149e5fa7b", "metadata": {}, "outputs": [], "source": [ "vac_df.info()" ] }, { "cell_type": "markdown", "id": "ab6e0f5a-e2cc-4a43-8242-13c631f50d48", "metadata": {}, "source": [ "## Sort by sentiment score" ] }, { "cell_type": "code", "execution_count": null, "id": "3744e12c-2cb8-4b21-8a61-36f9b5f7a013", "metadata": {}, "outputs": [], "source": [ "vac_df.sort_values(by='sentiment_score', ascending=False)[:10]" ] }, { "cell_type": "markdown", "id": "b56b639e-db66-46d2-b988-ee707893002e", "metadata": {}, "source": [ "## Convert date column to datetime type" ] }, { "cell_type": "code", "execution_count": null, "id": "bfc9fd5a-6527-4df4-884a-ab087cbad210", "metadata": {}, "outputs": [], "source": [ "# Convert the date column to datetime values\n", "vac_df['date'] = pd.to_datetime(vac_df['date'], format = \"%Y-%m-%d %H:%M:%S\")\n", "\n", "# Make date the index of the DataFrame\n", "vac_df = vac_df.set_index('date')" ] }, { "cell_type": "markdown", "id": "93557cbf-0670-458c-bee4-376051727454", "metadata": {}, "source": [ "## Plot by sentiment score" ] }, { "cell_type": "code", "execution_count": null, "id": "33a6874d-aadb-43f7-bbf0-876b53470766", "metadata": { "scrolled": true }, "outputs": [], "source": [ "vac_df.resample('M')['sentiment_score'].mean().plot(\n", " title=\"Tweet Sentiment by Month\")" ] }, { "cell_type": "markdown", "id": "1e18b58e-e7b2-4933-b05f-54bd56310646", "metadata": {}, "source": [ "## Zoom to locate specific row(s)" ] }, { "cell_type": "code", "execution_count": null, "id": "ae125d84-89b1-4b4b-8c91-98db96cff0aa", "metadata": {}, "outputs": [], "source": [ "vac_df.loc[\"2021-06\"].resample('D')['sentiment_score'].mean().plot(\n", " title=\"Tweet Sentiment by Day in June 2021\");\n", "\n", "#Add the .get_figure() and .savefig('name_of_your_graph') methods to the end of your graph. \n", "#Graph will be saved in the same folder as your notebook\n", "vac_df.loc[\"2021-06\"].resample('D')['sentiment_score'].mean().plot(\n", " title=\"Tweet Sentiment by Day in June 2021\").get_figure().savefig('tweetsentiment_jun21')" ] }, { "cell_type": "markdown", "id": "65dc1fbe-d15f-4804-8d46-21da45d08ad9", "metadata": {}, "source": [ "### Compare most negative sentiment on 06-27 and most negative overall for the month of June" ] }, { "cell_type": "code", "execution_count": null, "id": "6586147e-9a79-40a7-b275-6b8adc4b1c6e", "metadata": {}, "outputs": [], "source": [ "vac_df.loc[\"2021-06-26\"].sort_values(by='sentiment_score')[:5] #Top 5 most negative sentiments" ] }, { "cell_type": "code", "execution_count": null, "id": "4e5cf01f-c81b-4d10-a7a2-28cf1176d42d", "metadata": {}, "outputs": [], "source": [ "vac_df.loc[\"2021-06\"].sort_values(by='sentiment_score')[:5] #Top 5 most negative sentiments" ] }, { "cell_type": "markdown", "id": "f12b37fb-7043-407b-aaac-5dbea71d1a71", "metadata": {}, "source": [ "## Save csv file" ] }, { "cell_type": "code", "execution_count": null, "id": "5132641d-89da-446f-b2f1-783468cd93df", "metadata": {}, "outputs": [], "source": [ "vac_df.to_csv(\"vac_df_sentiment.csv\", encoding='utf-8', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }