import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

trends = pd.read_csv('Viral_Social_Media_Trends.csv')

trends.head()

# Visualization 1: Distribution of posts across different platforms
plt.figure(figsize=(10, 6))
sns.countplot(data=trends, x='Platform', palette='viridis', hue='Platform')
plt.title('Distribution of Posts Across Different Platforms')
plt.xlabel('Platform')
plt.ylabel('Number of Posts')
plt.show()

# Visualization 2: Distribution of engagement levels
plt.figure(figsize=(10, 6))
sns.countplot(data=trends, x='Engagement_Level', palette='viridis', hue='Engagement_Level')
plt.title('Distribution of Engagement Levels')
plt.xlabel('Engagement Level')
plt.ylabel('Number of Posts')
plt.show()

# Visualization 3: Average views, likes, shares, and comments by platform
metrics = ['Views', 'Likes', 'Shares', 'Comments']
for metric in metrics:
  plt.figure(figsize=(10, 6))
  sns.barplot(data=trends, x='Platform', y=metric, palette='viridis' , hue='Platform')
  plt.title(f'Average {metric} by Platform')
  plt.xlabel('Platform')
  plt.ylabel(f'Average {metric}')
  plt.show()

# Visualization 4: Average views, likes, shares, and comments by engagement level
for metric in metrics:
  plt.figure(figsize=(10, 6))
  sns.barplot(data=trends, x='Engagement_Level', y=metric, palette='viridis', hue='Engagement_Level')
  plt.title(f'Average {metric} by Engagement Level')
  plt.xlabel('Engagement Level')
  plt.ylabel(f'Average {metric}')
  plt.show()

# Visualization 5: Correlation heatmap of engagement metrics
plt.figure(figsize=(10, 6))
sns.heatmap(trends[metrics].corr(), annot=True, cmap='viridis', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Engagement Metrics')
plt.show()

trends.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Post_ID           5000 non-null   object
 1   Platform          5000 non-null   object
 2   Hashtag           5000 non-null   object
 3   Content_Type      5000 non-null   object
 4   Region            5000 non-null   object
 5   Views             5000 non-null   int64 
 6   Likes             5000 non-null   int64 
 7   Shares            5000 non-null   int64 
 8   Comments          5000 non-null   int64 
 9   Engagement_Level  5000 non-null   object
dtypes: int64(4), object(6)
memory usage: 390.8+ KB

trends = pd.get_dummies(trends, columns=['Platform', 'Content_Type', 'Region', 'Hashtag'], drop_first=True)

trends = trends.drop(columns=['Post_ID'])

trends['Engagement_Level'] = trends['Engagement_Level'].map({'High': 2, 'Medium':1, 'Low':0})

trends.head()

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

X = trends.drop(columns=['Engagement_Level'])
y = trends['Engagement_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

importances = pd.Series(rf.feature_importances_, index=X_train.columns)
importances.sort_values(ascending=False).head(10).plot(kind='barh')
plt.title('Top 10 Important Features')
plt.show()

trends['Engagement_Rate'] = (trends['Likes'] + trends['Shares'] + trends['Comments']) / trends['Views']

X = trends.drop(columns=['Engagement_Level'])
y = trends['Engagement_Level']

X_train_Random2, X_test_Random2, y_train_Random2, y_test_Random2 = train_test_split(X, y, test_size=0.2, random_state = 42)

rf = RandomForestClassifier()
rf.fit(X_train_Random2, y_train_Random2)

y_pred_Random2 = rf.predict(X_test_Random2)

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

importances= pd.Series(dt.feature_importances_, index=X_train.columns)
importances.sort_values(ascending=False).head(10).plot(kind='barh')
plt.title('Top 10 Important Features - Decision Tree')
plt.show()

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap


X = trends.drop(columns=['Engagement_Level'])
y = trends['Engagement_Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Added random_state for reproducibility

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier()

KNeighborsClassifier()

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_prediction = knn.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.333

confm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8,6))
sns.heatmap(confm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

print("Accuracy Score:", accuracy_score(y_test_Random2, y_pred_Random2))

Accuracy Score: 0.318

# This is from the Random Forest model from when we ran it again after feature engineering
confm = confusion_matrix(y_test_Random2, y_pred_Random2)


plt.figure(figsize=(8,6))
sns.heatmap(confm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# This is from the Random Forest model from when we ran it again after feature engineering

report = classification_report(y_test_Random2, y_pred_Random2, output_dict=True)
report_df = pd.DataFrame(report).transpose()

plt.figure(figsize=(10,6))
sns.heatmap(report_df.iloc[:-1, :-1], annot=True, cmap='YlGnBu', fmt='.2f')
plt.title('Classification Report')
plt.show()

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))

Decision Tree Accuracy: 0.334

report_df = pd.DataFrame(classification_report(y_test, y_pred_dt, output_dict=True)).T

report_df = report_df.drop(columns=['support'], errors='ignore')

plt.figure(figsize=(8, 6))
sns.heatmap(report_df, annot=True, cmap="Blues", fmt=".2f")
plt.title("Classification Report Heatmap - Decision Tree")
plt.show()

confm_dt = confusion_matrix(y_test, y_pred_dt)
plt.figure(figsize=(8, 6))
sns.heatmap(confm_dt, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Decision Tree Confusion Matrix')
plt.show()

print(f"Accuracy score: {accuracy_score(y_test, y_prediction)}")

Accuracy score: 0.307

k_values = range(1, 21)
accuracy_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred_k = knn.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred_k))


plt.figure(figsize=(8, 5))
plt.plot(k_values, accuracy_scores, marker='o', linestyle='dashed', color='b')
plt.xlabel("Number of Neighbors (k)")
plt.ylabel("Accuracy Score")
plt.title("KNN Accuracy vs. k-Values")
plt.xticks(k_values)
plt.show()

report_df = pd.DataFrame(classification_report(y_test, y_prediction, output_dict=True)).T

report_df = report_df.drop(columns=['support'], errors='ignore')

plt.figure(figsize=(8, 6))
sns.heatmap(report_df, annot=True, cmap="Blues", fmt=".2f")
plt.title("Classification Report Heatmap -  KNN test")
plt.show()

confm_knn = confusion_matrix(y_test, y_pred_dt)
plt.figure(figsize=(8, 6))
sns.heatmap(confm_knn, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('KNN Confusion Matrix')
plt.show()

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np


metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']
rf_scores = [accuracy_score(y_test, y_pred),
            precision_score(y_test, y_pred, average='weighted'),
            recall_score(y_test, y_pred, average='weighted'),
            f1_score(y_test, y_pred, average='weighted')]
dt_scores = [accuracy_score(y_test, y_pred_dt),
            precision_score(y_test, y_pred_dt, average='weighted'),
            recall_score(y_test, y_pred_dt, average='weighted'),
            f1_score(y_test, y_pred_dt, average='weighted')]

x = np.arange(len(metrics))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(x - width/2, rf_scores, width, label='Random Forest')
rects2 = ax.bar(x + width/2, dt_scores, width, label='Decision Tree')

ax.set_ylabel('Scores')
ax.set_title('Model Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

fig.tight_layout()
plt.show()

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns  # Import Seaborn for styling


metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']
rf_scores = [accuracy_score(y_test, y_pred),
            precision_score(y_test, y_pred, average='weighted'),
            recall_score(y_test, y_pred, average='weighted'),
            f1_score(y_test, y_pred, average='weighted')]
dt_scores = [accuracy_score(y_test, y_pred_dt),
            precision_score(y_test, y_pred_dt, average='weighted'),
            recall_score(y_test, y_pred_dt, average='weighted'),
            f1_score(y_test, y_pred_dt, average='weighted')]

x = np.arange(len(metrics))
width = 0.35

# Set Seaborn style
sns.set_style("whitegrid")

fig, ax = plt.subplots(figsize=(10, 6))

# Use Seaborn's barplot for better aesthetics
sns.barplot(x=metrics, y=rf_scores, color='skyblue', width=width, label='Random Forest', ax=ax)
sns.barplot(x=metrics, y=dt_scores, color='salmon', width=width, label='Decision Tree', ax=ax)

ax.set_ylabel('Scores', fontsize=12)
ax.set_title('Model Comparison', fontsize=14, fontweight='bold')
ax.set_xticklabels(metrics, fontsize=10)
ax.legend(fontsize=10)

# Remove spines
sns.despine()

# Add data labels
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                textcoords='offset points')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

/var/folders/77/9zgk3rw96_16j5r99_wp7gvm0000gn/T/ipykernel_6287/3050421759.py:31: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_xticklabels(metrics, fontsize=10)

	Post_ID	Platform	Hashtag	Content_Type	Region	Views	Likes	Shares	Comments	Engagement_Level
0	Post_1	TikTok	#Challenge	Video	UK	4163464	339431	53135	19346	High
1	Post_2	Instagram	#Education	Shorts	India	4155940	215240	65860	27239	Medium
2	Post_3	Twitter	#Challenge	Video	Brazil	3666211	327143	39423	36223	Medium
3	Post_4	YouTube	#Education	Shorts	Australia	917951	127125	11687	36806	Low
4	Post_5	TikTok	#Dance	Post	Brazil	64866	171361	69581	6376	Medium

Final Project - Group 5¶

Introduction¶

Our Data¶

Pre-processing¶

Modeling¶

Classification Predicting Engagement Level¶

Decision Tree¶

Feature Importance: Insights from the Decision Tree¶

KNN Classification¶

Evaluation¶

The Random Forest Classifier¶

Random Forest - Post Feature Engineering¶

Decision Tree¶

KNN¶

Model Comparison¶

Overview:¶

Conclusion¶

Storytelling¶

Impact¶

	Views	Likes	Shares	Comments	Engagement_Level	Platform_TikTok	Platform_Twitter	Platform_YouTube	Content_Type_Post	Content_Type_Reel	...	Region_USA	Hashtag_#Comedy	Hashtag_#Dance	Hashtag_#Education	Hashtag_#Fashion	Hashtag_#Fitness	Hashtag_#Gaming	Hashtag_#Music	Hashtag_#Tech	Hashtag_#Viral
0	4163464	339431	53135	19346	2	True	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
1	4155940	215240	65860	27239	1	False	False	False	False	False	...	False	False	False	True	False	False	False	False	False	False
2	3666211	327143	39423	36223	1	False	True	False	False	False	...	False	False	False	False	False	False	False	False	False	False
3	917951	127125	11687	36806	0	False	False	True	False	False	...	False	False	False	True	False	False	False	False	False	False
4	64866	171361	69581	6376	1	True	False	False	True	False	...	False	False	True	False	False	False	False	False	False	False