import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as shc
from sklearn.decomposition import PCA

df = pd.read_csv('data/all-ages.csv')
print(df)

     Major_code                                            Major  \
0          1100                              GENERAL AGRICULTURE   
1          1101            AGRICULTURE PRODUCTION AND MANAGEMENT   
2          1102                           AGRICULTURAL ECONOMICS   
3          1103                                  ANIMAL SCIENCES   
4          1104                                     FOOD SCIENCE   
..          ...                                              ...   
168        6211                           HOSPITALITY MANAGEMENT   
169        6212    MANAGEMENT INFORMATION SYSTEMS AND STATISTICS   
170        6299  MISCELLANEOUS BUSINESS & MEDICAL ADMINISTRATION   
171        6402                                          HISTORY   
172        6403                            UNITED STATES HISTORY   

                      Major_category   Total  Employed  \
0    Agriculture & Natural Resources  128148     90245   
1    Agriculture & Natural Resources   95326     76865   
2    Agriculture & Natural Resources   33955     26321   
3    Agriculture & Natural Resources  103549     81177   
4    Agriculture & Natural Resources   24280     17281   
..                               ...     ...       ...   
168                         Business  200854    163393   
169                         Business  156673    134478   
170                         Business  102753     77471   
171        Humanities & Liberal Arts  712509    478416   
172        Humanities & Liberal Arts   17746     11887   

     Employed_full_time_year_round  Unemployed  Unemployment_rate  Median  \
0                            74078        2423           0.026147   50000   
1                            64240        2266           0.028636   54000   
2                            22810         821           0.030248   63000   
3                            64937        3619           0.042679   46000   
4                            12722         894           0.049188   62000   
..                             ...         ...                ...     ...   
168                         122499        8862           0.051447   49000   
169                         118249        6186           0.043977   72000   
170                          61603        4308           0.052679   53000   
171                         354163       33725           0.065851   50000   
172                           8204         943           0.073500   50000   

     P25th     P75th  
0    34000   80000.0  
1    36000   80000.0  
2    40000   98000.0  
3    30000   72000.0  
4    38500   90000.0  
..     ...       ...  
168  33000   70000.0  
169  50000  100000.0  
170  36000   83000.0  
171  35000   80000.0  
172  39000   81000.0  

[173 rows x 11 columns]

missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

features = ["Employed", "Unemployed", "Total", "Employed_full_time_year_round", "Median", "P25th", "P75th",]

df_cluster = df[features].dropna()

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_cluster)

df_scaled = pd.DataFrame(scaled_data, columns=features)

df_cluster.head()

Missing values in each column:
Major_code                       0
Major                            0
Major_category                   0
Total                            0
Employed                         0
Employed_full_time_year_round    0
Unemployed                       0
Unemployment_rate                0
Median                           0
P25th                            0
P75th                            0
dtype: int64

plt.figure(figsize=(12, 8))
df_scaled.hist(bins=20, figsize=(12, 8), layout=(3, 3))
plt.tight_layout()
plt.title("Distribution of Scaled Features")
plt.show()

<Figure size 1200x800 with 0 Axes>

corr_matrix = df_scaled.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title("Correlation Matrix of College Majors Features (Scaled)")
plt.show()

sse = []
K_range = range(2, 10)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_scaled)
    sse.append(kmeans.inertia_)

plt.figure(figsize=(6, 4))
plt.plot(K_range, sse, marker='x')
plt.title("K-Means Elbow Plot")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("SSE (Sum of Squared Errors)")
plt.show()

k_opt = 4
kmeans = KMeans(n_clusters=k_opt, random_state=42)
kmeans_labels = kmeans.fit_predict(df_scaled)


kmeans_silhouette = silhouette_score(df_scaled, kmeans_labels)
print(f"K-Means Silhouette Score (k={k_opt}): {kmeans_silhouette:.3f}")

df_cluster['KMeans_Cluster'] = kmeans_labels

K-Means Silhouette Score (k=4): 0.499

plt.figure(figsize=(12, 6))
shc.dendrogram(
    shc.linkage(df_scaled, method='ward'),
    truncate_mode='lastp',
    p=12,
    leaf_rotation=90,
    leaf_font_size=12,
    show_contracted=True
)
plt.title("Truncated Dendrogram - Last 12")
plt.xlabel("Cluster Size")
plt.ylabel("Distance")
plt.show()

plt.figure(figsize=(12, 6))
ddata = shc.linkage(df_scaled, method='ward')
shc.dendrogram(
    ddata,
    color_threshold=6,
    leaf_rotation=90,
    leaf_font_size=12,
    show_contracted=True
)
plt.axhline(y=6, color='r', linestyle='--', label="Threshold")
plt.title("Dendrogram with Distance Threshold")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.legend()
plt.show()

plt.figure(figsize=(12, 8))
sns.boxplot(x='KMeans_Cluster', y='Median', data=df_cluster)
plt.title("Box Plot of Median Salary by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Median Salary")
plt.show()

pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_scaled)

pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])

print(f"Explained Variance Ratio: PC1 = {pca.explained_variance_ratio_[0]*100:.1f}%, PC2 = {pca.explained_variance_ratio_[1]*100:.1f}%")

Explained Variance Ratio: PC1 = 56.3%, PC2 = 40.7%

plt.figure(figsize=(8, 6))
plt.scatter(pca_df['PC1'], pca_df['PC2'], alpha=0.7)
plt.title(f"PCA Scatter Plot (PC1 vs PC2)\nExplained Variance: {pca.explained_variance_ratio_[0]*100:.2f}% and {pca.explained_variance_ratio_[1]*100:.2f}%")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'], hue=df_cluster['KMeans_Cluster'], palette='viridis', alpha=0.7)
plt.title("Clustered Scatter Plot - PCA Reduced Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title='Cluster')
plt.grid(True)
plt.show()

cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
cluster_centers_df = pd.DataFrame(cluster_centers, columns=features)

plt.figure(figsize=(12, 8))
sns.heatmap(cluster_centers_df, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Heatmap of Cluster Centers")
plt.xlabel("Features")
plt.ylabel("Cluster")
plt.show()

features = ["Major", "Employed", "Unemployed", "Total", "Employed_full_time_year_round", "Median", "P25th", "P75th", "Major_category"]

df_cluster2 = df[features].dropna()

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_cluster2.drop(columns=['Major', 'Major_category']))

df_scaled = pd.DataFrame(scaled_data, columns=features[1:-1])
df_scaled['Major'] = df_cluster2['Major'].values

k_opt = 4
kmeans = KMeans(n_clusters=k_opt, random_state=42)
kmeans_labels = kmeans.fit_predict(df_scaled.drop(columns=['Major']))

df_cluster2['KMeans_Cluster'] = kmeans_labels


clusters = df_cluster2.groupby('KMeans_Cluster')['Major'].apply(list).reset_index()

for index, row in clusters.iterrows():
    print(f"Cluster {row['KMeans_Cluster']}:")
    for major in row['Major']:
        print(f" - {major}")
    print("\n")

Cluster 0:
 - COMPUTER AND INFORMATION SYSTEMS
 - INFORMATION SCIENCES
 - GENERAL ENGINEERING
 - AEROSPACE ENGINEERING
 - ARCHITECTURAL ENGINEERING
 - CHEMICAL ENGINEERING
 - CIVIL ENGINEERING
 - COMPUTER ENGINEERING
 - ELECTRICAL ENGINEERING
 - ENGINEERING MECHANICS PHYSICS AND SCIENCE
 - ENVIRONMENTAL ENGINEERING
 - GEOLOGICAL AND GEOPHYSICAL ENGINEERING
 - INDUSTRIAL AND MANUFACTURING ENGINEERING
 - MATERIALS ENGINEERING AND MATERIALS SCIENCE
 - MECHANICAL ENGINEERING
 - METALLURGICAL ENGINEERING
 - MINING AND MINERAL ENGINEERING
 - NAVAL ARCHITECTURE AND MARINE ENGINEERING
 - NUCLEAR ENGINEERING
 - PETROLEUM ENGINEERING
 - MISCELLANEOUS ENGINEERING
 - ENGINEERING AND INDUSTRIAL MANAGEMENT
 - ELECTRICAL ENGINEERING TECHNOLOGY
 - INDUSTRIAL PRODUCTION TECHNOLOGIES
 - MATHEMATICS
 - APPLIED MATHEMATICS
 - STATISTICS AND DECISION SCIENCE
 - MATHEMATICS AND COMPUTER SCIENCE
 - ASTRONOMY AND ASTROPHYSICS
 - GEOLOGY AND EARTH SCIENCE
 - PHYSICS
 - MATERIALS SCIENCE
 - CONSTRUCTION SERVICES
 - TRANSPORTATION SCIENCES AND TECHNOLOGIES
 - PHARMACY PHARMACEUTICAL SCIENCES AND ADMINISTRATION
 - ACTUARIAL SCIENCE
 - OPERATIONS LOGISTICS AND E-COMMERCE
 - BUSINESS ECONOMICS
 - MANAGEMENT INFORMATION SYSTEMS AND STATISTICS


Cluster 1:
 - GENERAL AGRICULTURE
 - AGRICULTURE PRODUCTION AND MANAGEMENT
 - AGRICULTURAL ECONOMICS
 - ANIMAL SCIENCES
 - FOOD SCIENCE
 - PLANT SCIENCE AND AGRONOMY
 - SOIL SCIENCE
 - MISCELLANEOUS AGRICULTURE
 - ENVIRONMENTAL SCIENCE
 - FORESTRY
 - NATURAL RESOURCES MANAGEMENT
 - ARCHITECTURE
 - AREA ETHNIC AND CIVILIZATION STUDIES
 - JOURNALISM
 - MASS MEDIA
 - ADVERTISING AND PUBLIC RELATIONS
 - COMMUNICATION TECHNOLOGIES
 - COMPUTER PROGRAMMING AND DATA PROCESSING
 - COMPUTER ADMINISTRATION MANAGEMENT AND SECURITY
 - COMPUTER NETWORKING AND TELECOMMUNICATIONS
 - COSMETOLOGY SERVICES AND CULINARY ARTS
 - EDUCATIONAL ADMINISTRATION AND SUPERVISION
 - SCHOOL STUDENT COUNSELING
 - MATHEMATICS TEACHER EDUCATION
 - PHYSICAL AND HEALTH EDUCATION TEACHING
 - EARLY CHILDHOOD EDUCATION
 - SCIENCE AND COMPUTER TEACHER EDUCATION
 - SECONDARY TEACHER EDUCATION
 - SPECIAL NEEDS EDUCATION
 - SOCIAL SCIENCE OR HISTORY TEACHER EDUCATION
 - TEACHER EDUCATION: MULTIPLE LEVELS
 - LANGUAGE AND DRAMA EDUCATION
 - ART AND MUSIC EDUCATION
 - MISCELLANEOUS EDUCATION
 - BIOLOGICAL ENGINEERING
 - BIOMEDICAL ENGINEERING
 - ENGINEERING TECHNOLOGIES
 - MECHANICAL ENGINEERING RELATED TECHNOLOGIES
 - MISCELLANEOUS ENGINEERING TECHNOLOGIES
 - LINGUISTICS AND COMPARATIVE LANGUAGE AND LITERATURE
 - FRENCH GERMAN LATIN AND OTHER COMMON FOREIGN LANGUAGE STUDIES
 - OTHER FOREIGN LANGUAGES
 - FAMILY AND CONSUMER SCIENCES
 - COURT REPORTING
 - PRE-LAW AND LEGAL STUDIES
 - COMPOSITION AND RHETORIC
 - HUMANITIES
 - LIBRARY SCIENCE
 - BIOCHEMICAL SCIENCES
 - BOTANY
 - MOLECULAR BIOLOGY
 - ECOLOGY
 - GENETICS
 - MICROBIOLOGY
 - PHARMACOLOGY
 - PHYSIOLOGY
 - ZOOLOGY
 - NEUROSCIENCE
 - MISCELLANEOUS BIOLOGY
 - MILITARY TECHNOLOGIES
 - MULTI/INTERDISCIPLINARY STUDIES
 - INTERCULTURAL AND INTERNATIONAL STUDIES
 - NUTRITION SCIENCES
 - COGNITIVE SCIENCE AND BIOPSYCHOLOGY
 - INTERDISCIPLINARY SOCIAL SCIENCES
 - PHYSICAL FITNESS PARKS RECREATION AND LEISURE
 - PHILOSOPHY AND RELIGIOUS STUDIES
 - THEOLOGY AND RELIGIOUS VOCATIONS
 - PHYSICAL SCIENCES
 - ATMOSPHERIC SCIENCES AND METEOROLOGY
 - CHEMISTRY
 - GEOSCIENCES
 - OCEANOGRAPHY
 - MULTI-DISCIPLINARY OR GENERAL SCIENCE
 - NUCLEAR, INDUSTRIAL RADIOLOGY, AND BIOLOGICAL TECHNOLOGIES
 - EDUCATIONAL PSYCHOLOGY
 - CLINICAL PSYCHOLOGY
 - COUNSELING PSYCHOLOGY
 - INDUSTRIAL AND ORGANIZATIONAL PSYCHOLOGY
 - SOCIAL PSYCHOLOGY
 - MISCELLANEOUS PSYCHOLOGY
 - PUBLIC ADMINISTRATION
 - PUBLIC POLICY
 - HUMAN SERVICES AND COMMUNITY ORGANIZATION
 - SOCIAL WORK
 - GENERAL SOCIAL SCIENCES
 - ANTHROPOLOGY AND ARCHEOLOGY
 - CRIMINOLOGY
 - GEOGRAPHY
 - INTERNATIONAL RELATIONS
 - MISCELLANEOUS SOCIAL SCIENCES
 - ELECTRICAL, MECHANICAL, AND PRECISION TECHNOLOGIES AND PRODUCTION
 - DRAMA AND THEATER ARTS
 - MUSIC
 - VISUAL AND PERFORMING ARTS
 - FILM VIDEO AND PHOTOGRAPHIC ARTS
 - ART HISTORY AND CRITICISM
 - STUDIO ARTS
 - MISCELLANEOUS FINE ARTS
 - GENERAL MEDICAL AND HEALTH SERVICES
 - COMMUNICATION DISORDERS SCIENCES AND SERVICES
 - HEALTH AND MEDICAL ADMINISTRATIVE SERVICES
 - MEDICAL ASSISTING SERVICES
 - MEDICAL TECHNOLOGIES TECHNICIANS
 - HEALTH AND MEDICAL PREPARATORY PROGRAMS
 - TREATMENT THERAPY PROFESSIONS
 - COMMUNITY AND PUBLIC HEALTH
 - MISCELLANEOUS HEALTH MEDICAL PROFESSIONS
 - HUMAN RESOURCES AND PERSONNEL MANAGEMENT
 - INTERNATIONAL BUSINESS
 - HOSPITALITY MANAGEMENT
 - MISCELLANEOUS BUSINESS & MEDICAL ADMINISTRATION
 - UNITED STATES HISTORY


Cluster 2:
 - COMMUNICATIONS
 - COMPUTER SCIENCE
 - GENERAL EDUCATION
 - ELEMENTARY EDUCATION
 - ENGLISH LANGUAGE AND LITERATURE
 - LIBERAL ARTS
 - BIOLOGY
 - CRIMINAL JUSTICE AND FIRE PROTECTION
 - ECONOMICS
 - POLITICAL SCIENCE AND GOVERNMENT
 - SOCIOLOGY
 - FINE ARTS
 - COMMERCIAL ART AND GRAPHIC DESIGN
 - MARKETING AND MARKETING RESEARCH
 - FINANCE
 - HISTORY


Cluster 3:
 - PSYCHOLOGY
 - NURSING
 - GENERAL BUSINESS
 - ACCOUNTING
 - BUSINESS MANAGEMENT AND ADMINISTRATION

most_common_majors = df_cluster2.groupby('KMeans_Cluster')['Major'].agg(lambda x: x.value_counts().index[0]).reset_index()

plt.figure(figsize=(12, 6))
sns.barplot(x='KMeans_Cluster', y='Major', data=most_common_majors)
plt.title("Most Common Majors within Each Cluster")
plt.xlabel("Cluster")
plt.ylabel("Most Common Major")
plt.show()

plt.figure(figsize=(14, 8))
sns.countplot(y='Major_category', hue='KMeans_Cluster', data=df_cluster2, palette='viridis', order=df_cluster2['Major_category'].value_counts().index)
plt.title("Distribution of Majors within Each Category Group by Cluster")
plt.xlabel("Count")
plt.ylabel("Major Category")
plt.legend(title='Cluster')
plt.show()

Project 4 College Majors and Clustering¶

Overall Problem¶

Importing Libraries and Dependencies¶

Loading the Dataset¶

Data Preparation and Preprocessing¶

Explanation of Clustering¶

The Data¶

Data Understanding and Visualization¶

Distribution of Scaled Features¶

Correlation Matrix¶

K-Means Clustering¶

K-Means Clustering with Optimal Clusters¶

Truncated Dendrogram¶

Full Dendrogram with Distance Threshold¶

Salary Box Plot¶

PCA (Principal Component Analysis)¶

PCA Scatter Plot¶

PCA Scatter Plot with reduced PCA data¶

Heatmap of Cluster Centers¶

Information Regarding the Majors¶

Most Common Majors within Each Cluster¶

Distribution of Majors within Each Category Group by Cluster¶

Clustering Analysis¶

Impact¶

Ethical Impact¶

Economic Impact¶

Negative Impacts¶

References¶

Code¶

	Employed	Unemployed	Total	Employed_full_time_year_round	Median	P25th	P75th
0	90245	2423	128148	74078	50000	34000	80000.0
1	76865	2266	95326	64240	54000	36000	80000.0
2	26321	821	33955	22810	63000	40000	98000.0
3	81177	3619	103549	64937	46000	30000	72000.0
4	17281	894	24280	12722	62000	38500	90000.0

Project 4 College Majors and Clustering¶

Overall Problem¶

Importing Libraries and Dependencies¶

Loading the Dataset¶

Data Preparation and Preprocessing¶

Explanation of Clustering¶

The Data¶

Data Understanding and Visualization¶

Distribution of Scaled Features¶

Correlation Matrix¶

K-Means Clustering¶

K-Means Clustering with Optimal Clusters¶

Truncated Dendrogram¶

Full Dendrogram with Distance Threshold¶

Salary Box Plot¶

PCA (Principal Component Analysis)¶

PCA Scatter Plot¶

PCA Scatter Plot with reduced PCA data¶

Heatmap of Cluster Centers¶

Information Regarding the Majors¶

Most Common Majors within Each Cluster¶

Distribution of Majors within Each Category Group by Cluster¶

Clustering Analysis¶

Impact¶

Social Impact¶

Ethical Impact¶

Economic Impact¶

Negative Impacts¶

References¶

Code¶