Intro to Machine Learning Tools & Applications for Geoscience

import pandas as pd
import geopandas as gpd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix

Data Preprocessing
#

Data cleaning
Data transformation
Feature selection
Data normalization/standardization & encoding
Train/test split

Download volcano data from GVP
#

Holocene & Pleistocene volcanoes from the Smithsonian Global Volcanism Program (GVP) databases.

https://volcano.si.edu/database/webservices.cfm

# request data from Smithsonian database
server = 'https://webservices.volcano.si.edu/geoserver/GVP-VOTW/ows?'
queries = {
    'holocene': 'service=WFS&request=GetFeature&typeName=GVP-VOTW:Smithsonian_VOTW_Holocene_Volcanoes&outputFormat=json',
    'pleistocene': 'service=WFS&request=GetFeature&typeName=GVP-VOTW:Smithsonian_VOTW_Pleistocene_Volcanoes&outputFormat=json'
}

# download data using geopandas
holocene_volcanoes = gpd.read_file(server + queries['holocene'])
pleistocene_volcanoes = gpd.read_file(server + queries['pleistocene'])

# combine the two dataframes
volcanoes = gpd.GeoDataFrame(pd.concat([holocene_volcanoes, pleistocene_volcanoes], ignore_index=True))

Data cleaning & transformation
#

# print the counts of the major rock types
volcanoes['Major_Rock_Type'].value_counts()

volcanoes

	id	Volcano_Number	Volcano_Name	Volcanic_Landform	Primary_Volcano_Type	Last_Eruption_Year	Country	Region	Subregion	Geological_Summary	...	Longitude	Elevation	Tectonic_Setting	Geologic_Epoch	Evidence_Category	Primary_Photo_Link	Primary_Photo_Caption	Primary_Photo_Credit	Major_Rock_Type	geometry
0	Smithsonian_VOTW_Holocene_Volcanoes.fid--71e2f...	210010	West Eifel Volcanic Field	Cluster	Volcanic field	-8300.0	Germany	European Volcanic Regions	Central European Volcanic Province	The West Eifel Volcanic Field of western Germa...	...	6.8500	600.0	Rift zone / Continental crust (> 25 km)	Holocene	Eruption Dated	https://volcano.si.edu/gallery/photos/GVP-0150...	The lake-filled Weinfelder maar is one of abou...	Photo by Richard Waitt, 1990 (U.S. Geological ...	Foidite	POINT (6.85 50.17)
1	Smithsonian_VOTW_Holocene_Volcanoes.fid--71e2f...	210020	Chaine des Puys	Cluster	Lava dome(s)	-4040.0	France	European Volcanic Regions	Western European Volcanic Province	The Chaîne des Puys, prominent in the history ...	...	2.9810	1464.0	Rift zone / Continental crust (> 25 km)	Holocene	Eruption Dated	https://volcano.si.edu/gallery/photos/GVP-0880...	The central part of the Chaîne des Puys volcan...	Photo by Ichio Moriya (Kanazawa University).	Basalt / Picro-Basalt	POINT (2.981 45.786)
2	Smithsonian_VOTW_Holocene_Volcanoes.fid--71e2f...	210030	Olot Volcanic Field	Cluster	Volcanic field	NaN	Spain	European Volcanic Regions	Western European Volcanic Province	The Olot volcanic field (also known as the Gar...	...	2.5300	893.0	Intraplate / Continental crust (> 25 km)	Holocene	Evidence Credible	https://volcano.si.edu/gallery/photos/GVP-1199...	The forested Volcà Montolivet scoria cone rise...	Photo by Puigalder (Wikimedia Commons).	Trachybasalt / Tephrite Basanite	POINT (2.53 42.17)
3	Smithsonian_VOTW_Holocene_Volcanoes.fid--71e2f...	210040	Calatrava Volcanic Field	Cluster	Volcanic field	-3600.0	Spain	European Volcanic Regions	Western European Volcanic Province	The Calatrava volcanic field lies in central S...	...	-4.0200	1117.0	Intraplate / Continental crust (> 25 km)	Holocene	Eruption Dated	https://volcano.si.edu/gallery/photos/GVP-1185...	Columba volcano, the youngest known vent of th...	Photo by Rafael Becerra Ramírez, 2006 (Univers...	Basalt / Picro-Basalt	POINT (-4.02 38.87)
4	Smithsonian_VOTW_Holocene_Volcanoes.fid--71e2f...	211004	Colli Albani	Caldera	Caldera	NaN	Italy	European Volcanic Regions	Italian Peninsula Volcanic Provinces	The Colli Albani (Alban Hills) complex immedia...	...	12.7251	949.0	Subduction zone / Continental crust (> 25 km)	Holocene	Evidence Uncertain	https://volcano.si.edu/gallery/photos/GVP-0881...	The lake-filled Albano maar is part of the Alb...	Photo by Ichio Moriya (Kanazawa University).	Foidite	POINT (12.7251 41.7569)
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2655	Smithsonian_VOTW_Pleistocene_Volcanoes.fid--71...	274080	Y'Ami-North	Composite	Compound	NaN	Philippines	Western Pacific Volcanic Regions	Taiwan-Luzon Volcanic Arc	Submarine volcanic edifice with the islands of...	...	121.9420	213.0	NaN	Pleistocene	NaN	NaN	NaN	NaN	NaN	POINT (121.942 21.0868)
2656	Smithsonian_VOTW_Pleistocene_Volcanoes.fid--71...	274085	Hsiaolanyu	Composite	Compound	NaN	Taiwan	Western Pacific Volcanic Regions	Taiwan-Luzon Volcanic Arc	None	...	121.6123	148.0	NaN	Pleistocene	NaN	NaN	NaN	NaN	NaN	POINT (121.6123 21.9529)
2657	Smithsonian_VOTW_Pleistocene_Volcanoes.fid--71...	274090	Lutao	Composite	Compound	NaN	Taiwan	Western Pacific Volcanic Regions	Taiwan-Luzon Volcanic Arc	None	...	121.4927	268.0	NaN	Pleistocene	NaN	NaN	NaN	NaN	NaN	POINT (121.4927 22.658)
2658	Smithsonian_VOTW_Pleistocene_Volcanoes.fid--71...	315055	Addington Volcanic Field	Cluster	Volcanic field	NaN	United States	North America Volcanic Regions	Queen Charlotte Volcano Group	The submarine Addington Volcanic Field is abou...	...	-134.1700	-74.0	NaN	Pleistocene	NaN	NaN	NaN	NaN	NaN	POINT (-134.17 55.44)
2659	Smithsonian_VOTW_Pleistocene_Volcanoes.fid--71...	315100	Maclaren River Volcanic Field	Minor	Volcanic field	NaN	United States	North America Volcanic Regions	Northern Cordilleran Volcanic Province	As described by the Alaska Volcano Observatory...	...	-146.3200	1470.0	NaN	Pleistocene	NaN	NaN	NaN	NaN	NaN	POINT (-146.32 63.1369)

2660 rows × 21 columns

Feature selection
#

# filter for only andesite, basalt
volcanoes = volcanoes[volcanoes['Major_Rock_Type'].isin(['Andesite / Basaltic Andesite', 'Basalt / Picro-Basalt'])]

# select just the feature cols of interest
volcanoes = volcanoes[['Major_Rock_Type', 'Tectonic_Setting', 'Primary_Volcano_Type', 'Latitude', 'Longitude', 'Volcanic_Landform']]

# drop any that have missing data in the rows
volcanoes.dropna(inplace=True)

# view dataframe
volcanoes

	Major_Rock_Type	Tectonic_Setting	Primary_Volcano_Type	Latitude	Longitude	Volcanic_Landform
1	Basalt / Picro-Basalt	Rift zone / Continental crust (> 25 km)	Lava dome(s)	45.7860	2.9810	Cluster
3	Basalt / Picro-Basalt	Intraplate / Continental crust (> 25 km)	Volcanic field	38.8700	-4.0200	Cluster
9	Andesite / Basaltic Andesite	Subduction zone / Continental crust (> 25 km)	Stratovolcano	38.6380	15.0640	Composite
16	Andesite / Basaltic Andesite	Subduction zone / Continental crust (> 25 km)	Lava dome(s)	37.6186	23.3331	Minor (Basaltic)
22	Basalt / Picro-Basalt	Intraplate / Continental crust (> 25 km)	Shield	37.6700	39.8300	Shield
...	...	...	...	...	...	...
1252	Basalt / Picro-Basalt	Subduction zone / Oceanic crust (< 15 km)	Stratovolcano	-56.7120	-27.1760	Composite
1253	Andesite / Basaltic Andesite	Subduction zone / Oceanic crust (< 15 km)	Stratovolcano	-56.6560	-28.1400	Composite
1254	Basalt / Picro-Basalt	Subduction zone / Oceanic crust (< 15 km)	Stratovolcano	-56.3000	-27.5700	Composite
1257	Basalt / Picro-Basalt	Intraplate / Continental crust (> 25 km)	Shield	-64.1500	-57.7500	Shield
1268	Basalt / Picro-Basalt	Subduction zone / Continental crust (> 25 km)	Volcanic field	-45.2200	-73.0500	Cluster

933 rows × 6 columns

Data visualization
#

# plot volcanoes on map with plotly
fig = px.scatter_geo(volcanoes, lat='Latitude', lon='Longitude', hover_name='Major_Rock_Type', color='Major_Rock_Type')
fig.update_geos(projection_type='natural earth')
fig.update_layout(title='Pleistocene & Holocene Volcanoes by Rock Type')
fig.show()

Data normalization/standardization & encoding
#

# Preprocess the features and encode the target
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Latitude', 'Longitude']),
        ('cat', OneHotEncoder(handle_unknown='ignore'),
         ['Tectonic_Setting', 'Volcanic_Landform', 'Primary_Volcano_Type'])
    ])

# separate features (X) and target (y)
X = volcanoes[['Latitude', 'Longitude', 'Tectonic_Setting', 'Volcanic_Landform', 'Primary_Volcano_Type']]
y = volcanoes['Major_Rock_Type']

# encode the target variable (Major_Rock_Type)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

Train/test split
#

# use 20% of the data for testing, shuffle to reduce bias
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, shuffle=True)

Model selection
#

preprocessor: A preprocessing step to transform the data before classification.
classifier: A Gradient Boosting classifier with the following hyperparameters:
- n_estimators: The number of boosting stages to be run (default=100).
- verbose: Controls the verbosity of the output (default=1).
- validation_fraction: The proportion of training data to set aside as validation set for early stopping (default=0.15).
- learning_rate: The learning rate shrinks the contribution of each tree by learning_rate (default=0.075).
- min_impurity_decrease: A node will be split if this split induces a decrease of the impurity greater than or equal to this value (default=0.01).
- max_depth: The maximum depth of the individual regression estimators (default=3).

# create a pipeline with preprocessor and classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(n_estimators=100, verbose=1, validation_fraction=0.15, learning_rate=0.075, min_impurity_decrease=0.01, max_depth=3))
])

Model training
#

# fit the model
model.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time
         1           1.3147            0.13s
         2           1.2710            0.11s
         3           1.2332            0.10s
         4           1.2006            0.09s
         5           1.1719            0.08s
         6           1.1471            0.08s
         7           1.1225            0.08s
         8           1.1028            0.08s
         9           1.0833            0.07s
        10           1.0657            0.07s
        20           0.9339            0.07s
        30           0.8741            0.06s
        40           0.8296            0.05s
        50           0.8015            0.04s
        60           0.7746            0.03s
        70           0.7547            0.02s
        80           0.7262            0.02s
        90           0.7085            0.01s
       100           0.6926            0.00s

Pipeline(steps=[('preprocessor',
             ColumnTransformer(transformers=[(&#x27;num&#x27;, StandardScaler(),
                                              [&#x27;Latitude&#x27;, &#x27;Longitude&#x27;]),
                                             (&#x27;cat&#x27;,
                                              OneHotEncoder(handle_unknown=&#x27;ignore&#x27;),
                                              [&#x27;Tectonic_Setting&#x27;,
                                               &#x27;Volcanic_Landform&#x27;,
                                               &#x27;Primary_Volcano_Type&#x27;])])),
            (&#x27;classifier&#x27;,
             GradientBoostingClassifier(learning_rate=0.075,
                                        min_impurity_decrease=0.01,
                                        validation_fraction=0.15,
                                        verbose=1))])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label fitted sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-43" type="checkbox" ><label for="sk-estimator-id-43" class="sk-toggleable__label fitted sk-toggleable__label-arrow"><div><div>Pipeline</div></div><div><a class="sk-estimator-doc-link fitted" rel="noreferrer" target="_blank" href="https://scikit-learn.org/1.6/modules/generated/sklearn.pipeline.Pipeline.html">?<span>Documentation for Pipeline</span></a><span class="sk-estimator-doc-link fitted">i<span>Fitted</span></span></div></label><div class="sk-toggleable__content fitted"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,
             ColumnTransformer(transformers=[(&#x27;num&#x27;, StandardScaler(),
                                              [&#x27;Latitude&#x27;, &#x27;Longitude&#x27;]),
                                             (&#x27;cat&#x27;,
                                              OneHotEncoder(handle_unknown=&#x27;ignore&#x27;),
                                              [&#x27;Tectonic_Setting&#x27;,
                                               &#x27;Volcanic_Landform&#x27;,
                                               &#x27;Primary_Volcano_Type&#x27;])])),
            (&#x27;classifier&#x27;,
             GradientBoostingClassifier(learning_rate=0.075,
                                        min_impurity_decrease=0.01,
                                        validation_fraction=0.15,
                                        verbose=1))])</pre></div> </div></div><div class="sk-serial"><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label fitted sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-44" type="checkbox" ><label for="sk-estimator-id-44" class="sk-toggleable__label fitted sk-toggleable__label-arrow"><div><div>preprocessor: ColumnTransformer</div></div><div><a class="sk-estimator-doc-link fitted" rel="noreferrer" target="_blank" href="https://scikit-learn.org/1.6/modules/generated/sklearn.compose.ColumnTransformer.html">?<span>Documentation for preprocessor: ColumnTransformer</span></a></div></label><div class="sk-toggleable__content fitted"><pre>ColumnTransformer(transformers=[(&#x27;num&#x27;, StandardScaler(),
                             [&#x27;Latitude&#x27;, &#x27;Longitude&#x27;]),
                            (&#x27;cat&#x27;, OneHotEncoder(handle_unknown=&#x27;ignore&#x27;),
                             [&#x27;Tectonic_Setting&#x27;, &#x27;Volcanic_Landform&#x27;,
                              &#x27;Primary_Volcano_Type&#x27;])])</pre></div> </div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label fitted sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-45" type="checkbox" ><label for="sk-estimator-id-45" class="sk-toggleable__label fitted sk-toggleable__label-arrow"><div><div>num</div></div></label><div class="sk-toggleable__content fitted"><pre>[&#x27;Latitude&#x27;, &#x27;Longitude&#x27;]</pre></div> </div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator fitted sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-46" type="checkbox" ><label for="sk-estimator-id-46" class="sk-toggleable__label fitted sk-toggleable__label-arrow"><div><div>StandardScaler</div></div><div><a class="sk-estimator-doc-link fitted" rel="noreferrer" target="_blank" href="https://scikit-learn.org/1.6/modules/generated/sklearn.preprocessing.StandardScaler.html">?<span>Documentation for StandardScaler</span></a></div></label><div class="sk-toggleable__content fitted"><pre>StandardScaler()</pre></div> </div></div></div></div></div><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label fitted sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-47" type="checkbox" ><label for="sk-estimator-id-47" class="sk-toggleable__label fitted sk-toggleable__label-arrow"><div><div>cat</div></div></label><div class="sk-toggleable__content fitted"><pre>[&#x27;Tectonic_Setting&#x27;, &#x27;Volcanic_Landform&#x27;, &#x27;Primary_Volcano_Type&#x27;]</pre></div> </div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator fitted sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-48" type="checkbox" ><label for="sk-estimator-id-48" class="sk-toggleable__label fitted sk-toggleable__label-arrow"><div><div>OneHotEncoder</div></div><div><a class="sk-estimator-doc-link fitted" rel="noreferrer" target="_blank" href="https://scikit-learn.org/1.6/modules/generated/sklearn.preprocessing.OneHotEncoder.html">?<span>Documentation for OneHotEncoder</span></a></div></label><div class="sk-toggleable__content fitted"><pre>OneHotEncoder(handle_unknown=&#x27;ignore&#x27;)</pre></div> </div></div></div></div></div></div></div><div class="sk-item"><div class="sk-estimator fitted sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-49" type="checkbox" ><label for="sk-estimator-id-49" class="sk-toggleable__label fitted sk-toggleable__label-arrow"><div><div>GradientBoostingClassifier</div></div><div><a class="sk-estimator-doc-link fitted" rel="noreferrer" target="_blank" href="https://scikit-learn.org/1.6/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html">?<span>Documentation for GradientBoostingClassifier</span></a></div></label><div class="sk-toggleable__content fitted"><pre>GradientBoostingClassifier(learning_rate=0.075, min_impurity_decrease=0.01,
                       validation_fraction=0.15, verbose=1)</pre></div> </div></div></div></div></div></div>
Model evaluation
#
# eval the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# get predicted probabilities (needed for log loss)
y_pred_proba = model.predict_proba(X_test)

# calc log loss
loss = log_loss(y_test, y_pred_proba)

# print test metrics
print(f'Accuracy on Test Data: {accuracy * 100:.2f}%')
print(f'Log Loss on Test Data: {loss:.4f}')

# model info
print("\nModel details:", model.named_steps['classifier'])
Accuracy on Test Data: 80.75%
Log Loss on Test Data: 0.3957

Model details: GradientBoostingClassifier(learning_rate=0.075, min_impurity_decrease=0.01,
                           validation_fraction=0.15, verbose=1)
Confusion matrix
#
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Feature importance
#
# compute the feature importance
feature_names = model.named_steps['preprocessor'].transformers_[0][2] + list(model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out())

importance_scores = model.named_steps['classifier'].feature_importances_

# filter for features with importance > 5%
important_features = [(name, score) for name, score in zip(feature_names, importance_scores) if score > 0.05]

feature_names, importance_scores = zip(*important_features)

# Assuming you have feature_names and importance_scores
plt.figure(figsize=(10, 6))
plt.barh(feature_names, importance_scores)
plt.title('Feature Importance')
plt.ylabel('Features')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

Data Preprocessing#

Download volcano data from GVP#

Data cleaning & transformation#

Feature selection#

Data visualization#

Data normalization/standardization & encoding#

Train/test split#

Model selection#

Model training#

Model evaluation#

Confusion matrix#

Feature importance#

Data Preprocessing
#

Download volcano data from GVP
#

Data cleaning & transformation
#

Feature selection
#

Data visualization
#

Data normalization/standardization & encoding
#

Train/test split
#

Model selection
#

Model training
#

Model evaluation
#

Confusion matrix
#

Feature importance
#