Home Gender Gap in Nature History Books (Method: Preferred)
Post
Cancel
an image of a line chart

Gender Gap in Nature History Books (Method: Preferred)

1 About the project

COMING SOON 🤓

2 Code

1
2
3
4
5
6
# Source: https://www.geeksforgeeks.org/python-gender-identification-by-name-using-nltk/
# importing libraries
import random
from nltk.corpus import names
import nltk
import pandas as pd

Book Depository

This is where I have the nature history writers data here is the code for getting this data.

1
2
3
4
5
# Import Bookdepository CSV
#authors = pd.read_csv("/Users/nat/Desktop/Code/Code Projects/Book-Gender/Data/Bookdepository/NaturalHistory-Bookdepository-2021.csv", dtype=str)
authors = pd.read_csv("/Users/nat/Desktop/Code/Code Projects/Book-Gender/Data/Bookdepository/NaturalHistory-Bookdepository-All.csv", dtype=str)

authors.head(5)
Unnamed: 0authorstitlesdateyear
00Stephen HawkingA Brief History Of Time20 Jan 20152015
11James BowenA Street Cat Named Bob23 Jan 20132013
22Peter WohllebenThe Hidden Life of Trees22 Nov 20192019
33Raynor WinnThe Salt Path31 Jan 20192019
44Catherine D. HughesLittle Kids First Big Book of Dinosaurs16 Aug 20182018
1
2
3
4
5
6
7
8
9
10
11
12
13
# Create a colum for first names
authors["FirstName"] = ""

authors['FirstName'] = authors['authors'].str.split(" ")#1, expand=True
#authors['FirstName'][1][0] # 'Jeremy'

# Create an empty column for gender
authors["Gender"] = ""

# Drop unnecessary columns
authors.drop('Unnamed: 0', axis=1, inplace=True)

authors.head(5)
authorstitlesdateyearFirstNameGender
0Stephen HawkingA Brief History Of Time20 Jan 20152015[Stephen, Hawking]
1James BowenA Street Cat Named Bob23 Jan 20132013[James, Bowen]
2Peter WohllebenThe Hidden Life of Trees22 Nov 20192019[Peter, Wohlleben]
3Raynor WinnThe Salt Path31 Jan 20192019[Raynor, Winn]
4Catherine D. HughesLittle Kids First Big Book of Dinosaurs16 Aug 20182018[Catherine, D., Hughes]

NLTK Prediction

I used the name dataset from here

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def gender_features(word):
    return {'last_letter':word[-1]}
  
# preparing a list of examples and corresponding class labels.
labeled_names = ([(name, 'Female') for name in names.words('/Users/nat/Desktop/Code/Code Projects/Book-Gender/Data/Names/Dataset2/Female.txt')]+
             [(name, 'Male') for name in names.words('/Users/nat/Desktop/Code/Code Projects/Book-Gender/Data/Names/Dataset2/Male.txt')])
  
random.shuffle(labeled_names)

# we use the feature extractor to process the names data.
featuresets = [(gender_features(n), gender) 
               for (n, gender)in labeled_names]
  
# Divide the resulting list of feature
# sets into a training set and a test set.
train_set, test_set = featuresets[500:], featuresets[:500]
  
# The training set is used to 
# train a new "naive Bayes" classifier.
classifier = nltk.NaiveBayesClassifier.train(train_set) # 76% accuracy

#print(nltk.classify.accuracy(classifier, train_set)) # Accuract is 0.7461467177257799
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
for i in range(len(authors)): #iterate over rows
    
    # Get the name
    name = authors['FirstName'][i][0]
    
    # If the authors' name is in our name dataset check gender
    if name in open('/Users/nat/Desktop/Code/Code Projects/Book-Gender/Data/Names/Dataset2/AllNames.txt').read():
        gender = classifier.classify(gender_features(name))
        authors["Gender"][i] = str(gender)

    # If the name is empty print unknown
    elif name == 0:
        authors["Gender"][i] = "Unknown"
        
    # If the name is not in the database print unknown
    else:
        authors["Gender"][i] = "Unknown"
    
print('Done')
1
2
# Check the table. Now it should have a designated/predicted gender for each author
authors.head(5)
authorstitlesdateyearFirstNameGender
0Stephen HawkingA Brief History Of Time20 Jan 20152015[Stephen, Hawking]Male
1James BowenA Street Cat Named Bob23 Jan 20132013[James, Bowen]Male
2Peter WohllebenThe Hidden Life of Trees22 Nov 20192019[Peter, Wohlleben]Male
3Raynor WinnThe Salt Path31 Jan 20192019[Raynor, Winn]Male
4Catherine D. HughesLittle Kids First Big Book of Dinosaurs16 Aug 20182018[Catherine, D., Hughes]Female

Save the Result

1
2
authors.to_csv('/Users/nat/Desktop/Code/Code Projects/Book-Gender/Data/Bookdepository/NaturalHistory-All-Gender.csv')

Create Stat & Plot All The Data

1
2
# You can observe the data with these commands
#authors.describe()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import plotly.express as px

# Book number per year (the below will give a pandas series)
years_no= authors.groupby('year')['Gender'].count()  

# Convert pandas series into a dataframe
df_years = pd.DataFrame(years_no)
df_years.reset_index(inplace=True)

# Drop the books that had publishing dates in the future.  
df_years = df_years[df_years["year"].str.contains("2024")==False]
df_years = df_years[df_years["year"].str.contains("2023")==False]
df_years = df_years[df_years["year"].str.contains("2022")==False] # I will also remove 2022 as we are in the beginnign of this year

# Plot the data
fig = px.line(df_years, x=df_years['year'], y=df_years['Gender'], title='Number of Published Book')
fig.show()

Graph1

Gender Gap in the all (retrieved) nature book published

1
2
3
4
5
6
7
8
9
10
f = authors['Gender'].value_counts()['Female'] # All: 3465
m = authors['Gender'].value_counts()['Male'] # All: 5836
u = authors['Gender'].value_counts()['Unknown'] # All: 659

stat = pd.DataFrame({
    'Female': [f],
    'Male': [m]
    #'Unknown': [u]
})
stat
FemaleMale
034685833
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from matplotlib import pyplot as plt

mylabels = ["Female", "Male"]
mycolors = ["#34595a", "#a0c293"]

#controls default text size
plt.rc('font', size=15)

# plot size
#plt.rcParams["figure.figsize"] = [10, 15]

#set title font to size 50
plt.rc('axes', titlesize=50) 

plt.pie(stat, 
        labels = mylabels, 
        autopct ='%1.1f%%',
        colors = mycolors,
        wedgeprops = {"edgecolor" : "black",
                      'linewidth': 2,
                      'antialiased': True})
#plt.legend(loc='upper left')
#plt.title('Gender Gap')

# Save figure
#plt.savefig('/Users/nat/Desktop/gender-gap-All.png', dpi = 100)

# Display the graph onto the screen
plt.show() 

Graph2

Gender Gap Yearly Analysis

1
2
3
4
5
6
7
8
# Filter data
df_2020 = authors[(authors['year'] == '2020')]
df_2015 = authors[(authors['year'] == '2015')]
df_2010 = authors[(authors['year'] == '2010')]
df_2005 = authors[(authors['year'] == '2005')]
df_2000 = authors[(authors['year'] == '2000')]
df_1995 = authors[(authors['year'] == '1995')]
df_1990 = authors[(authors['year'] == '1990')]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
f2 = df_2020['Gender'].value_counts()['Female'] 
m2 = df_2020['Gender'].value_counts()['Male'] 
u2 = df_2020['Gender'].value_counts()['Unknown'] 

f7 = df_2015['Gender'].value_counts()['Female'] 
m7 = df_2015['Gender'].value_counts()['Male'] 
u7 = df_2015['Gender'].value_counts()['Unknown']

f8 = df_2010['Gender'].value_counts()['Female'] 
m8 = df_2010['Gender'].value_counts()['Male'] 
u8 = df_2010['Gender'].value_counts()['Unknown']

f9 = df_2005['Gender'].value_counts()['Female'] 
m9 = df_2005['Gender'].value_counts()['Male'] 
u9 = df_2005['Gender'].value_counts()['Unknown']

f10 = df_2000['Gender'].value_counts()['Female'] 
m10 = df_2000['Gender'].value_counts()['Male'] 
u10 = df_2000['Gender'].value_counts()['Unknown']

f11 = df_1995['Gender'].value_counts()['Female'] 
m11 = df_1995['Gender'].value_counts()['Male'] 
u11 = df_1995['Gender'].value_counts()['Unknown']
1
2
3
4
5
6
7
8
# Create your dataframes
every_5 = pd.DataFrame({
    'Year': [2020, 2015, 2010, 2005, 2000, 1995],
    'Female': [f2, f7, f8, f9, f10, f11],
    'Male': [m2, m7, m8, m9, m10, m11]
    #'Unknown': [u]
})
every_5
YearFemaleMale
02020358465
12015263383
22010107225
320055899
420003159
519951219
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import plotly.graph_objects as go

# labels={'trace 0': "hello", 'trace 1': "hi"}

# set up plotly figure
fig = go.Figure()

# add line / trace 1 to figure
fig.add_trace(go.Scatter(
    x=every_5['Year'],
    y=every_5['Female'],
    hovertext=every_5['Female'],
    hoverinfo="text",
    marker=dict(
        color="black"
    ),
    showlegend=True,
    line_width=3
))

# add line / trace 2 to figure
fig.add_trace(go.Scatter(
    x=every_5['Year'],
    y=every_5['Male'],
    hovertext=every_5['Male'],
    hoverinfo="text",
    marker=dict(
        color="green"
    ),
    showlegend=True,
    line_width=3
))

# Source: https://www.geeksforgeeks.org/plotly-how-to-show-legend-in-single-trace-scatterplot-with-plotly-express/
fig['data'][0]['showlegend'] = True
fig['data'][0]['name'] = 'Female'
fig['data'][1]['name'] = 'Male'

fig.show(renderer="png")

Graph3

1
2
3
4
5
6
7
8
9
10
11
12
13
import plotly.express as px
import pandas as pd
import plotly.graph_objs as go

fig = px.bar(every_5,
             x=every_5['Year'], 
             y=[every_5['Male'], every_5['Female']], 
             height=400, width=700,
             color_discrete_map= {"Male": "RebeccaPurple", 
                                  "Female": "MediumPurple"},
             template="simple_white"
            )
fig.show(renderer="png")

Graph4

This post is licensed under CC BY 4.0 by the author.