# Importing libraries

# Data wrangling and analysis
import pandas as pd
from scipy.stats import pearsonr, spearmanr

# Visualization
import matplotlib.pyplot as plt
import squarify as sq
import seaborn as sns

# Appendix
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

data = original_data = pd.read_csv('./fediverse-client-alt-text-data-2024-05-13.csv')
data

data_rows, data_columns = original_data.shape # Tuple with number of rows and number of columns
data_size = original_data.size # Integer that is the result of n of columns * n of rows

print(f'''
Number of rows:			{data_rows}
Number of columns:		{data_columns}
Data size (rows×columns):	{data_size}
''')

Number of rows:			266
Number of columns:		8
Data size (rows×columns):	2128

data.rename(columns={
	'status_count': 'posts',
	'descriptions_all_count': 'atxt_yes',
	'descriptions_all_percent': 'atxt_yes_pct',
	'descriptions_some_count': 'atxt_some',
	'descriptions_some_percent': 'atxt_some_pct',
	'descriptions_none_count': 'atxt_no',
	'descriptions_none_percent': 'atxt_no_pct'
}, inplace=True)

## Utilities

# Set theme colors
blue = '#355766'
light_blue = '#8EB7CA'
lighter_blue = '#D0E7F1'

# Creating a figure numbering generator
figure_count = 0
def figure_caption_generator():
	global figure_count
	figure_count += 1
	caption = plt.text(
		1,
		1,
		f'Figure {figure_count}',
		ha = 'right',
		va = 'top',
		transform = plt.gca().transAxes,
		backgroundcolor = 'black',
		color = 'white'
	)
	return caption

# Data type and count of missing values
summary = pd.DataFrame({
	'Data Type': original_data.dtypes,
	'Missing values?': any(original_data.isnull().sum())
})

# Descriptive statistics for numerical columns
numerical_summary = original_data.describe()
summary = summary.merge(numerical_summary.T, left_index=True, right_index=True, how='left')
summary

print(f'Are there any duplicated clients? {any(original_data['client'].duplicated())}')

Are there any duplicated clients? False

# Sort clients by status_count
data = data.sort_values('posts',ascending=False)

print(f'Client data type: {data["client"].dtype}')

Client data type: object

# Consider all client names as strings (they were objects)
data['client'] = data['client'].astype('string')
print(f'Client data type: {data["client"].dtype}')

Client data type: string

data['atxt_yes_pct'] = data['atxt_yes_pct'].round(1)
data['atxt_some_pct'] = data['atxt_some_pct'].round(1)
data['atxt_no_pct'] = data['atxt_no_pct'].round(1)

data.drop(data[data['client'] == 'unknown'].index, inplace=True)

clients = len(data)
clients

265

# The dataset already includes percentages. Let’s calculate and verify them.
posts_tot = data['posts'].sum()
atxt_yes_tot = data['atxt_yes'].sum()
atxt_some_tot = data['atxt_some'].sum()
atxt_no_tot = data['atxt_no'].sum()

atxt_yes_tot_pct = round(atxt_yes_tot / posts_tot * 100, 1)
atxt_some_tot_pct = round(atxt_some_tot / posts_tot * 100, 1)
atxt_no_tot_pct = round(atxt_no_tot / posts_tot * 100, 1)

if posts_tot == atxt_yes_tot + atxt_some_tot + atxt_no_tot:
	print('Posts count is matching!')
else:
	print('Something is wrong in the dataset. Please revise the number of posts!')

Posts count is matching!

# The dataset already includes percentages. Let’s calculate and verify them.
if data['atxt_yes_pct'].equals(round(data['atxt_yes']/data['posts']*100, 1)) and data['atxt_no_pct'].equals(round(data['atxt_no']/data['posts']*100, 1)):
	print('The percentages in the dataset are accurate!')

The percentages in the dataset are accurate!

print(f'''
Total number of posts under analysis: {posts_tot}
	of which {atxt_yes_tot} ({atxt_yes_tot_pct}%) contain an alt-text for all images,
	{atxt_some_tot} ({atxt_some_tot_pct}%) for some,
	and {atxt_no_tot} ({atxt_no_tot_pct}%) have no alt-text at all.
''')

Total number of posts under analysis: 28428
	of which 5346 (18.8%) contain an alt-text for all images,
	24 (0.1%) for some,
	and 23058 (81.1%) have no alt-text at all.

# Subtracting from the total of posts the ones that only have alt text for some images only
data['posts'] = data['posts'] - data['atxt_some']

# Remove columns related to some alt text
data.drop(columns=['atxt_some', 'atxt_some_pct'], inplace=True)

plt.figure(figsize=(15, 5), dpi=300)

pie_labels = ['Alt-text for all images', 'No alt-text']
pie_colors = [lighter_blue, light_blue]

plt.pie(
	[atxt_yes_tot, atxt_no_tot],
	labels = pie_labels,
	colors = pie_colors,
	autopct = '%1.1f%%'
)

plt.title('Alt text usage in posts', fontweight='bold')
figure_caption_generator()
plt.show()

plt.figure(figsize=(10, 5), dpi=300)
sns.boxenplot(
	x = data['posts'],
	color = lighter_blue
)
plt.xscale('log')
plt.title('Distribution of Posts Across Clients (Log Scale)', fontsize=14, fontweight='bold')
plt.xlabel('Log of Posts Across Clients')
plt.yticks([])
plt.tight_layout()
figure_caption_generator()
plt.show()

data['posts'].describe()

count	 265.000000
mean		107.184906
std		 639.324886
min		 1.000000
25%		 2.000000
50%		 8.000000
75%		39.000000
max		8265.000000
Name: posts, dtype: float64

plt.figure(figsize=(10, 5), dpi=300)
plt.scatter(
	x = 'posts',
	y = 'atxt_yes_pct',
	data = data,
	color = '#365766',
	alpha = 0.5
)
plt.title('Posts count per client vs percentage of posts with alt text per client', fontweight='bold')
plt.xlabel('Number of posts per client')
plt.ylabel('Percentage of posts with alt text per client')
plt.tight_layout()
figure_caption_generator()
plt.show()

# Calculate correlation coefficients
pcc, pearson_pct = pearsonr(data['posts'], data['atxt_yes_pct'])
scc, spearman_pct = spearmanr(data['posts'], data['atxt_yes_pct'])

print(f'Pearson correlation coefficient: {round(pcc,2)}')
print(f'Spearman correlation coefficient: {round(scc,2)}')

Pearson correlation coefficient: -0.07
Spearman correlation coefficient: 0.02

posts_count = data['posts'].value_counts().sort_index(ascending=False)

plt.figure(figsize=(20, 20), dpi=300)
bars = posts_count.plot(
	kind = 'barh',
	color = blue
)

for idx, count in enumerate(posts_count):
	plt.text(
		count,
		idx,
		str(count),
		ha = 'left',
		va = 'center',
	)

# Add labels and title
plt.xlabel('Number of posts', fontsize='18')
plt.ylabel('Number of clients', fontsize='18')
plt.title('Frequency of total posts counted by client', fontsize='24', fontweight='bold')
figure_caption_generator()
plt.show()

mode = data['posts'].mode()[0] # Using [0] to get only the first mode, as this function can return multiple values.
print(f'Mode: {mode}')

Mode: 1

# Determining arbitrary post numbers thresholds, based on previous graphs
top = 1000
worse = 4

data_clients_least = data[data['posts'] <= worse]
posts_clients_least = data_clients_least['posts'].sum()
clients_least = len(data_clients_least)
posts_clients_least_pct = posts_clients_least/posts_tot*100

print(f'''
Total of clients: {clients}.	Clients containing less than {worse} posts: {clients_least}.
Total of posts: {posts_tot}.	Total of posts from the {clients_least} less popular clients: {posts_clients_least}.
Around {round(posts_clients_least_pct,1)}% of the posts comes from the {round(clients_least/clients*100)}% least popular clients.
''')

Total of clients: 265.	Clients containing less than 4 posts: 115.
Total of posts: 28428.	Total of posts from the 115 less popular clients: 224.
Around 0.8% of the posts comes from the 43% least popular clients.

data_clients_top = data[data['posts'] >= top]
posts_clients_top = data_clients_top['posts'].sum()
clients_top = len(data_clients_top)
posts_clients_top_pct = posts_clients_top/posts_tot*100

print(f'''
Total of clients: {clients}.	Clients containing more than {top} posts: {clients_top}.
Total of posts: {posts_tot}.	Total of posts from top {clients_top} clients: {posts_clients_top}.
Around {round(posts_clients_top_pct)}% of the posts come from the top {round(clients_top/clients*100)}% of the clients.
''')

Total of clients: 265.	Clients containing more than 1000 posts: 5.
Total of posts: 28428.	Total of posts from top 5 clients: 18352.
Around 65% of the posts come from the top 2% of the clients.

print(f'There are {len(data[data['atxt_yes'] == 0])} clients whose posts do not have any alt text.')

There are 138 clients whose posts do not have any alt text.

clients_least_atxt_yes_tot = data_clients_least['atxt_yes'].sum()
clients_least_atxt_no_tot = data_clients_least['atxt_no'].sum()
clients_top_atxt_yes_tot = data_clients_top['atxt_yes'].sum()
clients_top_atxt_no_tot = data_clients_top['atxt_no'].sum()

plt.figure(figsize=(15,5), dpi=300)

plt.subplot(1,3,1)
plt.pie(
	[atxt_yes_tot, atxt_no_tot],
	labels = pie_labels,
	colors = pie_colors,
	autopct = '%1.1f%%'
)
plt.title('Whole dataset', fontweight='bold')

plt.subplot(1,3,2)
plt.pie(
	[clients_least_atxt_yes_tot, clients_least_atxt_no_tot],
	labels = pie_labels,
	colors = pie_colors,
	autopct = '%1.1f%%'
)
plt.title(f'Least popular clients (≤{worse} posts)', fontweight='bold')

plt.subplot(1,3,3)
plt.pie(
	[clients_top_atxt_yes_tot, clients_top_atxt_no_tot],
	labels = pie_labels,
	colors = pie_colors,
	autopct = '%1.1f%%'
)
plt.title(f'Most popular clients (≥{top} posts)', fontweight='bold')

plt.suptitle('Alt text usage, grouped by client popularity', fontweight='bold', fontsize=22)
#plt.legend()
figure_caption_generator()
plt.show()

plt.figure(figsize=(10, 5), dpi=300)

# Sort the data based on the percentage of posts with alt text (descriptions_all_percent) in descending order
#data_clients_top = data_clients_top.sort_values(by='posts', ascending=True)

plt.barh(
	data_clients_top['client'],
	data_clients_top['atxt_yes'],
	color = blue,
	label = 'With Alt Text'
)
plt.barh(
	data_clients_top['client'],
	data_clients_top['atxt_no'],
	left = data_clients_top['atxt_yes'],
	color = light_blue,
	label = 'Without Alt Text'
)

plt.xlabel('Number of posts')
plt.ylabel('Clients')
plt.title('Number of posts with and without alt text in top 5 clients',fontweight='bold')
plt.legend()
figure_caption_generator()
plt.show()

data_clients_top_sortpct = data_clients_top.sort_values(by='atxt_yes_pct')

plt.figure(figsize=(10, 5), dpi=300)
data_clients_top_barchart = plt.barh(
	data_clients_top_sortpct['client'],
	data_clients_top_sortpct['atxt_yes_pct'],
	label = 'Percentage of alt text use by client',
	color = blue
)
plt.xlim(0, 100)
plt.axvline(
	x = atxt_yes_tot_pct,
	color = light_blue,
	linestyle = ':',
	label = f'{atxt_yes_tot_pct}%, Average alt text use'
)
#plt.text(atxt_yes_tot_pct + 2, len(data_clients_top) - 1, f'Average value {atxt_yes_tot_pct}%', color='red', verticalalignment='center', horizontalalignment='left', fontsize=10)

for bar in data_clients_top_barchart:
	vertical_position = bar.get_y() + bar.get_height() / 2
	plt.text(
		2,
		vertical_position,
		f'{bar.get_width()}%',
		color = '#FFF',
		ha = 'left',
		va = 'center'
)

plt.xlabel('Number of posts')
plt.ylabel('Clients')
plt.title('Alt text usage percentage in top 5 clients', fontweight='bold')
plt.legend()
figure_caption_generator()
plt.show()

data_clients_top = data_clients_top[data_clients_top['atxt_yes'] > 0]

data_clients_top['label'] = data_clients_top.apply(
	lambda row: f"{row['client']}\n{row['atxt_yes']} of {row['posts']} posts have alt text\n({row['atxt_yes_pct']:.1f}%)", axis=1
)

# Generate a color palette using matplotlib's colormaps
#colors = plt.get_cmap('tab20')(range(len(data_clients_top)))

plt.figure(figsize=(15, 10), dpi=300)
sq.plot(
	sizes = data_clients_top['atxt_yes'],
	label = data_clients_top['label'],
	color = [blue,blue,lighter_blue,light_blue],
	alpha = 0.8
)

plt.title('Proportion of clients using alt text description in their posts', fontweight='bold')
plt.xticks([])
plt.yticks([])
figure_caption_generator()
plt.show()

# Select features for clustering
X = data[['posts', 'atxt_yes_pct']].copy()

# Standardize the data for better clustering
X_scaled = StandardScaler().fit_transform(X)

# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data['cluster'] = kmeans.fit_predict(X_scaled)

#print(data[['client', 'posts', 'atxt_yes_pct', 'cluster']])

plt.figure(figsize=(10, 5), dpi=300)
sns.scatterplot(
	x = data['posts'],
	y = data['atxt_yes_pct'],
	hue = data['cluster'],
	palette = 'viridis',
	s = 100
)

plt.xscale('log')
plt.xlabel('Status count (log scale)')
plt.ylabel('Percentage of alt text usage')
plt.title('K-means Clustering of Clients by post count and alt text usage')
plt.legend(title='Clusters')
plt.tight_layout()
figure_caption_generator()
plt.show()

data = data[data['cluster']==2]
data

sil_score = silhouette_score(X_scaled, kmeans.labels_)
print(f'Silhouette Score: {sil_score:.2f}')

Silhouette Score: 0.85

	client	status_count	descriptions_all_count	descriptions_all_percent	descriptions_some_count	descriptions_some_percent	descriptions_none_count	descriptions_none_percent
0	Web	8272	1438	17.383946	7	0.084623	6827	82.531431
1	dlvr.it	5806	1	0.017224	0	0.000000	5805	99.982776
2	Mastodon for Android	1894	270	14.255544	3	0.158395	1621	85.586061
3	unknown	1428	366	25.630252	1	0.070028	1061	74.299720
4	AboveMaidstoneBot	1339	0	0.000000	0	0.000000	1339	100.000000
...	...	...	...	...	...	...	...	...
261	socialbot	1	1	100.000000	0	0.000000	0	0.000000
262	PhonocasterMusicShare	1	0	0.000000	0	0.000000	1	100.000000
263	openvibe	1	0	0.000000	0	0.000000	1	100.000000
264	iflaapp	1	0	0.000000	0	0.000000	1	100.000000
265	Today's Dérive app task	1	1	100.000000	0	0.000000	0	0.000000

variable	meaning
`client`	Name of the client the posts come from.
`posts`	Total number of posts containing images for that client.
`atxt_yes`	Number of posts containing alt text in all images.
`atxt_yes_pct`	The percentage of `atxt_yes` in relation to `posts`.
`atxt_some`	Number of posts containing alt text in some images.
`atxt_some_pct`	The percentage of `atxt_some` in relation to `posts`.
`atxt_no`	Number of posts containing no alt text in any images.
`atxt_no_pct`	The percentage of `atxt_no` in relation to `posts`.

	Data Type	Missing values?	count	mean	std	min	25%	50%	75%	max
client	object	False	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
posts	int64	False	266.0	112.240602	643.625189	1.0	2.0	8.0	39.75	8272.000000
atxt_yes	int64	False	266.0	21.473684	105.282555	0.0	0.0	0.0	4.00	1438.000000
atxt_yes_pct	float64	False	266.0	36.290411	45.079671	0.0	0.0	0.0	100.00	100.000000
atxt_some	int64	False	266.0	0.093985	0.622382	0.0	0.0	0.0	0.00	7.000000
atxt_some_pct	float64	False	266.0	0.060461	0.527118	0.0	0.0	0.0	0.00	7.142857
atxt_no	int64	False	266.0	90.672932	568.477040	0.0	0.0	2.0	20.00	6827.000000
atxt_no_pct	float64	False	266.0	63.649128	45.113843	0.0	0.0	100.0	100.00	100.000000

Alterism – Analyzing image description usage in mastodon.social

Abstract¶

Project Domain¶

Definitions¶

Contextual information¶

Why accessibility?¶

Why the Fediverse?¶

Why focusing on clients?¶

Mastodon users’ information and growth¶

Project scope and objectives¶

Methods¶

Exploratory Data Analysis (EDA)¶

Descriptive Statistics¶

Visualizations¶

Data Preprocessing and Feature Engineering¶

Dataset structure and description¶

Dataset import and preview¶

Dataset dictionary¶

Data analysis and results¶

Data exploration¶

Refine the dataset for analysis¶

Exclude posts from unknown client(s)¶

Count total clients¶

Checking values¶

Global analysis¶

Analyzing client data¶

Correlation coefficients¶

Find the mode of posts per client¶

Grouping by client popularity¶

Client-specific alt text analysis¶

Alt text usage in top 5 clients¶

Exploring alt text features of the top 5 clients¶

Web¶

Official Mastodon apps¶

Bots¶

Note on Analytical Methods¶

Conclusion and future development¶

Summary¶

Challenges¶

Future development¶

Appendix¶

References¶