Social Vulnerability Milan - Census Data

Social Vulnerability Milan - Census Data#

Environment#

R Libraries#

Any required R libraries are imported into the kernal:

# Load R libraries
## none required

Output directory#

# create the pipeline directory if it does not exist
pipeline_dir <- file.path("../..","2_pipeline","Italy","Milan","1a_CensusData","2021")
if(!dir.exists(pipeline_dir)){
    dir.create(pipeline_dir, recursive = TRUE)
    print(paste0(pipeline_dir, " created"))
}

Load Data#

Import the csv data#

# Read the census data
census_data <- read.csv('../../0_data/census/Italy/Milan/2021/Milano_indicatori_2021_sezioni.csv', sep=",")
head(census_data)

A data.frame: 6 × 136
	CODREG	REGIONE	CODPRO	PROVINCIA	CODCOM	COMUNE	PROCOM	SEZ2011	ASC_1LIV_COD	ASC_1LIV_NOME	⋯	CIT_1_EGY	CIT_2_PHL	CIT_3_CHN	CIT_4_LKA	CIT_5_PER	CIT_6_ROU	CIT_7_BGD	CIT_8_ECU	CIT_9_MAR	CIT_10_UKR
	<int>	<chr>	<int>	<chr>	<int>	<chr>	<int>	<dbl>	<int>	<chr>	⋯	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
1	3	Lombardia	15	Milano	146	Milano	15146	1.5146e+11	15146001	Centro Storico	⋯	0	0	0	0	0	0	0	0	0	1
2	3	Lombardia	15	Milano	146	Milano	15146	1.5146e+11	15146001	Centro Storico	⋯	0	4	0	0	0	0	0	0	0	0
3	3	Lombardia	15	Milano	146	Milano	15146	1.5146e+11	15146001	Centro Storico	⋯	0	0	0	0	0	0	0	0	0	0
4	3	Lombardia	15	Milano	146	Milano	15146	1.5146e+11	15146001	Centro Storico	⋯	0	0	0	0	0	0	0	0	0	0
5	3	Lombardia	15	Milano	146	Milano	15146	1.5146e+11	15146001	Centro Storico	⋯	0	0	0	0	0	0	0	0	0	0
6	3	Lombardia	15	Milano	146	Milano	15146	1.5146e+11	15146001	Centro Storico	⋯	0	0	0	0	0	0	0	0	0	0

Prepare data#

We only require a subset of the census data for our purposes. We therefore need to extract the relevant data, then combine these to create our vulnerability indicators.

In addition, the raw data is not suitable for use within the vulnerabiltiy assessment. It needs to be normalised based on the number of people/households within each small area. Therefore, the data is converted to percentages based on the total persons/households within each small area.

Supporting data#

Code that uniquely identifies the census area#

identifier <- c('SEZ2011')
census_area_id <- census_data[, identifier, drop = FALSE]

# number of rows in the census data
head( nrow(census_data) )

5740

Population total#

population_total <- census_data[, 'P1', drop = FALSE]
names(population_total)[1] <- 'population_total'
head(population_total)

A data.frame: 6 × 1
	population_total
	<int>
1	13
2	16
3	17
4	11
5	1
6	6

Households / families total#

households_total <- census_data[, 'PF1', drop = FALSE]
names(households_total)[1] <- 'households_total'
head(households_total)

A data.frame: 6 × 1
	households_total
	<int>
1	9
2	7
3	17
4	7
5	1
6	3

Domain data#

Age domain#

### Domain:Age ###

# Age - early childhood boy (under 5 years old)
early_childhood_boy_fields <- c(
    'P30' # Age 0 to 4 boys
)
early_childhood_boy_data <- census_data[, early_childhood_boy_fields, drop = FALSE]
early_childhood_boy <- rowSums(early_childhood_boy_data, na.rm=TRUE)
early_childhood_boy_pct <- (early_childhood_boy / population_total) * 100.0
names(early_childhood_boy_pct)[1] <- 'early_childhood_boy_pct'

# Age - early childhood girl (under 5 years old)
early_childhood_girl_fields <- c(
    'P67' # Age 0 to 4 girls
)
early_childhood_girl_data <- census_data[, early_childhood_girl_fields, drop = FALSE]
early_childhood_girl <- rowSums(early_childhood_girl_data, na.rm=TRUE)
early_childhood_girl_pct <- (early_childhood_girl / population_total) * 100.0
names(early_childhood_girl_pct)[1] <- 'early_childhood_girl_pct'

# Age - middle to oldest old male (75+ years old)
age_middle_to_oldest_old_male_fields <- c(
    'P45' # Age 75+ males
)
age_middle_to_oldest_old_male_data <- census_data[, age_middle_to_oldest_old_male_fields, drop = FALSE]
age_middle_to_oldest_old_male <- rowSums(age_middle_to_oldest_old_male_data, na.rm=TRUE)
age_middle_to_oldest_old_male_pct <- (age_middle_to_oldest_old_male / population_total) * 100.0
names(age_middle_to_oldest_old_male_pct)[1] <- 'age_middle_to_oldest_old_male_pct'

# Age - middle to oldest old male (75+ years old)
age_middle_to_oldest_old_female_fields <- c(
    'P82' # Age 75+ females
)
age_middle_to_oldest_old_female_data <- census_data[, age_middle_to_oldest_old_female_fields, drop = FALSE]
age_middle_to_oldest_old_female <- rowSums(age_middle_to_oldest_old_female_data, na.rm=TRUE)
age_middle_to_oldest_old_female_pct <- (age_middle_to_oldest_old_female / population_total) * 100.0
names(age_middle_to_oldest_old_female_pct)[1] <- 'age_middle_to_oldest_old_female_pct'

# Combine all these indicators into an array for this domain
age_domain_pct <- cbind(early_childhood_boy_pct,
                        early_childhood_girl_pct,
                        age_middle_to_oldest_old_male_pct,
                        age_middle_to_oldest_old_female_pct)

# Print the first six rows of the data to visually check it looks OK
head(age_domain_pct)

A data.frame: 6 × 4
	early_childhood_boy_pct	early_childhood_girl_pct	age_middle_to_oldest_old_male_pct	age_middle_to_oldest_old_female_pct
	<dbl>	<dbl>	<dbl>	<dbl>
1	0	0	15.38462	0.00
2	0	0	6.25000	6.25
3	0	0	23.52941	0.00
4	0	0	18.18182	0.00
5	0	0	100.00000	0.00
6	0	0	0.00000	0.00

Income domain#

### Domain:Income ###

# Dependants - using under 15 years old for the 2022 Italian census
dependants_fields <- c(
    'P14',
    'P15',
    'P16')
dependants_data <- census_data[, dependants_fields, drop = FALSE]
dependants <- rowSums(dependants_data, na.rm=TRUE)
dependants_pct <- (dependants / population_total) * 100.0
names(dependants_pct)[1] <- 'dependants_pct'

# Unemployment
## Calculate total poluation for ages 15-64.
## Note: P101 = total employed people aged 15-64.
## Note: Sum of P17, P18, P19, P20, P21, P22, P23, P24, P25 and P26 is total poluation for ages 15-64.
## Note: to calculate unemployed we use: P101 - total poluation for ages 15-64.
## Note: this unemployed figure might also indicate students not working
population_15to64_fields <- c(
    'P17',
    'P18',
    'P19',
    'P20',
    'P21',
    'P22',
    'P23',
    'P24',
    'P25',
    'P26')
population_15to64_data <- census_data[, population_15to64_fields, drop = FALSE]
population_15to64 <- rowSums(population_15to64_data, na.rm=TRUE)
unemployment <- population_15to64 - census_data$P101
unemployment_pct <- (unemployment / population_total) * 100.0
names(unemployment_pct)[1] <- 'unemployment_pct'

# Combine all these indicators into an array for this domain
income_domain_pct <- cbind(dependants_pct,
                           unemployment_pct)

# Print the first six rows of the data to visually check it looks OK
head(income_domain_pct)

A data.frame: 6 × 2
	dependants_pct	unemployment_pct
	<dbl>	<dbl>
1	7.692308	15.384615
2	12.500000	31.250000
3	0.000000	17.647059
4	9.090909	9.090909
5	0.000000	0.000000
6	0.000000	0.000000

Information Access/Use domain#

### Domain:Information Access/Use ###

# No higher education
### Note: calculation test1 for P83 (total population age 9+) = P86 + P87 + P88 + P89 + P90 (without qualification + all primary to highest education levels)
### Note: calculation test2 for P83 (total population age 9+) = P16 + P17+ P18 + P19 + P20 + P21 + P22 + P23 + P24 + P25 + P26 + P27 + P28 + P29 (age 10+)
### Note: using without qualification (P86) + primary/elementary schools qualification only (P87) 
no_higher_education_fields <- c(
    'P86',
    'P87'
)
no_higher_education_data <- census_data[, no_higher_education_fields, drop = FALSE]
no_higher_education <- rowSums(no_higher_education_data, na.rm=TRUE)
no_higher_education_pct <- (no_higher_education / population_total) * 100.0
names(no_higher_education_pct)[1] <- 'no_higher_education_pct'

# Combine all these indicators into an array for this domain
info_domain_pct <- cbind(no_higher_education_pct)

# Print the first six rows of the data to visually check it looks OK
head(info_domain_pct)

A data.frame: 6 × 1
	no_higher_education_pct
	<dbl>
1	7.692308
2	18.750000
3	0.000000
4	0.000000
5	0.000000
6	0.000000

Local knowledge domain#

### Domain:Local Knowledge ###

# Foreign nationals (already a percentage)
foreign_nationals_fields <- c(
    'ST1'
)
foreign_nationals_data <- census_data[, foreign_nationals_fields, drop = FALSE]
foreign_nationals <- rowSums(foreign_nationals_data, na.rm=TRUE)
foreign_nationals_pct <- (foreign_nationals / population_total) * 100.0
names(foreign_nationals_pct)[1] <- 'foreign_nationals_pct'

# Combine all these indicators into an array for this domain
local_knowledge_domain_pct <- cbind(foreign_nationals_pct)

# Print the first six rows of the data to visually check it looks OK
head(local_knowledge_domain_pct)

A data.frame: 6 × 1
	foreign_nationals_pct
	<dbl>
1	23.07692
2	31.25000
3	0.00000
4	18.18182
5	0.00000
6	66.66667

Social Network domain data#

### Domain:Social Network ###

# Primary school age children
primary_school_age_fields <- c(
    'P15' # Primary School Age Children (Age 5-9) (Indicator P15)
)
primary_school_age_data <- census_data[, primary_school_age_fields, drop = FALSE]
primary_school_age <- rowSums(primary_school_age_data, na.rm=TRUE)
primary_school_age_pct <- (primary_school_age / population_total) * 100.0
names(primary_school_age_pct)[1] <- 'primary_school_age_pct'

# One person households
one_person_households_fields<- c(
    'PF3' # Households with one person (Indicator PF3)
)
one_person_households_data <- census_data[, one_person_households_fields, drop = FALSE]
one_person_households <- rowSums(one_person_households_data, na.rm=TRUE)
one_person_households_pct <- (one_person_households / households_total) * 100.0
names(one_person_households_pct)[1] <- 'one_person_households_pct'

# Combine all these indicators into an array for this domain
social_network_domain_pct <- cbind(primary_school_age_pct,
                                   one_person_households_pct)

# Print the first six rows of the data to visually check it looks OK
head(social_network_domain_pct)

A data.frame: 6 × 2
	primary_school_age_pct	one_person_households_pct
	<dbl>	<dbl>
1	0.000000	66.66667
2	0.000000	42.85714
3	0.000000	100.00000
4	9.090909	57.14286
5	0.000000	100.00000
6	0.000000	66.66667

Combine all data into one table#

# Combine all data into one table
indicator_domains_pct <- cbind(census_area_id,
                               age_domain_pct,
                               income_domain_pct,
                               info_domain_pct,
                               local_knowledge_domain_pct,
                               social_network_domain_pct)
head(indicator_domains_pct)

A data.frame: 6 × 11
	SEZ2011	early_childhood_boy_pct	early_childhood_girl_pct	age_middle_to_oldest_old_male_pct	age_middle_to_oldest_old_female_pct	dependants_pct	unemployment_pct	no_higher_education_pct	foreign_nationals_pct	primary_school_age_pct	one_person_households_pct
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	1.5146e+11	0	0	15.38462	0.00	7.692308	15.384615	7.692308	23.07692	0.000000	66.66667
2	1.5146e+11	0	0	6.25000	6.25	12.500000	31.250000	18.750000	31.25000	0.000000	42.85714
3	1.5146e+11	0	0	23.52941	0.00	0.000000	17.647059	0.000000	0.00000	0.000000	100.00000
4	1.5146e+11	0	0	18.18182	0.00	9.090909	9.090909	0.000000	18.18182	9.090909	57.14286
5	1.5146e+11	0	0	100.00000	0.00	0.000000	0.000000	0.000000	0.00000	0.000000	100.00000
6	1.5146e+11	0	0	0.00000	0.00	0.000000	0.000000	0.000000	66.66667	0.000000	66.66667

Calculate Z-Score#

The raw data is not suitable for use within the vulnerabiltiy assessment. It needs to be standardised. Therefore, the data is converted to z-scores. Z-scores are:

“A statistical measurement of a score’s relationship to the mean (average value) in a group of scores. A Z-score of 0 means the score is the same as the mean (average value). A Z-score can be positive or negative, indicating whether it is above or below the mean and by how many standard deviations. Z-score standardisation represents the deviation of a raw score from its mean in standard deviation units.” (Kazmierczak et al., 2015)

Calculate the Z-score#

# Copy the data
indicator_z_scores <- indicator_domains_pct

# Get the number of columns in the data
num_cols = ncol(indicator_z_scores)

# Calculate the z scores for each of the relevant columns - starting at the 2nd column
for(col in names(indicator_z_scores)[2:num_cols]) {
  # rename column (remove the '_pct' in the name)
  new_col_name <- gsub("_pct", "", col)
  indicator_z_scores[new_col_name] = scale(indicator_z_scores[col])
}

# Remove the original data to leave only the area identifier and the z scores
indicator_z_scores <- indicator_z_scores[-c(2:num_cols)]

head(indicator_z_scores)

A data.frame: 6 × 11
	SEZ2011	early_childhood_boy	early_childhood_girl	age_middle_to_oldest_old_male	age_middle_to_oldest_old_female	dependants	unemployment	no_higher_education	foreign_nationals	primary_school_age	one_person_households
	<dbl>	<dbl[,1]>	<dbl[,1]>	<dbl[,1]>	<dbl[,1]>	<dbl[,1]>	<dbl[,1]>	<dbl[,1]>	<dbl[,1]>	<dbl[,1]>	<dbl[,1]>
1	1.5146e+11	-1.230411	-1.064144	2.1196292	-1.3062152	-0.7801407	-0.5060886	-0.6075338	0.284861877	-1.458348	0.9333249
2	1.5146e+11	-1.230411	-1.064144	0.2302043	-0.3189938	0.1197864	1.3419462	0.8849880	0.768744204	-1.458348	-0.7026211
3	1.5146e+11	-1.230411	-1.064144	3.8043177	-1.3062152	-2.2200240	-0.2425543	-1.6458099	-1.081394102	-1.458348	3.2236494
4	1.5146e+11	-1.230411	-1.064144	2.6982091	-1.3062152	-0.5183437	-1.2391934	-1.6458099	-0.004949997	1.865648	0.2789465
5	1.5146e+11	-1.230411	-1.064144	19.6216703	-1.3062152	-2.2200240	-2.2981224	-1.6458099	-1.081394102	-1.458348	3.2236494
6	1.5146e+11	-1.230411	-1.064144	-1.0625601	-1.3062152	-2.2200240	-2.2981224	-1.6458099	2.865567617	-1.458348	0.9333249

Output the Z-score data#

# Output the z-score data as a csv file
output_file <- file.path(pipeline_dir, "censusDataZ.csv")
write.csv(indicator_z_scores, output_file, row.names = FALSE)

END