Social Vulnerability Milan - Census Data#

Environment#

R Libraries#

Any required R libraries are imported into the kernal:

# Load R libraries
## none required

Output directory#

# create the pipeline directory if it does not exist
pipeline_dir <- file.path("../..","2_pipeline","Italy","Milan","1a_CensusData","2021")
if(!dir.exists(pipeline_dir)){
    dir.create(pipeline_dir, recursive = TRUE)
    print(paste0(pipeline_dir, " created"))
}

Load Data#

Import the csv data#

Ireland census data from: https://www.cso.ie/en/census/census2022/census2022smallareapopulationstatistics

# Read the census data
census_data <- read.csv('../../0_data/census/Italy/Milan/2021/Milano_indicatori_2021_sezioni.csv', sep=",")
head(census_data)
A data.frame: 6 × 136
CODREGREGIONECODPROPROVINCIACODCOMCOMUNEPROCOMSEZ2011ASC_1LIV_CODASC_1LIV_NOMECIT_1_EGYCIT_2_PHLCIT_3_CHNCIT_4_LKACIT_5_PERCIT_6_ROUCIT_7_BGDCIT_8_ECUCIT_9_MARCIT_10_UKR
<int><chr><int><chr><int><chr><int><dbl><int><chr><int><int><int><int><int><int><int><int><int><int>
13Lombardia15Milano146Milano151461.5146e+1115146001Centro Storico0000000001
23Lombardia15Milano146Milano151461.5146e+1115146001Centro Storico0400000000
33Lombardia15Milano146Milano151461.5146e+1115146001Centro Storico0000000000
43Lombardia15Milano146Milano151461.5146e+1115146001Centro Storico0000000000
53Lombardia15Milano146Milano151461.5146e+1115146001Centro Storico0000000000
63Lombardia15Milano146Milano151461.5146e+1115146001Centro Storico0000000000

Prepare data#

We only require a subset of the census data for our purposes. We therefore need to extract the relevant data, then combine these to create our vulnerability indicators.

In addition, the raw data is not suitable for use within the vulnerabiltiy assessment. It needs to be normalised based on the number of people/households within each small area. Therefore, the data is converted to percentages based on the total persons/households within each small area.

Supporting data#

Code that uniquely identifies the census area#

identifier <- c('SEZ2011')
census_area_id <- census_data[, identifier, drop = FALSE]

# number of rows in the census data
head( nrow(census_data) )
5740

Population total#

population_total <- census_data[, 'P1', drop = FALSE]
names(population_total)[1] <- 'population_total'
head(population_total)
A data.frame: 6 × 1
population_total
<int>
113
216
317
411
5 1
6 6

Households / families total#

households_total <- census_data[, 'PF1', drop = FALSE]
names(households_total)[1] <- 'households_total'
head(households_total)
A data.frame: 6 × 1
households_total
<int>
1 9
2 7
317
4 7
5 1
6 3

Domain data#

Age domain#

### Domain:Age ###

# Age - early childhood boy (under 5 years old)
early_childhood_boy_fields <- c(
    'P30' # Age 0 to 4 boys
)
early_childhood_boy_data <- census_data[, early_childhood_boy_fields, drop = FALSE]
early_childhood_boy <- rowSums(early_childhood_boy_data, na.rm=TRUE)
early_childhood_boy_pct <- (early_childhood_boy / population_total) * 100.0
names(early_childhood_boy_pct)[1] <- 'early_childhood_boy_pct'

# Age - early childhood girl (under 5 years old)
early_childhood_girl_fields <- c(
    'P67' # Age 0 to 4 girls
)
early_childhood_girl_data <- census_data[, early_childhood_girl_fields, drop = FALSE]
early_childhood_girl <- rowSums(early_childhood_girl_data, na.rm=TRUE)
early_childhood_girl_pct <- (early_childhood_girl / population_total) * 100.0
names(early_childhood_girl_pct)[1] <- 'early_childhood_girl_pct'

# Age - middle to oldest old male (75+ years old)
age_middle_to_oldest_old_male_fields <- c(
    'P45' # Age 75+ males
)
age_middle_to_oldest_old_male_data <- census_data[, age_middle_to_oldest_old_male_fields, drop = FALSE]
age_middle_to_oldest_old_male <- rowSums(age_middle_to_oldest_old_male_data, na.rm=TRUE)
age_middle_to_oldest_old_male_pct <- (age_middle_to_oldest_old_male / population_total) * 100.0
names(age_middle_to_oldest_old_male_pct)[1] <- 'age_middle_to_oldest_old_male_pct'

# Age - middle to oldest old male (75+ years old)
age_middle_to_oldest_old_female_fields <- c(
    'P82' # Age 75+ females
)
age_middle_to_oldest_old_female_data <- census_data[, age_middle_to_oldest_old_female_fields, drop = FALSE]
age_middle_to_oldest_old_female <- rowSums(age_middle_to_oldest_old_female_data, na.rm=TRUE)
age_middle_to_oldest_old_female_pct <- (age_middle_to_oldest_old_female / population_total) * 100.0
names(age_middle_to_oldest_old_female_pct)[1] <- 'age_middle_to_oldest_old_female_pct'

# Combine all these indicators into an array for this domain
age_domain_pct <- cbind(early_childhood_boy_pct,
                        early_childhood_girl_pct,
                        age_middle_to_oldest_old_male_pct,
                        age_middle_to_oldest_old_female_pct)

# Print the first six rows of the data to visually check it looks OK
head(age_domain_pct)
A data.frame: 6 × 4
early_childhood_boy_pctearly_childhood_girl_pctage_middle_to_oldest_old_male_pctage_middle_to_oldest_old_female_pct
<dbl><dbl><dbl><dbl>
100 15.384620.00
200 6.250006.25
300 23.529410.00
400 18.181820.00
500100.000000.00
600 0.000000.00

Income domain#

### Domain:Income ###

# Dependants - using under 15 years old for the 2022 Italian census
dependants_fields <- c(
    'P14',
    'P15',
    'P16')
dependants_data <- census_data[, dependants_fields, drop = FALSE]
dependants <- rowSums(dependants_data, na.rm=TRUE)
dependants_pct <- (dependants / population_total) * 100.0
names(dependants_pct)[1] <- 'dependants_pct'

# Unemployment
## Calculate total poluation for ages 15-64.
## Note: P101 = total employed people aged 15-64.
## Note: Sum of P17, P18, P19, P20, P21, P22, P23, P24, P25 and P26 is total poluation for ages 15-64.
## Note: to calculate unemployed we use: P101 - total poluation for ages 15-64.
## Note: this unemployed figure might also indicate students not working
population_15to64_fields <- c(
    'P17',
    'P18',
    'P19',
    'P20',
    'P21',
    'P22',
    'P23',
    'P24',
    'P25',
    'P26')
population_15to64_data <- census_data[, population_15to64_fields, drop = FALSE]
population_15to64 <- rowSums(population_15to64_data, na.rm=TRUE)
unemployment <- population_15to64 - census_data$P101
unemployment_pct <- (unemployment / population_total) * 100.0
names(unemployment_pct)[1] <- 'unemployment_pct'

# Combine all these indicators into an array for this domain
income_domain_pct <- cbind(dependants_pct,
                           unemployment_pct)

# Print the first six rows of the data to visually check it looks OK
head(income_domain_pct)
A data.frame: 6 × 2
dependants_pctunemployment_pct
<dbl><dbl>
1 7.69230815.384615
212.50000031.250000
3 0.00000017.647059
4 9.090909 9.090909
5 0.000000 0.000000
6 0.000000 0.000000

Information Access/Use domain#

### Domain:Information Access/Use ###

# No higher education
### Note: calculation test1 for P83 (total population age 9+) = P86 + P87 + P88 + P89 + P90 (without qualification + all primary to highest education levels)
### Note: calculation test2 for P83 (total population age 9+) = P16 + P17+ P18 + P19 + P20 + P21 + P22 + P23 + P24 + P25 + P26 + P27 + P28 + P29 (age 10+)
### Note: using without qualification (P86) + primary/elementary schools qualification only (P87) 
no_higher_education_fields <- c(
    'P86',
    'P87'
)
no_higher_education_data <- census_data[, no_higher_education_fields, drop = FALSE]
no_higher_education <- rowSums(no_higher_education_data, na.rm=TRUE)
no_higher_education_pct <- (no_higher_education / population_total) * 100.0
names(no_higher_education_pct)[1] <- 'no_higher_education_pct'

# Combine all these indicators into an array for this domain
info_domain_pct <- cbind(no_higher_education_pct)

# Print the first six rows of the data to visually check it looks OK
head(info_domain_pct)
A data.frame: 6 × 1
no_higher_education_pct
<dbl>
1 7.692308
218.750000
3 0.000000
4 0.000000
5 0.000000
6 0.000000

Local knowledge domain#

### Domain:Local Knowledge ###

# Foreign nationals (already a percentage)
foreign_nationals_fields <- c(
    'ST1'
)
foreign_nationals_data <- census_data[, foreign_nationals_fields, drop = FALSE]
foreign_nationals <- rowSums(foreign_nationals_data, na.rm=TRUE)
foreign_nationals_pct <- (foreign_nationals / population_total) * 100.0
names(foreign_nationals_pct)[1] <- 'foreign_nationals_pct'

# Combine all these indicators into an array for this domain
local_knowledge_domain_pct <- cbind(foreign_nationals_pct)

# Print the first six rows of the data to visually check it looks OK
head(local_knowledge_domain_pct)
A data.frame: 6 × 1
foreign_nationals_pct
<dbl>
123.07692
231.25000
3 0.00000
418.18182
5 0.00000
666.66667

Social Network domain data#

### Domain:Social Network ###

# Primary school age children
primary_school_age_fields <- c(
    'P15' # Primary School Age Children (Age 5-9) (Indicator P15)
)
primary_school_age_data <- census_data[, primary_school_age_fields, drop = FALSE]
primary_school_age <- rowSums(primary_school_age_data, na.rm=TRUE)
primary_school_age_pct <- (primary_school_age / population_total) * 100.0
names(primary_school_age_pct)[1] <- 'primary_school_age_pct'

# One person households
one_person_households_fields<- c(
    'PF3' # Households with one person (Indicator PF3)
)
one_person_households_data <- census_data[, one_person_households_fields, drop = FALSE]
one_person_households <- rowSums(one_person_households_data, na.rm=TRUE)
one_person_households_pct <- (one_person_households / households_total) * 100.0
names(one_person_households_pct)[1] <- 'one_person_households_pct'

# Combine all these indicators into an array for this domain
social_network_domain_pct <- cbind(primary_school_age_pct,
                                   one_person_households_pct)

# Print the first six rows of the data to visually check it looks OK
head(social_network_domain_pct)
A data.frame: 6 × 2
primary_school_age_pctone_person_households_pct
<dbl><dbl>
10.000000 66.66667
20.000000 42.85714
30.000000100.00000
49.090909 57.14286
50.000000100.00000
60.000000 66.66667

Combine all data into one table#

# Combine all data into one table
indicator_domains_pct <- cbind(census_area_id,
                               age_domain_pct,
                               income_domain_pct,
                               info_domain_pct,
                               local_knowledge_domain_pct,
                               social_network_domain_pct)
head(indicator_domains_pct)
A data.frame: 6 × 11
SEZ2011early_childhood_boy_pctearly_childhood_girl_pctage_middle_to_oldest_old_male_pctage_middle_to_oldest_old_female_pctdependants_pctunemployment_pctno_higher_education_pctforeign_nationals_pctprimary_school_age_pctone_person_households_pct
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
11.5146e+1100 15.384620.00 7.69230815.384615 7.69230823.076920.000000 66.66667
21.5146e+1100 6.250006.2512.50000031.25000018.75000031.250000.000000 42.85714
31.5146e+1100 23.529410.00 0.00000017.647059 0.000000 0.000000.000000100.00000
41.5146e+1100 18.181820.00 9.090909 9.090909 0.00000018.181829.090909 57.14286
51.5146e+1100100.000000.00 0.000000 0.000000 0.000000 0.000000.000000100.00000
61.5146e+1100 0.000000.00 0.000000 0.000000 0.00000066.666670.000000 66.66667

Calculate Z-Score#

The raw data is not suitable for use within the vulnerabiltiy assessment. It needs to be standardised. Therefore, the data is converted to z-scores. Z-scores are:

“A statistical measurement of a score’s relationship to the mean (average value) in a group of scores. A Z-score of 0 means the score is the same as the mean (average value). A Z-score can be positive or negative, indicating whether it is above or below the mean and by how many standard deviations. Z-score standardisation represents the deviation of a raw score from its mean in standard deviation units.” (Kazmierczak et al., 2015)

Calculate the Z-score#

# Copy the data
indicator_z_scores <- indicator_domains_pct

# Get the number of columns in the data
num_cols = ncol(indicator_z_scores)

# Calculate the z scores for each of the relevant columns - starting at the 2nd column
for(col in names(indicator_z_scores)[2:num_cols]) {
  # rename column (remove the '_pct' in the name)
  new_col_name <- gsub("_pct", "", col)
  indicator_z_scores[new_col_name] = scale(indicator_z_scores[col])
}

# Remove the original data to leave only the area identifier and the z scores
indicator_z_scores <- indicator_z_scores[-c(2:num_cols)]

head(indicator_z_scores)
A data.frame: 6 × 11
SEZ2011early_childhood_boyearly_childhood_girlage_middle_to_oldest_old_maleage_middle_to_oldest_old_femaledependantsunemploymentno_higher_educationforeign_nationalsprimary_school_ageone_person_households
<dbl><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]>
11.5146e+11-1.230411-1.064144 2.1196292-1.3062152-0.7801407-0.5060886-0.6075338 0.284861877-1.458348 0.9333249
21.5146e+11-1.230411-1.064144 0.2302043-0.3189938 0.1197864 1.3419462 0.8849880 0.768744204-1.458348-0.7026211
31.5146e+11-1.230411-1.064144 3.8043177-1.3062152-2.2200240-0.2425543-1.6458099-1.081394102-1.458348 3.2236494
41.5146e+11-1.230411-1.064144 2.6982091-1.3062152-0.5183437-1.2391934-1.6458099-0.004949997 1.865648 0.2789465
51.5146e+11-1.230411-1.06414419.6216703-1.3062152-2.2200240-2.2981224-1.6458099-1.081394102-1.458348 3.2236494
61.5146e+11-1.230411-1.064144-1.0625601-1.3062152-2.2200240-2.2981224-1.6458099 2.865567617-1.458348 0.9333249

Output the Z-score data#

# Output the z-score data as a csv file
output_file <- file.path(pipeline_dir, "censusDataZ.csv")
write.csv(indicator_z_scores, output_file, row.names = FALSE)

END