Social Vulnerability Logroño - Census Data#

Environment#

R Libraries#

Any required R libraries are imported into the kernal:

# Load R libraries
## none required

Output directory#

# create the pipeline directory if it does not exist
pipeline_dir <- file.path("../..","2_pipeline","Spain","Logrono","1a_CensusData","2021")
if(!dir.exists(pipeline_dir)){
    dir.create(pipeline_dir, recursive = TRUE)
    print(paste0(pipeline_dir, " created"))
}

Load Data#

Import the csv data#

# Read the census data
# Current Spanish 2021 Census is a mixture of:
#  1) a static CSV or Excel downloaded from the INE Census website
#  2) extra data from missing indicators appended into this CSV or Excel
# in our case we read the CSV version
census_data <- read.csv("../../0_data/census/Spain/2021/Census_Data_2024-07-03_Census.csv", sep=",")
head(census_data)

census_homes_year_of_construction_data <- read.csv("../../0_data/census/Spain/2021/Censo_2021_custom_query__Family_homes__Year_of_construction__17_La_Rioja.csv", sep=",")
head(census_homes_year_of_construction_data)

census_household_structure_data <- read.csv("../../0_data/census/Spain/2021/Censo_2021_custom_query__People_living_in_family_homes__Household_structure__17_La_Rioja.csv", sep=",")
head(census_household_structure_data)
A data.frame: 6 × 48
ccaaCPROCMUNdistsecct1_1t2_1t2_2t3_1t4_1t22_3t22_4t22_5c_ab0t4c_ag0t4c_amo75c_afo75c_ac5t9fam_3plusfam_bb_1970
<int><int><int><int><int><int><dbl><dbl><dbl><dbl><int><int><int><int><int><int><int><int><int><int>
11726111 2360.410.5950.970.10 18 13 4 3 3 20 18 10NANA
2172621111500.450.5545.560.12 88 71272020 62 78 45NANA
31726311 4600.480.5252.590.10 27 31 6 2 6 36 52 19NANA
41726411 67 NA NA NA NA NA NANA 1 0 6 2 0NANA
5172651118280.480.5246.310.13114 95612529104151 67NANA
6172652116820.490.5139.040.21136128426448 60 87105NANA
A data.frame: 6 × 14
home_year_idhome_totalbefore_1900from_1900_to_1920from_1921_to_1940from_1941_to_1950from_1951_to_1960from_1961_a_1970from_1971_to_1980from_1981to_1990from_1991_to_2000from_2001_to_2010from_2011_to_2020not_recorded
<dbl><int><int><int><int><int><int><int><int><int><int><int><int><int>
12600201001 9758127 45 54 33 33102 6912934239 18
22600301001 7682190171120 63 90 54 45 21 3618 39
3260050100115814881 57 27 48 57156147129 5736735
426005020011011 9 6 9 24 33 42 39 5711157930 78
5260060100118543654 42 57 7212318924619552551264
62600701001 681 054 99111111 51 75 51 33 51 3 39
A data.frame: 6 × 13
household_structure_idhousehold_structure_totalsingle_female_under_65single_male_under_65single_female_over_65single_male_over_65single_parent_with_a_child_under_25single_parent_with_all_children_over_25couple_without_childrencouple_with_a_child_under_25couple_with_all_children_over_25couple_or_single_parent_with_a_child_under_25_years_and_more_personother_household
<dbl><int><int><int><int><int><int><int><int><int><int><int><int>
12600201001114933 724236 6039162333 78144150
22600501001168633 905148 7857267516 99261189
32600502001168360 87272414730210651 99174174
42600601001251166120784819278402891183282174
52600701001 64215 424524 2448 99171 51 78 45
626008010011374 9 394827 5166189558132168 93

Prepare data#

We only require a subset of the census data for our purposes. We therefore need to extract the relevant data, then combine these to create our vulnerability indicators.

In addition, the raw data is not suitable for use within the vulnerabiltiy assessment. It needs to be normalised based on the number of people/households within each small area. Therefore, the data is converted to percentages based on the total persons/households within each small area.

Supporting data#

Code that uniquely identifies the census area#

identifier <- c('ccaa', 'CPRO', 'CMUN', 'dist', 'secc')
census_area_id <- census_data[, identifier, drop = FALSE]

Merge Census datasets#

# number of rows in the census data - base
head( nrow(census_data) )

# number of rows in the census data - family homes: year of construction
#                                   - family homes: household structure
# Note: information from some census sections has been suppressed to protect statistical confidentiality
head( nrow(census_homes_year_of_construction_data) )
head( nrow(census_household_structure_data) )

# construct the identifier - family homes: year of construction
census_homes_year_of_construction_data$ccaa = as.integer(17)
census_homes_year_of_construction_data$CPRO = as.integer(substr(census_homes_year_of_construction_data$home_year_id, 1, 2))
census_homes_year_of_construction_data$CMUN = as.integer(substr(census_homes_year_of_construction_data$home_year_id, 3, 5))
census_homes_year_of_construction_data$dist = as.integer(substr(census_homes_year_of_construction_data$home_year_id, 6, 7))
census_homes_year_of_construction_data$secc = as.integer(substr(census_homes_year_of_construction_data$home_year_id, 8, 10))

# construct the identifier - family homes: household structure
census_household_structure_data$ccaa = as.integer(17)
census_household_structure_data$CPRO = as.integer(substr(census_household_structure_data$household_structure_id, 1, 2))
census_household_structure_data$CMUN = as.integer(substr(census_household_structure_data$household_structure_id, 3, 5))
census_household_structure_data$dist = as.integer(substr(census_household_structure_data$household_structure_id, 6, 7))
census_household_structure_data$secc = as.integer(substr(census_household_structure_data$household_structure_id, 8, 10))

# merge using the identifier
census_data <- merge(census_data, census_homes_year_of_construction_data, by=identifier, all.x=TRUE)
census_data <- merge(census_data, census_household_structure_data, by=identifier, all.x=TRUE)

head(census_data)
343
202
209
A data.frame: 6 × 75
ccaaCPROCMUNdistsecct1_1t2_1t2_2t3_1t4_1single_male_under_65single_female_over_65single_male_over_65single_parent_with_a_child_under_25single_parent_with_all_children_over_25couple_without_childrencouple_with_a_child_under_25couple_with_all_children_over_25couple_or_single_parent_with_a_child_under_25_years_and_more_personother_household
<int><int><int><int><int><int><dbl><dbl><dbl><dbl><int><int><int><int><int><int><int><int><int><int>
11726111 2360.410.5950.970.10NANANA NANA NA NANA NA NA
2172621111500.450.5545.560.12724236 603916233378144150
31726311 4600.480.5252.590.10NANANA NANA NA NANA NA NA
41726411 67 NA NA NA NANANANA NANA NA NANA NA NA
5172651118280.480.5246.310.13905148 785726751699261189
6172652116820.490.5139.040.218727241473021065199174174

Totals#

Population total#

population_total <- census_data[, 't1_1', drop = FALSE]
names(population_total)[1] <- 'population_total'
head(population_total)
A data.frame: 6 × 1
population_total
<int>
1 236
21150
3 460
4 67
51828
61682

Dwellings total#

dwellings_total <- census_data[, 't18_1', drop = FALSE]
names(dwellings_total)[1] <- 'dwellings_total'
head(dwellings_total)
A data.frame: 6 × 1
dwellings_total
<int>
1 284
2 976
3 768
4 NA
51581
61010

Households total#

households_total <- census_data[, 't21_1', drop = FALSE]
names(households_total)[1] <- 'households_total'
head(households_total)
A data.frame: 6 × 1
households_total
<int>
1112
2495
3215
4 NA
5697
6673

Domain data#

Age domain#

### Domain:Age ###

# Age - early childhood boy (under 5 years old)
early_childhood_boy_fields <- c(
    'c_ab0t4' # Age 0 to 4 boys
)
early_childhood_boy_data <- census_data[, early_childhood_boy_fields, drop = FALSE]
early_childhood_boy <- rowSums(early_childhood_boy_data, na.rm=TRUE)
early_childhood_boy_pct <- (early_childhood_boy / population_total) * 100.0
names(early_childhood_boy_pct)[1] <- 'early_childhood_boy_pct'

# Age - early childhood girl (under 5 years old)
early_childhood_girl_fields <- c(
    'c_ag0t4' # Age 0 to 4 girls
)
early_childhood_girl_data <- census_data[, early_childhood_girl_fields, drop = FALSE]
early_childhood_girl <- rowSums(early_childhood_girl_data, na.rm=TRUE)
early_childhood_girl_pct <- (early_childhood_girl / population_total) * 100.0
names(early_childhood_girl_pct)[1] <- 'early_childhood_girl_pct'

# Age - middle to oldest old male (75+ years old)
age_middle_to_oldest_old_male_fields <- c(
    'c_amo75' # Age 75+ males
)
age_middle_to_oldest_old_male_data <- census_data[, age_middle_to_oldest_old_male_fields, drop = FALSE]
age_middle_to_oldest_old_male <- rowSums(age_middle_to_oldest_old_male_data, na.rm=TRUE)
age_middle_to_oldest_old_male_pct <- (age_middle_to_oldest_old_male / population_total) * 100.0
names(age_middle_to_oldest_old_male_pct)[1] <- 'age_middle_to_oldest_old_male_pct'

# Age - middle to oldest old male (75+ years old)
age_middle_to_oldest_old_female_fields <- c(
    'c_afo75' # Age 75+ females
)
age_middle_to_oldest_old_female_data <- census_data[, age_middle_to_oldest_old_female_fields, drop = FALSE]
age_middle_to_oldest_old_female <- rowSums(age_middle_to_oldest_old_female_data, na.rm=TRUE)
age_middle_to_oldest_old_female_pct <- (age_middle_to_oldest_old_female / population_total) * 100.0
names(age_middle_to_oldest_old_female_pct)[1] <- 'age_middle_to_oldest_old_female_pct'

# Combine all these indicators into an array for this domain
age_domain_pct <- cbind(early_childhood_boy_pct,
                        early_childhood_girl_pct,
                        age_middle_to_oldest_old_male_pct,
                        age_middle_to_oldest_old_female_pct)

# Print the first six rows of the data to visually check it looks OK
head(age_domain_pct)
A data.frame: 6 × 4
early_childhood_boy_pctearly_childhood_girl_pctage_middle_to_oldest_old_male_pctage_middle_to_oldest_old_female_pct
<dbl><dbl><dbl><dbl>
11.27118641.2711868.474576 7.627119
21.73913041.7391305.391304 6.782609
30.43478261.3043487.82608711.304348
41.49253730.0000008.955224 2.985075
51.36761491.5864335.689278 8.260394
63.80499412.8537463.567182 5.172414

Health domain#

### Domain:Health ###

# Health - people with a disability preventing work (already a percentage)
disability_fields <- c(
    't13_1'
)
disability_data <- census_data[, disability_fields, drop = FALSE]
disability <- rowSums(disability_data, na.rm=TRUE)
disability_pct <- disability

# Combine all these indicators into an array for this domain
health_domain_pct <- cbind(disability_pct)

# Print the first six rows of the data to visually check it looks OK
head(health_domain_pct)
A matrix: 6 × 1 of type dbl
disability_pct
0.03
0.02
0.01
0.00
0.03
0.02

Income domain#

### Domain:Income ###

# One parent households
one_parent_households_fields <- c(
    'single_parent_with_a_child_under_25'
)
one_parent_households_data <- census_data[, one_parent_households_fields, drop = FALSE]
one_parent_households <- rowSums(one_parent_households_data, na.rm=TRUE)
one_parent_households_pct <- (one_parent_households / households_total) * 100.0
names(one_parent_households_pct)[1] <- 'one_parent_households_pct'

# Dependants under 16 years of age (already a percentage)
dependants_fields <- c(
    't4_1'
)
dependants_data <- census_data[, dependants_fields, drop = FALSE]
dependants <- rowSums(dependants_data, na.rm=TRUE)
dependants_pct <- dependants

# Unemployment (already a percentage)
unemployment_fields <- c(
    't10_1'
)
unemployment_data <- census_data[, unemployment_fields, drop = FALSE]
unemployment <- rowSums(unemployment_data, na.rm=TRUE)
unemployment_pct <- unemployment

# population attending university (already a percentage)
attending_university_fields <- c(
    't8_1'
)
attending_university_data <- census_data[, attending_university_fields, drop = FALSE]
attending_university <- rowSums(attending_university_data, na.rm=TRUE)
attending_university_pct <- attending_university

# Combine all these indicators into an array for this domain
income_domain_pct <- cbind(one_parent_households_pct,
                           dependants_pct,
                           unemployment_pct,
                           attending_university_pct)

# Print the first six rows of the data to visually check it looks OK
head(income_domain_pct)
A data.frame: 6 × 4
one_parent_households_pctdependants_pctunemployment_pctattending_university_pct
<dbl><dbl><dbl><dbl>
1 0.000000.100.040.05
212.121210.120.160.03
3 0.000000.100.080.04
4 NA0.000.000.00
511.190820.130.180.02
621.842500.210.100.03

Information Access/Use domain#

### Domain:Information Access/Use ###

# No higher education (already a percentage)
no_higher_education_fields <- c(
    't9_1' # need to invert
)
no_higher_education_data <- census_data[, no_higher_education_fields, drop = FALSE]
no_higher_education <- rowSums(no_higher_education_data, na.rm=TRUE)
no_higher_education_pct <- (1.0-no_higher_education) # invert

# Combine all these indicators into an array for this domain
info_domain_pct <- cbind(no_higher_education_pct)

# Print the first six rows of the data to visually check it looks OK
head(info_domain_pct)
A matrix: 6 × 1 of type dbl
no_higher_education_pct
0.77
0.80
0.79
1.00
0.80
0.75

Local knowledge domain#

### Domain:Local Knowledge ###

# Foreign nationals (already a percentage)
foreign_nationals_fields <- c(
    't5_1'
)
foreign_nationals_data <- census_data[, foreign_nationals_fields, drop = FALSE]
foreign_nationals <- rowSums(foreign_nationals_data, na.rm=TRUE)
foreign_nationals_pct <- foreign_nationals

# Combine all these indicators into an array for this domain
local_knowledge_domain_pct <- cbind(foreign_nationals_pct)

# Print the first six rows of the data to visually check it looks OK
head(local_knowledge_domain_pct)
A matrix: 6 × 1 of type dbl
foreign_nationals_pct
0.11
0.16
0.13
0.00
0.15
0.11

Physical access domain#

###  Domain:Tenure ###

# Households renting
rented_fields <- c(
    't20_2'
)
rented_data <- census_data[, rented_fields, drop = FALSE]
rented <- rowSums(rented_data, na.rm=TRUE)
rented_pct <- (rented / dwellings_total) * 100.0
names(rented_pct)[1] <- 'rented_pct'

# Combine all these indicators into an array for this domain
tenure_domain_pct <- cbind(rented_pct)

# Print the first six rows of the data to visually check it looks OK
head(tenure_domain_pct)
A data.frame: 6 × 1
rented_pct
<dbl>
14.577465
24.610656
31.822917
4 NA
54.111322
65.247525

Social Network domain data#

### Domain:Social Network ###

# Primary school age children
primary_school_age_fields <- c(
    'c_ac5t9'
)
primary_school_age_data <- census_data[, primary_school_age_fields, drop = FALSE]
primary_school_age <- rowSums(primary_school_age_data, na.rm=TRUE)
primary_school_age_pct <- (primary_school_age / population_total) * 100.0
names(primary_school_age_pct)[1] <- 'primary_school_age_pct'

# One person households
one_person_households_fields<- c(
    't22_1'
)
one_person_households_data <- census_data[, one_person_households_fields, drop = FALSE]
one_person_households <- rowSums(one_person_households_data, na.rm=TRUE)
one_person_households_pct <- (one_person_households / households_total) * 100.0
names(one_person_households_pct)[1] <- 'one_person_households_pct'

# Combine all these indicators into an array for this domain
social_network_domain_pct <- cbind(primary_school_age_pct,
                                   one_person_households_pct)

# Print the first six rows of the data to visually check it looks OK
head(social_network_domain_pct)
A data.frame: 6 × 2
primary_school_age_pctone_person_households_pct
<dbl><dbl>
14.23728841.07143
23.91304336.76768
34.13043538.60465
40.000000 NA
53.66520831.99426
66.24256829.12333

Housing Characteristics domain#

### Domain: Housing Characteristics ####

# Indicator: Persons residing in family homes - year built pre 1970
#    'fam_bb_1970'
year_built_fields <- c(
    'before_1900',
    'from_1900_to_1920',
    'from_1921_to_1940',
    'from_1941_to_1950',
    'from_1951_to_1960',
    'from_1961_a_1970'
)
year_built_data <- census_data[, year_built_fields, drop = FALSE]
year_built <- rowSums(year_built_data, na.rm=TRUE)
year_built_pct <- (year_built / dwellings_total) * 100.0
names(year_built_pct)[1] <- 'year_built_pct'
#year_built_pct[is.na(year_built_pct)] <- 0

# Combine all these indicators into an array for this domain
housing_characteristics_domain_pct <- cbind(year_built_pct)

# Print the first six rows of the data to visually check it looks OK
head(housing_characteristics_domain_pct)
A data.frame: 6 × 1
year_built_pct
<dbl>
1 0.00000
227.97131
372.26562
4 NA
520.11385
612.17822

Combine all data into one table#

# Combine all data into one table
indicator_domains_pct <- cbind(census_area_id,
                               age_domain_pct,
                               health_domain_pct,
                               income_domain_pct,
                               info_domain_pct,
                               local_knowledge_domain_pct,
                               tenure_domain_pct,
                               social_network_domain_pct,
                               housing_characteristics_domain_pct)
head(indicator_domains_pct)
A data.frame: 6 × 20
ccaaCPROCMUNdistseccearly_childhood_boy_pctearly_childhood_girl_pctage_middle_to_oldest_old_male_pctage_middle_to_oldest_old_female_pctdisability_pctone_parent_households_pctdependants_pctunemployment_pctattending_university_pctno_higher_education_pctforeign_nationals_pctrented_pctprimary_school_age_pctone_person_households_pctyear_built_pct
<int><int><int><int><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
117261111.27118641.2711868.474576 7.6271190.03 0.000000.100.040.050.770.114.5774654.23728841.07143 0.00000
217262111.73913041.7391305.391304 6.7826090.0212.121210.120.160.030.800.164.6106563.91304336.7676827.97131
317263110.43478261.3043487.82608711.3043480.01 0.000000.100.080.040.790.131.8229174.13043538.6046572.26562
417264111.49253730.0000008.955224 2.9850750.00 NA0.000.000.001.000.00 NA0.000000 NA NA
517265111.36761491.5864335.689278 8.2603940.0311.190820.130.180.020.800.154.1113223.66520831.9942620.11385
617265213.80499412.8537463.567182 5.1724140.0221.842500.210.100.030.750.115.2475256.24256829.1233312.17822

Calculate Z-Score#

The raw data is not suitable for use within the vulnerabiltiy assessment. It needs to be standardised. Therefore, the data is converted to z-scores. Z-scores are:

“A statistical measurement of a score’s relationship to the mean (average value) in a group of scores. A Z-score of 0 means the score is the same as the mean (average value). A Z-score can be positive or negative, indicating whether it is above or below the mean and by how many standard deviations. Z-score standardisation represents the deviation of a raw score from its mean in standard deviation units.” (Kazmierczak et al., 2015)

Calculate the Z-score#

# Copy the data
indicator_z_scores <- indicator_domains_pct

# Get the number of columns in the data
num_cols = ncol(indicator_z_scores)

# Calculate the z scores for each of the relevant columns - starting at the 2nd column
for(col in names(indicator_z_scores)[6:num_cols]) {
  # rename column (remove the '_pct' in the name)
  new_col_name <- gsub("_pct", "", col)
  indicator_z_scores[new_col_name] = scale(indicator_z_scores[col])
}

# Remove the original data to leave only the area identifier and the z scores
indicator_z_scores <- indicator_z_scores[-c(6:num_cols)]

head(indicator_z_scores)
A data.frame: 6 × 20
ccaaCPROCMUNdistseccearly_childhood_boyearly_childhood_girlage_middle_to_oldest_old_maleage_middle_to_oldest_old_femaledisabilityone_parent_householdsdependantsunemploymentattending_universityno_higher_educationforeign_nationalsrentedprimary_school_ageone_person_householdsyear_built
<int><int><int><int><int><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]><dbl[,1]>
11726111-0.19258164-0.1467916 0.4818082-0.10021618 1.2753218-1.459709836-0.0938240-1.00214549 1.2303104 0.07143877 0.1446865-0.5073936 0.30936208 1.0319635-0.94812767
21726211 0.23745377 0.3146154-0.2611307-0.31519260 0.3880289-0.007254482 0.1987363 0.87022439 0.2231589 0.27339067 0.7793089-0.5005024 0.16550926 0.4903007 0.24638629
31726311-0.96122760-0.1140934 0.3255496 0.83585043-0.4992639-1.459709836-0.0938240-0.37802220 0.7267347 0.20607337 0.3985355-1.0793008 0.26195604 0.7214991 2.13797352
41726411 0.01083739-1.4002199 0.5976241-1.28188374-1.3865567 NA-1.5566254-1.62626879-1.2875683 1.61973671-1.2514828 NA-1.57053273 NA NA
51726511-0.10396496 0.1640514-0.1893316 0.06098884 1.2753218-0.118741371 0.3450164 1.18228604-0.2804168 0.27339067 0.6523844-0.6041755 0.05555571-0.1104731-0.08916627
61726521 2.13595981 1.4136596-0.7006675-0.72507994 0.3880289 1.157623344 1.5152575-0.06596055 0.2231589-0.06319583 0.1446865-0.3682738 1.19901504-0.4718037-0.42805727

Output the Z-score data#

# Output the z-score data as a csv file
output_file <- file.path(pipeline_dir, "censusDataZ.csv")
write.csv(indicator_z_scores, output_file, row.names = FALSE)

END