capture log close _all
clear 

* Define some variables;
local today=subinstr("`c(current_date)'"," ","",.)
local time=subinstr("`c(current_time)'",":","",.)

* stata version compatibility
version 16.0

* log file number
local vnum="1_13"

* Syntax name
local syntax "Prepare01"

* Projectname
local project "C19a"

* Open log file;
log using "$logfilepath/StataLog_AW_`project'_`syntax'_`vnum'_`today'_`time'", replace text name(Merge`syntax')
local logname=r(name)

timer clear 1

* Change Log
********************************
* 1.01: Added lan dummy for Stockholm region
* 1.03: Added egen groupSexDispink5=group(kon dispink2018_5), label
* 1.04: Added drop if tpop_oldnew==1
* 1.05: Corrected sun3
* 1.06: Adapted age split to ages after 98 (now to max 110)
* 1.07: Added more groups to combinations (Sun3lmic dispink3lmic)
* 1.08-1.09: Added comments for public extraction
* 1.10: added drop for missing lmic and dispink
* 1.11: Added mena
* 1.11: Added episode identifier to allow for logistic regression in sensitivity analyses on same data
* 1.12: Added categorical variable for age to allow for more descriptives by age
* 1.13: Code improvements to increase reliability

clear

use "$datafilepath/AW_`project'_Stset"

local lar=20

* Count those who die on first day of observation period
count if ddatecod==mdy(3,12,2020)

* Drop those who are not in survival analyses (should be the same as those who died on first day of observation period)
drop if _st==0

* Base pop
count

* Drop those who have are in totalpopulation2020 but not in the original totalpopulation of the data
* Covid deaths are based on the original totalpopulation
drop if tpop_oldnew==1
count

* Generate dummy for covid deaths to distinguish models between all-cause and covid-only models
generate byte cod_cov=0
replace cod_cov=1 if covid==1

* Dummy for Stockholm region based on existing lan variable end of 2019
gen stolan=0
replace stolan=1 if lan2019==1

label define stolan 0 "Non Stockholm" 1 "Stockholms Lan", replace
lab val stolan stolan

* Marital statusbased on existing variable end of 2019
gen marital_status=.
replace marital_status=0 if civil2019=="OG"  
replace marital_status=1 if civil2019=="G" | civil2019=="RP" 
replace marital_status=2 if civil2019=="S" | civil2019=="SP"
replace marital_status=3 if civil2019=="Ä" | civil2019=="EP"

label define marital_status 0 "Never Married" 1 "Married" 2 "Sep/Div" 3 "Widowed", replace
lab val marital_status marital_status

* Country of birth grouping

* High, low-middle income countries, World Bank definition
/*Generate high vs low, middle income countries*/
gen lmic = .
replace lmic = 0 if birthcountry==1
replace lmic = 1 if birthcountry>=2  & birthcountry<=22
replace lmic = 1 if inlist(birthcountry, 24, 42)
replace lmic = 2 if lmic==. & birthcountry!=99 & birthcountry!=1
/*Pick out early countries from the first if statement included in HIC*/
replace lmic = 2 if inlist(birthcountry, 6, 7, 16, 17, 18)

label define lmic 0 "Sweden" 1 "HIC" 2 "LMIC", modify
label variable lmic "HIC vs LMIC, World Bank definitions"
label values lmic lmic
tab birthcountry lmic, m

* Drop those with missing country of birth
drop if lmic==.
count

/*
Notes - because of the pre-existing country groupings, there are some small ambiguities

  e.g. Croatia is HIC, but part of Former Yugoslavia, most of which is LMIC
  
  e.g. Uruguary is HIC, but part of South America, most which is LMIC
 
*/

/*Generate extended definition to differentiate MENA*/
/*World Bank definition*/
gen mena = .
replace mena = 0 if birthcountry==1
replace mena = 1 if birthcountry>=2  & birthcountry<=22
replace mena = 1 if inlist(birthcountry, 24, 42)
replace mena = 2 if mena==. & birthcountry!=99 & birthcountry!=1
/*Pick out early countries from the first if statement included in HIC*/
replace mena = 2 if inlist(birthcountry, 6, 7, 16, 17, 18)
replace mena = 3 if inlist(birthcountry, 30, 31, 32, 33, 34, 35)

label define mena 0 "Sweden" 1 "HIC" 2 "LMIC other" 3 "LMIC MENA", modify
label variable mena "HIC vs LMIC w/MENA World Bank definitions"
label values mena mena
tab birthcountry mena, m

* Drop those with missing country of birth (should affect 0 individuals as done in lmic)
drop if mena==.
count

* Generate sex variable from existing variable from birth register
encode kon, gen(sex)

label define sex 1 "Men" 2 "Women", replace
lab val sex sex

* Generate education variables based on last existing known education (end of 2018)
destring sun2018, replace
gen byte sun3=9
replace sun3=9 if sun2018==.
replace sun3=1 if sun2018==1
replace sun3=1 if sun2018==2
replace sun3=2 if sun2018==3
replace sun3=3 if sun2018==4
replace sun3=3 if sun2018==5
replace sun3=3 if sun2018==6

label define sun3 1 "primary" 2 "Secondary" 3 "Post-Second" 9 "Missing", replace
lab val sun3 sun3

* Generate income variables based on last existing known tercile of income (end of 2018)
xtile dispink2018_3=dispink2018, nq(3)

* Drop those with missing income
drop if dispink2018_3==.

* Age split for every age after age lar
gen age=.
gen dateage`lar'=mdy(month(bdate), day(bdate), year(bdate)+`lar')
stsplit splitage, after at(0(365)38325) (time=dateage`lar')
replace age=0 if splitage==-1
replace age=(splitage/365)+`lar' if splitage>=0

gen agecat=.
replace agecat=1 if age >=`lar' & age<=49
replace agecat=2 if age >=50 & age<=69
replace agecat=3 if age >=70 & age<=79
replace agecat=4 if age >=80 & age<=89
replace agecat=5 if age >=90 & age!=.

label define agecat 1 "20-49" 2 "50-69" 3 "70-79" 4 "80-89" 5 "90+", replace
lab val agecat agecat

* Identify first episode for later deletion to allow logistic regression on same data
sort lopnr _t0
by lopnr: gen episodeX=_n
by lopnr: gen episodeZ=_N
gen logistReg=1
replace logistReg=0 if episodeX!=episodeZ
gen logistage=age
replace logistage=age-1 if episodeX==episodeZ & episodeX!=1
drop episodeX episodeZ

gen agecatlogist=.
replace agecatlogist=1 if logistage>=`lar' & logistage<=29
replace agecatlogist=2 if logistage>=30 & logistage<=34
replace agecatlogist=3 if logistage>=35 & logistage<=39
replace agecatlogist=4 if logistage>=40 & logistage<=44
replace agecatlogist=5 if logistage>=45 & logistage<=49
replace agecatlogist=6 if logistage>=50 & logistage<=54
replace agecatlogist=7 if logistage>=55 & logistage<=59
replace agecatlogist=8 if logistage>=60 & logistage<=64
replace agecatlogist=9 if logistage>=65 & logistage<=69
replace agecatlogist=10 if logistage>=70 & logistage<=74
replace agecatlogist=11 if logistage>=75 & logistage<=79
replace agecatlogist=12 if logistage>=80 & logistage<=84
replace agecatlogist=13 if logistage>=85 & logistage<=89
replace agecatlogist=14 if logistage>=90 & logistage<=94
replace agecatlogist=15 if logistage>=95 & logistage<=99
replace agecatlogist=16 if logistage>=100 & logistage!=.
replace agecatlogist=. if logistReg==0

label define agecatlogist 1 "20-29" 2 "30-34" 3 "35-39" 4 "40-44" 5 "45-49" 6 "50-54" 7 "55-59" 8 "60-64" 9 "65-69" 10 "70-74" 11 "75-79" 12 "80-84" 13 "85-89" 14 "90-94" 15 "95-99" 16 "100+", replace
lab val agecatlogist agecatlogist

* Create the combination of some variables for descriptive statistics
egen groupSexSun3=group(kon sun3), label
egen groupSexDispink=group(kon dispink2018_3), label
egen groupSexMaritalStatus=group(kon marital_status), label
egen groupSexlmic=group(kon lmic), label
egen groupSexmena=group(kon mena), label
egen groupSexStolan=group(kon stolan), label
egen groupDispink3lmic=group(dispink2018_3 lmic), label
egen groupSun3lmic=group(sun3 lmic), label

* Create by agecat sex and variables for even more descriptive statistics
egen groupSexagecatSun3=group(kon agecat sun3), label
egen groupSexagecatDispink=group(kon agecat dispink2018_3), label
egen groupSexagecatMaritalStatus=group(kon agecat marital_status), label
egen groupSexagecatlmic=group(kon agecat lmic), label
egen groupSexagecatmena=group(kon agecat mena), label
egen groupSexagecatStolan=group(kon agecat stolan), label

cd "$outputpath"

* Save file
save "$datafilepath/AW_`project'_`syntax'", replace

timer list 1

log close `logname'