/*
Stata Workshop 2
Data Manipulation and Visualization
Tao Wang
Feb., 2026
*/

// ============= SETUP =============
clear
set more off
// ***** IMPORTANT: CHANGE THE PATH IN THE LINE BELOW *****
capture cd "/Users/your_name/your_folder/workshop2"
capture log close

// Starts a log file to save your work
log using workshop2, replace

// ============= EXPLORING MICRO DATA =============
/* Consumer Expenditure Public Use Microdata, 2023 Q3 
obtained from BLS at https://www.bls.gov/cex/pumd.htm
Selected variables are extracted from "fmli233.dta",
exported to "ce_pumd_fmli233.xlsx", using the following code:
	use "fmli233.dta", clear
	keep NEWID FINCBTAX FINCBTXM FINATXEM ETOTAPX4 TOTEX4PQ FAM_SIZE
	export excel using "ce_pumd_fmli233.xls", firstrow(variables) replace
Note that the survey is revised frequently, and variables used here may not
exist in older or newer survey data. 
*/

// --- 1 Import Raw CSV Data ---
import excel "ce_pumd_fmli233.xls", firstrow case(lower) clear

// --- 2 Data Cleaning and Preparation ---
// Rename variables to be intuitive
rename fincbtax inc
rename finatxem dinc
rename totex4pq con
rename fam_size size

// Add descriptive labels
label variable inc "Family income in the last 12 months before tax"
label variable dinc "Family income in the last 12 months after tax"
label variable con "Total family expenditure in the previous quarter"
label variable size "Family size"

/* Exercise 1:
Find the documentation for the variable etotapx4 on the BLS website. Rename and
label the variable accordingly. 
*/

rename etotapx4 outlay
label variable outlay "Total family outlay in the previous quarter"

// Drop and order variables for a clean dataset
drop fincbtxm // alternatively, can use keep
// keep newid inc dinc con outlay 
order newid size inc dinc con outlay

// --- 3 Inspecting Clean Data ---
describe
browse
count    // check the number of observations
codebook // get a detailed overview of all variables
summarize

// --- 4 Exploring a Single Variable: Income ---
codebook inc
inspect inc
summarize inc, detail
histogram inc, name(hist_inc, replace) // visualize the distribution
histogram inc, kdensity name(hist_inc1, replace) // add a density

// --- 5 Exploring a Categorical Variable: Family Size ---
codebook size
tabulate size
histogram size, discrete name(hist_size, replace)
graph bar (count), over(size) name(bar_freq, replace) // bar chart w/ frequency
graph bar, over(size) name(bar_pct, replace) /// bar chart w/ percetage
	blabel(bar) // w/ bar label

// --- 6 Generating New Variables ---
// Condense the family size variable
gen size1 = size 					// generate a new size variable
replace size1 = 6 if size >= 6 		// that condenses the larger values
label variable size1 "Family Size"
label define size1_label 6 "6 or more" // define the label size_label
label values size1 size1_label        // apply the label to variable size1
tabulate size1                       // see the result
graph pie, over(size1) plabel(_all percent) name(pie_size, replace) //pie chart

// Alternatively, use egen to create indicator for terciles
egen size2 = cut(size), group(3)	
label variable size2 "Family Size"
label define size2_label 0 "Bottom tercile" 1 "Middle tercile" 2 "Top Tercile" 
label values size2 size2_label
tab size2

/* Exercise 2:
Create a new variable that represents the quintile of the family's income, and 
label the variable and values appropriately. 
*/

// Create a poverty indicator variable (1 = family in poverty)
/* use Federal poverty level (FPL) for 2023
https://aspe.hhs.gov/sites/default/files/documents/1c92a9207f3ed5915ca020d58fe77696/detailed-guidelines-2023.pdf */
gen poverty = 0
replace poverty = 1 if inc < (14580 + 5140 * (size - 1))
label variable poverty "Federal Poverty Indicator (1 = Poverty)"
// Alternatively, use logical operation.
gen poverty1 = inc < (14580 + 5140 * (size - 1))
count if poverty != poverty1	// check if the two variables are identical
drop poverty1
save "workshop2.dta", replace

// --- 7 Visualizing Two-Way Categorical Data (Optional) ---
/* Stacked bar chart
graph bar, over(poverty) over(size1) asyvars stack ///
	blabel(bar, position(center) format(%-6.3g)) legend(off) ///
	title("Household Size and Federal Poverty Rate (US, 2023 Q3)") ///
	caption("Note: Red = poverty. Percetage of observations shown (n = 4770).", size(small)) ///
	note("Data Source: Consumer Expenditure Public Use Microdata, 2023 Q3")
*/

// --- 8 Scatterplot and Correlation with Micro Data ---
twoway (scatter con dinc) (lfit con dinc), ///
	title("Disposable Income and Personal Consumption Expenditure") ///
	subtitle("$ annually, United States, 2023") ///
	note("Data Source: Consumer Expenditure Public Use Microdata, 2023 Q3") ///
	legend(off) name(sc_levels, replace)
	
correlate con dinc // Compare this correlation to the macro data!

/* Scatterplot matrix (optional)
graph matrix inc dinc con outlay, half name(sc_matrix_micro, replace)
*/

/* Graph in logs (optional)
twoway (scatter con dinc, msize(tiny) colorvar(size1) colordiscrete) ///
	(lfitci con dinc) if dinc > 50 & con > 50, ///
	yscale(log) ylabel(500 5000 50000 500000) ///
	xscale(log) xlabel(500 5000 50000 500000) ///
	title("Disposable Income and Personal Consumption Expenditure") ///
	subtitle("$ annually, United States, 2023") ///
	note("Data Source: Consumer Expenditure Public Use Microdata, 2023 Q3")	///
	legend(off) name(sc_logs1, replace)
*/

/* Exercise 3:
Use appropriately numerical or graphical analysis to investigate the 
relationship between poverty rate and family size. 
*/

// ============= WRAP UP =============
log close
view "workshop2.smcl"
