/*
Stata Workshop 1
Introduction to Stata
Tao Wang
SSQL@Swarthmore
swarthmore.edu/ssql
Jan., 2026
*/

clear
set more off // pause --more-- messages
capture { // suppress error messages
cd "/Users/.../Documents/Ec31/Stata/workshop1" 
// change into working directory
// please replace what's in the quotation mark with your own path
}
capture log close // close log files if there is one open

log using workshop1, replace // open a log file

********** Part 0: Introduction to Stata Interface and files********************
/*
0.1 Windows (with shortcuts): 
1 - Command
2 - Results
3 - History
4 - Variables
5 - Properties
6 - Graphs 
7 - Viewer
8 - Data Editor
9 - Do-file editor

0.2 Dropdown menu

0.3 keyboard shortcuts

0.4 Help/Search

0.5 Stata files: 
data (.dta), do (.do), log (.smcl or .log)

0.6 The Do file - how to "write" Stata code:
- use templates, follow examples;
- learn the Stata syntax using the help files;
- use the graphical interface to generate commands;
- ask AI for help.
*/

********** Part 1: Income-Expenditure (Macro data) *****************************

/* Income and consumption data for the U.S. 1929-2022, obtained from BEA at https://apps.bea.gov/iTable/?isuri=1&reqid=19&step=4&categories=flatfiles&nipa_table_list=1
"Section 2 Personal Income and Outlays"
"Section2All_xls.xlsx" is the original downloaded file.
"consumption.csv" has been cleaned to be used easily in STATA.
*/

* 1.1 Import dataset
* 1.1.1 Method 1: copy and paste

* 1.1.2 Import from spreadsheet

import delimited "consumption.csv", clear // ".csv" can be omitted
//quotation mark can be left out
help import // view help

/* VERY OPTIONAL: Using Stata to prepare data
import excel "Section2All_xls.xlsx", ///
	sheet("T20100-A") firstrow cellrange(B8:CS54) clear	
keep if B == "Personal income" | ///
	B == "Equals: Disposable personal income" | ///
	B == "  Personal consumption expenditures" // keep income and consumption
destring D-AN, replace force // destring, missing values will be generated
xpose, clear // transpose dataset variables <--> observations
rename (v1 v2 v3) (inc dinc con) // rename variables
drop if inc == . // drop empty observations
gen year = 1928 + _n // generate variable year
order year inc dinc con // order variables 
*/

* 1.1.3 Load Stata dataset
save "consumption.dta", replace // ".dta" can be omitted
use "consumption.dta", clear

* 1.2 Inspect data
* 1.2.1 Describe data
describe // or use keyboard F2
desc // command can be shortened as long as there is no abmiguity

* 1.2.2 Browse using Data Editor
browse // view data in Data Editor (browse)
br // can use menu icon on top
br di c // varialbe names can be shortened as well
// can specify a subset of varialbes, or choose in "Variables" window

* 1.3 Summary Statistics
summarize // summary statistics
sum dinc, detail // with detailed statistics

/* Exercise 1: 
Show the summary statistics for consumption. Find the mean, variance, standard
deviation, and the five number summary. What kind of variable is consumption?
*/

* 1.4 Visualizing Distribution
histogram con
generate logcon = log(con)
hist logc

/* Exercise 2: 
Create a new variable that represents the natural log of disposable income, and
generate a histogram of the variable. Describe the shape of the distribution.
*/

* 1.5 Two-Way Relationship
twoway (scatter con dinc)
sc con din, name(dinc_con, replace)

/* OPTIONAL: make the figure nicer, using options in the GUI
twoway (scatter con dinc, mlabel(year) mlabsize(tiny)) /// add label
	(lfit con dinc), /// add linear fit
	title("Disposable Income and Personal Consumption Expenditure") ///
	subtitle("(Millions of $), United States, 1929-2022") ///
	note("Source: Bureau of Economic Analysis")  ///
	name(dinc_con, replace) // name graph so that it is preserved

label variable year "Year" // Label variable(s)
label variable inc "Income"
label variable dinc "Disposable Income"
label variable con "Personal Consumption Expenditure"

twoway (scatter con dinc, mlabel(year) mlabsize(tiny)) (lfit con dinc), ///
	ytitle("Personal Consumption Expenditure") /// add title of Y-axis
	title("Disposable Income and Personal Consumption Expenditure") ///
	subtitle("(Millions of $), United States, 1929-2022") ///
	note("Source: Bureau of Economic Analysis") ///
	legend(off) /// hide legend
	name(dinc_con, replace)
	
twoway (scatter con dinc, mlabel(year) mlabsize(tiny)) (lfit con dinc), ///
	yscale(log) ylabel(30000 300000 3000000 30000000) /// log scale
	xscale(log) xlabel(30000 300000 3000000 30000000) /// specify ticks
	ytitle("Personal Consumption Expenditure") /// add title of Y-axis
	title("Disposable Income and Personal Consumption Expenditure") ///
	subtitle("(Millions of $), United States, 1929-2022") ///
	note("Source: Bureau of Economic Analysis") ///
	legend(off) /// hide legend
	name(logdinc_logcon, replace)
*/

correlate dinc con // linear correlation coefficient 

/* Exercise 3:
Find the linear correlation coefficient between log disposable income and log 
consumption. Create a scatter plot to show the relationship. 
*/
	
/* OPTIONAL: 1.6 Regression
regress con dinc // simple regression
regress logc logdi
*/

log close
view "workshop1.smcl"
