3.1 R and SAS Syntax

3.1.1 Packages and Sample Data

# Data
adam_path <- "https://github.com/phuse-org/TestDataFactory/raw/main/Updated/TDF_ADaM/"
adsl <- haven::read_xpt(paste0(adam_path, "adsl.xpt"))

# Select a few variables
adsl <- adsl %>%
  dplyr::select(STUDYID, USUBJID, SUBJID, AGE, TRT01P, TRTSDT, TRTEDT, RACE, SEX, DISCONFL)

3.1.2 PROC CONTENTS \(\rightarrow\) summary()

To explore the variables of your dataset you can use PROC contents in SAS:

proc contents data = adsl;
run;

In R, you can use str() and summary() instead:

str(adsl)

## tibble [254 × 10] (S3: tbl_df/tbl/data.frame)
##  $ STUDYID : chr [1:254] "CDISCPILOT01" "CDISCPILOT01" "CDISCPILOT01" "CDISCPILOT01" ...
##   ..- attr(*, "label")= chr "Study Identifier"
##  $ USUBJID : chr [1:254] "01-701-1015" "01-701-1023" "01-701-1028" "01-701-1033" ...
##   ..- attr(*, "label")= chr "Unique Subject Identifier"
##  $ SUBJID  : chr [1:254] "1015" "1023" "1028" "1033" ...
##   ..- attr(*, "label")= chr "Subject Identifier for the Study"
##  $ AGE     : num [1:254] 63 64 71 74 77 85 68 81 84 52 ...
##   ..- attr(*, "label")= chr "Age"
##  $ TRT01P  : chr [1:254] "Placebo" "Placebo" "Xanomeline High Dose" "Xanomeline Low Dose" ...
##   ..- attr(*, "label")= chr "Planned Treatment for Period 01"
##  $ TRTSDT  : Date[1:254], format: "2014-01-02" "2012-08-05" ...
##  $ TRTEDT  : Date[1:254], format: "2014-07-02" "2012-09-01" ...
##  $ RACE    : chr [1:254] "WHITE" "WHITE" "WHITE" "WHITE" ...
##   ..- attr(*, "label")= chr "Race"
##  $ SEX     : chr [1:254] "F" "M" "M" "M" ...
##   ..- attr(*, "label")= chr "Sex"
##  $ DISCONFL: chr [1:254] "" "Y" "" "Y" ...
##   ..- attr(*, "label")= chr "Did the Subject Discontinue the Study?"

summary(adsl)

##    STUDYID            USUBJID             SUBJID               AGE       
##  Length:254         Length:254         Length:254         Min.   :51.00  
##  Class :character   Class :character   Class :character   1st Qu.:70.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :77.00  
##                                                           Mean   :75.09  
##                                                           3rd Qu.:81.00  
##                                                           Max.   :89.00  
##     TRT01P              TRTSDT               TRTEDT          
##  Length:254         Min.   :2012-07-09   Min.   :2012-08-28  
##  Class :character   1st Qu.:2013-01-26   1st Qu.:2013-05-12  
##  Mode  :character   Median :2013-06-13   Median :2013-09-27  
##                     Mean   :2013-06-17   Mean   :2013-10-10  
##                     3rd Qu.:2013-11-07   3rd Qu.:2014-03-15  
##                     Max.   :2014-09-02   Max.   :2015-03-05  
##      RACE               SEX              DISCONFL        
##  Length:254         Length:254         Length:254        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##

3.1.3 PROC FREQ \(\rightarrow\) count()

In order to get the frequencies for one variables you use PROC FREQ in SAS.

PROC FREQ data = adsl;
  TABLES SEX;
RUN;

And for cross tables:

PROC FREQ data = adsl;
  TABLES SEX * TRT01P * RACE;
RUN;

In R we use table() or count() from the tidyverse package:

For One Variable

table(adsl$SEX)

adsl %>%
  count(SEX)

## 
##   F   M 
## 143 111

SEX	n
F	143
M	111

For Multiple Variables

#for multiple variables
table(adsl$SEX, adsl$TRT01P, adsl$RACE) 

adsl %>%
  count(SEX, TRT01P, RACE)

## , ,  = AMERICAN INDIAN OR ALASKA NATIVE
## 
##    
##     Placebo Xanomeline High Dose Xanomeline Low Dose
##   F       0                    0                   0
##   M       0                    1                   0
## 
## , ,  = BLACK OR AFRICAN AMERICAN
## 
##    
##     Placebo Xanomeline High Dose Xanomeline Low Dose
##   F       5                    6                   6
##   M       3                    3                   0
## 
## , ,  = WHITE
## 
##    
##     Placebo Xanomeline High Dose Xanomeline Low Dose
##   F      48                   34                  44
##   M      30                   40                  34

SEX	TRT01P	RACE	n
F	Placebo	BLACK OR AFRICAN AMERICAN	5
F	Placebo	WHITE	48
F	Xanomeline High Dose	BLACK OR AFRICAN AMERICAN	6
F	Xanomeline High Dose	WHITE	34
F	Xanomeline Low Dose	BLACK OR AFRICAN AMERICAN	6
F	Xanomeline Low Dose	WHITE	44
M	Placebo	BLACK OR AFRICAN AMERICAN	3
M	Placebo	WHITE	30
M	Xanomeline High Dose	AMERICAN INDIAN OR ALASKA NATIVE	1
M	Xanomeline High Dose	BLACK OR AFRICAN AMERICAN	3
M	Xanomeline High Dose	WHITE	40
M	Xanomeline Low Dose	WHITE	34

3.1.4 KEEP/DROP \(\rightarrow\) select()

DATA adsl2;
  SET adsl;
  KEEP subjidn; *or drop subjidn;
RUN;

In R we use the select() function from the tidyverse package.

# to keep (only) SUBJID
adsl %>%
  select(SUBJID) %>%
  head() # keep only first 6 rows

# to keep all variables except SUBJID
adsl %>%
  select(-SUBJID) %>%
  head()

SUBJID
1015
1023
1028
1033
1034
1047

STUDYID	USUBJID	AGE	TRT01P	TRTSDT	TRTEDT	RACE	SEX	DISCONFL
CDISCPILOT01	01-701-1015	63	Placebo	2014-01-02	2014-07-02	WHITE	F
CDISCPILOT01	01-701-1023	64	Placebo	2012-08-05	2012-09-01	WHITE	M	Y
CDISCPILOT01	01-701-1028	71	Xanomeline High Dose	2013-07-19	2014-01-14	WHITE	M
CDISCPILOT01	01-701-1033	74	Xanomeline Low Dose	2014-03-18	2014-03-31	WHITE	M	Y
CDISCPILOT01	01-701-1034	77	Xanomeline High Dose	2014-07-01	2014-12-30	WHITE	F
CDISCPILOT01	01-701-1047	85	Placebo	2013-02-12	2013-03-09	WHITE	F	Y

3.1.5 Subsetting data (WHERE/IF) \(\rightarrow\) filter()

DATA female;
  SET adsl;
  WHERE SEX = 'F';
RUN;

adsl %>%
  filter(SEX == "F") %>%
  head() 

# multiple conditions:
adsl %>%
  filter(SEX == "F" & AGE > 70) %>%
  head()

STUDYID	USUBJID	SUBJID	AGE	TRT01P	TRTSDT	TRTEDT	RACE	SEX	DISCONFL
CDISCPILOT01	01-701-1015	1015	63	Placebo	2014-01-02	2014-07-02	WHITE	F
CDISCPILOT01	01-701-1034	1034	77	Xanomeline High Dose	2014-07-01	2014-12-30	WHITE	F
CDISCPILOT01	01-701-1047	1047	85	Placebo	2013-02-12	2013-03-09	WHITE	F	Y
CDISCPILOT01	01-701-1111	1111	81	Xanomeline Low Dose	2012-09-07	2012-09-16	WHITE	F	Y
CDISCPILOT01	01-701-1133	1133	81	Xanomeline High Dose	2012-10-28	2013-04-28	WHITE	F
CDISCPILOT01	01-701-1146	1146	75	Xanomeline High Dose	2013-05-20	2013-06-26	WHITE	F	Y

STUDYID	USUBJID	SUBJID	AGE	TRT01P	TRTSDT	TRTEDT	RACE	SEX	DISCONFL
CDISCPILOT01	01-701-1034	1034	77	Xanomeline High Dose	2014-07-01	2014-12-30	WHITE	F
CDISCPILOT01	01-701-1047	1047	85	Placebo	2013-02-12	2013-03-09	WHITE	F	Y
CDISCPILOT01	01-701-1111	1111	81	Xanomeline Low Dose	2012-09-07	2012-09-16	WHITE	F	Y
CDISCPILOT01	01-701-1133	1133	81	Xanomeline High Dose	2012-10-28	2013-04-28	WHITE	F
CDISCPILOT01	01-701-1146	1146	75	Xanomeline High Dose	2013-05-20	2013-06-26	WHITE	F	Y
CDISCPILOT01	01-701-1153	1153	79	Placebo	2013-09-23	2014-03-16	WHITE	F

3.1.6 Sorting data

PROC SORT data=adsl out=adsl_sort_age;
  BY AGE; 
RUN;

In R we use the arrange() function from tidyverse:

# ascending
adsl %>%
  arrange(AGE)%>%
  head() 

# descending
adsl %>%
  arrange(-AGE)%>%
  head()

STUDYID	USUBJID	SUBJID	AGE	TRT01P	TRTSDT	TRTEDT	RACE	SEX	DISCONFL
CDISCPILOT01	01-701-1341	1341	51	Xanomeline Low Dose	2013-01-05	2013-01-26	WHITE	M	Y
CDISCPILOT01	01-701-1118	1118	52	Placebo	2014-03-12	2014-09-09	WHITE	M
CDISCPILOT01	01-709-1007	1007	54	Xanomeline Low Dose	2012-07-31	2012-08-28	WHITE	F	Y
CDISCPILOT01	01-701-1180	1180	56	Xanomeline High Dose	2013-02-12	2013-03-18	WHITE	M	Y
CDISCPILOT01	01-701-1239	1239	56	Xanomeline High Dose	2014-01-11	2014-07-10	WHITE	M
CDISCPILOT01	01-701-1287	1287	56	Xanomeline High Dose	2014-01-25	2014-07-26	WHITE	F

STUDYID	USUBJID	SUBJID	AGE	TRT01P	TRTSDT	TRTEDT	RACE	SEX	DISCONFL
CDISCPILOT01	01-710-1083	1083	89	Placebo	2013-07-22	2013-08-01	WHITE	F	Y
CDISCPILOT01	01-703-1295	1295	88	Xanomeline High Dose	2013-11-21	2014-04-19	WHITE	F	Y
CDISCPILOT01	01-710-1002	1002	88	Xanomeline Low Dose	2014-01-14	2014-01-18	WHITE	M	Y
CDISCPILOT01	01-710-1368	1368	88	Placebo	2013-10-23	2014-04-24	WHITE	F
CDISCPILOT01	01-714-1035	1035	88	Placebo	2014-04-17	2014-10-16	WHITE	F
CDISCPILOT01	01-701-1387	1387	87	Placebo	2014-03-12	2014-03-25	WHITE	F	Y

3.1.7 Creating new variables

DATA adsl;
  SET adsl;
  length AGEGR1 $20.; * length function in R means something different
  IF age > 50 then AGEGR1 = ‘> 50 years old’;
  ELSE if age <= 50 then AGEGR1 = ‘<= 50 years old’;
run;

In R we use the mutate() function:

adsl %>%
  mutate(AGEGR1 = case_when(
    AGE > 50 ~"> 50 years old",
    AGE <= 50 ~"<= 50 years old",
  )) %>%
  head()

STUDYID	USUBJID	SUBJID	AGE	TRT01P	TRTSDT	TRTEDT	RACE	SEX	DISCONFL	AGEGR1
CDISCPILOT01	01-701-1015	1015	63	Placebo	2014-01-02	2014-07-02	WHITE	F		> 50 years old
CDISCPILOT01	01-701-1023	1023	64	Placebo	2012-08-05	2012-09-01	WHITE	M	Y	> 50 years old
CDISCPILOT01	01-701-1028	1028	71	Xanomeline High Dose	2013-07-19	2014-01-14	WHITE	M		> 50 years old
CDISCPILOT01	01-701-1033	1033	74	Xanomeline Low Dose	2014-03-18	2014-03-31	WHITE	M	Y	> 50 years old
CDISCPILOT01	01-701-1034	1034	77	Xanomeline High Dose	2014-07-01	2014-12-30	WHITE	F		> 50 years old
CDISCPILOT01	01-701-1047	1047	85	Placebo	2013-02-12	2013-03-09	WHITE	F	Y	> 50 years old

3.1.8 Handling of missing values

Missing values in SAS and R:

SAS
- Missing value is a blank/a single decimal point for character/numeric variables, see details in Missing Values in SAS;
R
- Missing data in R appears as NA. NA is not a string nor a numeric value, but an indicator of missingness, see detailed examples in Advanced R #3.2.3 Missing values;
- NA and "" are different in R: "" is a blank string, while NA is missing;
- “Missing values (NA) and NaN values are regarded as non-comparable even to themselves, so comparisons involving them will always result in NA”
- admiral::convert_blanks_to_na() can turn SAS blank strings into proper R NA (from admiral documentation).

# Unlike SAS, space(s) != blank string in R, 
" " == ""

## [1] FALSE

# is.na to check if it's missing
c(" ", "Y", NA_character_) %>% is.na()

## [1] FALSE FALSE  TRUE

# NA is non-comparable, result in NA
c(" ", "Y", NA_character_) != "Y"

## [1]  TRUE FALSE    NA

Unexpected results might occur with “SAS habits”, below are three examples to illustrate the different handling of missing values in R.

3.1.8.1 Subsetting data

!= is not the same in R when it involves NA

# add one more row to the data with missing AGE and DISCONFL, select the first 3 rows
adsl_na_example <- adsl %>% 
  add_row(SUBJID = "1", DISCONFL = NA_character_, .before = T) %>% 
  slice(1:3) %>% 
  select(SUBJID, DISCONFL)
# demo data
adsl_na_example

SUBJID	DISCONFL
1	NA
1015
1023	Y

Subset to DISCONFL != "Y", only DISCONFL = " " is selected, not NA

adsl_na_example %>% filter(DISCONFL != "Y")

SUBJID	DISCONFL
1015

To display NA, add is.na(DISCONFL) to the filter

# include is.na(DISCONFL) in the filter
adsl_na_example %>% filter(DISCONFL != "Y" | is.na(DISCONFL))

SUBJID	DISCONFL
1	NA
1015

3.1.8.2 Sorting data

In SAS, missing is treated as the smallest value in proc sort; while in R, NA is always at the bottom after ascending or descending sorting.

Create demo data

# add one more row to the data with missing AGE and DISCONFL 
# select the first 3 rows
adsl_na_sort <- adsl %>% 
  add_row(SUBJID = "1", AGE = NA_integer_, .before = T) %>% 
  slice(1:5) %>% 
  select(SUBJID, AGE)

SUBJID	AGE
1	NA
1015	63
1023	64
1028	71
1033	74

Sort by ascending order of AGE

# ascending
adsl_na_sort %>%
  arrange(AGE)

SUBJID	AGE
1015	63
1023	64
1028	71
1033	74
1	NA

Sort by descending order of AGE

# descending
adsl_na_sort %>%
  arrange(-AGE)

SUBJID	AGE
1033	74
1028	71
1023	64
1015	63
1	NA

NA can also be placed at the top with below workaround

# to be consistent with SAS - NA at the top when ascending
adsl_na_sort %>%
  arrange(!is.na(AGE), AGE)

SUBJID	AGE
1	NA
1015	63
1023	64
1028	71
1033	74

3.1.8.3 Creating new variables

When AGE is missing, AGEGR1 is set to ‘<= 50 years old’ in SAS; while in R it is set to NA_character_.

adsl_na_sort %>%
  mutate(AGEGR1 = case_when(
    AGE > 50 ~"> 50 years old",
    AGE <= 50 ~"<= 50 years old",
  ))

SUBJID	AGE	AGEGR1
1	NA	NA
1015	63	> 50 years old
1023	64	> 50 years old
1028	71	> 50 years old
1033	74	> 50 years old

3.1.9 Merging Data

We create another data set with weight information for some of the subjects:

SUBJID <- sample(adsl$SUBJID, 100, replace = FALSE)
WEIGHT <- sample(50:100, 100, replace = TRUE)

adsl_weight <- data.frame(SUBJID, WEIGHT)

Now, we would like to join the weight column to our adsl data set.

In SAS, we use the merge function:

* inner join;
data adsl_merge;
merge adsl(in = inadsl) adsl_weight(in = inweight);
by subjidn;
if inadsl and inweight; * inner join in SAS
run;

* outer join;
data adsl_merge_all;
merge adsl(in = inadsl) adsl_weight(in = inweight);
by subjidn;
if inadsl or inweight; * default in SAS
run;

* left join;
data adsl_merge_left;
merge adsl(in = inadsl) adsl_weight(in = inweight);
by subjidn;
if inadsl;
run;

In R, we use again the tidyverse package:

# inner join
adsl%>%
  inner_join(adsl_weight, by = "SUBJID") 

# outer join
adsl %>%
  full_join(adsl_weight, by = "SUBJID")

# left join
adsl %>%
  left_join(adsl_weight, by = "SUBJID")

3.1.10 Concatenating Data

To demonstrate stacking or concatenating data sets, we can first split adsl into two parts, each having 5 records with the same columns in common.

# first 5 records of adsl
adsl_1 <- adsl[c(1:5),]

# the next 5 records of adsl
adsl_2 <- adsl[c(6:10),]

In SAS, you could use the set command:

DATA adsl_stacked;
 SET adsl_1 adsl_2;
RUN;

In R, you may accomplish this like so:

# tidyverse method - bind_rows()
adsl_stacked <- adsl_1 %>%
  bind_rows(adsl_2)