3.1 R and SAS Syntax

Below are some examples that use adsl to show how common operations are done in SAS and R.

3.1.1 Packages and Sample Data

# Data
adam_path <- "https://github.com/phuse-org/TestDataFactory/raw/main/Updated/TDF_ADaM/"
adsl <- haven::read_xpt(paste0(adam_path, "adsl.xpt"))

# Select a few variables
adsl <- adsl %>%
  dplyr::select(STUDYID, USUBJID, SUBJID, AGE, TRT01P, TRTSDT, TRTEDT, RACE, SEX, DISCONFL)

3.1.2 PROC CONTENTS \(\rightarrow\) summary()

To explore the variables of your dataset you can use PROC contents in SAS:

proc contents data = adsl;
run;

In R, you can use str() and summary() instead:

str(adsl)
## tibble [254 × 10] (S3: tbl_df/tbl/data.frame)
##  $ STUDYID : chr [1:254] "CDISCPILOT01" "CDISCPILOT01" "CDISCPILOT01" "CDISCPILOT01" ...
##   ..- attr(*, "label")= chr "Study Identifier"
##  $ USUBJID : chr [1:254] "01-701-1015" "01-701-1023" "01-701-1028" "01-701-1033" ...
##   ..- attr(*, "label")= chr "Unique Subject Identifier"
##  $ SUBJID  : chr [1:254] "1015" "1023" "1028" "1033" ...
##   ..- attr(*, "label")= chr "Subject Identifier for the Study"
##  $ AGE     : num [1:254] 63 64 71 74 77 85 68 81 84 52 ...
##   ..- attr(*, "label")= chr "Age"
##  $ TRT01P  : chr [1:254] "Placebo" "Placebo" "Xanomeline High Dose" "Xanomeline Low Dose" ...
##   ..- attr(*, "label")= chr "Planned Treatment for Period 01"
##  $ TRTSDT  : Date[1:254], format: "2014-01-02" "2012-08-05" ...
##  $ TRTEDT  : Date[1:254], format: "2014-07-02" "2012-09-01" ...
##  $ RACE    : chr [1:254] "WHITE" "WHITE" "WHITE" "WHITE" ...
##   ..- attr(*, "label")= chr "Race"
##  $ SEX     : chr [1:254] "F" "M" "M" "M" ...
##   ..- attr(*, "label")= chr "Sex"
##  $ DISCONFL: chr [1:254] "" "Y" "" "Y" ...
##   ..- attr(*, "label")= chr "Did the Subject Discontinue the Study?"
summary(adsl) 
##    STUDYID            USUBJID             SUBJID               AGE       
##  Length:254         Length:254         Length:254         Min.   :51.00  
##  Class :character   Class :character   Class :character   1st Qu.:70.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :77.00  
##                                                           Mean   :75.09  
##                                                           3rd Qu.:81.00  
##                                                           Max.   :89.00  
##     TRT01P              TRTSDT               TRTEDT          
##  Length:254         Min.   :2012-07-09   Min.   :2012-08-28  
##  Class :character   1st Qu.:2013-01-26   1st Qu.:2013-05-12  
##  Mode  :character   Median :2013-06-13   Median :2013-09-27  
##                     Mean   :2013-06-17   Mean   :2013-10-10  
##                     3rd Qu.:2013-11-07   3rd Qu.:2014-03-15  
##                     Max.   :2014-09-02   Max.   :2015-03-05  
##      RACE               SEX              DISCONFL        
##  Length:254         Length:254         Length:254        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
## 

3.1.3 PROC FREQ \(\rightarrow\) count()

In order to get the frequencies for one variables you use PROC FREQ in SAS.

PROC FREQ data = adsl;
  TABLES SEX;
RUN;

And for cross tables:

PROC FREQ data = adsl;
  TABLES SEX * TRT01P * RACE;
RUN;

In R we use table() or count() from the tidyverse package:

For One Variable

table(adsl$SEX)

adsl %>%
  count(SEX)
## 
##   F   M 
## 143 111
SEX n
F 143
M 111

For Multiple Variables

#for multiple variables
table(adsl$SEX, adsl$TRT01P, adsl$RACE) 

adsl %>%
  count(SEX, TRT01P, RACE)
## , ,  = AMERICAN INDIAN OR ALASKA NATIVE
## 
##    
##     Placebo Xanomeline High Dose Xanomeline Low Dose
##   F       0                    0                   0
##   M       0                    1                   0
## 
## , ,  = BLACK OR AFRICAN AMERICAN
## 
##    
##     Placebo Xanomeline High Dose Xanomeline Low Dose
##   F       5                    6                   6
##   M       3                    3                   0
## 
## , ,  = WHITE
## 
##    
##     Placebo Xanomeline High Dose Xanomeline Low Dose
##   F      48                   34                  44
##   M      30                   40                  34
SEX TRT01P RACE n
F Placebo BLACK OR AFRICAN AMERICAN 5
F Placebo WHITE 48
F Xanomeline High Dose BLACK OR AFRICAN AMERICAN 6
F Xanomeline High Dose WHITE 34
F Xanomeline Low Dose BLACK OR AFRICAN AMERICAN 6
F Xanomeline Low Dose WHITE 44
M Placebo BLACK OR AFRICAN AMERICAN 3
M Placebo WHITE 30
M Xanomeline High Dose AMERICAN INDIAN OR ALASKA NATIVE 1
M Xanomeline High Dose BLACK OR AFRICAN AMERICAN 3
M Xanomeline High Dose WHITE 40
M Xanomeline Low Dose WHITE 34

3.1.4 KEEP/DROP \(\rightarrow\) select()

DATA adsl2;
  SET adsl;
  KEEP subjidn; *or drop subjidn;
RUN;

In R we use the select() function from the tidyverse package.

# to keep (only) SUBJID
adsl %>%
  select(SUBJID) %>%
  head() # keep only first 6 rows

# to keep all variables except SUBJID
adsl %>%
  select(-SUBJID) %>%
  head()
SUBJID
1015
1023
1028
1033
1034
1047
STUDYID USUBJID AGE TRT01P TRTSDT TRTEDT RACE SEX DISCONFL
CDISCPILOT01 01-701-1015 63 Placebo 2014-01-02 2014-07-02 WHITE F
CDISCPILOT01 01-701-1023 64 Placebo 2012-08-05 2012-09-01 WHITE M Y
CDISCPILOT01 01-701-1028 71 Xanomeline High Dose 2013-07-19 2014-01-14 WHITE M
CDISCPILOT01 01-701-1033 74 Xanomeline Low Dose 2014-03-18 2014-03-31 WHITE M Y
CDISCPILOT01 01-701-1034 77 Xanomeline High Dose 2014-07-01 2014-12-30 WHITE F
CDISCPILOT01 01-701-1047 85 Placebo 2013-02-12 2013-03-09 WHITE F Y

3.1.5 Subsetting data (WHERE/IF) \(\rightarrow\) filter()

DATA female;
  SET adsl;
  WHERE SEX = 'F';
RUN;
adsl %>%
  filter(SEX == "F") %>%
  head() 

# multiple conditions:
adsl %>%
  filter(SEX == "F" & AGE > 70) %>%
  head() 
STUDYID USUBJID SUBJID AGE TRT01P TRTSDT TRTEDT RACE SEX DISCONFL
CDISCPILOT01 01-701-1015 1015 63 Placebo 2014-01-02 2014-07-02 WHITE F
CDISCPILOT01 01-701-1034 1034 77 Xanomeline High Dose 2014-07-01 2014-12-30 WHITE F
CDISCPILOT01 01-701-1047 1047 85 Placebo 2013-02-12 2013-03-09 WHITE F Y
CDISCPILOT01 01-701-1111 1111 81 Xanomeline Low Dose 2012-09-07 2012-09-16 WHITE F Y
CDISCPILOT01 01-701-1133 1133 81 Xanomeline High Dose 2012-10-28 2013-04-28 WHITE F
CDISCPILOT01 01-701-1146 1146 75 Xanomeline High Dose 2013-05-20 2013-06-26 WHITE F Y
STUDYID USUBJID SUBJID AGE TRT01P TRTSDT TRTEDT RACE SEX DISCONFL
CDISCPILOT01 01-701-1034 1034 77 Xanomeline High Dose 2014-07-01 2014-12-30 WHITE F
CDISCPILOT01 01-701-1047 1047 85 Placebo 2013-02-12 2013-03-09 WHITE F Y
CDISCPILOT01 01-701-1111 1111 81 Xanomeline Low Dose 2012-09-07 2012-09-16 WHITE F Y
CDISCPILOT01 01-701-1133 1133 81 Xanomeline High Dose 2012-10-28 2013-04-28 WHITE F
CDISCPILOT01 01-701-1146 1146 75 Xanomeline High Dose 2013-05-20 2013-06-26 WHITE F Y
CDISCPILOT01 01-701-1153 1153 79 Placebo 2013-09-23 2014-03-16 WHITE F

3.1.6 Sorting data

PROC SORT data=adsl out=adsl_sort_age;
  BY AGE; 
RUN;

In R we use the arrange() function from tidyverse:

# ascending
adsl %>%
  arrange(AGE)%>%
  head() 

# descending
adsl %>%
  arrange(-AGE)%>%
  head() 
STUDYID USUBJID SUBJID AGE TRT01P TRTSDT TRTEDT RACE SEX DISCONFL
CDISCPILOT01 01-701-1341 1341 51 Xanomeline Low Dose 2013-01-05 2013-01-26 WHITE M Y
CDISCPILOT01 01-701-1118 1118 52 Placebo 2014-03-12 2014-09-09 WHITE M
CDISCPILOT01 01-709-1007 1007 54 Xanomeline Low Dose 2012-07-31 2012-08-28 WHITE F Y
CDISCPILOT01 01-701-1180 1180 56 Xanomeline High Dose 2013-02-12 2013-03-18 WHITE M Y
CDISCPILOT01 01-701-1239 1239 56 Xanomeline High Dose 2014-01-11 2014-07-10 WHITE M
CDISCPILOT01 01-701-1287 1287 56 Xanomeline High Dose 2014-01-25 2014-07-26 WHITE F
STUDYID USUBJID SUBJID AGE TRT01P TRTSDT TRTEDT RACE SEX DISCONFL
CDISCPILOT01 01-710-1083 1083 89 Placebo 2013-07-22 2013-08-01 WHITE F Y
CDISCPILOT01 01-703-1295 1295 88 Xanomeline High Dose 2013-11-21 2014-04-19 WHITE F Y
CDISCPILOT01 01-710-1002 1002 88 Xanomeline Low Dose 2014-01-14 2014-01-18 WHITE M Y
CDISCPILOT01 01-710-1368 1368 88 Placebo 2013-10-23 2014-04-24 WHITE F
CDISCPILOT01 01-714-1035 1035 88 Placebo 2014-04-17 2014-10-16 WHITE F
CDISCPILOT01 01-701-1387 1387 87 Placebo 2014-03-12 2014-03-25 WHITE F Y

3.1.7 Creating new variables

DATA adsl;
  SET adsl;
  length AGEGR1 $20.; * length function in R means something different
  IF age > 50 then AGEGR1 = ‘> 50 years old’;
  ELSE if age <= 50 then AGEGR1 = ‘<= 50 years old’;
run;

In R we use the mutate() function:

adsl %>%
  mutate(AGEGR1 = case_when(
    AGE > 50 ~"> 50 years old",
    AGE <= 50 ~"<= 50 years old",
  )) %>%
  head()
STUDYID USUBJID SUBJID AGE TRT01P TRTSDT TRTEDT RACE SEX DISCONFL AGEGR1
CDISCPILOT01 01-701-1015 1015 63 Placebo 2014-01-02 2014-07-02 WHITE F > 50 years old
CDISCPILOT01 01-701-1023 1023 64 Placebo 2012-08-05 2012-09-01 WHITE M Y > 50 years old
CDISCPILOT01 01-701-1028 1028 71 Xanomeline High Dose 2013-07-19 2014-01-14 WHITE M > 50 years old
CDISCPILOT01 01-701-1033 1033 74 Xanomeline Low Dose 2014-03-18 2014-03-31 WHITE M Y > 50 years old
CDISCPILOT01 01-701-1034 1034 77 Xanomeline High Dose 2014-07-01 2014-12-30 WHITE F > 50 years old
CDISCPILOT01 01-701-1047 1047 85 Placebo 2013-02-12 2013-03-09 WHITE F Y > 50 years old

3.1.8 Handling of missing values

Missing values in SAS and R:

  • SAS
    • Missing value is a blank/a single decimal point for character/numeric variables, see details in Missing Values in SAS;
  • R
    • Missing data in R appears as NA. NA is not a string nor a numeric value, but an indicator of missingness, see detailed examples in Advanced R #3.2.3 Missing values;
    • NA and "" are different in R: "" is a blank string, while NA is missing;
    • “Missing values (NA) and NaN values are regarded as non-comparable even to themselves, so comparisons involving them will always result in NA
    • admiral::convert_blanks_to_na() can turn SAS blank strings into proper R NA (from admiral documentation).
# Unlike SAS, space(s) != blank string in R, 
" " == ""
## [1] FALSE
# is.na to check if it's missing
c(" ", "Y", NA_character_) %>% is.na()
## [1] FALSE FALSE  TRUE
# NA is non-comparable, result in NA
c(" ", "Y", NA_character_) != "Y"
## [1]  TRUE FALSE    NA

Unexpected results might occur with “SAS habits”, below are three examples to illustrate the different handling of missing values in R.

3.1.8.1 Subsetting data

!= is not the same in R when it involves NA

# add one more row to the data with missing AGE and DISCONFL, select the first 3 rows
adsl_na_example <- adsl %>% 
  add_row(SUBJID = "1", DISCONFL = NA_character_, .before = T) %>% 
  slice(1:3) %>% 
  select(SUBJID, DISCONFL)
# demo data
adsl_na_example
SUBJID DISCONFL
1 NA
1015
1023 Y

Subset to DISCONFL != "Y", only DISCONFL = " " is selected, not NA

adsl_na_example %>% filter(DISCONFL != "Y")
SUBJID DISCONFL
1015

To display NA, add is.na(DISCONFL) to the filter

# include is.na(DISCONFL) in the filter
adsl_na_example %>% filter(DISCONFL != "Y" | is.na(DISCONFL))
SUBJID DISCONFL
1 NA
1015

3.1.8.2 Sorting data

In SAS, missing is treated as the smallest value in proc sort; while in R, NA is always at the bottom after ascending or descending sorting.

Create demo data

# add one more row to the data with missing AGE and DISCONFL 
# select the first 3 rows
adsl_na_sort <- adsl %>% 
  add_row(SUBJID = "1", AGE = NA_integer_, .before = T) %>% 
  slice(1:5) %>% 
  select(SUBJID, AGE)
SUBJID AGE
1 NA
1015 63
1023 64
1028 71
1033 74

Sort by ascending order of AGE

# ascending
adsl_na_sort %>%
  arrange(AGE)
SUBJID AGE
1015 63
1023 64
1028 71
1033 74
1 NA

Sort by descending order of AGE

# descending
adsl_na_sort %>%
  arrange(-AGE)
SUBJID AGE
1033 74
1028 71
1023 64
1015 63
1 NA

NA can also be placed at the top with below workaround

# to be consistent with SAS - NA at the top when ascending
adsl_na_sort %>%
  arrange(!is.na(AGE), AGE)
SUBJID AGE
1 NA
1015 63
1023 64
1028 71
1033 74

3.1.8.3 Creating new variables

When AGE is missing, AGEGR1 is set to ‘<= 50 years old’ in SAS; while in R it is set to NA_character_.

adsl_na_sort %>%
  mutate(AGEGR1 = case_when(
    AGE > 50 ~"> 50 years old",
    AGE <= 50 ~"<= 50 years old",
  )) 
SUBJID AGE AGEGR1
1 NA NA
1015 63 > 50 years old
1023 64 > 50 years old
1028 71 > 50 years old
1033 74 > 50 years old

3.1.9 Merging Data

We create another data set with weight information for some of the subjects:

SUBJID <- sample(adsl$SUBJID, 100, replace = FALSE)
WEIGHT <- sample(50:100, 100, replace = TRUE)

adsl_weight <- data.frame(SUBJID, WEIGHT)

Now, we would like to join the weight column to our adsl data set.

In SAS, we use the merge function:

* inner join;
data adsl_merge;
merge adsl(in = inadsl) adsl_weight(in = inweight);
by subjidn;
if inadsl and inweight; * inner join in SAS
run;

* outer join;
data adsl_merge_all;
merge adsl(in = inadsl) adsl_weight(in = inweight);
by subjidn;
if inadsl or inweight; * default in SAS
run;

* left join;
data adsl_merge_left;
merge adsl(in = inadsl) adsl_weight(in = inweight);
by subjidn;
if inadsl;
run;

In R, we use again the tidyverse package:

# inner join
adsl%>%
  inner_join(adsl_weight, by = "SUBJID") 

# outer join
adsl %>%
  full_join(adsl_weight, by = "SUBJID")

# left join
adsl %>%
  left_join(adsl_weight, by = "SUBJID")

3.1.10 Concatenating Data

To demonstrate stacking or concatenating data sets, we can first split adsl into two parts, each having 5 records with the same columns in common.

# first 5 records of adsl
adsl_1 <- adsl[c(1:5),]

# the next 5 records of adsl
adsl_2 <- adsl[c(6:10),]

In SAS, you could use the set command:

DATA adsl_stacked;
 SET adsl_1 adsl_2;
RUN;

In R, you may accomplish this like so:

# tidyverse method - bind_rows()
adsl_stacked <- adsl_1 %>%
  bind_rows(adsl_2)