/* Reading TAB delimited files provided by Malcolm Baker, June 15, 2004 */
/* He provided 2 lists of IPOs for 2002 paper with Jeffrey Wurgler:     */
/* - Jay Ritter's list of IPOs                                          */
/* - A list they generated based on SDC                                 */
/* July 1, 2004                                                        */

options ls=120 nocenter;
libname corpfin '/projects/nwu/pledesma/corpfin';

/* Ritter file has 8 digit CUSIP */
/* SDC file has 6 digit CUSIP    */
filename ritter '/projects/nwu/pledesma/corpfin/raw/Baker_and_Wurgler_RitterIPOlist.txt';
filename sdc '/projects/nwu/pledesma/corpfin/raw/Baker_and_Wurgler_SDC_IPOlist.txt';

data ritterlist;
 infile ritter;
 input cusip $ tdate $;
 date=input(tdate,yymmdd.);
 format date yymmddn8.;
 drop tdate;

data sdclist;
 infile sdc;
 input cusip $ tdate $;
 date=input(tdate,yymmdd.);
 format date yymmddn8.;
 drop tdate;

/* Verify formats look OK */
/*
proc contents data=ritterlist;
proc contents data=sdclist;

proc print data=ritterlist (obs=10);
proc print data=sdclist (obs=10);
*/

/* Check for duplicates in data */
/* No duplicates in Ritter list */
/*
proc sort data=ritterlist;
 by cusip;

proc freq data=ritterlist noprint;
 tables cusip / out=test;

proc print data=test;
 where count > 1;
*/

/* Checking duplicate records in SDC list */
/* there are 31 duplicate CUSIP numbers   */

proc sort data=sdclist nodupkey;
 by cusip date;

/*
proc freq data=sdclist noprint;
 tables cusip / out=test;

proc print data=test;
 where count > 1;

data test; set test;
 where count > 1;

data test;
 merge sdclist test (in=b);
 by cusip;
 if b;
proc print data=test;
*/

/* Match Ritter's list to historical CUSIP in DFSNAMES */
/* Create variable numday as difference between        */
/* start of data in CRSP and IPO date                  */

proc sql;
 create table rittermatch as select
   ritterlist.*, dsfnames.permno, dsfnames.st_date, dsfnames.end_date, dsfnames.comnam,
   (dsfnames.st_date-ritterlist.date) as numday
 from ritterlist, crsp.dsfnames
 where ritterlist.cusip=dsfnames.ncusip;
quit;

proc sort data=rittermatch nodupkey;
 by cusip permno;

*proc freq data=rittermatch;
*  tables numday; 

proc sort data=rittermatch;
 by descending numday;

/* Note: Investigated some of the companies listed as  */
/* having the longest delays - Compustat started the   */
/* series earlier than CRSP; checked a couple of firms */
/* against Historical WSJ archive: permno 15465,       */
/* Astrosystems Inc and permno 30876, Dunkin Donuts.   */
/* Both firms traded OTC in 1968.                      */
/*
proc print data=rittermatch;
 where numday > 30;
*/
/* Match the SDC list to historical CUSIP in DFSNAMES  */
/* Create variable numday as difference between        */
/* start of data in CRSP and IPO date                  */
/* Match based on 6-digit CUSIP                        */
proc sql;
 create table sdcmatch as select
   sdclist.*, dsfnames.permno, dsfnames.st_date, dsfnames.end_date, dsfnames.comnam,
   (dsfnames.st_date-sdclist.date) as numday
 from sdclist, crsp.dsfnames
 where sdclist.cusip=substr(dsfnames.ncusip,1,6);
quit;

/* Leaving IPOs with two different dates in the matched list */

proc sort data=sdcmatch nodupkey;
 by cusip permno date;

/* Retrieve Compustat GVKEY from CSTLINK */

proc sql;
 create table rittermatch as select
  rittermatch.*, cstlink.gvkey
  from rittermatch, crsp.cstlink
 where rittermatch.permno=cstlink.npermno;
quit;

proc sql;
 create table sdcmatch as select
 sdcmatch.*, cstlink.gvkey
 from sdcmatch, crsp.cstlink
 where sdcmatch.permno=cstlink.npermno;
quit;

/* Baker and Wurgler use SDC when the IPO date was not */
/* in the Ritter list -> append firms that did not     */
/* appear in Ritter.                                   */

proc sort data=rittermatch; by permno date;
proc sort data=sdcmatch; by permno date;

/* List of permno to be added to the Ritter list       */

data sdcadds2ritter;
 merge rittermatch (in=a) sdcmatch (in=b);
 by permno;
 if b and not a;

proc append base=rittermatch data=sdcadds2ritter;

proc sort data=rittermatch nodupkey;
 by permno gvkey date;

/* Checking duplicate GVKEYs and PERMNOs */

proc freq data=rittermatch noprint; tables gvkey / out=gvkeycounts;

proc freq data=rittermatch noprint; tables permno / out=permcounts;

data repeatedgvkeys (rename=(count=gvkeycount));
 set gvkeycounts;
 where count>1;

data repeatedperms (rename=(count=permcount));
 set permcounts;
 where count>1;

proc sort data=rittermatch; by gvkey;
data t1;
 merge rittermatch repeatedgvkeys (in=a);
 by gvkey;
 if not a then delete;

proc sort data=rittermatch; by permno;
data t2;
 merge rittermatch repeatedperms (in=a);
 by permno;
 if not a then delete;

*proc print data=t1;
*proc print data=t2;

/* Baker and Wurgler use IPOs between 1968 and 1998 */

data rittermatch; set rittermatch; where year(date) between 1968 and 1998;

/* Eliminate duplicate GVKEY-DATE combinations          */
/* (ignore different PERMNOs for firms with same GVKEY) */

proc sort data=rittermatch nodupkey; by gvkey date;

/* Match with Compustat to get first year of data available */

proc sql;
 create table t3 as select
 rittermatch.*, compann.data6, compann.yeara
 from rittermatch, comp.compann
 where rittermatch.gvkey=compann.gvkey and compann.data6 ne .;
quit;

proc sort data=t3; by gvkey yeara;

data st_cstat (keep=gvkey yeara) end_cstat (keep=gvkey yeara);
 set t3;
 by gvkey yeara;
 if first.gvkey then output st_cstat;
 if last.gvkey then output end_cstat;

data corpfin.bakerwurgler;
 merge rittermatch st_cstat (rename=(yeara=st_cstat)) end_cstat (rename=(yeara=end_cstat));
 by gvkey;
 numyrs=st_cstat-year(date);
 if st_cstat=. then delete;

/* Create a dataset with the list of macthed IPOs and the years in which data for assets, */
/* market capitizalization (stock price * shares outstanding) start and end in Compustat. */

proc sql;
 create table temp as select
 rittermatch.*, compann.data6, compann.data25, compann.data199, compann.yeara,
  (compann.data25*compann.data199) as mkval
 from rittermatch, comp.compann
 where rittermatch.gvkey=compann.gvkey and compann.data25 ne . and compann.data199 ne . and compann.dnum not between 6000 and 6999;
quit;

proc sort data=temp; by gvkey yeara;

data assets;
 set temp;
where data6 ne .;

data st_assets (keep=gvkey yeara) end_assets (keep=gvkey yeara);
 set assets;
 by gvkey yeara;
 if first.gvkey then output st_assets;
 if last.gvkey then output end_assets;

data mkval;
 set temp;
 where mkval ne .;

data st_mkval (keep=gvkey yeara) end_mkval (keep=gvkey yeara);
 set mkval;
 by gvkey yeara;
 if first.gvkey then output st_mkval;
 if last.gvkey then output end_mkval;

data corpfin.bakerwurgler2;
 merge rittermatch (in=a) st_assets (in=b rename=(yeara=st_assets)) end_assets (in=c rename=(yeara=end_assets)) 
      st_mkval (in=d rename=(yeara=st_mkval)) end_mkval (in=e rename=(yeara=end_mkval));
 by gvkey;
 if a and b and c and d and e;
 yrsmkval=end_mkval-st_mkval+1;
 yrsassets=end_assets-st_assets+1;
 numyrs=st_mkval-year(date);
 label 
 st_assets="Year Total Assets series starts"
 end_assets="Year Total Assets series ends"
 st_mkval="Year Market Value series starts"
 end_mkval="Year Market Value series ends"
 yrsmkval="Number of years Market Value should be available"
 yrsassets="Number of years Total Assets should be available"
 cusip="CUSIP number"
 date="IPO date"
 numday="Number of days after IPO date to appear in CRSP"
 numyrs="Number of years after IPO year to appear in Compustat";

proc contents data=corpfin.bakerwurgler2;
*proc print data=corpfin.bakerwurgler (obs=50);

data corpfin.ipolist;
 set corpfin.bakerwurgler (keep=gvkey date);
run;

proc export data=corpfin.bakerwurgler
     outfile="/projects/nwu/pledesma/corpfin/raw/bakerwurgler.csv"
     dbms=csv replace;

proc export data=corpfin.bakerwurgler2
     outfile="/projects/nwu/pledesma/corpfin/raw/bakerwurgler2.csv"
     dbms=csv replace;

proc freq data=corpfin.bakerwurgler2 noprint;
 tables gvkey / out=gvkeycounts;

data gvkeycounts; set gvkeycounts; where count > 1;

data gvkeycounts;
 merge corpfin.bakerwurgler gvkeycounts (in=a);
 by gvkey;
 if not a then delete;

proc print data=gvkeycounts noobs;
 var gvkey permno cusip comnam date st_date st_cstat;

proc freq data=corpfin.bakerwurgler2;
tables numyrs;

endsas;


/* tidbits */

/* SQL to retrieve only those PERMNO closest to offer date */
proc sql;
 create table rittermatch as select
   ritterlist.*, msfnames.permno, msfnames.st_date, msfnames.end_date,
   (msfnames.st_date-ritterlist.date) as numday
 from ritterlist left join crsp.msfnames
 on abs(msfnames.st_date-ritterlist.date)<=60
 where ritterlist.cusip=msfnames.ncusip;
quit;