The Art of Streetplay

Sunday, September 25, 2005

Yahoo Stock R Mining Functions

Here are some functions which may be of use to those of you who use R.  Gotta do my part for the open source movement!  Pretty tame. "yimp" gathers price data for an arbitrary number of stocks over an arbitrary time period.  "ksImport" gathers a handful of key statistics for whatever stocks you want and throws them into a list.  Check it out.  If anyone has any follow-ups, corrections or comments please feel free to email me.
-Danny

#Yahoo Price History Gatherer -- for example, yimp(c("IBM","GE"),20050101,20050901)

yimp <- function(ticker.list,start.date, end.date, data=TRUE, plot=FALSE){
     Source = "http://ichart.finance.yahoo.com/table.csv?"
     startmonth <- as.numeric(substring(start.date,5,6))-1
     endmonth <- as.numeric(substring(end.date,5,6))-1
     nstocks <- length(ticker.list)
     for(i in 1:nstocks){
          if(startmonth <10){
               startmonth <- paste("0",startmonth,sep="")
               }
          if(endmonth <10){
               endmonth <- paste("0",endmonth,sep="")
               }
          Query <- paste("&a=", startmonth,"&b=", as.numeric( substring( start.date,7,8) ),"&c=", as.numeric( substring( start.date,1,4)),"&d=", endmonth,"&e=", as.numeric( substring( end.date,7,8)),"&f=",as.numeric( substring( end.date,1,4)), "&g=d&ignore=.csv",sep="")
          download.file( url=paste( Source,"&s=",ticker.list[i],Query,sep=""),destfile= "tempfile",quiet=TRUE )
          temp<- read.delim("tempfile",sep=",",as.is=TRUE,fill=TRUE)
          temp <- temp[,c("Date","Adj..Close.")]
          colnames(temp) <- c("Date",ticker.list[i])
          time <- sub("-","",sub("-","",temp[,"Date"]))
          tempnames <- colnames(temp)
          temp <- data.frame(strptime(time,"%d%b%y"),temp[,2])
          colnames(temp) <- tempnames          
          if(plot==TRUE){
               windows()
               plot( x=temp[,"Date"], y=temp[,ticker.list[i]], type="l",col="blue",lwd=1, main=paste("Prices for ",ticker.list[i]," from ", temp[1,1]," to ",temp[nrow(temp),1],sep=""), xlab=paste("Date",sep=""), ylab="Price")
               end <- nrow(temp)
               mid <- mean(temp[,ticker.list[i]])
               sdup <- mean( temp[,ticker.list[i]]) + sd(temp[,ticker.list[i]])
               sddown <- mean( temp[,ticker.list[i]]) - sd(temp[,ticker.list[i]])
               lines(c( temp[1,1],temp[nrow(temp),1]),c(mid,mid), col="red",lwd=2)
               lines(c( temp[1,1],temp[nrow(temp),1]),c(sdup,sdup), col="red",lwd=1)
               lines(c( temp[1,1],temp[nrow(temp),1]),c(sddown,sddown), col="red",lwd=1)
               }
          if(i ==1){
               list <- temp
               }
          if(i !=1){
               if(nrow(temp)>nrow(list)){
               #if the temp is larger than list, then set the temp dates as the list dates, append to all
               #columns in the small list NA's until they match in length to temp, then append temp to the
               #end.
                    list2 <- list
                    list2names <- colnames(list)
                    tempnames <- colnames(temp)
                    list <- temp[,1]
                    oldlength <- nrow(list2)
                    for(k in 2:ncol(list2)){
                         newtemp <- as.numeric( append(list2[,k],rep("NA",(nrow(temp)-oldlength))))
                         list <- data.frame(list,newtemp)
                         colnames(list) <- c(colnames(list)[1:(k-1)],list2names[k])
                         }
                    colnames(list) <-c( tempnames[1],colnames(list)[2:ncol(list)])
                    list <- data.frame(list,temp[,2])
                    colnames(list) <- c( colnames(list)[1:(ncol(list)-1)], tempnames[2])
                    }
#Note: this makes the assumption that up until we have no price data for a particular stock, all stocks in
#the set trade on the same days. This will be true almost all the time, except for instances in which a
#particular stock is forced to cease trading (for example, for regulatory reasons).  I have yet to see an
#instance of this, but it could very well happen I would imagine, unless yahoo corrects for this.
               if(nrow(list)>nrow(temp)){
                    tempname <- colnames(temp)
                    newtemp <- as.numeric( append(temp[,2],rep("NA",(nrow(list)-nrow(temp)))))
                    list <- data.frame(list,newtemp)
                    colnames(list) <- c( colnames(list)[1:(ncol(list)-1)],tempname[2])
                    }
               if(nrow(temp)==nrow(list)){
                    list <- data.frame(list,temp[,ticker.list[i]])
                    colnames(list) <- c( colnames(list)[1:(ncol(list)-1)],ticker.list[i])
                    }
               }
          }
     if(length(ticker.list)>=3){
          list <- list[,-4]
          }
if(data==TRUE){return(list)}
}




#Key Statistics Importer -- Grab a handful of Key Statistics (ie. ksImport(query="IBM"))



ksImport <- function( file = "tempfile",source1 = "http://finance.yahoo.com/q/ks?s=", source2 = "http://finance.yahoo.com/q/in?s=",query){
pointer <- ":</td"
offset = 2
nstocks <- length(query)
keynames = c( "Market Cap ", "Enterprise Value ", "Trailing P/E ", "Forward P/E ", "Price/Book ",
      "Enterprise Value/EBITDA ", "Trailing Annual Dividend ", "EBITDA ", "Net Income Avl to Common ",
      "Revenue ", "Total Cash ", "Total Debt ", "Average Volume ",
      "Shares Short ", "Shares Outstanding:")
      temp = as.character(Sys.Date())
stats <- matrix(0,(length(keynames)+2),nstocks)

for(j in 1:nstocks){
     temp = as.character(Sys.Date())
     url1 = paste(source1, query[j], sep = "")
     download.file(url1, file, quiet=TRUE)
     x = scan(file, what = "", sep = ">")
     if(length(grep("no longer valid",x))!=0){
          query[j] <- strsplit(x[grep("no longer valid",x)],split="?s=")[[1]][2]
          url1 = paste(source1, query[j], sep = "")
          download.file(url1, file, quiet=TRUE)
          x = scan(file, what = "", sep = ">")
          }
     if(length(grep("There is no  data available",x))!=0){
          stats[,j] <- "NA"
          }
     if(length(grep("Invalid Ticker Symbol",x))!=0){
          stats[,j] <- "NA"
          }
     if(query[j]==""){
          stats[,j] <- "NA"
          }
     if(sum(nchar(x)>15000)!=0){
          x <- strsplit(x[nchar(x)>15000],split=">")[[1]]
          }
     if(query[j]!=""){
     if(length(grep("There is no  data available",x))==0){
     if(length(grep("Invalid Ticker Symbol",x))==0){
          for (s in keynames) {
               loc <- grep(s,x)
               if((s=="EBITDA ")&(length(loc)!=1)){loc <- loc[2]}
               if((s=="Revenue ")&(length(loc)!=1)){loc <- loc[2]}
               if((s=="Total Cash ")&(length(loc)!=1)){loc <- loc[1]}
               if((s=="Average Volume ")&(length(loc)!=1)){loc <- loc[1]}
               if(( s=="Trailing Annual Dividend ")&(length(loc)!=1)){loc <- loc[2]}
               if((s=="Shares Short ")&(length(loc)!=1)){loc <- loc[1]}
               if(length(grep(pointer,x[loc]))==1){
                    grepped = paste(sub("</td", "", x[loc + offset]))
                    }
               if(length(grep(pointer,x[loc]))==0){
                    i=1
                    while(length(grep(pointer,x[loc+i]))==0){
                         i <- i+1
                         }
                    grepped = paste(sub("</td", "", x[loc +i+offset]))
                    }
               temp = c(temp, grepped)
                       }
          url2 = paste(source2,query[j],sep="")
          download.file(url2, file, quiet=TRUE)
          x = scan(file, what="",sep=">")
          s="Industry:"
          grepped = paste(substring(sub("</b", "", x[grep(s, x)][2]),11))
          temp = c(temp, grepped)
          stats[,j] <- temp
          }
          }
          }     
     }
for (i in 1:length(keynames)) {keynames[i] = substr(keynames[i], 1, nchar(keynames[i]) - 1)}
        keynames = c("Date", keynames,"Industry")
        output <- data.frame(cbind(Keyname = keynames, Statistic = stats))
     colnames(output) <- c(colnames(output)[1],query)
     #tidying up the format
     output <- t(output)
     colnames(output)<- output[1,]
     output <- output[-1,]
     names <- colnames(output)
     if(length(query)==1){
          output["Industry"] <- sub("&","&",output["Industry"])
          if(output["Trailing Annual Dividend"]==""){output["Trailing Annual Dividend"] <- 0}
          output["Trailing Annual Dividend"] <- sub("%","",output["Trailing Annual Dividend"])
     }
     if(length(query)>1){
          output[grep("&",output[,"Industry"]),"Industry"] <- sub("&","&",output[grep("&",output[,"Industry"]),"Industry"])
          output[output[,"Trailing Annual Dividend"]=="","Trailing Annual Dividend"] <- 0
          output[grep( "%",output[,"Trailing Annual Dividend"]),"Trailing Annual Dividend"] <- sub("%","",output[grep("%",output[,"Trailing Annual Dividend"]),"Trailing Annual Dividend"])
          output <- data.frame(rownames(output),output)
          colnames(output) <- c("ticker","date","mktcap","EV","PEttm","PEfwd","PtoB","EVtoEBITDA","DivYld",
               "EBITDA", "NetIncome","Revenue", "TotCash","TotDebt","AvgVol", "TotShort","TotShares","Industry")
          }
return(output)
}


0 Comments:

Post a Comment

<< Home