/2010_12/questionare1.r
R | 311 lines | 229 code | 37 blank | 45 comment | 12 complexity | 8008f51cef83815cc83e17a512d905f3 MD5 | raw file
- # R scripts for cleanup and produsing some statistics for dou questionare.
- # (see report on http://www.developers.org.ua/archives/rssh/2010/12/14/programming-languages-rating-2010/ )
- # Statistics is incomplete, some steps (such as calculating right bounds
- # for statistically corrected data) not included.
- #
- # Use this on own risc.
- #
- # You can use and redistribute this scripts on conditions of
- # GNU General Public License version 2 or later.
- #
- #
- # (C) Ruslan Shevchenko <ruslan@shevchenko.kiev.ua> 2010
- #
- # note, that file contains cyrillic comments in utf8 encoding.
- #config
- drawNow=TRUE
- qs <- read.csv(file="questionare1.csv", head=TRUE, sep=",")
- # normalize data
- # ????????? ???????? ? ??????????? ???? ?????? ?????????????? ??????:
- normalizeLanguage <- function(x) {
- x <- gsub("(^ )|( $)","",x);
- x <- gsub("(ActionScript(.+)$)|(Action *Script.*$)|AS3|AS|as3",
- "JavaScript",x);
- x <- gsub("actionscript|Actionscript|ActionScript|actionScript|(Adobe Fl.*)",
- "JavaScript",x);
- x <- gsub("JavaScript(.*)","JavaScript",x);
- x <- gsub("groovy","Groovy",x);
- x <- gsub("Groovy / Grails at back-end","Groovy",x);
- x <- gsub("(Groovy.*)|& Groovy","Groovy",x);
- x <- gsub("Android Java|Android","Java",x);
- x <- gsub("xslt","XSLT",x);
- x <- gsub("(built-in.*)","Other",x);
- x <- gsub("Delphi.*","Delphi",x);
- x <- gsub("(erlang)|(Erlang \\(server side\\))","Erlang",x);
- x <- gsub("Flash|Flex","JavaScript",x);
- x <- gsub("Google Go","Go",x);
- x <- gsub("Shell.*|bash|UNIX shell|unix shell|Shell.*Bash|shell|Bash|ksh","Shell",x);
- x <- gsub("(Shell.scripts)|(Shell-scripting)","Shell",x);
- x <- gsub("FoxPro","DBase-????????",x);
- x <- gsub("Matlab","MatLab",x);
- x <- gsub("Ocaml|ocaml","OCaml",x);
- x <- gsub("(VBScript)|(VB.Net)|(VB.NET)|VBS|PureBasic","Basic",x);
- x <- gsub("JavaScript(.*)$","JavaScript",x);
- x <- gsub("MXML","",x);
- x <- gsub("(^XML$)|(^xml$)","",x);
- x <- gsub("^XSL$","XSLT",x);
- x <- gsub("Qt","",x);
- x <- gsub("English","",x);
- x <- gsub("sh","Shell",x);
- x <- gsub("php","PHP",x);
- x <- gsub("go!|go","Go",x);
- x <- gsub("TCL|tcl","Tcl",x);
- x <- gsub("TurboProlog","Prolog",x);
- x <- gsub("CPL(.*)","CPL",x);
- x <- gsub("D \\(client side\\)","D",x);
- x <- gsub("jruby","Ruby",x);
- x <- gsub("clojure","Clojure",x);
- x <- gsub("f#","F#",x);
- x <- gsub("^c$","C/C++",x);
- x <- gsub("??? ?????(.*)","",x);
- x <- gsub("PL-SQL|pl/sql|pl/pgsql","PL/SQL",x);
- x <- gsub("^sql$","SQL",x);
- x
- }
- #
- languagesColumn <- function(cname) { al=NULL
- for(a in strsplit(as.character(qs[,cname]),",")) {
- al=c(al,normalizeLanguage(a))
- }
- al <- factor(al)
- al
- }
- al <- languagesColumn('AdditionalLanguages')
- pl <- languagesColumn('PetProjectsLanguages')
- # ? ????????????? ?????? ?????? ???? ???? ?????
- normalizeFixLanguage <- function(x) {
- x <- gsub("(^ )|( $)","",x);
- x<-gsub("\\(???????? ?? ??????\\)","??????",x)
- }
- qs['NowLanguage'] <- lapply(qs['NowLanguage'],
- function(x){factor(normalizeFixLanguage(x))})
- qs['NextLanguage'] <- lapply(qs['NextLanguage'],
- function(x){factor(normalizeFixLanguage(x))})
- qs['FirstLanguage'] <- lapply(qs['FirstLanguage'],
- function(x){factor(normalizeFixLanguage(x))})
- # ?????? ?????????????? ???????
- qs$Comment <- 0
- qs$Timestamp <- 0
- # ?? ??? ???????? ?
- sfl <- summary(qs[['FirstLanguage']])
- sfl <- sfl[order(sfl)]
- sfl <- sfl[-match("??????",names(sfl))]
- if (drawNow) {
- png(file="sfl.png")
- dotchart(sfl[sfl>10])
- title("?? ????? ????? ?? ???????? ????????.")
- dev.off()
- }
- # ?? ??? ????? ?????? ?
- snl <- summary(qs[['NowLanguage']])
- snl <- snl[order(snl)]
- snl <- snl[-match("??????",names(snl))]
- # ???????? ????????
- if (drawNow) {
- png(file="snl.png")
- dotchart(snl)
- title("?? ????? ????? ?? ?????? ??? ?????? ?????? ")
- dev.off()
- }
- # ?? ??? ????? ??????? ?????? ?
- sxl <- summary(qs[['NextLanguage']])
- sxl <- sxl[rev(order(sxl))]
- sxl <- sxl[-match("??????",names(sxl))]
- if (drawNow) {
- png(file="sxl.png")
- dotchart(sxl)
- title("???? ?? ?? ???????? ?????? ???????????? ?????? \n ? ? ??? ????-?? ??????? ?????? ...")
- dev.off()
- }
- #
- # ?????? ????????????????? ?????? (??? ??????????????):
- lct <- table(qs$NowLanguage,qs$Change)
- lci <- lct[,"???"]/(lct[,"???"]+lct[,"??"])
- lci <- lci[names(snl[snl>15])]
- lci <- lci[order(lci)]
- if (drawNow) {
- png(file="lci.png")
- barplot(lci,las=2)
- title("?????? ?????????????? ? ?????? ...")
- dev.off()
- }
- lct <- table(qs$NowLanguage,qs$NextLanguage)
- languageAfter <- function(name) {
- mgX <- lct[name,]/snl[name]
- mgX <- mgX[mgX>0.01]
- mgX <- mgX[rev(order(mgX))]
- }
- #
- # ???? ????????? ??????? PHP-????:
- print("mirgation from PHP:");
- print(languageAfter("PHP"))
- # ??-?? ????? ??? l=?????? ??????
- print("mirgation from Delphi:");
- print(languageAfter("Delphi"))
- print("mirgation from Java:");
- print(languageAfter("Java"))
- print("mirgation from C#");
- print(languageAfter("C#"))
- # ????????????? ?? ?????? ? ????? ??????
- lct <- table(qs$NowLanguage,qs$Experience)
- langPercentage <- function(x) {
- lct[x,]/snl[x]
- }
- lct <- lct[names(snl[snl>50]),]
- lct <- lct[,c("0","1","2","3","4","5","6","7","8","9","10 ? ??????")]
- colnames(lct)<-c("0","1","2","3","4","5","6","7","8","9",">10")
- plct <- langPercentage(rownames(lct))
- if (drawNow) {
- png(file="experience1.png")
- #oldpar <- par(mfrow=c(1,2))
- names<-c("C#","Java","C/C++")
- barplot(lct[names,],beside=TRUE,col=rainbow(3))
- legend("topleft",names,fill=rainbow(3))
- title("???? ??????: C#, Java, C/C++")
- #barplot(plct[names,],beside=TRUE,col=rainbow(3))
- dev.off()
- # ? ??? ??????-??????
- png(file="experience2.png")
- #oldpar <- par(mfrow=c(1,2))
- names<-c("PHP","Python","Ruby")
- barplot(lct[names,],beside=TRUE,col=rainbow(3))
- title("???? ??????: ??????-?????")
- legend("topleft",names,fill=rainbow(3))
- #barplot(plct[names,],beside=TRUE,col=rainbow(3))
- dev.off()
- #par(oldpar)
- }
- # ????? ????? ???????????? ??? ??????????????
- sal<-summary(al)
- sal<-sal[sal>10]
- sal<-sal[order(sal)]
- if (drawNow) {
- png(file="sal.png")
- dotchart(sal)
- title("????? ?????????????? ????? ?? ??????????? ??? ?????? ?")
- dev.off()
- }
- # ? ??? ???????????
- spl <- summary(pl)
- spl <- spl[spl>10]
- spl <- spl[order(spl)]
- if (drawNow) {
- png(file="spl.png")
- dotchart(spl)
- title("???? ?? ? ??? ???? pet-projects ?\n???? ????, ?? ?? ????? ?????? ?")
- dev.off()
- }
- ## ??????? ????????????? ????? ????? ??? ???????????:
- ppl<-qs[["PetProjectsLanguages"]]
- cat("?-???? ?????????????, ? ???? ???? ???? ???????:",
- length(ppl[ppl!=""])
- ,"\n"
- )
- ## ??? ??????????? ??????? ? ?????? ?????:
- scl=summary(qs$Choos)
- names(scl) <- c("??????? \n?? ???????","??????????","????????????","??? ???????????\n ???????????????? ?? ???? ?????","?????????")
- if (drawNow) {
- png(file="scl.png")
- barplot(scl)
- title("??? ??????????? ??????? ? ?????? ????? ?")
- text(3,1,labels=c("????????????"))
- text(5.5,1,labels=c("?????????"))
- dev.off()
- }
- ## ???? ?????? ? ???????? ? ?????????? ?? ???????
- stu <- table(qs$inUA,qs$Experience)
- stu <- stu[,c("0","1","2","3","4","5","6","7","8","9","10 ? ??????")]
- colnames(stu)<-c("0","1","2","3","4","5","6","7","8","9",">10")
- stu <- stu[c("??","???"),]
- normalizePercents<-function(tbl)
- {
- for(x in rownames(tbl)) {
- tbl[x,]<-tbl[x,]/sum(tbl[x,])
- }
- tbl
- }
- stu<-normalizePercents(stu)
- # barcode.
- #
- if (drawNow) {
- png(file="expUA.png")
- barplot(stu,beside=TRUE,legend=TRUE)
- dev.off()
- }
- #
- stu <- table(qs$NowLanguage,qs$inUA)
- linUA=stu[,"??"]
- linNotUA=stu[,"???"]
- linUA<-linUA/sum(linUA)
- linUA<-linUA[order(linUA)]
- linUA<-linUA[linUA>0.03]
- linNotUA<-linNotUA/sum(linNotUA)
- linNotUA<-linNotUA[order(linNotUA)]
- linNotUA<-linNotUA[linNotUA>0.05]
- names<-c("C#","Java","PHP","C/C++","Python","Ruby")
- lnua<-cbind(linUA[names],linNotUA[names])
- colnames(lnua)<-c("in UA","not in UA")
- if (drawNow) {
- png(file="lnUA.png")
- barplot(lnua,beside=TRUE,col=rainbow(6))
- legend("topright",names,fill=rainbow(6))
- dev.off()
- }
- # now do summary of languages.
- names<-c("C#","Java","C/C++","PHP","Python","Ruby","Objective-C","Scala",
- "Delphi", "JavaScript", "Perl", "Haskell", "Lisp", "1?", "Basic",
- "DBase-????????", "Asm", "Lua", "Fortran", "Cobol","Groovy")
- snl['Groovy']=sal['Groovy']
- snl['Cobol']=0
- snl <- snl[rev(order(snl))]
- names <- names(snl)
- rxsnl <- snl[names]
- sfl['Cobol']=0
- sfl['Groovy']=NA
- rxsfl <- sfl[names]
- sxl['Cobol']=0
- sxl['Groovy']=NA
- rxsxl <- sxl[names]
- splf <- summary(pl)
- rxspl <- splf[names]
- salf <- summary(al)
- rxsal <- salf[names]
- rxpsnl <- rxsnl/sum(rxsnl)*100
- res<-cbind(rxpsnl,rxsnl,rxsxl,rxsal,rxspl)