PageRenderTime 44ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/rmr2/pkg/tests/logistic-regression.R

http://github.com/RevolutionAnalytics/RHadoop
R | 78 lines | 46 code | 6 blank | 26 comment | 2 complexity | 289afd649a0f4118d840c252ef693194 MD5 | raw file
  1. # Copyright 2011 Revolution Analytics
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. ## see spark implementation http://www.spark-project.org/examples.html
  15. ## see nice derivation here http://people.csail.mit.edu/jrennie/writing/lr.pdf
  16. library(rmr2)
  17. ## @knitr logistic.regression-signature
  18. logistic.regression =
  19. function(input, iterations, dims, alpha){
  20. ## @knitr logistic.regression-map
  21. lr.map =
  22. function(., M) {
  23. Y = M[,1]
  24. X = M[,-1]
  25. keyval(
  26. 1,
  27. Y * X *
  28. g(-Y * as.numeric(X %*% t(plane))))}
  29. ## @knitr logistic.regression-reduce
  30. lr.reduce =
  31. function(k, Z)
  32. keyval(k, t(as.matrix(apply(Z,2,sum))))
  33. ## @knitr logistic.regression-main
  34. plane = t(rep(0, dims))
  35. g = function(z) 1/(1 + exp(-z))
  36. for (i in 1:iterations) {
  37. gradient =
  38. values(
  39. from.dfs(
  40. mapreduce(
  41. input,
  42. map = lr.map,
  43. reduce = lr.reduce,
  44. combine = T)))
  45. plane = plane + alpha * gradient }
  46. plane }
  47. ## @knitr end
  48. out = list()
  49. test.size = 10^5
  50. for (be in c("local", "hadoop")) {
  51. rmr.options(backend = be)
  52. ## create test set
  53. set.seed(0)
  54. ## @knitr logistic.regression-data
  55. eps = rnorm(test.size)
  56. testdata =
  57. to.dfs(
  58. as.matrix(
  59. data.frame(
  60. y = 2 * (eps > 0) - 1,
  61. x1 = 1:test.size,
  62. x2 = 1:test.size + eps)))
  63. ## @knitr end
  64. out[[be]] =
  65. ## @knitr logistic.regression-run
  66. logistic.regression(
  67. testdata, 3, 2, 0.05)
  68. ## @knitr end
  69. ## max likelihood solution diverges for separable dataset, (-inf, inf) such as the above
  70. }
  71. stopifnot(
  72. isTRUE(all.equal(out[['local']], out[['hadoop']])))