/examples/world_development_indicators/src/main/java/com/mongodb/hadoop/examples/world_development/WorldDevIndicatorDataLoader.java

http://github.com/mongodb/mongo-hadoop · Java · 139 lines · 75 code · 37 blank · 27 comment · 21 complexity · d48844593e6427ea8c4a3b8ebca8173c MD5 · raw file

  1. /*
  2. * Copyright 2011 10gen Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package com.mongodb.hadoop.examples.world_development;
  17. // Mongo
  18. import com.mongodb.Mongo;
  19. import com.mongodb.DBAddress;
  20. import com.mongodb.BasicDBObject;
  21. import org.bson.types.ObjectId;
  22. // Java
  23. import java.io.FileInputStream;
  24. import java.io.DataInputStream;
  25. import java.io.BufferedReader;
  26. import java.io.InputStreamReader;
  27. import java.util.HashMap;
  28. import java.util.List;
  29. import java.util.LinkedList;
  30. import java.util.regex.Matcher;
  31. import java.util.regex.Pattern;
  32. /**
  33. * The world development indicator data loader. Loads the CSV data into Mongo.
  34. */
  35. public class WorldDevIndicatorDataLoader {
  36. public static void main( final String[] pArgs ) throws Exception{
  37. final Mongo mongo = new Mongo( new DBAddress( "127.0.0.1:27017", "test" ) );
  38. mongo.getDB( "test" ).getCollection( "worldDevelopmentIndicators.in" ).remove( new BasicDBObject() );
  39. final DataInputStream in = new DataInputStream( new FileInputStream( DATA_FILE ) );
  40. final BufferedReader br = new BufferedReader( new InputStreamReader( in ) );
  41. final HashMap<Integer, String> fieldPositions = new HashMap<Integer, String>();
  42. _csvPattern = Pattern.compile( CSV_REGEXP );
  43. try {
  44. final LinkedList<String> vals = new LinkedList<String>();
  45. String line;
  46. int count = 0;
  47. while ( ( line = br.readLine() ) != null ){
  48. int position = 0;
  49. // If this is the first line, read the field positions.
  50. if ( count == 0 ){
  51. for ( final String field : line.split( "," ) )
  52. fieldPositions.put( position++, field );
  53. count++;
  54. continue;
  55. }
  56. final BasicDBObject doc = new BasicDBObject();
  57. doc.put( "_id", ObjectId.get().toString() );
  58. // Loop through the data and insert.
  59. parseCsvLine( line, vals );
  60. if ( vals.isEmpty() ) continue;
  61. for ( final String data : vals ){
  62. final String field = fieldPositions.get( position++ );
  63. if ( field == null ) continue;
  64. if ( data == null || data.equals( "" ) ) continue;
  65. // Check to see if this is a number.
  66. try {
  67. doc.put( field, Double.parseDouble( data ) );
  68. }
  69. catch ( final NumberFormatException nfe ) {
  70. // This is a string.
  71. doc.put( field, data );
  72. }
  73. }
  74. mongo.getDB( "test" ).getCollection( "worldDevelopmentIndicators.in" ).insert( doc );
  75. }
  76. }
  77. finally {
  78. if ( in != null ) in.close();
  79. }
  80. }
  81. /**
  82. * Parse the CSV line.
  83. */
  84. private static void parseCsvLine( final String pLine, final LinkedList<String> pVals ){
  85. pVals.clear();
  86. final Matcher matcher = _csvPattern.matcher( pLine );
  87. while ( matcher.find() ){
  88. String match = matcher.group();
  89. if ( match == null ) break;
  90. if ( match.endsWith( "," ) )
  91. match = match.substring( 0, match.length() - 1 );
  92. if ( match.startsWith( "\"" ) )
  93. match = match.substring( 1, match.length() - 1 );
  94. if ( match.length() == 0 ) match = null;
  95. pVals.addLast( match );
  96. }
  97. }
  98. private static Pattern _csvPattern;
  99. private static final String CSV_REGEXP = "\"([^\"]+?)\",?|([^,]+),?|,";
  100. private static final String DATA_FILE
  101. = "examples/world_development_indicators/resources/WDI_GDF_Data.csv";
  102. }