PageRenderTime 110ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/java/de/jetwick/util/Util.java

https://github.com/neville-agius/Jetwick
Java | 353 lines | 276 code | 43 blank | 34 comment | 26 complexity | df3055b4e92fd93cefd0bb4cb8b9c035 MD5 | raw file
  1. /**
  2. * Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info>
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package de.jetwick.util;
  17. import com.google.inject.Guice;
  18. import com.google.inject.Inject;
  19. import com.google.inject.Injector;
  20. import com.google.inject.Module;
  21. import com.google.inject.Provider;
  22. import de.jetwick.config.Configuration;
  23. import de.jetwick.config.DefaultModule;
  24. import de.jetwick.es.ElasticTweetSearch;
  25. import de.jetwick.es.ElasticUserSearch;
  26. import de.jetwick.rmi.RMIClient;
  27. import de.jetwick.es.JetwickQuery;
  28. import de.jetwick.data.JUser;
  29. import de.jetwick.es.CreateObjectsInterface;
  30. import de.jetwick.es.TweetQuery;
  31. import de.jetwick.tw.Credits;
  32. import de.jetwick.tw.MyTweetGrabber;
  33. import de.jetwick.tw.TwitterSearch;
  34. import de.jetwick.tw.queue.QueueThread;
  35. import java.util.Arrays;
  36. import java.util.LinkedHashSet;
  37. import java.util.List;
  38. import java.util.Map;
  39. import java.util.Set;
  40. import java.util.TreeSet;
  41. import org.elasticsearch.action.search.SearchResponse;
  42. import org.elasticsearch.index.query.xcontent.ExistsFilterBuilder;
  43. import org.elasticsearch.index.query.xcontent.FilterBuilders;
  44. import org.elasticsearch.search.facet.terms.TermsFacet;
  45. import org.slf4j.Logger;
  46. import org.slf4j.LoggerFactory;
  47. /**
  48. *
  49. * @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net
  50. */
  51. public class Util {
  52. private static Logger logger = LoggerFactory.getLogger(Util.class);
  53. @Inject
  54. private ElasticUserSearch userSearch;
  55. @Inject
  56. private ElasticTweetSearch tweetSearch;
  57. private int userCounter;
  58. private Configuration config = new Configuration();
  59. public static void main(String[] args) {
  60. Map<String, String> map = Helper.parseArguments(args);
  61. Util util = new Util();
  62. String argStr = "";
  63. if (!Helper.isEmpty(map.get("deleteAll"))) {
  64. util.deleteAll();
  65. return;
  66. }
  67. argStr = map.get("fillFrom");
  68. if (!Helper.isEmpty(argStr)) {
  69. String fromUrl = argStr;
  70. util.fillFrom(fromUrl);
  71. return;
  72. }
  73. argStr = map.get("clearUserTokens");
  74. if (!Helper.isEmpty(argStr)) {
  75. String newUserIndexName = argStr;
  76. util.clearUserTokens(newUserIndexName);
  77. return;
  78. }
  79. if (!Helper.isEmpty(map.get("copyStaticTweets"))) {
  80. util.copyStaticTweets();
  81. return;
  82. }
  83. argStr = map.get("showFollowers");
  84. if (!Helper.isEmpty(argStr)) {
  85. String user = argStr;
  86. util.showFollowers(user);
  87. return;
  88. }
  89. if (!Helper.isEmpty(map.get("optimize"))) {
  90. util.optimize();
  91. return;
  92. }
  93. // copyUserIndex=newtwindex
  94. argStr = map.get("copyUserIndex");
  95. if (!Helper.isEmpty(argStr)) {
  96. String newIndex = argStr;
  97. util.copyUserIndex(newIndex);
  98. return;
  99. }
  100. // copyUserIndex=newtwindex
  101. argStr = map.get("copyTweetIndex");
  102. if (!Helper.isEmpty(argStr)) {
  103. String newIndex = argStr;
  104. util.copyTweetIndex(newIndex);
  105. return;
  106. }
  107. // call after copyIndex:
  108. // removeOldTweetIndexAndAlias=newtwindex
  109. argStr = map.get("removeIndexAndAddAlias");
  110. if (!Helper.isEmpty(argStr)) {
  111. String oldIndex = argStr.split(",")[0];
  112. String newIndex = argStr.split(",")[1];
  113. util.removeIndexAndAddAlias(oldIndex, newIndex);
  114. return;
  115. }
  116. }
  117. public Util() {
  118. Module module = new DefaultModule();
  119. Guice.createInjector(module).injectMembers(this);
  120. }
  121. public void deleteAll() {
  122. // why don't we need to set? query.setQueryType("simple")
  123. userSearch.deleteAll();
  124. userSearch.refresh();
  125. logger.info("Successfully finished deleteAll");
  126. }
  127. private void copyStaticTweets() {
  128. Module module = new DefaultModule();
  129. Injector injector = Guice.createInjector(module);
  130. Provider<RMIClient> rmiProvider = injector.getProvider(RMIClient.class);
  131. Configuration cfg = injector.getInstance(Configuration.class);
  132. TwitterSearch twSearch = injector.getInstance(TwitterSearch.class);
  133. twSearch.initTwitter4JInstance(cfg.getTwitterSearchCredits().getToken(), cfg.getTwitterSearchCredits().getTokenSecret());
  134. ElasticTweetSearch fromUserSearch = new ElasticTweetSearch(injector.getInstance(Configuration.class));
  135. JetwickQuery query = new TweetQuery().addFilterQuery(ElasticTweetSearch.UPDATE_DT, "[* TO *]");
  136. // TODO set facetlimit to 2000
  137. query.addFacetField("user").setSize(0);
  138. SearchResponse rsp = fromUserSearch.search(query);
  139. TermsFacet tf = (TermsFacet) rsp.getFacets().facet("user");
  140. logger.info("found: " + tf.entries().size() + " users with the specified criteria");
  141. int SLEEP = 30;
  142. int counter = 0;
  143. for (TermsFacet.Entry tmpUser : tf.entries()) {
  144. if (tmpUser.getCount() < 20)
  145. break;
  146. while (twSearch.getRateLimit() <= 3) {
  147. try {
  148. logger.info("sleeping " + SLEEP + " seconds to avoid ratelimit violation");
  149. Thread.sleep(1000 * SLEEP);
  150. } catch (InterruptedException ex) {
  151. throw new IllegalStateException(ex);
  152. }
  153. }
  154. logger.info(counter++ + "> feed pipe from " + tmpUser.getTerm() + " with " + tmpUser.getCount() + " tweets");
  155. MaxBoundSet boundSet = new MaxBoundSet<String>(0, 0);
  156. // try updating can fail so try max 3 times
  157. for (int trial = 0; trial < 3; trial++) {
  158. MyTweetGrabber grabber = new MyTweetGrabber().setMyBoundSet(boundSet).
  159. init(null, null, tmpUser.getTerm()).setTweetsCount((int) tmpUser.getCount()).
  160. setRmiClient(rmiProvider).setTwitterSearch(twSearch);
  161. QueueThread pkg = grabber.queueTweetPackage();
  162. Thread t = new Thread(pkg);
  163. t.start();
  164. try {
  165. t.join();
  166. if (pkg.getException() == null)
  167. break;
  168. logger.warn(trial + "> Try again feeding of user " + tmpUser.getTerm() + " for tweet package " + pkg);
  169. } catch (InterruptedException ex) {
  170. logger.warn("interrupted", ex);
  171. break;
  172. }
  173. }
  174. }
  175. // TODO send via RMI
  176. }
  177. public void fillFrom(final String fromUrl) {
  178. ElasticTweetSearch fromTweetSearch = new ElasticTweetSearch(fromUrl, null, null);
  179. JetwickQuery query = new TweetQuery();
  180. long maxPage = 1;
  181. int hitsPerPage = 300;
  182. Set<JUser> users = new LinkedHashSet<JUser>();
  183. Runnable optimizeOnExit = new Runnable() {
  184. @Override
  185. public void run() {
  186. userSearch.refresh();
  187. logger.info(userCounter + " users pushed to default tweet search from " + fromUrl);
  188. }
  189. };
  190. Runtime.getRuntime().addShutdownHook(new Thread(optimizeOnExit));
  191. for (int page = 0; page < maxPage; page++) {
  192. query.attachPagability(page, hitsPerPage);
  193. users.clear();
  194. SearchResponse rsp;
  195. try {
  196. rsp = fromTweetSearch.search(users, query);
  197. } catch (Exception ex) {
  198. logger.warn("Error while searching!", ex);
  199. continue;
  200. }
  201. if (maxPage == 1) {
  202. maxPage = rsp.getHits().getTotalHits() / hitsPerPage + 1;
  203. logger.info("Paging though query:" + query.toString());
  204. logger.info("Set numFound to " + rsp.getHits().getTotalHits());
  205. }
  206. for (JUser user : users) {
  207. userSearch.save(user, false);
  208. }
  209. userCounter += users.size();
  210. logger.info("Page " + page + " out of " + maxPage + " hitsPerPage:" + hitsPerPage);
  211. if (page * hitsPerPage % 100000 == 0) {
  212. logger.info("Commit ...");
  213. userSearch.refresh();
  214. }
  215. }
  216. }
  217. public void showFollowers(String user) {
  218. // ElasticUserSearch uSearch = createUserSearch();
  219. // Set<SolrUser> jetwickUsers = new LinkedHashSet<SolrUser>();
  220. // uSearch.search(jetwickUsers, new SolrQuery().setRows(10000));
  221. final Set<String> set = new TreeSet<String>();
  222. // for (SolrUser u : jetwickUsers) {
  223. // set.add(u.getScreenName());
  224. // }
  225. Credits credits = config.getTwitterSearchCredits();
  226. TwitterSearch tw4j = new TwitterSearch().setConsumer(credits.getConsumerKey(), credits.getConsumerSecret());
  227. tw4j.initTwitter4JInstance(credits.getToken(), credits.getTokenSecret());
  228. tw4j.getFollowers(user, new AnyExecutor<JUser>() {
  229. @Override
  230. public JUser execute(JUser o) {
  231. // if (set.contains(o.getScreenName()))
  232. set.add(o.getScreenName());
  233. return null;
  234. }
  235. });
  236. for (String u : set) {
  237. System.out.println(u);
  238. }
  239. }
  240. public void optimize() {
  241. tweetSearch.optimize();
  242. }
  243. public void copyTweetIndex(String newIndex) {
  244. try {
  245. logger.info("Old index has totalhits:" + tweetSearch.countAll());
  246. if (!tweetSearch.indexExists(newIndex)) {
  247. logger.info("New Index '" + newIndex + "' does not exist! create it before copy!");
  248. return;
  249. }
  250. ExistsFilterBuilder filter = FilterBuilders.existsFilter(ElasticTweetSearch.UPDATE_DT);
  251. logger.info("Now copy from " + tweetSearch.getIndexName() + " to " + newIndex + " with exist filter: " + filter);
  252. tweetSearch.mergeIndices(Arrays.asList(tweetSearch.getIndexName()), newIndex,
  253. 10000, true, tweetSearch, filter);
  254. tweetSearch.setIndexName(newIndex);
  255. logger.info("New index has totalhits:" + tweetSearch.countAll() + " Now optimize ...");
  256. tweetSearch.optimize();
  257. } catch (Exception ex) {
  258. logger.error("Exception while copyIndex", ex);
  259. }
  260. }
  261. public void copyUserIndex(String newIndex) {
  262. try {
  263. logger.info("Old index has totalhits:" + userSearch.countAll());
  264. if (!userSearch.indexExists(newIndex)) {
  265. logger.info("New Index '" + newIndex + "' does not exist! create it before copy!");
  266. return;
  267. }
  268. logger.info("Now copy from " + userSearch.getIndexName() + " to " + newIndex);
  269. userSearch.mergeIndices(Arrays.asList(userSearch.getIndexName()), newIndex,
  270. 10000, true, userSearch, null);
  271. userSearch.setIndexName(newIndex);
  272. logger.info("New index has totalhits:" + userSearch.countAll() + " Now optimize ...");
  273. userSearch.optimize();
  274. } catch (Exception ex) {
  275. logger.error("Exception while copyIndex", ex);
  276. }
  277. }
  278. public void removeIndexAndAddAlias(String oldIndex, String newIndex) {
  279. tweetSearch.deleteIndex(oldIndex);
  280. tweetSearch.addIndexAlias(newIndex, oldIndex);
  281. logger.info("added alias:" + newIndex + " for deleted:" + oldIndex);
  282. }
  283. public void clearUserTokens(String newIndex) {
  284. try {
  285. logger.info("Old index has totalhits:" + userSearch.countAll());
  286. if (!userSearch.indexExists(newIndex)) {
  287. logger.info("New Index '" + newIndex + "' does not exist! create it before copy!");
  288. return;
  289. }
  290. logger.info("Now copy from " + userSearch.getIndexName() + " to " + newIndex);
  291. userSearch.mergeIndices(Arrays.asList(userSearch.getIndexName()), newIndex, 10000, true,
  292. new CreateObjectsInterface<JUser>() {
  293. @Override
  294. public List<JUser> collectObjects(SearchResponse rsp) {
  295. List<JUser> users = userSearch.collectObjects(rsp);
  296. for (JUser u : users) {
  297. u.setTwitterToken(null);
  298. u.setTwitterTokenSecret(null);
  299. }
  300. return users;
  301. }
  302. }, null);
  303. userSearch.setIndexName(newIndex);
  304. logger.info("New index has totalhits:" + userSearch.countAll() + " Now optimize ...");
  305. userSearch.optimize();
  306. } catch (Exception ex) {
  307. logger.error("Exception while copyIndex", ex);
  308. }
  309. }
  310. }