PageRenderTime 47ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/object.c

https://github.com/russross/envoy
C | 502 lines | 374 code | 95 blank | 33 comment | 73 complexity | 25509f3aa999773d097947d601834e40 MD5 | raw file
  1. #include <assert.h>
  2. #include <pthread.h>
  3. #include <gc/gc.h>
  4. #include <stdlib.h>
  5. #include <unistd.h>
  6. #include <errno.h>
  7. #include <string.h>
  8. #include "types.h"
  9. #include "9p.h"
  10. #include "list.h"
  11. #include "transaction.h"
  12. #include "util.h"
  13. #include "config.h"
  14. #include "object.h"
  15. #include "dispatch.h"
  16. #include "worker.h"
  17. #include "lru.h"
  18. #include "disk.h"
  19. /* Operations on storage objects.
  20. * These functions allow simple calls to the object storage service. They
  21. * handle local caching, find storage servers based on OID, and handle
  22. * replication. */
  23. /* pool of reserved oids */
  24. static u64 object_reserve_next;
  25. static u32 object_reserve_remaining;
  26. static pthread_cond_t *object_reserve_wait;
  27. static Lru *object_cache_status;
  28. void object_cache_validate(u64 oid) {
  29. u64 *key;
  30. if (objectroot == NULL)
  31. return;
  32. key = GC_NEW_ATOMIC(u64);
  33. assert(key != NULL);
  34. *key = oid;
  35. lru_add(object_cache_status, key, key);
  36. }
  37. void object_cache_invalidate(u64 oid) {
  38. if (objectroot == NULL)
  39. return;
  40. lru_remove(object_cache_status, &oid);
  41. }
  42. void object_cache_invalidate_all(void) {
  43. if (objectroot == NULL)
  44. return;
  45. lru_clear(object_cache_status);
  46. }
  47. int object_cache_isvalid(u64 oid) {
  48. return objectroot != NULL && lru_get(object_cache_status, &oid) != NULL;
  49. }
  50. static void send_request_to_all(Transaction *trans,
  51. void (*callback)(void *), void *env)
  52. {
  53. List *requests = cons(trans, NULL);
  54. int i;
  55. for (i = 1; i < storage_server_count; i++) {
  56. Transaction *newtrans =
  57. trans_new(storage_servers[i], NULL, message_new());
  58. /* copy the whole mess over */
  59. memcpy(newtrans->out, trans->out, sizeof(Message));
  60. /* for tswrite, we need to copy the data payload as well */
  61. if (newtrans->out->raw != NULL) {
  62. struct Tswrite *req = &newtrans->out->msg.tswrite;
  63. assert(trans->out->id == TSWRITE);
  64. newtrans->out->raw = raw_new();
  65. req->data = newtrans->out->raw + TWRITE_DATA_OFFSET;
  66. memcpy(req->data, trans->out->raw + TWRITE_DATA_OFFSET, req->count);
  67. }
  68. requests = cons(newtrans, requests);
  69. }
  70. /* send request to all storage servers and wait for all to respond */
  71. send_requests(requests, callback, env);
  72. /* make sure they all succeeded */
  73. while (!null(requests)) {
  74. trans = car(requests);
  75. assert(trans->in != NULL && trans->in->id == trans->out->id + 1);
  76. requests = cdr(requests);
  77. }
  78. }
  79. u64 object_reserve_oid(Worker *worker) {
  80. assert(storage_server_count > 0);
  81. /* is someone else in the process of requesting new oids? */
  82. while (object_reserve_wait != NULL)
  83. cond_wait(object_reserve_wait);
  84. /* do we need to request a fresh batch of oids? */
  85. if (object_reserve_remaining == 0) {
  86. /* the first storage server is considered the master */
  87. Transaction *trans = trans_new(storage_servers[0], NULL, message_new());
  88. struct Rsreserve *res;
  89. pthread_cond_t *wait;
  90. trans->out->tag = ALLOCTAG;
  91. trans->out->id = TSRESERVE;
  92. wait = object_reserve_wait = cond_new();
  93. send_request(trans);
  94. object_reserve_wait = NULL;
  95. cond_broadcast(wait);
  96. res = &trans->in->msg.rsreserve;
  97. object_reserve_next = res->firstoid;
  98. object_reserve_remaining = res->count;
  99. }
  100. object_reserve_remaining--;
  101. return object_reserve_next++;
  102. }
  103. struct qid object_create(Worker *worker, u64 oid, u32 mode,
  104. u32 ctime, char *uid, char *gid, char *extension)
  105. {
  106. Transaction *trans = trans_new(storage_servers[0], NULL, message_new());
  107. struct Rscreate *res;
  108. int len;
  109. /* create it in the cache */
  110. if (objectroot != NULL) {
  111. len = disk_create(worker, oid, mode, ctime, uid, gid, extension);
  112. assert(len >= 0);
  113. object_cache_validate(oid);
  114. }
  115. /* create it on the storage servers */
  116. trans->out->tag = ALLOCTAG;
  117. trans->out->id = TSCREATE;
  118. set_tscreate(trans->out, oid, mode, ctime, uid, gid, extension);
  119. send_request_to_all(trans, NULL, NULL);
  120. res = &trans->in->msg.rscreate;
  121. return res->qid;
  122. }
  123. struct object_clone_env {
  124. Worker *worker;
  125. u64 oid;
  126. u64 newoid;
  127. };
  128. static void object_clone_cb(struct object_clone_env *env) {
  129. /* clone it locally if we have it in the cache */
  130. if (object_cache_isvalid(env->oid)) {
  131. int res = disk_clone(env->worker, env->oid, env->newoid);
  132. assert(res >= 0);
  133. object_cache_validate(env->newoid);
  134. }
  135. }
  136. void object_clone(Worker *worker, u64 oid, u64 newoid) {
  137. struct object_clone_env env = {
  138. .worker = worker,
  139. .oid = oid,
  140. .newoid = newoid
  141. };
  142. Transaction *trans = trans_new(storage_servers[0], NULL, message_new());
  143. trans->out->tag = ALLOCTAG;
  144. trans->out->id = TSCLONE;
  145. set_tsclone(trans->out, oid, newoid);
  146. send_request_to_all(trans, (void (*)(void *)) object_clone_cb, &env);
  147. }
  148. void *object_read(Worker *worker, u64 oid, u32 atime, u64 offset, u32 count,
  149. u32 *bytesread, u8 **data)
  150. {
  151. int i;
  152. Transaction *trans;
  153. struct Rsread *res;
  154. void *result;
  155. /* read from the cache if it exists */
  156. if (object_cache_isvalid(oid)) {
  157. u8 *raw = raw_new();
  158. int len;
  159. *data = raw + RSREAD_DATA_OFFSET;
  160. len = disk_read(worker, oid, atime, offset, count, *data);
  161. assert(len > 0);
  162. *bytesread = len;
  163. return raw;
  164. }
  165. i = randInt(storage_server_count);
  166. trans = trans_new(storage_servers[i], NULL, message_new());
  167. trans->out->tag = ALLOCTAG;
  168. trans->out->id = TSREAD;
  169. set_tsread(trans->out, oid, atime, offset, count);
  170. /* send the request to one randomly chosen storage server */
  171. send_request(trans);
  172. assert(trans->in != NULL && trans->in->id == RSREAD);
  173. res = &trans->in->msg.rsread;
  174. *bytesread = res->count;
  175. *data = res->data;
  176. result = trans->in->raw;
  177. trans->in->raw = NULL;
  178. return result;
  179. }
  180. struct object_write_env {
  181. Worker *worker;
  182. u64 oid;
  183. u32 mtime;
  184. u64 offset;
  185. u32 count;
  186. u8 *data;
  187. };
  188. static void object_write_cb(struct object_write_env *env) {
  189. /* write to the cache if it exists */
  190. if (object_cache_isvalid(env->oid)) {
  191. int len = disk_write(env->worker, env->oid, env->mtime,
  192. env->offset, env->count, env->data);
  193. assert(len > 0);
  194. }
  195. }
  196. u32 object_write(Worker *worker, u64 oid, u32 mtime, u64 offset,
  197. u32 count, u8 *data, void *raw)
  198. {
  199. struct object_write_env env = {
  200. .worker = worker,
  201. .oid = oid,
  202. .mtime = mtime,
  203. .offset = offset,
  204. .count = count,
  205. .data = data
  206. };
  207. Transaction *trans = trans_new(storage_servers[0], NULL, message_new());
  208. struct Rswrite *res;
  209. assert(raw != NULL);
  210. trans->out->raw = raw;
  211. trans->out->tag = ALLOCTAG;
  212. trans->out->id = TSWRITE;
  213. set_tswrite(trans->out, mtime, offset, count, data, oid);
  214. send_request_to_all(trans, (void (*)(void *)) object_write_cb, &env);
  215. res = &trans->in->msg.rswrite;
  216. return res->count;
  217. }
  218. struct p9stat *object_stat(Worker *worker, u64 oid, char *filename) {
  219. int i;
  220. Transaction *trans;
  221. struct Rsstat *res;
  222. struct p9stat *info;
  223. /* handle it from the cache if it exists */
  224. if (object_cache_isvalid(oid)) {
  225. info = disk_stat(worker, oid);
  226. info->name = filename;
  227. return info;
  228. }
  229. i = randInt(storage_server_count);
  230. trans = trans_new(storage_servers[i], NULL, message_new());
  231. trans->out->tag = ALLOCTAG;
  232. trans->out->id = TSSTAT;
  233. set_tsstat(trans->out, oid);
  234. /* send the request to one randomly chosen storage server */
  235. send_request(trans);
  236. assert(trans->in != NULL && trans->in->id == RSSTAT);
  237. res = &trans->in->msg.rsstat;
  238. /* insert the filename supplied by the caller */
  239. res->stat->name = filename;
  240. /* check if we have a cache entry with matching stats */
  241. if (objectroot != NULL && (info = disk_stat(worker, oid)) != NULL) {
  242. info->name = filename;
  243. info->atime = res->stat->atime;
  244. /* if it's up-to-date, note it as a valid entry */
  245. if (!p9stat_cmp(info, res->stat))
  246. object_cache_validate(oid);
  247. }
  248. return res->stat;
  249. }
  250. struct object_wstat_env {
  251. Worker *worker;
  252. u64 oid;
  253. struct p9stat *info;
  254. };
  255. static void object_wstat_cb(struct object_wstat_env *env) {
  256. /* update the cache if it exists */
  257. if (object_cache_isvalid(env->oid)) {
  258. int res = disk_wstat(env->worker, env->oid, env->info);
  259. assert(res >= 0);
  260. }
  261. }
  262. void object_wstat(Worker *worker, u64 oid, struct p9stat *info) {
  263. struct object_wstat_env env = {
  264. .worker = worker,
  265. .oid = oid,
  266. .info = info
  267. };
  268. Transaction *trans = trans_new(storage_servers[0], NULL, message_new());
  269. trans->out->tag = ALLOCTAG;
  270. trans->out->id = TSWSTAT;
  271. set_tswstat(trans->out, oid, info);
  272. send_request_to_all(trans, (void (*)(void *)) object_wstat_cb, &env);
  273. }
  274. struct object_delete_env {
  275. Worker *worker;
  276. u64 oid;
  277. };
  278. static void object_delete_cb(struct object_delete_env *env) {
  279. /* delete the cache entry if it exists */
  280. if (objectroot != NULL) {
  281. int res = disk_delete(env->worker, env->oid);
  282. object_cache_invalidate(env->oid);
  283. if (res < 0) {
  284. /* no entry is okay for the cache */
  285. assert(-res == ENOENT);
  286. }
  287. }
  288. }
  289. void object_delete(Worker *worker, u64 oid) {
  290. struct object_delete_env env = {
  291. .worker = worker,
  292. .oid = oid
  293. };
  294. Transaction *trans = trans_new(storage_servers[0], NULL, message_new());
  295. trans->out->tag = ALLOCTAG;
  296. trans->out->id = TSDELETE;
  297. set_tsdelete(trans->out, oid);
  298. send_request_to_all(trans, (void (*)(void *)) object_delete_cb, &env);
  299. }
  300. struct object_fetch_env {
  301. Openfile *file;
  302. pthread_cond_t *wait;
  303. };
  304. static void object_fetch_iter(struct object_fetch_env *env, Transaction *trans)
  305. {
  306. struct Rsread *res;
  307. int x;
  308. assert(trans->in != NULL && trans->in->id == RSREAD);
  309. res = &trans->in->msg.rsread;
  310. while (env->wait != NULL)
  311. cond_wait(env->wait);
  312. env->wait = cond_new();
  313. unlock();
  314. x = lseek(env->file->fd, trans->out->msg.tsread.offset, SEEK_SET);
  315. assert(x >= 0);
  316. x = write(env->file->fd, res->data, res->count);
  317. assert(res->count == (u32) x);
  318. lock();
  319. raw_delete(trans->in->raw);
  320. trans->in->raw = NULL;
  321. cond_broadcast(env->wait);
  322. env->wait = NULL;
  323. }
  324. void object_fetch(Worker *worker, u64 oid, struct p9stat *info) {
  325. int res;
  326. u32 packetsize;
  327. u32 packetcount;
  328. u64 offset;
  329. int i;
  330. int start;
  331. u32 time = now();
  332. List **queues;
  333. struct object_fetch_env env;
  334. if (objectroot == NULL || object_cache_isvalid(oid))
  335. return;
  336. /* delete any existing entry in the cache */
  337. res = disk_delete(worker, oid);
  338. assert(res >= 0 || -res == ENOENT);
  339. /* create the file */
  340. disk_create(worker, oid, info->mode, info->mtime, info->uid,
  341. info->gid, info->extension);
  342. /* empty file? */
  343. if (info->length == 0 || !emptystring(info->extension)) {
  344. int res = disk_wstat(worker, oid, info);
  345. assert(res == 0);
  346. return;
  347. }
  348. /* stripe the reads across the storage servers */
  349. queues = GC_MALLOC(sizeof(List *) * storage_server_count);
  350. assert(queues != NULL);
  351. queues[0] = NULL;
  352. packetsize = (storage_servers[0]->maxSize / BLOCK_SIZE) * BLOCK_SIZE;
  353. for (i = 1; i < storage_server_count; i++) {
  354. int size = (storage_servers[i]->maxSize / BLOCK_SIZE) * BLOCK_SIZE;
  355. packetsize = min(packetsize, size);
  356. queues[i] = NULL;
  357. }
  358. packetcount = (info->length + (packetsize - 1)) / packetsize;
  359. i = 0;
  360. offset = 0;
  361. start = randInt(storage_server_count);
  362. /* create read requests in contiguous chunks for each server */
  363. while (offset < info->length) {
  364. u64 size = info->length - offset;
  365. if (size > packetsize)
  366. size = packetsize;
  367. Transaction *trans = trans_new(
  368. storage_servers[(i + start) % storage_server_count],
  369. NULL, message_new());
  370. trans->out->tag = ALLOCTAG;
  371. trans->out->id = TSREAD;
  372. set_tsread(trans->out, oid, time, offset, (u32) size);
  373. queues[i] = cons(trans, queues[i]);
  374. offset += size;
  375. /* time to switch to next server? */
  376. if (offset * storage_server_count > info->length * (i + 1))
  377. i++;
  378. }
  379. /* put the requests in sequential order */
  380. for (i = 0; i < storage_server_count; i++)
  381. queues[i] = reverse(queues[i]);
  382. env.file = disk_get_openfile(worker, oid);
  383. assert(env.file != NULL);
  384. env.wait = NULL;
  385. if (ftruncate(env.file->fd, info->length) < 0)
  386. assert(0);
  387. send_requests_streamed(queues, storage_server_count,
  388. (void (*)(void *, Transaction *)) object_fetch_iter, &env);
  389. if (disk_wstat(worker, oid, info) != 0)
  390. assert(0);
  391. object_cache_validate(oid);
  392. }
  393. void object_state_init(void) {
  394. object_reserve_next = ~ (u64) 0;
  395. object_reserve_remaining = 0;
  396. object_reserve_wait = NULL;
  397. if (objectroot == NULL) {
  398. object_cache_status = NULL;
  399. } else {
  400. object_cache_status = lru_new(
  401. OBJECT_CACHE_STATE_SIZE,
  402. (Hashfunc) u64_hash,
  403. (Cmpfunc) u64_cmp,
  404. NULL,
  405. NULL);
  406. }
  407. }