/arch/ia64/sn/kernel/bte_error.c

https://bitbucket.org/evzijst/gittest · C · 198 lines · 118 code · 24 blank · 56 comment · 19 complexity · 3a7bbaaaa509bf3498496c70369f7c2e MD5 · raw file

  1. /*
  2. * This file is subject to the terms and conditions of the GNU General Public
  3. * License. See the file "COPYING" in the main directory of this archive
  4. * for more details.
  5. *
  6. * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
  7. */
  8. #include <linux/types.h>
  9. #include <asm/sn/sn_sal.h>
  10. #include "ioerror.h"
  11. #include <asm/sn/addrs.h>
  12. #include <asm/sn/shubio.h>
  13. #include <asm/sn/geo.h>
  14. #include "xtalk/xwidgetdev.h"
  15. #include "xtalk/hubdev.h"
  16. #include <asm/sn/bte.h>
  17. #include <asm/param.h>
  18. /*
  19. * Bte error handling is done in two parts. The first captures
  20. * any crb related errors. Since there can be multiple crbs per
  21. * interface and multiple interfaces active, we need to wait until
  22. * all active crbs are completed. This is the first job of the
  23. * second part error handler. When all bte related CRBs are cleanly
  24. * completed, it resets the interfaces and gets them ready for new
  25. * transfers to be queued.
  26. */
  27. void bte_error_handler(unsigned long);
  28. /*
  29. * Wait until all BTE related CRBs are completed
  30. * and then reset the interfaces.
  31. */
  32. void bte_error_handler(unsigned long _nodepda)
  33. {
  34. struct nodepda_s *err_nodepda = (struct nodepda_s *)_nodepda;
  35. spinlock_t *recovery_lock = &err_nodepda->bte_recovery_lock;
  36. struct timer_list *recovery_timer = &err_nodepda->bte_recovery_timer;
  37. nasid_t nasid;
  38. int i;
  39. int valid_crbs;
  40. unsigned long irq_flags;
  41. volatile u64 *notify;
  42. bte_result_t bh_error;
  43. ii_imem_u_t imem; /* II IMEM Register */
  44. ii_icrb0_d_u_t icrbd; /* II CRB Register D */
  45. ii_ibcr_u_t ibcr;
  46. ii_icmr_u_t icmr;
  47. ii_ieclr_u_t ieclr;
  48. BTE_PRINTK(("bte_error_handler(%p) - %d\n", err_nodepda,
  49. smp_processor_id()));
  50. spin_lock_irqsave(recovery_lock, irq_flags);
  51. if ((err_nodepda->bte_if[0].bh_error == BTE_SUCCESS) &&
  52. (err_nodepda->bte_if[1].bh_error == BTE_SUCCESS)) {
  53. BTE_PRINTK(("eh:%p:%d Nothing to do.\n", err_nodepda,
  54. smp_processor_id()));
  55. spin_unlock_irqrestore(recovery_lock, irq_flags);
  56. return;
  57. }
  58. /*
  59. * Lock all interfaces on this node to prevent new transfers
  60. * from being queued.
  61. */
  62. for (i = 0; i < BTES_PER_NODE; i++) {
  63. if (err_nodepda->bte_if[i].cleanup_active) {
  64. continue;
  65. }
  66. spin_lock(&err_nodepda->bte_if[i].spinlock);
  67. BTE_PRINTK(("eh:%p:%d locked %d\n", err_nodepda,
  68. smp_processor_id(), i));
  69. err_nodepda->bte_if[i].cleanup_active = 1;
  70. }
  71. /* Determine information about our hub */
  72. nasid = cnodeid_to_nasid(err_nodepda->bte_if[0].bte_cnode);
  73. /*
  74. * A BTE transfer can use multiple CRBs. We need to make sure
  75. * that all the BTE CRBs are complete (or timed out) before
  76. * attempting to clean up the error. Resetting the BTE while
  77. * there are still BTE CRBs active will hang the BTE.
  78. * We should look at all the CRBs to see if they are allocated
  79. * to the BTE and see if they are still active. When none
  80. * are active, we can continue with the cleanup.
  81. *
  82. * We also want to make sure that the local NI port is up.
  83. * When a router resets the NI port can go down, while it
  84. * goes through the LLP handshake, but then comes back up.
  85. */
  86. icmr.ii_icmr_regval = REMOTE_HUB_L(nasid, IIO_ICMR);
  87. if (icmr.ii_icmr_fld_s.i_crb_mark != 0) {
  88. /*
  89. * There are errors which still need to be cleaned up by
  90. * hubiio_crb_error_handler
  91. */
  92. mod_timer(recovery_timer, HZ * 5);
  93. BTE_PRINTK(("eh:%p:%d Marked Giving up\n", err_nodepda,
  94. smp_processor_id()));
  95. spin_unlock_irqrestore(recovery_lock, irq_flags);
  96. return;
  97. }
  98. if (icmr.ii_icmr_fld_s.i_crb_vld != 0) {
  99. valid_crbs = icmr.ii_icmr_fld_s.i_crb_vld;
  100. for (i = 0; i < IIO_NUM_CRBS; i++) {
  101. if (!((1 << i) & valid_crbs)) {
  102. /* This crb was not marked as valid, ignore */
  103. continue;
  104. }
  105. icrbd.ii_icrb0_d_regval =
  106. REMOTE_HUB_L(nasid, IIO_ICRB_D(i));
  107. if (icrbd.d_bteop) {
  108. mod_timer(recovery_timer, HZ * 5);
  109. BTE_PRINTK(("eh:%p:%d Valid %d, Giving up\n",
  110. err_nodepda, smp_processor_id(),
  111. i));
  112. spin_unlock_irqrestore(recovery_lock,
  113. irq_flags);
  114. return;
  115. }
  116. }
  117. }
  118. BTE_PRINTK(("eh:%p:%d Cleaning up\n", err_nodepda, smp_processor_id()));
  119. /* Reenable both bte interfaces */
  120. imem.ii_imem_regval = REMOTE_HUB_L(nasid, IIO_IMEM);
  121. imem.ii_imem_fld_s.i_b0_esd = imem.ii_imem_fld_s.i_b1_esd = 1;
  122. REMOTE_HUB_S(nasid, IIO_IMEM, imem.ii_imem_regval);
  123. /* Clear BTE0/1 error bits */
  124. ieclr.ii_ieclr_regval = 0;
  125. if (err_nodepda->bte_if[0].bh_error != BTE_SUCCESS)
  126. ieclr.ii_ieclr_fld_s.i_e_bte_0 = 1;
  127. if (err_nodepda->bte_if[1].bh_error != BTE_SUCCESS)
  128. ieclr.ii_ieclr_fld_s.i_e_bte_1 = 1;
  129. REMOTE_HUB_S(nasid, IIO_IECLR, ieclr.ii_ieclr_regval);
  130. /* Reinitialize both BTE state machines. */
  131. ibcr.ii_ibcr_regval = REMOTE_HUB_L(nasid, IIO_IBCR);
  132. ibcr.ii_ibcr_fld_s.i_soft_reset = 1;
  133. REMOTE_HUB_S(nasid, IIO_IBCR, ibcr.ii_ibcr_regval);
  134. for (i = 0; i < BTES_PER_NODE; i++) {
  135. bh_error = err_nodepda->bte_if[i].bh_error;
  136. if (bh_error != BTE_SUCCESS) {
  137. /* There is an error which needs to be notified */
  138. notify = err_nodepda->bte_if[i].most_rcnt_na;
  139. BTE_PRINTK(("cnode %d bte %d error=0x%lx\n",
  140. err_nodepda->bte_if[i].bte_cnode,
  141. err_nodepda->bte_if[i].bte_num,
  142. IBLS_ERROR | (u64) bh_error));
  143. *notify = IBLS_ERROR | bh_error;
  144. err_nodepda->bte_if[i].bh_error = BTE_SUCCESS;
  145. }
  146. err_nodepda->bte_if[i].cleanup_active = 0;
  147. BTE_PRINTK(("eh:%p:%d Unlocked %d\n", err_nodepda,
  148. smp_processor_id(), i));
  149. spin_unlock(&err_nodepda->bte_if[i].spinlock);
  150. }
  151. del_timer(recovery_timer);
  152. spin_unlock_irqrestore(recovery_lock, irq_flags);
  153. }
  154. /*
  155. * First part error handler. This is called whenever any error CRB interrupt
  156. * is generated by the II.
  157. */
  158. void
  159. bte_crb_error_handler(cnodeid_t cnode, int btenum,
  160. int crbnum, ioerror_t * ioe, int bteop)
  161. {
  162. struct bteinfo_s *bte;
  163. bte = &(NODEPDA(cnode)->bte_if[btenum]);
  164. /*
  165. * The caller has already figured out the error type, we save that
  166. * in the bte handle structure for the thread excercising the
  167. * interface to consume.
  168. */
  169. bte->bh_error = ioe->ie_errortype + BTEFAIL_OFFSET;
  170. bte->bte_error_count++;
  171. BTE_PRINTK(("Got an error on cnode %d bte %d: HW error type 0x%x\n",
  172. bte->bte_cnode, bte->bte_num, ioe->ie_errortype));
  173. bte_error_handler((unsigned long) NODEPDA(cnode));
  174. }