XRootD
Loading...
Searching...
No Matches
XrdCmsManager.cc
Go to the documentation of this file.
1/******************************************************************************/
2/* */
3/* X r d C m s M a n a g e r . c c */
4/* */
5/* (c) 2007 by the Board of Trustees of the Leland Stanford, Jr., University */
6/* All Rights Reserved */
7/* Produced by Andrew Hanushevsky for Stanford University under contract */
8/* DE-AC02-76-SFO0515 with the Department of Energy */
9/* */
10/* This file is part of the XRootD software suite. */
11/* */
12/* XRootD is free software: you can redistribute it and/or modify it under */
13/* the terms of the GNU Lesser General Public License as published by the */
14/* Free Software Foundation, either version 3 of the License, or (at your */
15/* option) any later version. */
16/* */
17/* XRootD is distributed in the hope that it will be useful, but WITHOUT */
18/* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or */
19/* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public */
20/* License for more details. */
21/* */
22/* You should have received a copy of the GNU Lesser General Public License */
23/* along with XRootD in a file called COPYING.LESSER (LGPL license) and file */
24/* COPYING (GPL license). If not, see <http://www.gnu.org/licenses/>. */
25/* */
26/* The copyright holder's institutional names and contributor's names may not */
27/* be used to endorse or promote products derived from this software without */
28/* specific prior written permission of the institution or contributor. */
29/******************************************************************************/
30
31#include <cstdlib>
32#include <cstdio>
33#include <unistd.h>
34#include <netinet/in.h>
35#include <sys/types.h>
36
37#include "Xrd/XrdScheduler.hh"
38
42#include "XrdCms/XrdCmsNode.hh"
45#include "XrdCms/XrdCmsUtils.hh"
46#include "XrdCms/XrdCmsTrace.hh"
47
48#include "XrdNet/XrdNetAddr.hh"
49
50#include "XrdOuc/XrdOucTList.hh"
52
53#include "XrdSys/XrdSysError.hh"
54#include "XrdSys/XrdSysTimer.hh"
55
56/******************************************************************************/
57/* G l o b a l O b j e c t s */
58/******************************************************************************/
59
60namespace XrdCms
61{
63
65}
66
67using namespace XrdCms;
68
69/******************************************************************************/
70/* S t a t i c M e m b e r s */
71/******************************************************************************/
72
73XrdSysMutex XrdCmsManager::MTMutex;
74XrdCmsNode *XrdCmsManager::MastTab[MTMax] = {0};
75char XrdCmsManager::MastSID[MTMax] = {0};
76int XrdCmsManager::MTHi = -1;
77
78/******************************************************************************/
79/* L o c a l C l a s s e s */
80/******************************************************************************/
81
83{
84public:
85
86 void DoIt() {nodeP->Delete(XrdCmsManager::MTMutex);
87 delete this;
88 }
89
90 XrdCmsDelNode(XrdCmsNode *nP) : XrdJob("delete node"), nodeP(nP)
91 {Sched->Schedule((XrdJob *)this);}
92
94
96};
97
98/******************************************************************************/
99/* C o n s t r u c t o r */
100/******************************************************************************/
101
103{
104 myMans = 0;
105 ManTree = 0;
106 curManCnt = 0;
107 curManList= mlP;
108 newManList= 0;
109 theSite = 0;
110 theHost = 0;
111 theSID = 0;
112 siteID = snum;
113 wasRedir = false;
114}
115
116/******************************************************************************/
117/* A d d */
118/******************************************************************************/
119
120XrdCmsNode *XrdCmsManager::Add(XrdLink *lp, int Lvl, bool &xit)
121{
122 EPNAME("Add")
123 XrdCmsNode *nP;
124 int i;
125
126// Check if there is a pending reconfiguration. If so, return no node but
127// tell the caller to finish so we can proceed with the reconfiguration
128//
129 MTMutex.Lock();
130 lp->setID("manager",0);
131 if (newManList) {MTMutex.UnLock(); xit = true; return 0;}
132 xit = false;
133
134// Find available ID for this node
135//
136 for (i = 0; i < MTMax; i++) if (!MastTab[i]) break;
137
138// Check if we have too many here
139//
140 if (i >= MTMax)
141 {MTMutex.UnLock();
142 Say.Emsg("Manager", "Login to", lp->Name(), "failed; too many managers");
143 return 0;
144 }
145
146// Obtain a new a new node object
147//
148 if (!(nP = new XrdCmsNode(lp, 0, 0, 0, Lvl, i)))
149 {Say.Emsg("Manager", "Unable to obtain node object."); return 0;}
150
151// Assign new manager
152//
153 MastTab[i] = nP;
154 MastSID[i] = siteID;
155 if (i > MTHi) MTHi = i;
156 nP->isOffline = 0;
157 nP->isNoStage = 0;
158 nP->isBad = 0;
159 nP->isBound = 1;
160 nP->isConn = 1;
161 nP->isMan = (Config.asManager() ? 1 : 0);
162 nP->setManager(this);
163 MTMutex.UnLock();
164
165// Document login
166//
167 DEBUG(nP->Name() <<" to manager config; id=" <<i);
168 return nP;
169}
170
171/******************************************************************************/
172/* D e l e t e */
173/******************************************************************************/
174
176{
177 new XrdCmsDelNode(nodeP);
178}
179
180/******************************************************************************/
181/* F i n i s h e d */
182/******************************************************************************/
183
184void XrdCmsManager::Finished(const char *manP, int mPort)
185{
186 XrdOucTList *mP;
187 char mbuff[16];
188
189// Indicate what we are disbanding
190//
191 sprintf(mbuff, ":%d", mPort);
192 Say.Say("Config ", "Manager ", manP, mbuff, " unconfigured.");
193
194// Serialize
195//
196 MTMutex.Lock();
197
198// If this is this is the last manager connection and we have a pending new
199// list of managers, run those now. We waited so as to not overwhelm the system.
200//
201 curManCnt--;
202 if (curManCnt > 0 || !newManList) {MTMutex.UnLock(); return;}
203
204// Remove all vestigial information
205//
206 for (int i = 0; i <= MTHi; i++)
207 {if (MastSID[i] == siteID) {MastTab[i] = 0; MastSID[i] = 0;}}
208
209// Readjust the high water mark
210//
211 while(MTHi >= 0 && !MastTab[MTHi]) MTHi--;
212
213// Delete the current manager list, it is safe to do so
214//
215 while((mP = curManList)) {curManList = curManList->next; delete mP;}
216 curManList = newManList;
217 newManList = 0;
218
219// Run the new manager setup
220//
221 Say.Say("Config ","Manager subsystem reconfiguration completed; restarting.");
222 Run(curManList);
223
224// All done
225//
226 MTMutex.UnLock();
227}
228
229/******************************************************************************/
230/* I n f o r m */
231/******************************************************************************/
232
233void XrdCmsManager::Inform(const char *What, const char *Data, int Dlen)
234{
235 EPNAME("Inform");
236 XrdCmsNode *nP;
237 int i;
238
239// Obtain a lock on the table
240//
241 MTMutex.Lock();
242
243// Run through the table looking for managers to send messages to
244//
245 for (i = 0; i <= MTHi; i++)
246 {if ((nP=MastTab[i]) && !nP->isOffline)
247 {nP->Lock();
248 MTMutex.UnLock();
249 DEBUG(nP->Name() <<" " <<What);
250 nP->Send(Data, Dlen);
251 nP->UnLock();
252 MTMutex.Lock();
253 }
254 }
255 MTMutex.UnLock();
256}
257
258/******************************************************************************/
259
260void XrdCmsManager::Inform(const char *What, struct iovec *vP, int vN, int vT)
261{
262 EPNAME("Inform");
263 int i;
264 XrdCmsNode *nP;
265
266// Obtain a lock on the table
267//
268 MTMutex.Lock();
269
270// Run through the table looking for managers to send messages to
271//
272 for (i = 0; i <= MTHi; i++)
273 {if ((nP=MastTab[i]) && !nP->isOffline)
274 {nP->Lock();
275 MTMutex.UnLock();
276 DEBUG(nP->Name() <<" " <<What);
277 nP->Send(vP, vN, vT);
278 nP->UnLock();
279 MTMutex.Lock();
280 }
281 }
282 MTMutex.UnLock();
283}
284
285/******************************************************************************/
286
288 const char *Arg, int Alen)
289{
290 CmsRRHdr Hdr = {0, (kXR_char)rCode, (kXR_char)rMod,
291 htons(static_cast<unsigned short>(Alen))};
292 struct iovec ioV[2] = {{(char *)&Hdr, sizeof(Hdr)},
293 {(char *)Arg, (size_t)Alen}};
294
295 Inform(Router.getName((int)rCode), ioV, (Arg ? 2 : 1), Alen+sizeof(Hdr));
296}
297
298/******************************************************************************/
299
300void XrdCmsManager::Inform(CmsRRHdr &Hdr, const char *Arg, int Alen)
301{
302 struct iovec ioV[2] = {{(char *)&Hdr, sizeof(Hdr)},
303 {(char *)Arg, (size_t)Alen}};
304
305 Hdr.datalen = htons(static_cast<unsigned short>(Alen));
306
307 Inform(Router.getName(Hdr.rrCode), ioV, (Arg ? 2 : 1), Alen+sizeof(Hdr));
308}
309
310/******************************************************************************/
311/* R e m o v e */
312/******************************************************************************/
313
314void XrdCmsManager::Remove(XrdCmsNode *nP, const char *reason)
315{
316 EPNAME("Remove")
317 int sinst, sent = nP->ID(sinst);
318
319// Obtain a lock on the servtab
320//
321 MTMutex.Lock();
322
323// Make sure this node is the right one
324//
325 if (!(nP == MastTab[sent]))
326 {MTMutex.UnLock();
327 DEBUG("manager " <<sent <<'.' <<sinst <<" failed.");
328 return;
329 }
330
331// Remove node from the manager table
332//
333 MastTab[sent] = 0;
334 MastSID[sent] = 0;
335 nP->isOffline = 1;
336 DEBUG("completed " <<nP->Name() <<" manager " <<sent <<'.' <<sinst);
337
338// Readjust MTHi
339//
340 if (sent == MTHi) while(MTHi >= 0 && !MastTab[MTHi]) MTHi--;
341 MTMutex.UnLock();
342
343// Document removal
344// .
345 if (reason) Say.Emsg("Manager", nP->Ident, "removed;", reason);
346}
347
348/******************************************************************************/
349/* R e r u n */
350/******************************************************************************/
351
352void XrdCmsManager::Rerun(char *newMans)
353{
354 static CmsDiscRequest discRequest = {{0, kYR_disc, 0, 0}};
355 XrdOucTList *tP;
356 const char *eText;
357 char *hP;
358 int newManCnt = 0;
359
360// Lock ourselves
361//
362 MTMutex.Lock();
363 wasRedir = true;
364
365// If we already have a pending new sequence, then just return
366//
367 if (newManList) {MTMutex.UnLock(); return;}
368
369// Indicate that we will be re-initialzing
370//
371 Say.Say("Config ", "Manager subsystem reconfiguring using ", newMans);
372
373// Process the new man list
374//
375 XrdNetAddr manAddr;
376 XrdOucTokenizer mList((char *)newMans);
377 hP = mList.GetLine();
378
379// Add each manager in the list. These have already been expanded and are
380// gaurenteed to contain a port number as the list is provided by the cmsd.
381// However, we will check for duplicates and ignore any overage.
382//
383 while((hP = mList.GetToken()))
384 {if ((eText = manAddr.Set(hP)))
385 {Say.Emsg("Config","Ignoring manager", hP, eText); continue;}
386 tP = newManList;
387 while(tP && strcmp(hP, tP->text)) tP = tP->next;
388 if (tP) {Say.Emsg("Config","Ignoring duplicate manager", hP);
389 continue;
390 }
391 if (newManCnt >=MTMax)
392 {Say.Emsg("Config","Ignoring manager", hP,
393 "and remaining entries; limit exceeded!");
394 break;
395 }
396 newManList = new XrdOucTList(manAddr.Name(),manAddr.Port(),newManList);
397 newManCnt++;
398 }
399
400// If we have managers then tell the cluster builder to abort as we will
401// be restarting this whole process (we don't want any hung nodes here).
402//
403 if (newManCnt) ManTree->Abort();
404
405// Now run through the node table and doom all current site connections as we
406// need to reinitialize the whole manager subsystem. Note that none of these
407// objects can escape without us removing them from the table.
408//
409 if (newManCnt)
410 {for (int i = 0; i <= MTHi; i++)
411 if (MastTab[i] && (MastSID[i] == siteID))
412 {MastTab[i]->isBad |= XrdCmsNode::isBlisted|XrdCmsNode::isDoomed;
413 MastTab[i]->Send((char *)&discRequest, sizeof(discRequest));
414 }
415 }
416
417// We are done
418//
419 MTMutex.UnLock();
420}
421
422/******************************************************************************/
423/* R e s e t */
424/******************************************************************************/
425
427{
428 EPNAME("Reset");
429 static CmsStatusRequest myState = {{0, kYR_status,
431 static const int szReqst = sizeof(CmsStatusRequest);
432 XrdCmsNode *nP;
433 int i;
434
435// Obtain a lock on the table
436//
437 MTMutex.Lock();
438
439// Run through the table looking for managers to send a reset request
440//
441 for (i = 0; i <= MTHi; i++)
442 {if ((nP=MastTab[i]) && !nP->isOffline && nP->isKnown)
443 {nP->Lock();
444 nP->isKnown = 0;
445 MTMutex.UnLock();
446 DEBUG("sent to " <<nP->Name());
447 nP->Send((char *)&myState, szReqst);
448 nP->UnLock();
449 MTMutex.Lock();
450 }
451 }
452 MTMutex.UnLock();
453}
454
455/******************************************************************************/
456/* Private: R u n */
457/******************************************************************************/
458
459int XrdCmsManager::Run(XrdOucTList *manL)
460{
461 XrdOucTList *tP = manL;
462 XrdJob *jP, *jFirst = 0, *jLast = 0;
463
464// This method is either called during initial start-up or if we were wholly
465// redirected elsewhere due to a blacklist. In the latter case, the caller
466// must have obtained all the required locks
467//
468 curManCnt = 0;
469 if (!manL) return 0;
470
471// Prime the manager subsystem. We check here to make sure we will not be
472// tying to connect to ourselves. This is possible if the manager and meta-
473// manager were defined to be the same and we are a manager. We would have
474// liked to screen this out earlier but port discovery prevents it.
475//
476 while(tP)
477 {if (strcmp(tP->text, Config.myName) || tP->val != Config.PortTCP)
478 {jP = (XrdJob *)XrdCmsProtocol::Alloc(Config.myRole, this,
479 tP->text, tP->val);
480 if (!jFirst) jFirst = jLast = jP;
481 else {jLast->NextJob = jP; jLast = jP;}
482 curManCnt++;
483 } else {
484 char buff[512];
485 sprintf(buff, "%s:%d", tP->text, tP->val);
486 Say.Emsg("Config", "Circular connection to", buff, "ignored.");
487 }
488 tP = tP->next;
489 }
490
491// Make sure we have something to start up
492//
493 if (!curManCnt)
494 {Say.Emsg("Config","No managers can be started; we are now unreachable!");
495 return 0;
496 }
497
498// We now know there is no pandering going on, so we need to initialize the
499// the tree management subsystem to get it into a fresh state.
500//
501 if (myMans) delete myMans;
502 myMans = new XrdCmsManList;
503 if (ManTree) delete ManTree;
504 ManTree = new XrdCmsManTree(curManCnt);
505 if (theSID) {free(theSID); theSID = 0;}
506 if (theSite) {free(theSite); theSite = 0;}
507
508// Now start up all of the threads
509//
510 if (jFirst == jLast) Sched->Schedule(jFirst);
511 else Sched->Schedule(curManCnt, jFirst, jLast);
512
513// All done
514//
515 return curManCnt;
516}
517
518/******************************************************************************/
519/* S t a r t */
520/******************************************************************************/
521
523{
524 XrdOucTList *manVec[MTMax] = {0};
525 XrdCmsManager *manP;
526 char buff[1024];
527 int n, sid, snum = 0, mtot = 0, mnum = 0, xnum = 0;
528
529// If there is no manager list then we must not be connecting to anyone
530//
531 if (!mL) return true;
532
533// Segregate the manager list by site and run them that way. Unfortunately,
534// that means we have to copy the TList. This ok as this happens once.
535//
536 while(mL)
537 {sid = mL->ival[1]; mtot++;
538 if (sid >= MTMax)
539 {sprintf(buff, "%d", sid);
540 Say.Say("Config ", "Invalid site ID ", buff, " for ", mL->text);
541 } else {
542 manVec[sid] = new XrdOucTList(mL->text, mL->val, manVec[sid]);
543 mnum++;
544 }
545 mL = mL->next;
546 }
547
548// Count how many sites we have
549//
550 for (n = 0; n < MTMax; n++) if (manVec[n]) snum++;
551
552// Indicate what we are about to do
553//
554 snprintf(buff, sizeof(buff),"%d manager%s and %d site%s.", mnum,
555 (mnum != 1 ? "s":""), snum, (snum != 1 ? "s":""));
556 Say.Say("Config Connecting to ", buff);
557
558// Now run each one
559//
560 for (n = 0; n < MTMax; n++)
561 {if (manVec[n])
562 {manP = new XrdCmsManager(manVec[n], n);
563 xnum += manP->Run(manVec[n]);
564 }
565 }
566
567// Check if we should issue a warning
568//
569 if (xnum < mtot)
570 {snprintf(buff, sizeof(buff), "%d of %d", xnum, mtot);
571 Say.Say("Config Warning! Only ", buff, " manager(s) will be contacted!");
572 }
573
574// All done
575//
576 return xnum == mtot;
577}
578
579/******************************************************************************/
580/* V e r i f y */
581/******************************************************************************/
582
583bool XrdCmsManager::Verify(XrdLink *lP, const char *sid, const char *sname)
584{
585 XrdSysMutexHelper hMutex(MTMutex);
586 const char *sidP;
587
588// Trim off the type of service in the sid
589//
590 if ((sidP = index(sid, ' '))) sidP++;
591 else sidP = sid;
592
593// If we have no sid, just record it
594//
595 if (!theSID)
596 {theSID = strdup(sidP);
597 if (theSite) free(theSite);
598 theHost = strdup(lP->Host());
599 theSite = (sname ? strdup(sname) : strdup("anonymous"));
600 return true;
601 }
602
603// Make sure we are connecting to the same cluster as before
604//
605 if (!strcmp(theSID, sidP)) return true;
606
607// Compute the offending site configuration
608//
609 char mBuff[1024];
610 snprintf(mBuff,sizeof(mBuff),"%s for site %s; "
611 "making file location unpredictable!", theHost,
612 (wasRedir ? theSite : XrdCmsUtils::SiteName(siteID)));
613
614// There seems to be a configuration error here
615//
616 Say.Emsg("Manager", lP->Host(), "manager configuration differs from", mBuff);
617 return false;
618}
unsigned char kXR_char
Definition XPtypes.hh:65
#define DEBUG(x)
#define EPNAME(x)
XrdCmsDelNode(XrdCmsNode *nP)
XrdCmsNode * nodeP
static void Inform(const char *What, const char *Data, int Dlen)
XrdCmsNode * Add(XrdLink *lp, int Lvl, bool &xit)
void Rerun(char *newMans)
void Finished(const char *manP, int mPort)
static void Reset()
friend class XrdCmsDelNode
static bool Start(const XrdOucTList *mL)
XrdCmsManTree * ManTree
void Delete(XrdCmsNode *nodeP)
XrdCmsManager(XrdOucTList *mlP, int snum)
void Remove(XrdCmsNode *nP, const char *reason=0)
bool Verify(XrdLink *lP, const char *sid, const char *sname)
static const int MTMax
XrdCmsManList * myMans
void setManager(XrdCmsManager *mP)
char * Ident
Definition XrdCmsNode.hh:61
int Send(const char *buff, int blen=0)
void Lock()
char * Name()
char isOffline
Definition XrdCmsNode.hh:64
int ID(int &INum)
char isNoStage
Definition XrdCmsNode.hh:66
void UnLock()
static const char isDoomed
Definition XrdCmsNode.hh:82
static const char isBlisted
Definition XrdCmsNode.hh:79
static XrdCmsProtocol * Alloc(const char *theRole="", XrdCmsManager *mP=0, const char *theMan=0, int thePort=0)
static const char * SiteName(int snum)
XrdJob(const char *desc="")
Definition XrdJob.hh:51
const char * Name(const char *eName=0, const char **eText=0)
int Port(int pNum=-1)
const char * Set(const char *hSpec, int pNum=PortInSpec)
XrdOucTList * next
char * GetToken(char **rest=0, int lowcase=0)
void Schedule(XrdJob *jp)
int Emsg(const char *esfx, int ecode, const char *text1, const char *text2=0)
XrdCmsRouter Router
kXR_unt16 datalen
Definition YProtocol.hh:86
XrdScheduler * Sched
XrdSysError Say
kXR_char rrCode
Definition YProtocol.hh:84
XrdSysTrace Trace("cms")
XrdCmsConfig Config
@ kYR_status
Definition YProtocol.hh:112