The Joint Accelerator Conferences Website (JACoW) is an international collaboration that publishes the proceedings of accelerator conferences held around the world.
@InProceedings{vino:icalepcs2019-tudpp01, author = {G. Vino and V. Chibante Barroso and D. Elia and A. Wegrzynek}, title = {{A Monitoring System for the New ALICE O2 Farm}}, booktitle = {Proc. ICALEPCS'19}, pages = {835--840}, paper = {TUDPP01}, language = {english}, keywords = {monitoring, detector, network, database, controls}, venue = {New York, NY, USA}, series = {International Conference on Accelerator and Large Experimental Physics Control Systems}, number = {17}, publisher = {JACoW Publishing, Geneva, Switzerland}, month = {08}, year = {2020}, issn = {2226-0358}, isbn = {978-3-95450-209-7}, doi = {10.18429/JACoW-ICALEPCS2019-TUDPP01}, url = {https://jacow.org/icalepcs2019/papers/tudpp01.pdf}, note = {https://doi.org/10.18429/JACoW-ICALEPCS2019-TUDPP01}, abstract = {The ALICE Experiment has been designed to study the physics of strongly interacting matter with heavy-ion collisions at the CERN LHC. A major upgrade of the detector and computing model (O2, Offline-Online) is currently ongoing. The ALICE O2 farm will consist of almost 1000 nodes enabled to readout and process on-the-fly about 27 Tb/s of raw data. To increase the efficiency of computing farm operations a general-purpose near real-time monitoring system has been developed: it lays on features like high-performance, high-availability, modularity, and open source. The core component (Apache Kafka) ensures high throughput, data pipelines, and fault-tolerant services. Additional monitoring functionality is based on Telegraf as metric collector, Apache Spark for complex aggregation, InfluxDB as time-series database, and Grafana as visualization tool. A logging service based on Elasticsearch stack is also included. The designed system handles metrics coming from operating system, network, custom hardware, and in-house software. A prototype version is currently running at CERN and has been also successfully deployed by the ReCaS Datacenter at INFN Bari for both monitoring and logging.}, }