@inproceedings{e02a8f6ee2e045d18bda8982a203f1be,
title = "Coreset-Based data compression for Logistic Regression",
abstract = "The coreset paradigm is a fundamental tool for analysing complex and large datasets. Although coresets are used as an acceleration technique for many learning problems, the algorithms used for constructing them may become computationally exhaustive in some settings. We show that this can easily happen when computing coresets for learning a logistic regression classifier. We overcome this issue with two methods: Accelerating Clustering via Sampling (ACvS) and Regressed Data Summarisation Framework (RDSF); the former is an acceleration procedure based on a simple theoretical observation on using Uniform Random Sampling for clustering problems, the latter is a coreset-based data-summarising framework that builds on ACvS and extends it by using a regression algorithm as part of the construction. We tested both procedures on five public datasets, and observed that computing the coreset and learning from it, is 11 times faster than learning directly from the full input data in the worst case, and 34 times faster in the best case. We further observed that the best regression algorithm for creating summaries of data using the RDSF framework is the Ordinary Least Squares (OLS).",
keywords = "Coresets, Logistic Regression, Data compression, Logistic regression",
author = "Nery Riquelme-Granada and Nguyen, {Khuong An} and Zhiyuan Luo",
note = "Funding Information: This research is supported by AstraZeneca and the Paraguayan Government.",
year = "2021",
month = jul,
day = "23",
doi = "10.1007/978-3-030-83014-4_10",
language = "English",
isbn = "9783030830137",
volume = "1446",
series = "Communications in Computer and Information Science",
publisher = "Springer",
pages = "195--222",
editor = "Slimane Hammoudi and Christoph Quix and Jorge Bernardino",
booktitle = "Data Management Technologies and Applications - 9th International Conference, DATA 2020, Revised Selected Papers",
}