Documentation
Quick Start
Data Cockpit
PyRun
Data Formats
Examples
API Reference
DataPlug Installation
Install DataPlug
pip install git+https://github.com/CLOUDLAB-URV/dataplug
Import the library
from dataplug import CloudObject
Initialize
co = CloudObject.from_s3(DataFormat, "s3://bucket/path")
Basic Usage
import logging
import math
from dataplug import CloudObject
from dataplug.formats.genomics.fasta import FASTA, partition_chunks_strategy
from dataplug.util import setup_logging
def main():
setup_logging(logging.DEBUG)
FASTA.debug()
# Localhost minio config
minio = {"endpoint_url": "http://127.0.0.1:9000",
"role_arn": "arn:aws:iam::123456789012:role/S3Access"}
co = CloudObject.from_s3(FASTA, "s3://genomics/fasta_sample.fasta",
s3_config={"endpoint_url": "http://127.0.0.1:9000",
"role_arn": "arn:aws:iam::123456789012:role/S3Access"})
# Perform preprocessing in 4 parallel jobs (chunk size = total size / 4)
parallel_config = {"verbose": 10}
chunk_size = math.ceil(co.size / 4)
co.preprocess(parallel_config=parallel_config, chunk_size=chunk_size)
print(f"FASTA file has {co.attributes.num_sequences} sequences")
data_slices = co.partition(partition_chunks_strategy, num_chunks=8)
for data_slice in data_slices:
batch = data_slice.get().decode('utf-8')
print(batch)
print('---')Experiment with different chunk sizes, use multiple workers, ensure high-bandwidth connection to S3, and monitor memory usage for optimal performance.