# -*- coding: utf-8; mode: tcl; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- vim:fenc=utf-8:ft=tcl:et:sw=4:ts=4:sts=4

PortSystem          1.0
PortGroup           cmake 1.1
PortGroup           github 1.0

# If you update this port, also update the py-sentencepiece to latest version.
# `py-sentencepiece` is closely coupled to `sentencepiece`.
github.setup        google sentencepiece 0.2.0 v
revision            0
checksums           rmd160  9940d5c995a5ee0895854fdd0ca3f8ae4fed6d95 \
                    sha256  9970f0a0afee1648890293321665e5b2efa04eaec9f1671fcf8048f456f5bb86 \
                    size    11980811

categories          textproc
license             Apache-2
maintainers         nomaintainer

description         Unsupervised text tokenizer for Neural Network-based text generation.

long_description    SentencePiece is an unsupervised text tokenizer \
                    and detokenizer mainly for Neural Network-based \
                    text generation systems where the vocabulary size \
                    is predetermined prior to the neural model \
                    training. SentencePiece implements subword units \
                    (e.g., byte-pair-encoding (BPE) (Sennrich et al.) \
                    and unigram language model (Kudo)) \
                    with the extension of direct training from raw \
                    sentences. SentencePiece allows us to make \
                    a purely end-to-end system that does not depend \
                    on language-specific pre/postprocessing.

github.tarball_from archive

depends_lib-append  port:gperftools \
                    port:protobuf3-cpp

compiler.cxx_standard           2017
compiler.thread_local_storage   yes

configure.args      -DSPM_USE_BUILTIN_PROTOBUF=OFF

platform darwin {
    if {[string match *gcc* ${configure.compiler}]} {
        # ___atomic_fetch_add_8
        configure.ldflags-append  -latomic
    }
}
