Skip to content

Commit

Permalink
docs v3.7: add archive.md and refs, main readme, API comments and ref
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Aug 22, 2021
1 parent ecf466a commit f85b4c8
Show file tree
Hide file tree
Showing 13 changed files with 88 additions and 65 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ Further, there's the capability referred to as [global namespace](/docs/provider
- Tutorials
- [Tutorials](/docs/tutorials/README.md)
- [Videos](/docs/videos.md)
- Power tools and extensions
- [Reading, writing, and listing *archives*](/docs/archive.md)
- [Distributed Shuffle](/docs/dsort.md)
- [Downloader](/docs/downloader.md)
- [Extract, Transform, Load](/docs/etl.md)
- [Tools and utilities](/docs/tools.md)
- Benchmarking and tuning Performance
- [AIS Load Generator: integrated benchmark tool](/docs/aisloader.md)
- [How to benchmark](/docs/howto_benchmark.md)
Expand All @@ -92,12 +98,6 @@ Further, there's the capability referred to as [global namespace](/docs/provider
- [Storage Services](/docs/storage_svcs.md)
- [Checksumming: brief theory of operations](/docs/checksum.md)
- [S3 compatibility](/docs/s3compat.md)
- Power tools and extensions
- [Distributed Shuffle](/docs/dsort.md)
- [Downloader](/docs/downloader.md)
- [Extract, Transform, Load](/docs/etl.md)
- Reading, writing, and listing *archives* (objects formatted as TAR, TGZ, ZIP, etc.)
- [Tools and utilities](/docs/tools.md)
- Cluster Management
- [Joining AIS cluster](/docs/join_cluster.md)
- [Leaving AIS cluster](/docs/leave_cluster.md)
Expand Down
20 changes: 16 additions & 4 deletions api/bucket.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,15 @@ func waitForAsyncReqComplete(reqParams ReqParams, action string, msg *cmn.Bucket
}

// ListObjects returns list of objects in a bucket. `numObjects` is the
// maximum number of objects returned (0 - return all objects in a bucket).
// maximum number of objects to be returned (0 - return all objects in a bucket).
// This API supports numerous options and flags. In particular, `cmn.SelectMsg`
// supports "opening" objects formatted as one of the supported
// archival types and include contents of archived directories in generated
// result sets.
// See also: CLI and CLI usage examples
// See also: `cmn.SelectMsg`
// See also: `api.ListObjectsInvalidateCache`
// See also: `api.ListObjectsPage`
func ListObjects(baseParams BaseParams, bck cmn.Bck, smsg *cmn.SelectMsg, numObjects uint,
args ...*ProgressContext) (bckList *cmn.BucketList, err error) {
baseParams.Method = http.MethodGet
Expand Down Expand Up @@ -401,8 +409,12 @@ func ListObjects(baseParams BaseParams, bck cmn.Bck, smsg *cmn.SelectMsg, numObj
}

// ListObjectsPage returns the first page of bucket objects.
// On success the function updates `smsg.ContinuationToken`, so a client can reuse
// the message to fetch the next page.
// On success the function updates `smsg.ContinuationToken` which client then can reuse
// to fetch the next page.
// See also: CLI and CLI usage examples
// See also: `cmn.SelectMsg`
// See also: `api.ListObjectsInvalidateCache`
// See also: `api.ListObjects`
func ListObjectsPage(baseParams BaseParams, bck cmn.Bck, smsg *cmn.SelectMsg) (*cmn.BucketList, error) {
baseParams.Method = http.MethodGet
if smsg == nil {
Expand Down Expand Up @@ -430,7 +442,7 @@ func ListObjectsPage(baseParams BaseParams, bck cmn.Bck, smsg *cmn.SelectMsg) (*
return page, nil
}

// TODO: remove this function after introducing mechanism detecting bucket changes.
// TODO: obsolete this function after introducing mechanism to detect remote bucket changes.
func ListObjectsInvalidateCache(params BaseParams, bck cmn.Bck) error {
params.Method = http.MethodPost
var (
Expand Down
3 changes: 1 addition & 2 deletions cmd/cli/commands/arch_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,5 @@ func listArchHandler(c *cli.Context) (err error) {
if err != nil {
return err
}

return _listArchObjects(c, bck, objName, true)
return _doListObj(c, bck, objName, true /*list arch*/)
}
27 changes: 10 additions & 17 deletions cmd/cli/commands/bucket.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,32 +169,26 @@ func listBuckets(c *cli.Context, query cmn.QueryBcks) (err error) {
return
}

// Lists objects in bucket
// Lists objects in a bucket; include archived content if requested
func listObjects(c *cli.Context, bck cmn.Bck) error {
prefix := parseStrFlag(c, prefixFlag)
listArch := flagIsSet(c, listArchiveFlag)
return _listArchObjects(c, bck, prefix, listArch)
listArch := flagIsSet(c, listArchFlag)
return _doListObj(c, bck, prefix, listArch)
}

// Lists objects in bucket
func _listArchObjects(c *cli.Context, bck cmn.Bck, prefix string, isArch bool) error {
objectListFilter, err := newObjectListFilter(c)
func _doListObj(c *cli.Context, bck cmn.Bck, prefix string, listArch bool) error {
var (
showUnmatched = flagIsSet(c, showUnmatchedFlag)
msg = &cmn.SelectMsg{Prefix: prefix}
objectListFilter, err = newObjectListFilter(c)
)
if err != nil {
return err
}

var (
showUnmatched = flagIsSet(c, showUnmatchedFlag)

msg = &cmn.SelectMsg{
Prefix: prefix,
}
)

if flagIsSet(c, cachedFlag) {
msg.SetFlag(cmn.SelectCached)
}
if isArch {
if listArch {
msg.SetFlag(cmn.SelectArchDir)
}
props := strings.Split(parseStrFlag(c, objPropsFlag), ",")
Expand All @@ -210,7 +204,6 @@ func _listArchObjects(c *cli.Context, bck cmn.Bck, prefix string, isArch bool) e
msg.AddProps(cmn.GetPropsStatus)
msg.SetFlag(cmn.SelectMisplaced)
}

if flagIsSet(c, startAfterFlag) {
msg.StartAfter = parseStrFlag(c, startAfterFlag)
}
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/commands/bucket_hdlr.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ var (
maxPagesFlag,
startAfterFlag,
cachedFlag,
listArchiveFlag,
listArchFlag,
},
subcmdSummary: {
cachedFlag,
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/commands/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ var (
checksumFlags = getCksumFlags()

// begin archive
listArchiveFlag = cli.BoolFlag{Name: "list-archive", Usage: "list archive content"}
listArchFlag = cli.BoolFlag{Name: "list-archive", Usage: "list archived content"}
createArchFlag = cli.BoolFlag{Name: "archive", Usage: "archive a list or a range of objects"}
archpathFlag = cli.StringFlag{Name: "archpath", Usage: "filename in archive"}
includeSrcBucketNameFlag = cli.BoolFlag{
Expand Down
35 changes: 17 additions & 18 deletions cmn/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,6 @@ import (
"github.com/NVIDIA/aistore/cmn/debug"
)

// SelectMsg extended flags
const (
SelectCached = 1 << iota // list only cached (Cloud buckets only)
SelectMisplaced // Include misplaced
SelectDeleted // Include marked for deletion
SelectArchDir // expand archives as directories
)

// ActionMsg is a JSON-formatted control structures for the REST API
type (
ActionMsg struct {
Expand All @@ -44,10 +36,16 @@ type (
}
)

// Bucket LIST and Bucket Summary
// SelectMsg extended flags
const (
SelectCached = 1 << iota // list only cached (Cloud buckets only)
SelectMisplaced // Include misplaced
SelectDeleted // Include marked for deletion
SelectArchDir // expand archives as directories
)

type (
// TODO: `UUID` should be merged into `ContinuationToken`.
// SelectMsg represents properties and options for listing objects.
// options and flags to list objects
SelectMsg struct {
UUID string `json:"uuid"` // ID to identify a single multi-page request
Props string `json:"props"` // e.g. "checksum,size"
Expand All @@ -56,23 +54,24 @@ type (
PageSize uint `json:"pagesize"` // max entries returned by list objects call
StartAfter string `json:"start_after"` // start listing after (AIS buckets only)
ContinuationToken string `json:"continuation_token"` // `BucketList.ContinuationToken`
Flags uint64 `json:"flags,string"` // advanced filtering (SelectMsg extended flags)
Flags uint64 `json:"flags,string"` // enum {SelectCached, ..., SelectArchDir } - see above
UseCache bool `json:"use_cache"` // use proxy cache to speed up listing objects
}

// control message to generate bucket summary or summaries
BucketSummaryMsg struct {
UUID string `json:"uuid"`
Fast bool `json:"fast"`
Cached bool `json:"cached"`
}
// bucket summary (result) for a given bucket
BucketSummary struct {
Bck
ObjCount uint64 `json:"count,string"`
Size uint64 `json:"size,string"`
TotalDisksSize uint64 `json:"disks_size,string"`
UsedPct float64 `json:"used_pct"`
}
// BucketSummaryMsg represents options that can be set when asking for bucket summary.
BucketSummaryMsg struct {
UUID string `json:"uuid"`
Fast bool `json:"fast"`
Cached bool `json:"cached"`
}
BucketsSummaries []BucketSummary
)

Expand Down
19 changes: 19 additions & 0 deletions docs/archive.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Training on very large datasets is not easy. One of the many associated challenges is a so-called [small-file problem](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=%22small+file+problem%22) - the problem that gets progressively worse given continuous random access to the entirety of an underlying dataset.

Addressing the problem often means providing some sort of serialization (formatting, logic) that, ideally, also hides the fact and allows to run unmodified clients and apps. AIS approach to this and closely related problems (choices, tradeoffs) can be summarized in one word: TAR. As in: TAR archive.

More precisely, AIS equally supports several archival mime types, including TAR, TGZ (TAR.GZ), and ZIP.

The support itself started way back when we introduced [distributed shuffle](/docs/dsort.md) (extension) that works with all the 3 listed formats and performs massively-parallel custom sorting of any-size datasets. Version 3.7 adds an API-level native capability to read, write and list archives.

In particular, `list-objects` API supports "opening" objects formatted as one of the supported archival types and including contents of archived directories into generated result sets.

APPEND to existing archives is also supported, although at the time of this writing is limited to TAR (format).

In addition, clients can run concurrent multi-object (source bucket to destination bucket) transactions to generate new archives, and more.

See also:

* [CLI examples](/docs/cli/archive.md)
* [More CLI examples](/docs/cli/object.md)
* [API](/docs/http_api.md)
4 changes: 2 additions & 2 deletions docs/cli/bucket.md
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@ List all objects contained in `BUCKET` bucket.
| `--no-headers` | `bool` | Display tables without headers | `false` |
| `--cached` | `bool` | For a remote bucket, shows only objects that have already been downloaded and are cached on local drives (ignored for ais buckets) | `false` |
| `--use-cache` | `bool` | Use proxy cache to speed up list object request | `false` |
| `--start-after` | `string` | Object name after which the listing should start | `""` |
| `--list-archive` | `bool` | Treat archives as directories and include their content into the bucket list |
| `--start-after` | `string` | Object name (marker) after which the listing should start | `""` |
| `--list-archive` | `bool` | List contents of archives (ie., objects formatted as TAR, TGZ, ZIP archives) | `false` |

### Examples

Expand Down
4 changes: 2 additions & 2 deletions docs/cli/object.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ redirect_from:
---

# CLI Reference for Objects
This document contains `ais object` commands, such as GET, PUT, APPEND, PROMOTE, PREFETCH, EVICT, and many more.
This document contains `ais object` commands - the commands to read (GET), write (PUT), APPEND, PROMOTE, PREFETCH, EVICT etc. user data.

## Table of Contents
- [GET object](#get-object)
Expand Down Expand Up @@ -422,7 +422,7 @@ Environment variable `ARCH_PATH` defines the path inside the archive for the new
Add a file to an archive

```console
$ # list archive content before operation
$ # list archived content prior to appending new files
$ ais ls ais://bck --prefix test --list-archive
NAME SIZE
test.tar 42.00KiB
Expand Down
12 changes: 6 additions & 6 deletions docs/docs.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ redirect_from:
- Tutorials
- [Tutorials](/docs/tutorials/README.md)
- [Videos](/docs/videos.md)
- Power tools and extensions
- [Reading, writing, and listing *archives*](/docs/archive.md)
- [Distributed Shuffle](/docs/dsort.md)
- [Downloader](/docs/downloader.md)
- [Extract, Transform, Load](/docs/etl.md)
- [Tools and utilities](/docs/tools.md)
- Benchmarking and tuning Performance
- [AIS Load Generator: integrated benchmark tool](/docs/aisloader.md)
- [How to benchmark](/docs/howto_benchmark.md)
Expand All @@ -45,12 +51,6 @@ redirect_from:
- [Storage Services](/docs/storage_svcs.md)
- [Checksumming: brief theory of operations](/docs/checksum.md)
- [S3 compatibility](/docs/s3compat.md)
- Power tools and extensions
- [Distributed Shuffle](/docs/dsort.md)
- [Downloader](/docs/downloader.md)
- [Extract, Transform, Load](/docs/etl.md)
- Reading, writing, and listing *archives* (objects formatted as TAR, TGZ, ZIP, etc.)
- [Tools and utilities](/docs/tools.md)
- Cluster Management
- [Joining AIS cluster](/docs/join_cluster.md)
- [Leaving AIS cluster](/docs/leave_cluster.md)
Expand Down
1 change: 1 addition & 0 deletions docs/http_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ These APIs also require specific node ID (to identify the target in the cluster
|--- | --- | ---|--- |
| Create multi-object archive _or_ append multiple objects to an existing one | (to be added) | (to be added) | `api.CreateArchMultiObj` |
| APPEND to an existing archive | (to be added) | (to be added) | `api.AppendToArch` |
| List archived content | (to be added) | (to be added) | `api.ListObjects` and friends |

## Starting, stopping, and querying batch operations (jobs)

Expand Down
12 changes: 6 additions & 6 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ Further, there's the capability referred to as [global namespace](/docs/provider
- Tutorials
- [Tutorials](/docs/tutorials/README.md)
- [Videos](/docs/videos.md)
- Power tools and extensions
- [Reading, writing, and listing *archives*](/docs/archive.md)
- [Distributed Shuffle](/docs/dsort.md)
- [Downloader](/docs/downloader.md)
- [Extract, Transform, Load](/docs/etl.md)
- [Tools and utilities](/docs/tools.md)
- Benchmarking and tuning Performance
- [AIS Load Generator: integrated benchmark tool](/docs/aisloader.md)
- [How to benchmark](/docs/howto_benchmark.md)
Expand All @@ -96,12 +102,6 @@ Further, there's the capability referred to as [global namespace](/docs/provider
- [Storage Services](/docs/storage_svcs.md)
- [Checksumming: brief theory of operations](/docs/checksum.md)
- [S3 compatibility](/docs/s3compat.md)
- Power tools and extensions
- [Distributed Shuffle](/docs/dsort.md)
- [Downloader](/docs/downloader.md)
- [Extract, Transform, Load](/docs/etl.md)
- Reading, writing, and listing *archives* (objects formatted as TAR, TGZ, ZIP, etc.)
- [Tools and utilities](/docs/tools.md)
- Cluster Management
- [Joining AIS cluster](/docs/join_cluster.md)
- [Leaving AIS cluster](/docs/leave_cluster.md)
Expand Down

0 comments on commit f85b4c8

Please sign in to comment.