-
Notifications
You must be signed in to change notification settings - Fork 483
adds function to compute load plans when splits are unknown #5982
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,7 +38,13 @@ | |
| import org.apache.accumulo.core.client.admin.TableOperations.ImportMappingOptions; | ||
| import org.apache.accumulo.core.client.rfile.RFile; | ||
| import org.apache.accumulo.core.clientImpl.bulk.BulkImport; | ||
| import org.apache.accumulo.core.crypto.CryptoFactoryLoader; | ||
| import org.apache.accumulo.core.dataImpl.KeyExtent; | ||
| import org.apache.accumulo.core.file.blockfile.impl.CachableBlockFile; | ||
| import org.apache.accumulo.core.spi.crypto.CryptoEnvironment; | ||
| import org.apache.accumulo.core.spi.crypto.CryptoService; | ||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.hadoop.fs.FileSystem; | ||
| import org.apache.hadoop.fs.Path; | ||
| import org.apache.hadoop.io.Text; | ||
|
|
||
|
|
@@ -90,13 +96,19 @@ public enum RangeType { | |
| * row and end row can be null. The start row is exclusive and the end row is inclusive (like | ||
| * Accumulo tablets). A common use case for this would be when files were partitioned using a | ||
| * table's splits. When using this range type, the start and end row must exist as splits in the | ||
| * table or an exception will be thrown at load time. | ||
| * table or an exception will be thrown at load time. This RangeType is the most efficient for | ||
| * accumulo to load, and it enables only loading files to tablets that overlap data in the file. | ||
| */ | ||
| TABLE, | ||
| /** | ||
| * Range that correspond to known rows in a file. For this range type, the start row and end row | ||
| * must be non-null. The start row and end row are both considered inclusive. At load time, | ||
| * these data ranges will be mapped to table ranges. | ||
| * Range that corresponds to the minimum and maximum rows in a file. For this range type, the | ||
| * start row and end row must be non-null. The start row and end row are both considered | ||
| * inclusive. At load time, these data ranges will be mapped to table ranges. For this RangeType | ||
| * accumulo has to do more work at load to map the file range to tablets. Also, this will map a | ||
| * file to all tablets in the range even if the file has no data for that tablet. For example if | ||
| * a range overlapped 10 tablets but the file only had data for 8 of those tablets, the file | ||
| * would still be loaded to all 10. This will not cause problems for scans or compactions other | ||
| * than the unnecessary work of opening a file and finding it has no data for the tablet. | ||
| */ | ||
| FILE | ||
| } | ||
|
|
@@ -459,6 +471,7 @@ static SplitResolver from(SortedSet<Text> splits) { | |
| * Computes a load plan for a given rfile. This will open the rfile and find every | ||
| * {@link TableSplits} that overlaps rows in the file and add those to the returned load plan. | ||
| * | ||
| * @return a load plan of type {@link RangeType#TABLE} | ||
| * @since 2.1.4 | ||
| */ | ||
| public static LoadPlan compute(URI file, SplitResolver splitResolver) throws IOException { | ||
|
|
@@ -475,6 +488,7 @@ public static LoadPlan compute(URI file, SplitResolver splitResolver) throws IOE | |
| * | ||
| * @param properties used when opening the rfile, see | ||
| * {@link org.apache.accumulo.core.client.rfile.RFile.ScannerOptions#withTableProperties(Map)} | ||
| * @return a load plan of type {@link RangeType#TABLE} | ||
| * @since 2.1.4 | ||
| */ | ||
| public static LoadPlan compute(URI file, Map<String,String> properties, | ||
|
|
@@ -510,4 +524,41 @@ public static LoadPlan compute(URI file, Map<String,String> properties, | |
| return builder.build(); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Computes a load plan for a rfile based on the minimum and maximum row present across all | ||
| * locality groups. | ||
| * | ||
| * @param properties used when opening the rfile, see | ||
| * {@link org.apache.accumulo.core.client.rfile.RFile.ScannerOptions#withTableProperties(Map)} | ||
| * | ||
| * @return a load plan of type {@link RangeType#FILE} | ||
| * @since 2.1.5 | ||
| */ | ||
| public static LoadPlan compute(URI file, Map<String,String> properties) throws IOException { | ||
| var path = new Path(file); | ||
| var conf = new Configuration(); | ||
| var fs = FileSystem.get(path.toUri(), conf); | ||
| CryptoService cs = | ||
| CryptoFactoryLoader.getServiceForClient(CryptoEnvironment.Scope.TABLE, properties); | ||
| CachableBlockFile.CachableBuilder cb = | ||
| new CachableBlockFile.CachableBuilder().fsPath(fs, path).conf(conf).cryptoService(cs); | ||
| try (var reader = new org.apache.accumulo.core.file.rfile.RFile.Reader(cb)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason not to use FileOperations.ReaderBuilder?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure, I will look into that. I think I copied this code from somewhere (maybe the Rfile print info code) that was doing what I needed.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated in a10cd12 to use FileOperations |
||
| var firstRow = reader.getFirstKey().getRow(); | ||
| var lastRow = reader.getLastKey().getRow(); | ||
| return LoadPlan.builder().loadFileTo(path.getName(), RangeType.FILE, firstRow, lastRow) | ||
| .build(); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Computes a load plan for a rfile based on the minimum and maximum row present across all | ||
| * locality groups. | ||
| * | ||
| * @return a load plan of type {@link RangeType#FILE} | ||
| * @since 2.1.5 | ||
| */ | ||
| public static LoadPlan compute(URI file) throws IOException { | ||
| return compute(file, Map.of()); | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.