Skip to content

Added VGGT as an option for sfm-tool #3642

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "vggt"]
path = vggt
url = https://github.com/jckhng/vggt.git
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
# limitations under the License.

"""Base class to processes a video or image sequence to a nerfstudio compatible dataset."""
import sys
sys.path.append('./vggt')

from dataclasses import dataclass
from pathlib import Path
Expand All @@ -35,7 +37,7 @@ class ColmapConverterToNerfstudioDataset(BaseConverterToNerfstudioDataset):
"""Feature matching method to use. Vocab tree is recommended for a balance of speed
and accuracy. Exhaustive is slower but more accurate. Sequential is faster but
should only be used for videos."""
sfm_tool: Literal["any", "colmap", "hloc"] = "any"
sfm_tool: Literal["any", "colmap", "hloc", "vggt"] = "any"
"""Structure from motion tool to use. Colmap will use sift features, hloc can use
many modern methods such as superpoint features and superglue matcher"""
refine_pixsfm: bool = False
Expand Down Expand Up @@ -238,6 +240,34 @@ def _run_colmap(self, mask_path: Optional[Path] = None):
refine_pixsfm=self.refine_pixsfm,
use_single_camera_mode=self.use_single_camera_mode,
)
elif sfm_tool == "vggt":
from vggt_to_colmap import load_model, process_images, extrinsic_to_colmap_format, filter_and_prepare_points

model, device = load_model()
predictions, image_names = process_images(image_dir, model, device)

quaternions, translations = extrinsic_to_colmap_format(predictions["extrinsic"])
points3D, image_points2D = filter_and_prepare_points(
predictions,
conf_threshold=50.0,
mask_sky=False,
mask_black_bg=True,
mask_white_bg=True,
stride=1,
)

# Save COLMAP-compatible files
colmap_utils.save_colmap_files(
self.output_dir,
quaternions,
translations,
points3D,
image_points2D,
image_names,
predictions["intrinsic"],
predictions["images"].shape[2],
predictions["images"].shape[1],
)
else:
raise RuntimeError("Invalid combination of sfm_tool, feature_type, and matcher_type, exiting")

Expand Down
77 changes: 77 additions & 0 deletions nerfstudio/process_data/colmap_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,3 +712,80 @@ def create_ply_from_colmap(
x, y, z = coord
r, g, b = color
f.write(f"{x:8f} {y:8f} {z:8f} {r} {g} {b}\n")


def save_colmap_files(
output_dir: Path,
quaternions: np.ndarray,
translations: np.ndarray,
points3D: list,
image_points2D: list,
image_names: list,
intrinsic: np.ndarray,
width: int,
height: int,
):
"""Save COLMAP-compatible reconstruction files in both text and binary formats.

Args:
output_dir: Path to save the COLMAP files
quaternions: Camera orientations as quaternions (Nx4)
translations: Camera positions (Nx3)
points3D: List of 3D points with rgb and observation info
image_points2D: 2D point observations for each image
image_names: Names of the images
intrinsic: Camera intrinsic matrix
width: Image width
height: Image height
"""
import sys
sys.path.append(str(Path(__file__).parent.parent.parent / "vggt"))
from vggt_to_colmap import (
write_colmap_cameras_txt,
write_colmap_images_txt,
write_colmap_points3D_txt,
write_colmap_cameras_bin,
write_colmap_images_bin,
write_colmap_points3D_bin,
)

sparse_dir = output_dir / "colmap" / "sparse" / "0"
sparse_dir.mkdir(parents=True, exist_ok=True)

# Save text format
write_colmap_cameras_txt(
sparse_dir / "cameras.txt",
intrinsic,
width,
height,
)
write_colmap_images_txt(
sparse_dir / "images.txt",
quaternions,
translations,
image_points2D,
image_names,
)
write_colmap_points3D_txt(
sparse_dir / "points3D.txt",
points3D,
)

# Save binary format
write_colmap_cameras_bin(
sparse_dir / "cameras.bin",
intrinsic,
width,
height,
)
write_colmap_images_bin(
sparse_dir / "images.bin",
quaternions,
translations,
image_points2D,
image_names,
)
write_colmap_points3D_bin(
sparse_dir / "points3D.bin",
points3D,
)
28 changes: 19 additions & 9 deletions nerfstudio/process_data/process_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ def downscale_images(


def find_tool_feature_matcher_combination(
sfm_tool: Literal["any", "colmap", "hloc"],
sfm_tool: Literal["any", "colmap", "hloc", "vggt"],
feature_type: Literal[
"any",
"sift",
Expand Down Expand Up @@ -518,11 +518,11 @@ def find_tool_feature_matcher_combination(
) -> Union[
Tuple[None, None, None],
Tuple[
Literal["colmap", "hloc"],
Literal["colmap", "hloc", "vggt"],
Literal[
"sift",
"superpoint_aachen",
"superpoint_max",
"superpoint_max",
"superpoint_inloc",
"r2d2",
"d2net-ss",
Expand All @@ -546,7 +546,7 @@ def find_tool_feature_matcher_combination(
Basically, replace the default parameters 'any' by usable value

Args:
sfm_tool: Sfm tool name (any, colmap, hloc)
sfm_tool: Sfm tool name (any, colmap, hloc, vggt)
feature_type: Type of image features (any, sift, superpoint, ...)
matcher_type: Type of matching algorithm (any, NN, superglue,...)

Expand All @@ -555,10 +555,19 @@ def find_tool_feature_matcher_combination(
Returns (None,None,None) if no valid combination can be found
"""
if sfm_tool == "any":
if (feature_type in ("any", "sift")) and (matcher_type in ("any", "NN")):
sfm_tool = "colmap"
else:
sfm_tool = "hloc"
sfm_tool = "colmap"

if sfm_tool == "vggt":
# VGGT does not require feature_type or matcher_type
return ("vggt", None, None)
elif sfm_tool == "colmap":
feature_type = "sift"
matcher_type = "NN"
elif sfm_tool == "hloc":
feature_type = feature_type or "superpoint"
matcher_type = matcher_type or "superglue"
else:
raise ValueError(f"Invalid sfm_tool: {sfm_tool}")

if sfm_tool == "colmap":
if (feature_type not in ("any", "sift")) or (matcher_type not in ("any", "NN")):
Expand All @@ -573,7 +582,8 @@ def find_tool_feature_matcher_combination(
elif matcher_type == "NN":
matcher_type = "NN-mutual"

return (sfm_tool, feature_type, matcher_type)
return ("hloc", feature_type, matcher_type)

return (None, None, None)


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import sys
sys.path.append('./vggt')

from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import shutil

from nerfstudio.process_data.base_converter_to_nerfstudio_dataset import BaseConverterToNerfstudioDataset
from nerfstudio.process_data.colmap_utils import colmap_to_json
from nerfstudio.utils.rich_utils import CONSOLE

# Adjusted imports for VGGT utilities
from vggt_to_colmap import load_model, process_images, extrinsic_to_colmap_format, filter_and_prepare_points, write_colmap_cameras_txt, write_colmap_images_txt, write_colmap_points3D_txt, write_colmap_cameras_bin, write_colmap_images_bin, write_colmap_points3D_bin

@dataclass
class VGGTColmapConverterToNerfstudioDataset(BaseConverterToNerfstudioDataset):
"""Class to process VGGT data into a Nerfstudio-compatible dataset."""

data: Path
"""Path to the input data, either a video file or a directory of images."""
output_dir: Path
"""Path to the output directory."""
conf_threshold: float = 50.0
"""Confidence threshold for filtering points."""
mask_sky: bool = False
"""Whether to mask sky regions."""
mask_black_bg: bool = False
"""Whether to mask black background regions."""
mask_white_bg: bool = False
"""Whether to mask white background regions."""
stride: int = 1
"""Stride for point sampling."""

@property
def image_dir(self) -> Path:
return self.data

def _run_vggt_to_colmap(self):
"""Run VGGT to generate COLMAP-compatible data."""
model, device = load_model()
predictions, image_names = process_images(self.image_dir, model, device)

quaternions, translations = extrinsic_to_colmap_format(predictions["extrinsic"])
points3D, image_points2D = filter_and_prepare_points(
predictions,
self.conf_threshold,
mask_sky=self.mask_sky,
mask_black_bg=self.mask_black_bg,
mask_white_bg=self.mask_white_bg,
stride=self.stride,
)

return quaternions, translations, points3D, image_points2D, image_names, predictions

def _save_transforms(self, num_frames: int) -> List[str]:
"""Save transforms.json after processing VGGT data."""
summary_log = []
quaternions, translations, points3D, image_points2D, image_names, predictions = self._run_vggt_to_colmap()

with CONSOLE.status("[bold yellow]Saving results to transforms.json", spinner="balloon"):
# Save COLMAP-compatible files before calling colmap_to_json
write_colmap_cameras_txt(
self.output_dir / "cameras.txt",
predictions["intrinsic"],
predictions["images"].shape[2],
predictions["images"].shape[1],
)
write_colmap_images_txt(
self.output_dir / "images.txt",
quaternions,
translations,
image_points2D,
image_names,
)
write_colmap_points3D_txt(
self.output_dir / "points3D.txt",
points3D,
)

# Save binary COLMAP-compatible files
write_colmap_cameras_bin(
self.output_dir / "cameras.bin",
predictions["intrinsic"],
predictions["images"].shape[2],
predictions["images"].shape[1],
)
write_colmap_images_bin(
self.output_dir / "images.bin",
quaternions,
translations,
image_points2D,
image_names,
)
write_colmap_points3D_bin(
self.output_dir / "points3D.bin",
points3D,
)

num_matched_frames = colmap_to_json(
recon_dir=self.output_dir,
output_dir=self.output_dir,
camera_mask_path=None,
image_id_to_depth_path=None,
image_rename_map=None,
keep_original_world_coordinate=False,
use_single_camera_mode=True,
)
summary_log.append(f"VGGT-Colmap matched {num_matched_frames} images")

return summary_log

def __post_init__(self) -> None:
super().__post_init__()
if not self.image_dir.exists():
raise RuntimeError(f"Image directory {self.image_dir} does not exist.")

def main(self) -> None:
"""Main method to process VGGT data into a Nerfstudio-compatible dataset."""
self.output_dir.mkdir(parents=True, exist_ok=True)

# Copy downscaled images from the input directory to the output directory
downscaled_image_dir = self.image_dir / "downscaled"
if not downscaled_image_dir.exists():
raise FileNotFoundError(f"Expected downscaled directory at {downscaled_image_dir}, but it does not exist.")

output_image_dir = self.output_dir / "images"
output_image_dir.mkdir(parents=True, exist_ok=True)

for image_file in downscaled_image_dir.iterdir():
if image_file.is_file():
shutil.copy(image_file, output_image_dir)

print(f"Copied downscaled images to {output_image_dir}")

summary_log = self._save_transforms(num_frames=0)

for summary in summary_log:
CONSOLE.print(summary, justify="center")
CONSOLE.rule("[bold green]:tada: :tada: :tada: All DONE :tada: :tada: :tada:")
1 change: 0 additions & 1 deletion nerfstudio/scripts/process_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,6 @@ def main(self) -> None: ...
],
]


def entrypoint():
"""Entrypoint for use with pyproject scripts."""
tyro.extras.set_accent_color("bright_yellow")
Expand Down
3 changes: 3 additions & 0 deletions notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
1. need to read in a bunch of images
2. pain point is in VGGT being memory hog
3. how do fix this? need to batch it up? and how to combine after batching it up?
1 change: 1 addition & 0 deletions vggt
Submodule vggt added at 89f1ec
Loading