nerfstudio-project · jckhng · Apr 15, 2025 · Apr 15, 2025 · Apr 15, 2025 · Apr 23, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "vggt"]
+	path = vggt
+	url = https://github.com/jckhng/vggt.git
diff --git a/nerfstudio/process_data/colmap_converter_to_nerfstudio_dataset.py b/nerfstudio/process_data/colmap_converter_to_nerfstudio_dataset.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 """Base class to processes a video or image sequence to a nerfstudio compatible dataset."""
+import sys
+sys.path.append('./vggt')
 
 from dataclasses import dataclass
 from pathlib import Path
@@ -35,7 +37,7 @@ class ColmapConverterToNerfstudioDataset(BaseConverterToNerfstudioDataset):
     """Feature matching method to use. Vocab tree is recommended for a balance of speed
     and accuracy. Exhaustive is slower but more accurate. Sequential is faster but
     should only be used for videos."""
-    sfm_tool: Literal["any", "colmap", "hloc"] = "any"
+    sfm_tool: Literal["any", "colmap", "hloc", "vggt"] = "any"
     """Structure from motion tool to use. Colmap will use sift features, hloc can use
     many modern methods such as superpoint features and superglue matcher"""
     refine_pixsfm: bool = False
@@ -238,6 +240,34 @@ def _run_colmap(self, mask_path: Optional[Path] = None):
                 refine_pixsfm=self.refine_pixsfm,
                 use_single_camera_mode=self.use_single_camera_mode,
             )
+        elif sfm_tool == "vggt":
+            from vggt_to_colmap import load_model, process_images, extrinsic_to_colmap_format, filter_and_prepare_points
+
+            model, device = load_model()
+            predictions, image_names = process_images(image_dir, model, device)
+
+            quaternions, translations = extrinsic_to_colmap_format(predictions["extrinsic"])
+            points3D, image_points2D = filter_and_prepare_points(
+                predictions,
+                conf_threshold=50.0,
+                mask_sky=False,
+                mask_black_bg=True,
+                mask_white_bg=True,
+                stride=1,
+            )
+
+            # Save COLMAP-compatible files
+            colmap_utils.save_colmap_files(
+                self.output_dir,
+                quaternions,
+                translations,
+                points3D,
+                image_points2D,
+                image_names,
+                predictions["intrinsic"],
+                predictions["images"].shape[2],
+                predictions["images"].shape[1],
+            )
         else:
             raise RuntimeError("Invalid combination of sfm_tool, feature_type, and matcher_type, exiting")
 

diff --git a/nerfstudio/process_data/colmap_utils.py b/nerfstudio/process_data/colmap_utils.py
@@ -712,3 +712,80 @@ def create_ply_from_colmap(
             x, y, z = coord
             r, g, b = color
             f.write(f"{x:8f} {y:8f} {z:8f} {r} {g} {b}\n")
+
+
+def save_colmap_files(
+    output_dir: Path,
+    quaternions: np.ndarray,
+    translations: np.ndarray,
+    points3D: list,
+    image_points2D: list,
+    image_names: list,
+    intrinsic: np.ndarray,
+    width: int,
+    height: int,
+):
+    """Save COLMAP-compatible reconstruction files in both text and binary formats.
+
+    Args:
+        output_dir: Path to save the COLMAP files
+        quaternions: Camera orientations as quaternions (Nx4)
+        translations: Camera positions (Nx3)
+        points3D: List of 3D points with rgb and observation info
+        image_points2D: 2D point observations for each image
+        image_names: Names of the images
+        intrinsic: Camera intrinsic matrix
+        width: Image width
+        height: Image height
+    """
+    import sys
+    sys.path.append(str(Path(__file__).parent.parent.parent / "vggt"))
+    from vggt_to_colmap import (
+        write_colmap_cameras_txt,
+        write_colmap_images_txt,
+        write_colmap_points3D_txt,
+        write_colmap_cameras_bin,
+        write_colmap_images_bin,
+        write_colmap_points3D_bin,
+    )
+
+    sparse_dir = output_dir / "colmap" / "sparse" / "0"
+    sparse_dir.mkdir(parents=True, exist_ok=True)
+
+    # Save text format
+    write_colmap_cameras_txt(
+        sparse_dir / "cameras.txt",
+        intrinsic,
+        width,
+        height,
+    )
+    write_colmap_images_txt(
+        sparse_dir / "images.txt",
+        quaternions,
+        translations,
+        image_points2D,
+        image_names,
+    )
+    write_colmap_points3D_txt(
+        sparse_dir / "points3D.txt",
+        points3D,
+    )
+
+    # Save binary format
+    write_colmap_cameras_bin(
+        sparse_dir / "cameras.bin",
+        intrinsic,
+        width,
+        height,
+    )
+    write_colmap_images_bin(
+        sparse_dir / "images.bin",
+        quaternions,
+        translations,
+        image_points2D,
+        image_names,
+    )
+    write_colmap_points3D_bin(
+        sparse_dir / "points3D.bin",
+        points3D,
+    )
diff --git a/nerfstudio/process_data/process_data_utils.py b/nerfstudio/process_data/process_data_utils.py
@@ -490,7 +490,7 @@ def downscale_images(
 
 
 def find_tool_feature_matcher_combination(
-    sfm_tool: Literal["any", "colmap", "hloc"],
+    sfm_tool: Literal["any", "colmap", "hloc", "vggt"],
     feature_type: Literal[
         "any",
         "sift",
@@ -518,11 +518,11 @@ def find_tool_feature_matcher_combination(
 ) -> Union[
     Tuple[None, None, None],
     Tuple[
-        Literal["colmap", "hloc"],
+        Literal["colmap", "hloc", "vggt"],
         Literal[
             "sift",
             "superpoint_aachen",
-            "superpoint_max",
+            "superpoint_max", 
             "superpoint_inloc",
             "r2d2",
             "d2net-ss",
@@ -546,7 +546,7 @@ def find_tool_feature_matcher_combination(
     Basically, replace the default parameters 'any' by usable value
 
     Args:
-        sfm_tool: Sfm tool name (any, colmap, hloc)
+        sfm_tool: Sfm tool name (any, colmap, hloc, vggt)
         feature_type: Type of image features (any, sift, superpoint, ...)
         matcher_type: Type of matching algorithm (any, NN, superglue,...)
 
@@ -555,10 +555,19 @@ def find_tool_feature_matcher_combination(
         Returns (None,None,None) if no valid combination can be found
     """
     if sfm_tool == "any":
-        if (feature_type in ("any", "sift")) and (matcher_type in ("any", "NN")):
-            sfm_tool = "colmap"
-        else:
-            sfm_tool = "hloc"
+        sfm_tool = "colmap"
+
+    if sfm_tool == "vggt":
+        # VGGT does not require feature_type or matcher_type
+        return ("vggt", None, None)
+    elif sfm_tool == "colmap":
+        feature_type = "sift"
+        matcher_type = "NN"
+    elif sfm_tool == "hloc":
+        feature_type = feature_type or "superpoint"
+        matcher_type = matcher_type or "superglue"
+    else:
+        raise ValueError(f"Invalid sfm_tool: {sfm_tool}")
 
     if sfm_tool == "colmap":
         if (feature_type not in ("any", "sift")) or (matcher_type not in ("any", "NN")):
@@ -573,7 +582,8 @@ def find_tool_feature_matcher_combination(
         elif matcher_type == "NN":
             matcher_type = "NN-mutual"
 
-        return (sfm_tool, feature_type, matcher_type)
+        return ("hloc", feature_type, matcher_type)
+
     return (None, None, None)
 
 

diff --git a/nerfstudio/process_data/vggt_colmap_converter_to_nerfstudio_dataset.old b/nerfstudio/process_data/vggt_colmap_converter_to_nerfstudio_dataset.old
@@ -0,0 +1,140 @@
+import sys
+sys.path.append('./vggt')
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import shutil
+
+from nerfstudio.process_data.base_converter_to_nerfstudio_dataset import BaseConverterToNerfstudioDataset
+from nerfstudio.process_data.colmap_utils import colmap_to_json
+from nerfstudio.utils.rich_utils import CONSOLE
+
+# Adjusted imports for VGGT utilities
+from vggt_to_colmap import load_model, process_images, extrinsic_to_colmap_format, filter_and_prepare_points, write_colmap_cameras_txt, write_colmap_images_txt, write_colmap_points3D_txt, write_colmap_cameras_bin, write_colmap_images_bin, write_colmap_points3D_bin
+
+@dataclass
+class VGGTColmapConverterToNerfstudioDataset(BaseConverterToNerfstudioDataset):
+    """Class to process VGGT data into a Nerfstudio-compatible dataset."""
+
+    data: Path
+    """Path to the input data, either a video file or a directory of images."""
+    output_dir: Path
+    """Path to the output directory."""
+    conf_threshold: float = 50.0
+    """Confidence threshold for filtering points."""
+    mask_sky: bool = False
+    """Whether to mask sky regions."""
+    mask_black_bg: bool = False
+    """Whether to mask black background regions."""
+    mask_white_bg: bool = False
+    """Whether to mask white background regions."""
+    stride: int = 1
+    """Stride for point sampling."""
+
+    @property
+    def image_dir(self) -> Path:
+        return self.data
+
+    def _run_vggt_to_colmap(self):
+        """Run VGGT to generate COLMAP-compatible data."""
+        model, device = load_model()
+        predictions, image_names = process_images(self.image_dir, model, device)
+
+        quaternions, translations = extrinsic_to_colmap_format(predictions["extrinsic"])
+        points3D, image_points2D = filter_and_prepare_points(
+            predictions,
+            self.conf_threshold,
+            mask_sky=self.mask_sky,
+            mask_black_bg=self.mask_black_bg,
+            mask_white_bg=self.mask_white_bg,
+            stride=self.stride,
+        )
+
+        return quaternions, translations, points3D, image_points2D, image_names, predictions
+
+    def _save_transforms(self, num_frames: int) -> List[str]:
+        """Save transforms.json after processing VGGT data."""
+        summary_log = []
+        quaternions, translations, points3D, image_points2D, image_names, predictions = self._run_vggt_to_colmap()
+
+        with CONSOLE.status("[bold yellow]Saving results to transforms.json", spinner="balloon"):
+            # Save COLMAP-compatible files before calling colmap_to_json
+            write_colmap_cameras_txt(
+                self.output_dir / "cameras.txt",
+                predictions["intrinsic"],
+                predictions["images"].shape[2],
+                predictions["images"].shape[1],
+            )
+            write_colmap_images_txt(
+                self.output_dir / "images.txt",
+                quaternions,
+                translations,
+                image_points2D,
+                image_names,
+            )
+            write_colmap_points3D_txt(
+                self.output_dir / "points3D.txt",
+                points3D,
+            )
+
+            # Save binary COLMAP-compatible files
+            write_colmap_cameras_bin(
+                self.output_dir / "cameras.bin",
+                predictions["intrinsic"],
+                predictions["images"].shape[2],
+                predictions["images"].shape[1],
+            )
+            write_colmap_images_bin(
+                self.output_dir / "images.bin",
+                quaternions,
+                translations,
+                image_points2D,
+                image_names,
+            )
+            write_colmap_points3D_bin(
+                self.output_dir / "points3D.bin",
+                points3D,
+            )
+
+            num_matched_frames = colmap_to_json(
+                recon_dir=self.output_dir,
+                output_dir=self.output_dir,
+                camera_mask_path=None,
+                image_id_to_depth_path=None,
+                image_rename_map=None,
+                keep_original_world_coordinate=False,
+                use_single_camera_mode=True,
+            )
+            summary_log.append(f"VGGT-Colmap matched {num_matched_frames} images")
+
+        return summary_log
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if not self.image_dir.exists():
+            raise RuntimeError(f"Image directory {self.image_dir} does not exist.")
+
+    def main(self) -> None:
+        """Main method to process VGGT data into a Nerfstudio-compatible dataset."""
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Copy downscaled images from the input directory to the output directory
+        downscaled_image_dir = self.image_dir / "downscaled"
+        if not downscaled_image_dir.exists():
+            raise FileNotFoundError(f"Expected downscaled directory at {downscaled_image_dir}, but it does not exist.")
+
+        output_image_dir = self.output_dir / "images"
+        output_image_dir.mkdir(parents=True, exist_ok=True)
+
+        for image_file in downscaled_image_dir.iterdir():
+            if image_file.is_file():
+                shutil.copy(image_file, output_image_dir)
+
+        print(f"Copied downscaled images to {output_image_dir}")
+
+        summary_log = self._save_transforms(num_frames=0)
+
+        for summary in summary_log:
+            CONSOLE.print(summary, justify="center")
+        CONSOLE.rule("[bold green]:tada: :tada: :tada: All DONE :tada: :tada: :tada:")
diff --git a/nerfstudio/scripts/process_data.py b/nerfstudio/scripts/process_data.py
@@ -554,7 +554,6 @@ def main(self) -> None: ...
         ],
     ]
 
-
 def entrypoint():
     """Entrypoint for use with pyproject scripts."""
     tyro.extras.set_accent_color("bright_yellow")

diff --git a/notes.md b/notes.md
@@ -0,0 +1,3 @@
+1. need to read in a bunch of images
+2. pain point is in VGGT being memory hog
+3. how do fix this? need to batch it up? and how to combine after batching it up?
diff --git a/vggt b/vggt
-Original file line number
+Diff line change
@@ Expand Up / @@ -554,7 +554,6 @@ def main(self) -> None: ... @@
             ],
         ]
     def entrypoint():
         """Entrypoint for use with pyproject scripts."""
         tyro.extras.set_accent_color("bright_yellow")
@@ Expand Down @@